import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


movies = pd.read_csv('Refactored_Py_DS_ML_Bootcamp-master/19-Recommender-Systems/u.data',
                     sep = '\t', names=['user_id', 'item_id', 'rating', 'timestamp'])


movies.head()


titles = pd.read_csv('Refactored_Py_DS_ML_Bootcamp-master/19-Recommender-Systems/Movie_Id_Titles')


titles.head()


df = pd.merge(movies, titles, on='item_id', how = 'inner')


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100003 entries, 0 to 100002
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    100003 non-null  int64 
 1   item_id    100003 non-null  int64 
 2   rating     100003 non-null  int64 
 3   timestamp  100003 non-null  int64 
 4   title      100003 non-null  object
dtypes: int64(4), object(1)
memory usage: 4.6+ MB


n_titles = df['title'].nunique()
n_users = df['user_id'].nunique()

print('movies: ',n_titles)
print('users: ', n_users)

movies:  1664
users:  944


#checking how ratings are distributed in DF:
plt.figure(figsize=(8,5))
sns.set_style('whitegrid')
sns.histplot(data = df, x = 'rating', color = 'green', discrete = True)

<AxesSubplot:xlabel='rating', ylabel='Count'>


#agregating relevant information by the title
ByTitle = df.groupby('title')['rating'].agg([len, np.mean, np.std])


#To ease understanding, we rename the columns
ByTitle.rename(columns = {'len': 'num of rates','mean':'rating', 'std':'rate_std'}, inplace = True)

# Quick look to new data
ByTitle[ByTitle['num of rates'] >= 5].sort_values('rating', ascending = False).head()


# Can we see any correlations inside this data? 
plt.figure(figsize=(16,10))
sns.set_style('whitegrid')
sns.set_context('notebook')
sns.pairplot(ByTitle)

<function matplotlib.pyplot.savefig(*args, **kwargs)>

<Figure size 1152x720 with 0 Axes>


UI_mat = df.pivot_table(index = 'user_id', columns='title', values = 'rating')


UI_mat.head()


rates = pd.DataFrame(df['title'].value_counts().sort_values(ascending = False).head(10))

plt.figure(figsize=(20,10))
plt.tick_params(labelsize = 15, labelrotation = 45)
plt.ylabel('num of rates')
sns.barplot(data = rates, x = rates.index,
            y = 'title')

<AxesSubplot:ylabel='title'>


#selecting Toy Story (1995) as to recommend

TS_rates = UI_mat['Toy Story (1995)']
similarTo_TS = UI_mat.corrwith(TS_rates)

C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\function_base.py:2634: RuntimeWarning: Degrees of freedom <= 0 for slice
  c = cov(x, y, rowvar, dtype=dtype)
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\function_base.py:2493: RuntimeWarning: divide by zero encountered in true_divide
  c *= np.true_divide(1, fact)


cor_TS = pd.DataFrame(similarTo_TS, columns = ['correl'])
cor_TS.sort_values('correl',ascending = False)


#since the index are the movie names, we can append num of rates from ByTitle DF

cor_TS['num of rates'] = pd.DataFrame(ByTitle['num of rates'])
cor_TS[cor_TS['num of rates'] >= 25].sort_values('correl', ascending = False)['correl'].head(10)

title
Toy Story (1995)                       1.000000
Transformers: The Movie, The (1986)    0.753673
Mouse Hunt (1997)                      0.736826
Farewell My Concubine (1993)           0.672918
Paris Is Burning (1990)                0.668410
Fallen (1998)                          0.654585
187 (1997)                             0.651857
Raise the Red Lantern (1991)           0.641535
Eddie (1996)                           0.623460
Associate, The (1996)                  0.620767
Name: correl, dtype: float64


def CBsys(Mov_title, UImat, movie_rates_count, minViews):
    ''' Doc:
    Mov_title: str, Name of the movie as in User Item matrix
    ---
    UImat: User-Item Matrix (movies in column)
    ---
    movie_rates_count: pandas series with the num of rates of each movie
    ---
    minViews: the minimum number of reviews (ratings) before calculating corelation
    '''
    myMov = UImat[Mov_title]
    correl = pd.DataFrame(UImat.corrwith(myMov), columns = ['cor'])
    correl['views'] = pd.DataFrame(movie_rates_count)
    print(correl[(correl['views']>= minViews)].sort_values('cor', ascending = False).head(10))

#testing for Air Force One (1997)
CBsys('Air Force One (1997)', UI_mat, ByTitle['num of rates'], 20)

C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\function_base.py:2634: RuntimeWarning: Degrees of freedom <= 0 for slice
  c = cov(x, y, rowvar, dtype=dtype)
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\function_base.py:2493: RuntimeWarning: divide by zero encountered in true_divide
  c *= np.true_divide(1, fact)

                                              cor  views
title                                                   
Air Force One (1997)                     1.000000    431
House of the Spirits, The (1993)         0.774597     24
Houseguest (1994)                        0.774597     24
Black Sheep (1996)                       0.766259     55
Little Buddha (1993)                     0.746729     22
Copycat (1995)                           0.711744     86
Higher Learning (1995)                   0.696826     30
Associate, The (1996)                    0.696300     41
It's My Party (1995)                     0.689100     21
Ma vie en rose (My Life in Pink) (1997)  0.681221     20


df2 = pd.merge(movies, titles, on= 'item_id')
n_users = df2['user_id'].nunique()
n_titles = df2['item_id'].nunique()


from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df2,
                                         test_size = 0.2,
                                         random_state=101)


#creating user-item matrices

#train
train_data_matrix = np.zeros((n_users, n_titles))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

#test
test_data_matrix = np.zeros((n_users, n_titles))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]


import scipy.sparse as sp
from scipy.sparse.linalg import svds


u, s, vt = svds(train_data_matrix, k = 5)
s_diag = np.diag(s)

pred_matrix = np.dot(np.dot(u, s_diag), vt)


from sklearn.metrics import mean_squared_error
from math import sqrt

#To calculate the model precision, we need to flatten the matrix into a 1D dimention and compare to the values in the test matrix which has been evaluated!
MSE = mean_squared_error(pred_matrix[test_data_matrix.nonzero()].flatten(),
                         test_data_matrix[test_data_matrix.nonzero()].flatten())

RMSE = sqrt(MSE)

#Lastly, let's print the results:
print(f'MSE is: {MSE:.2f}', )
print(f'RMSE is: {RMSE:.2f} ')

MSE is: 7.19
RMSE is: 2.68


from sklearn.metrics import pairwise_distances

u_sim = pairwise_distances(train_data_matrix, metric ='cosine')


mean_rate = train_data_matrix.mean(axis=1)

#to calculate the diff between current value and mean, we need to transform the mean to the same format as the matrix
rate_diff = (train_data_matrix - mean_rate[:,np.newaxis])

#use the above similarity to calculate the formula
pred = mean_rate[:,np.newaxis] +(u_sim.dot(rate_diff) / np.array([np.abs(u_sim).sum(axis = 1)]).T)

#EVALUATION
score = sqrt(mean_squared_error(pred[test_data_matrix.nonzero()].flatten(), test_data_matrix[test_data_matrix.nonzero()].flatten()))

print(f'RMSE for the memory-based CF:\n {score:.2f}')

RMSE for the memory-based CF:
 3.09

	user_id	item_id	rating	timestamp
0	0	50	5	881250949
1	0	172	5	881250949
2	0	133	1	881250949
3	196	242	3	881250949
4	186	302	3	891717742

	user_id	item_id	rating	timestamp	title
0	0	50	5	881250949	Star Wars (1977)
1	290	50	5	880473582	Star Wars (1977)
2	79	50	4	891271545	Star Wars (1977)
3	2	50	5	888552084	Star Wars (1977)
4	8	50	5	879362124	Star Wars (1977)

	num of rates	rating	rate_std
title
Pather Panchali (1955)	8	4.625000	0.517549
Close Shave, A (1995)	112	4.491071	0.771047
Schindler's List (1993)	298	4.466443	0.829109
Wrong Trousers, The (1993)	118	4.466102	0.823607
Casablanca (1942)	243	4.456790	0.728114

title	'Til There Was You (1997)	1-900 (1994)	101 Dalmatians (1996)	12 Angry Men (1957)	187 (1997)	2 Days in the Valley (1996)	20,000 Leagues Under the Sea (1954)	2001: A Space Odyssey (1968)	3 Ninjas: High Noon At Mega Mountain (1998)	39 Steps, The (1935)	...	Yankee Zulu (1994)	Year of the Horse (1997)	You So Crazy (1994)	Young Frankenstein (1974)	Young Guns (1988)	Young Guns II (1990)	Young Poisoner's Handbook, The (1995)	Zeus and Roxanne (1997)	unknown	Á köldum klaka (Cold Fever) (1994)
user_id
0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	NaN	NaN	2.0	5.0	NaN	NaN	3.0	4.0	NaN	NaN	...	NaN	NaN	NaN	5.0	3.0	NaN	NaN	NaN	4.0	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	NaN	2.0	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	correl
title
Scarlet Letter, The (1995)	1.0
Substance of Fire, The (1996)	1.0
Stranger, The (1994)	1.0
Wooden Man's Bride, The (Wu Kui) (1994)	1.0
Newton Boys, The (1998)	1.0
...	...
Woman in Question, The (1950)	NaN
Wonderland (1997)	NaN
Yankee Zulu (1994)	NaN
You So Crazy (1994)	NaN
Á köldum klaka (Cold Fever) (1994)	NaN

🎬 Movies recomendation using Content-Based and Collaborative Filtering¶

1. Recommendation Systems¶

2. Importing Data¶

3. EDA¶

4. Content-based systems¶

5. Collaborative Filtering¶

5.1 Model Based CF¶

5.1.1 SVD¶

5.2 Memory-Based CF¶

6. Conclusion¶

	item_id	title
0	1	Toy Story (1995)
1	2	GoldenEye (1995)
2	3	Four Rooms (1995)
3	4	Get Shorty (1995)
4	5	Copycat (1995)