Project : Movielens Case Study
In [1]: #Import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
In [2]: df1 = pd.read_csv("movies.dat", sep='::', header=None, engine='python')
df1.columns = ['MovieID','Title','Genres']
df1.head()
Out[2]:
MovieID Title Genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
In [3]: df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
MovieID 3883 non-null int64
Title 3883 non-null object
Genres 3883 non-null object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
In [4]: df2 = pd.read_csv("ratings.dat", sep='::', header=None, engine='python')
df2.columns = ['UserID','MovieID','Rating','Timestamp']
df2.head()
Out[4]:
UserID MovieID Rating Timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
In [5]: df2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
UserID 1000209 non-null int64
MovieID 1000209 non-null int64
Rating 1000209 non-null int64
Timestamp 1000209 non-null int64
dtypes: int64(4)
memory usage: 30.5 MB
In [6]: df3 = pd.read_csv("users.dat", sep='::', header=None, engine='python')
df3.columns = ['UserID','Gender','Age','Occupation','Zip-code']
df3.head()
Out[6]:
UserID Gender Age Occupation Zip-code
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455
In [7]: df3.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
UserID 6040 non-null int64
Gender 6040 non-null object
Age 6040 non-null int64
Occupation 6040 non-null int64
Zip-code 6040 non-null object
dtypes: int64(3), object(2)
memory usage: 236.0+ KB
Merging datasets
In [8]: finalDF = pd.merge(pd.merge(df1,df2, on = 'MovieID'),df3, on = 'UserID')
finalDF.head()
Out[8]:
Zip-
MovieID Title Genres UserID Rating Timestamp Gender Age Occupation
code
Toy Story
0 1 Animation|Children's|Comedy 1 5 978824268 F 1 10 48067
(1995)
Pocahontas
1 48 Animation|Children's|Musical|Romance 1 5 978824351 F 1 10 48067
(1995)
Apollo 13
2 150 Drama 1 5 978301777 F 1 10 48067
(1995)
Star Wars:
Episode IV -
3 260 Action|Adventure|Fantasy|Sci-Fi 1 4 978300760 F 1 10 48067
A New Hope
(1977)
Schindler's
4 527 Drama|War 1 5 978824195 F 1 10 48067
List (1993)
User Age Distribution
In [9]: users_Age = df3.groupby(['Age']).size()
users_Age
Out[9]: Age
1 222
18 1103
25 2096
35 1193
45 550
50 496
56 380
dtype: int64
In [10]: plt.figure(figsize = (4,6))
users_Age.plot.bar(color='r',width=.4,alpha=0.8)
plt.title("User Age Distribution")
plt.xlabel("Age")
plt.ylabel("No of Users")
plt.show()
In [11]: plt.figure(figsize = (4,6))
plt.hist(df3['Age'])
plt.xlabel('Age Distribution')
plt.ylabel('No. of Users')
plt.show()
Overall rating by users
In [12]: df2['Rating'].unique()
Out[12]: array([5, 3, 4, 2, 1], dtype=int64)
In [13]: users_Overall_Ratings = df2.groupby(['Rating'],axis = 0).UserID.size()
print (users_Overall_Ratings)
Rating
1 56174
2 107557
3 261197
4 348971
5 226310
Name: UserID, dtype: int64
In [14]: plt.figure(figsize = (4,6))
users_Overall_Ratings.plot.bar(color='r',width=.4,alpha=0.8)
plt.xlabel('Ratings')
plt.ylabel('No. of Users')
plt.show()
User rating of the movie “Toy Story”
In [15]: MovieTitles= finalDF["Title"].unique()
toyMovie = []
for i in MovieTitles:
if i.startswith("Toy Story") == True:
toyMovie.append(i)
toyMovie
Out[15]: ['Toy Story (1995)', 'Toy Story 2 (1999)']
In [16]: Toy1995 = finalDF[finalDF['Title'] == 'Toy Story (1995)']
Toy1995_rating = Toy1995.groupby('Rating')['UserID'].count()
print(Toy1995_rating)
Toy1995_rating.plot(kind='bar')
Rating
1 16
2 61
3 345
4 835
5 820
Name: UserID, dtype: int64
Out[16]: <matplotlib.axes._subplots.AxesSubplot at 0xb8771d0>
In [17]: Toy1995_rating.plot(kind='pie')
Out[17]: <matplotlib.axes._subplots.AxesSubplot at 0xb873c50>
In [18]: Toy1999 = finalDF[finalDF['Title'] == 'Toy Story 2 (1999)']
Toy1999_rating = Toy1999.groupby('Rating')['UserID'].count()
print(Toy1999_rating)
Toy1999_rating.plot(kind='bar')
Rating
1 25
2 44
3 214
4 578
5 724
Name: UserID, dtype: int64
Out[18]: <matplotlib.axes._subplots.AxesSubplot at 0x1c97f0f0>
In [19]: Toy1999_rating.plot(kind='pie')
Out[19]: <matplotlib.axes._subplots.AxesSubplot at 0x1c98dd68>
In [20]: Toy = finalDF[(finalDF['Title'].isin(['Toy Story 2 (1999)','Toy Story (1995)']))]
Toy_rating = Toy.groupby('Rating')['UserID'].count()
print(Toy_rating)
Toy_rating.plot(kind='bar')
Rating
1 41
2 105
3 559
4 1413
5 1544
Name: UserID, dtype: int64
Out[20]: <matplotlib.axes._subplots.AxesSubplot at 0xb842550>
In [21]: Toy_rating.plot(kind='pie')
Out[21]: <matplotlib.axes._subplots.AxesSubplot at 0xb860358>
Viewership of the movie “Toy Story” by age group
In [22]: Rating_Toy_Story = (Toy).groupby('Age',axis=0).Rating.count()
plt.figure(figsize = (4,6))
Rating_Toy_Story.plot.bar(color='magenta',width=.4,alpha=0.8)
plt.xlabel('User Age rates Toy Story')
plt.ylabel('No. of Users')
plt.show()
Top 25 movies by viewership rating
In [23]: Top = finalDF.groupby('MovieID')['Rating'].count().nlargest(25)
print(Top)
Top.plot(kind='bar')
plt.xlabel('Movie ID Num')
plt.ylabel('Ratings Count')
plt.show()
MovieID
2858 3428
260 2991
1196 2990
1210 2883
480 2672
2028 2653
589 2649
2571 2590
1270 2583
593 2578
1580 2538
1198 2514
608 2513
2762 2459
110 2443
2396 2369
1197 2318
527 2304
1617 2288
1265 2278
1097 2269
2628 2250
2997 2241
318 2227
858 2223
Name: Rating, dtype: int64
In [24]: Top.plot(kind ='pie')
Out[24]: <matplotlib.axes._subplots.AxesSubplot at 0x1c9654a8>
Rating for a particular user of user id = 2696
In [25]: Rating_of_2696 = finalDF[finalDF['UserID'] == 2696].groupby('Rating')['Rating'].count()
print(Rating_of_2696)
Rating_of_2696.plot(kind='bar')
plt.xlabel('Ratings by User 2696')
plt.ylabel('Rating Counts')
plt.show()
Rating
1 2
2 3
3 3
4 11
5 1
Name: Rating, dtype: int64
In [26]: Rating_of_2696.plot(kind='pie')
Out[26]: <matplotlib.axes._subplots.AxesSubplot at 0x1c99dfd0>
In [27]: #Find out all the unique genres
finalDF['Genres'].unique
Out[27]: <bound method Series.unique of 0 Animation|Children's|Comedy
1 Animation|Children's|Musical|Romance
2 Drama
3 Action|Adventure|Fantasy|Sci-Fi
4 Drama|War
5 Children's|Drama
6 Animation|Children's|Comedy|Musical
7 Animation|Children's|Musical
8 Animation|Children's|Musical
9 Crime|Drama|Thriller
10 Animation|Children's|Musical
11 Animation
12 Animation|Comedy|Thriller
13 Animation|Children's|Musical
14 Musical|Romance
15 Adventure|Children's|Drama|Musical
16 Musical
17 Animation|Children's|Musical
18 Children's|Comedy|Musical
19 Animation|Children's|Musical
20 Musical
21 Children's|Drama|Fantasy|Sci-Fi
22 Drama
23 Action|Adventure|Comedy|Romance
24 Drama
25 Drama
26 Comedy|Sci-Fi
27 Action|Adventure|Drama
28 Drama
29 Adventure|Animation|Children's|Comedy|Musical
...
1000179 Action|Thriller
1000180 Comedy
1000181 Comedy|Romance
1000182 Sci-Fi|Thriller
1000183 Thriller
1000184 Thriller
1000185 Comedy|Drama
1000186 Action|Thriller
1000187 Comedy
1000188 Thriller
1000189 Drama|War
1000190 Horror|Romance
1000191 Action|Thriller
1000192 Animation|Children's|Comedy
1000193 Drama|Mystery|Thriller
1000194 Comedy
1000195 Horror|Mystery|Thriller
1000196 Comedy|Crime
1000197 Action|Thriller
1000198 Comedy|Drama
1000199 Drama
1000200 Drama|Thriller
1000201 Comedy
1000202 Animation|Children's
1000203 Thriller
1000204 Drama|Thriller
1000205 Comedy|Horror|Thriller
1000206 Comedy|Romance
1000207 Action|Thriller
1000208 Action|Drama
Name: Genres, Length: 1000209, dtype: object>
In [28]: #one-hot encoding for genre
ohe_genre=pd.concat([pd.get_dummies(finalDF['Genres']),finalDF.iloc[:,[0,1,3,4,5,6,7,8,9]]],axis=1)
ohe_genre.head()
Out[28]:
Action|Adventure|Anima
Action Action|Adventure Action|Adventure|Animation Action|Adventure|Animation|Children's|Fantasy
0 0 0 0 0
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
5 rows × 310 columns
Machine Learning
In [29]: #creating features and label by taking sample data
features = finalDF.iloc[:500,[0,7,8]]
label = finalDF.iloc[:500,4]
In [30]: features.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 499
Data columns (total 3 columns):
MovieID 500 non-null int64
Age 500 non-null int64
Occupation 500 non-null int64
dtypes: int64(3)
memory usage: 15.6 KB
In [31]: #train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,label,test_size=0.2,random_state=912)
In [32]: #create model
model = KNeighborsClassifier(n_neighbors=15)
model.fit(X_train,y_train)
Out[32]: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=15, p=2,
weights='uniform')
In [33]: print(model.score(X_train,y_train))
print(model.score(X_test,y_test))
0.4575
0.47
Histogram for movie
In [34]: plt.figure(figsize = (4,6))
plt.hist(finalDF['MovieID'],color = 'b')
plt.xlabel('movie id')
plt.show()
Histogram for age
In [35]: plt.figure(figsize = (4,6))
plt.hist(finalDF['Age'],color = 'g')
plt.xlabel('Age Distribution')
plt.show()
Histogram for Occupation
In [36]: plt.figure(figsize = (4,6))
plt.hist(finalDF['Occupation'],color = 'r')
plt.xlabel('occupation id')
plt.show()