tags = read_csv("./ml/tags.csv", sep = ",")
tags.head()
Out[26]:
userIdmovieIdtagtimestamp
0260756funny1445714994
1260756Highly quotable1445714996
2260756will ferrell1445714992
3289774Boxing story1445715207
4289774MMA1445715200
In [27]:
tags.describe()
Out[27]:
userIdmovieIdtimestamp
count3683.0000003683.0000003.683000e+03
mean431.14933527252.0135761.320032e+09
std158.47255343490.5588031.721025e+08
min2.0000001.0000001.137179e+09
25%424.0000001262.5000001.137521e+09
50%474.0000004454.0000001.269833e+09
75%477.00000039263.0000001.498457e+09
max610.000000193565.0000001.537099e+09
In [28]:
tags.shape
Out[28]:
(3683, 4)
In [29]:
movies = read_csv("./ml/movies.csv", sep = ",")
movies.head()
Out[29]:
movieIdtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama|Romance
45Father of the Bride Part II (1995)Comedy
In [30]:
movies.shape
Out[30]:
(9742, 3)
In [31]:
ratings = read_csv("./ml/ratings.csv")
ratings.head()
Out[31]:
userIdmovieIdratingtimestamp
0114.0964982703
1134.0964981247
2164.0964982224
31475.0964983815
41505.0964982931
In [32]:
ratings.shape
Out[32]:
(100836, 4)
In [38]:
#ratings[1000:1010]
ratings[-10 : ] #last ten rows
Out[38]:
userIdmovieIdratingtimestamp
1008266101623503.51493849971
1008276101639373.51493848789
1008286101639813.51493850155
1008296101641795.01493845631
1008306101665284.01493879365
1008316101665344.01493848402
1008326101682485.01493850091
1008336101682505.01494273047
1008346101682525.01493846352
1008356101708753.01493846415
In [39]:
tag_counts = tags['tag'].value_counts() #counts values in tag column
tag_counts[ : 10] # most ten values
Out[39]:
In Netflix queue     131
atmospheric           36
superhero             24
thought-provoking     24
funny                 23
surreal               23
Disney                23
religion              22
psychology            21
quirky                21
Name: tag, dtype: int64
In [40]:
tag_counts[-10 : ] #least ten values
Out[40]:
brilliant              1
Insurance              1
parrots                1
President              1
Neil Patrick Harris    1
Renee Zellweger        1
Classic                1
crucifixion            1
Boston                 1
tricky                 1
Name: tag, dtype: int64
In [42]:
tag_counts['sci-fi']  # it's series type, so find value with idx
Out[42]:
21
In [44]:
tag_counts[ : 10].plot(kind = 'bar', figsize = (15, 10))
Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbac652c320>
In [45]:
is_highly_rated = ratings['rating'] >= 4.0
is_highly_rated.head()
Out[45]:
0    True
1    True
2    True
3    True
4    True
Name: rating, dtype: bool
In [46]:
ratings[is_highly_rated][-5 : ]
Out[46]:
userIdmovieIdratingtimestamp
1008306101665284.01493879365
1008316101665344.01493848402
1008326101682485.01493850091
1008336101682505.01494273047
1008346101682525.01493846352
In [49]:
is_animation = movies['genres'].str.contains('Animation')
is_animation.head()
Out[49]:
0     True
1    False
2    False
3    False
4    False
Name: genres, dtype: bool
In [50]:
movies[is_animation][5:15]
Out[50]:
movieIdtitlegenres
322364Lion King, The (1994)Adventure|Animation|Children|Drama|Musical|IMAX
483551Nightmare Before Christmas, The (1993)Animation|Children|Fantasy|Musical
488558Pagemaster, The (1994)Action|Adventure|Animation|Children|Fantasy
506588Aladdin (1992)Adventure|Animation|Children|Comedy|Musical
511594Snow White and the Seven Dwarfs (1937)Animation|Children|Drama|Fantasy|Musical
512595Beauty and the Beast (1991)Animation|Children|Fantasy|Musical|Romance|IMAX
513596Pinocchio (1940)Animation|Children|Fantasy|Musical
522610Heavy Metal (1981)Action|Adventure|Animation|Horror|Sci-Fi
527616Aristocats, The (1970)Animation|Children
534631All Dogs Go to Heaven 2 (1996)Adventure|Animation|Children|Fantasy|Musical|R...
In [51]:
ratings_count = ratings[['movieId', 'rating']].groupby('rating').count()
ratings_count.head()
Out[51]:
movieId
rating
0.51370
1.02811
1.51791
2.07551
2.55550
In [54]:
average_rating = ratings[['movieId', 'rating']].groupby('movieId').mean()
average_rating.tail()
Out[54]:
rating
movieId
1935814.0
1935833.5
1935853.5
1935873.5
1936094.0
In [55]:
movie_count = ratings[['movieId', 'rating']].groupby('movieId').count()
movie_count.head()   #how many movies in the movie.csv
Out[55]:
rating
movieId
1215
2110
352
47
549
In [ ]:


+ Recent posts