MovieLens 1M Dataset¶
In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
import os
import shutil
from pathlib import Path
sys.path.append("../")
from helpers.utils import download_by_url
In [2]:
DATASET_DIR = download_by_url(
"https://files.grouplens.org/datasets/movielens/ml-1m.zip",
"../input/movielens_1m_ds/ml-1m.zip",
unzip=True,
)
src_dir = os.path.join(DATASET_DIR, "ml-1m")
for f in os.listdir(src_dir):
shutil.move(os.path.join(src_dir, f), os.path.join(DATASET_DIR, f))
shutil.rmtree(src_dir)
Downloading https://files.grouplens.org/datasets/movielens/ml-1m.zip to ../input/movielens_1m_ds/ml-1m.zip ✅ Download complete: ../input/movielens_1m_ds/ml-1m.zip Unzipping ../input/movielens_1m_ds/ml-1m.zip ✅ Unzipped to: ../input/movielens_1m_ds
In [3]:
for dirname, _, filenames in os.walk(DATASET_DIR):
for filename in filenames:
print(os.path.join(dirname, filename))
../input/movielens_1m_ds\movies.dat ../input/movielens_1m_ds\ratings.dat ../input/movielens_1m_ds\README ../input/movielens_1m_ds\users.dat
In [4]:
DATASET_DIR
Out[4]:
'../input/movielens_1m_ds'
In [5]:
unames = ["user_id", "gender", "age", "occupation", "zip"]
users = pd.read_csv(
os.path.join(DATASET_DIR, "users.dat"),
sep="::",
header=None,
names=unames,
engine="python",
)
users.head()
Out[5]:
user_id | gender | age | occupation | zip | |
---|---|---|---|---|---|
0 | 1 | F | 1 | 10 | 48067 |
1 | 2 | M | 56 | 16 | 70072 |
2 | 3 | M | 25 | 15 | 55117 |
3 | 4 | M | 45 | 7 | 02460 |
4 | 5 | M | 25 | 20 | 55455 |
In [6]:
rnames = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(
os.path.join(DATASET_DIR, "ratings.dat"),
sep="::",
header=None,
names=rnames,
engine="python",
)
ratings.head()
Out[6]:
user_id | movie_id | rating | timestamp | |
---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 |
1 | 1 | 661 | 3 | 978302109 |
2 | 1 | 914 | 3 | 978301968 |
3 | 1 | 3408 | 4 | 978300275 |
4 | 1 | 2355 | 5 | 978824291 |
In [7]:
mnames = ["movie_id", "title", "genres"]
movies = pd.read_csv(
os.path.join(DATASET_DIR, "movies.dat"),
sep="::",
header=None,
names=mnames,
engine="python",
encoding="ISO-8859-1",
)
movies.head()
Out[7]:
movie_id | title | genres | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation|Children's|Comedy |
1 | 2 | Jumanji (1995) | Adventure|Children's|Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy|Drama |
4 | 5 | Father of the Bride Part II (1995) | Comedy |
In [8]:
data = ratings.merge(users).merge(movies)
data.head()
Out[8]:
user_id | movie_id | rating | timestamp | gender | age | occupation | zip | title | genres | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1193 | 5 | 978300760 | F | 1 | 10 | 48067 | One Flew Over the Cuckoo's Nest (1975) | Drama |
1 | 1 | 661 | 3 | 978302109 | F | 1 | 10 | 48067 | James and the Giant Peach (1996) | Animation|Children's|Musical |
2 | 1 | 914 | 3 | 978301968 | F | 1 | 10 | 48067 | My Fair Lady (1964) | Musical|Romance |
3 | 1 | 3408 | 4 | 978300275 | F | 1 | 10 | 48067 | Erin Brockovich (2000) | Drama |
4 | 1 | 2355 | 5 | 978824291 | F | 1 | 10 | 48067 | Bug's Life, A (1998) | Animation|Children's|Comedy |
In [9]:
mean_ratings = data.pivot_table("rating", index="title", columns="gender", aggfunc="mean")
mean_ratings.head()
Out[9]:
gender | F | M |
---|---|---|
title | ||
$1,000,000 Duck (1971) | 3.375000 | 2.761905 |
'Night Mother (1986) | 3.388889 | 3.352941 |
'Til There Was You (1997) | 2.675676 | 2.733333 |
'burbs, The (1989) | 2.793478 | 2.962085 |
...And Justice for All (1979) | 3.828571 | 3.689024 |
In [10]:
ratings_by_title = data.groupby("title").size()
ratings_by_title.head()
Out[10]:
title $1,000,000 Duck (1971) 37 'Night Mother (1986) 70 'Til There Was You (1997) 52 'burbs, The (1989) 303 ...And Justice for All (1979) 199 dtype: int64
In [11]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles
Out[11]:
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)', '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)', '13th Warrior, The (1999)', '2 Days in the Valley (1996)', '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)', '2010 (1984)', ... 'X-Men (2000)', 'Year of Living Dangerously (1982)', 'Yellow Submarine (1968)', 'You've Got Mail (1998)', 'Young Frankenstein (1974)', 'Young Guns (1988)', 'Young Guns II (1990)', 'Young Sherlock Holmes (1985)', 'Zero Effect (1998)', 'eXistenZ (1999)'], dtype='object', name='title', length=1216)
In [12]:
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings.head()
Out[12]:
gender | F | M |
---|---|---|
title | ||
'burbs, The (1989) | 2.793478 | 2.962085 |
10 Things I Hate About You (1999) | 3.646552 | 3.311966 |
101 Dalmatians (1961) | 3.791444 | 3.500000 |
101 Dalmatians (1996) | 3.240000 | 2.911215 |
12 Angry Men (1957) | 4.184397 | 4.328421 |
In [13]:
top_female_ratings = mean_ratings.sort_values(by="F", ascending=False)
top_female_ratings.head()
Out[13]:
gender | F | M |
---|---|---|
title | ||
Close Shave, A (1995) | 4.644444 | 4.473795 |
Wrong Trousers, The (1993) | 4.588235 | 4.478261 |
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) | 4.572650 | 4.464589 |
Wallace & Gromit: The Best of Aardman Animation (1996) | 4.563107 | 4.385075 |
Schindler's List (1993) | 4.562602 | 4.491415 |
Measuring Rating Disagreement¶
In [14]:
mean_ratings["diff"] = mean_ratings["M"] - mean_ratings["F"]
sorted_by_diff = mean_ratings.sort_values(by="diff")
sorted_by_diff.head()
Out[14]:
gender | F | M | diff |
---|---|---|---|
title | |||
Dirty Dancing (1987) | 3.790378 | 2.959596 | -0.830782 |
Jumpin' Jack Flash (1986) | 3.254717 | 2.578358 | -0.676359 |
Grease (1978) | 3.975265 | 3.367041 | -0.608224 |
Little Women (1994) | 3.870588 | 3.321739 | -0.548849 |
Steel Magnolias (1989) | 3.901734 | 3.365957 | -0.535777 |
In [15]:
sorted_by_diff[::-1].head()
Out[15]:
gender | F | M | diff |
---|---|---|---|
title | |||
Good, The Bad and The Ugly, The (1966) | 3.494949 | 4.221300 | 0.726351 |
Kentucky Fried Movie, The (1977) | 2.878788 | 3.555147 | 0.676359 |
Dumb & Dumber (1994) | 2.697987 | 3.336595 | 0.638608 |
Longest Day, The (1962) | 3.411765 | 4.031447 | 0.619682 |
Cable Guy, The (1996) | 2.250000 | 2.863787 | 0.613787 |
In [16]:
rating_std_by_title = data.groupby("title")["rating"].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]
rating_std_by_title.head()
Out[16]:
title 'burbs, The (1989) 1.107760 10 Things I Hate About You (1999) 0.989815 101 Dalmatians (1961) 0.982103 101 Dalmatians (1996) 1.098717 12 Angry Men (1957) 0.812731 Name: rating, dtype: float64
In [17]:
rating_std_by_title.sort_values(ascending=False)[:10]
Out[17]:
title Dumb & Dumber (1994) 1.321333 Blair Witch Project, The (1999) 1.316368 Natural Born Killers (1994) 1.307198 Tank Girl (1995) 1.277695 Rocky Horror Picture Show, The (1975) 1.260177 Eyes Wide Shut (1999) 1.259624 Evita (1996) 1.253631 Billy Madison (1995) 1.249970 Fear and Loathing in Las Vegas (1998) 1.246408 Bicentennial Man (1999) 1.245533 Name: rating, dtype: float64
In [18]:
movies["genres"].head()
Out[18]:
0 Animation|Children's|Comedy 1 Adventure|Children's|Fantasy 2 Comedy|Romance 3 Comedy|Drama 4 Comedy Name: genres, dtype: object
In [19]:
movies["genres"].head().str.split(r"|")
Out[19]:
0 [Animation, Children's, Comedy] 1 [Adventure, Children's, Fantasy] 2 [Comedy, Romance] 3 [Comedy, Drama] 4 [Comedy] Name: genres, dtype: object
In [20]:
movies["genre"] = movies.pop("genres").str.split(r"|")
movies.head()
Out[20]:
movie_id | title | genre | |
---|---|---|---|
0 | 1 | Toy Story (1995) | [Animation, Children's, Comedy] |
1 | 2 | Jumanji (1995) | [Adventure, Children's, Fantasy] |
2 | 3 | Grumpier Old Men (1995) | [Comedy, Romance] |
3 | 4 | Waiting to Exhale (1995) | [Comedy, Drama] |
4 | 5 | Father of the Bride Part II (1995) | [Comedy] |
In [21]:
movies_exploded = movies.explode("genre")
movies_exploded[:10]
Out[21]:
movie_id | title | genre | |
---|---|---|---|
0 | 1 | Toy Story (1995) | Animation |
0 | 1 | Toy Story (1995) | Children's |
0 | 1 | Toy Story (1995) | Comedy |
1 | 2 | Jumanji (1995) | Adventure |
1 | 2 | Jumanji (1995) | Children's |
1 | 2 | Jumanji (1995) | Fantasy |
2 | 3 | Grumpier Old Men (1995) | Comedy |
2 | 3 | Grumpier Old Men (1995) | Romance |
3 | 4 | Waiting to Exhale (1995) | Comedy |
3 | 4 | Waiting to Exhale (1995) | Drama |
In [22]:
ratings_with_genre = pd.merge(pd.merge(movies_exploded, ratings), users
)
ratings_with_genre.iloc[0]
Out[22]:
movie_id 1 title Toy Story (1995) genre Animation user_id 1 rating 5 timestamp 978824268 gender F age 1 occupation 10 zip 48067 Name: 0, dtype: object
In [23]:
genre_ratings = (
ratings_with_genre.groupby(["genre", "age"])["rating"].mean().unstack("age")
)
genre_ratings.head(10)
Out[23]:
age | 1 | 18 | 25 | 35 | 45 | 50 | 56 |
---|---|---|---|---|---|---|---|
genre | |||||||
Action | 3.506385 | 3.447097 | 3.453358 | 3.538107 | 3.528543 | 3.611333 | 3.610709 |
Adventure | 3.449975 | 3.408525 | 3.443163 | 3.515291 | 3.528963 | 3.628163 | 3.649064 |
Animation | 3.476113 | 3.624014 | 3.701228 | 3.740545 | 3.734856 | 3.780020 | 3.756233 |
Children's | 3.241642 | 3.294257 | 3.426873 | 3.518423 | 3.527593 | 3.556555 | 3.621822 |
Comedy | 3.497491 | 3.460417 | 3.490385 | 3.561984 | 3.591789 | 3.646868 | 3.650949 |
Crime | 3.710170 | 3.668054 | 3.680321 | 3.733736 | 3.750661 | 3.810688 | 3.832549 |
Documentary | 3.730769 | 3.865865 | 3.946690 | 3.953747 | 3.966521 | 3.908108 | 3.961538 |
Drama | 3.794735 | 3.721930 | 3.726428 | 3.782512 | 3.784356 | 3.878415 | 3.933465 |
Fantasy | 3.317647 | 3.353778 | 3.452484 | 3.482301 | 3.532468 | 3.581570 | 3.532700 |
Film-Noir | 4.145455 | 3.997368 | 4.058725 | 4.064910 | 4.105376 | 4.175401 | 4.125932 |
In [ ]: