MovieLens 1M Dataset¶

In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
import os
import shutil
from pathlib import Path

sys.path.append("../")

from helpers.utils import download_by_url
In [2]:
DATASET_DIR = download_by_url(
    "https://files.grouplens.org/datasets/movielens/ml-1m.zip",
    "../input/movielens_1m_ds/ml-1m.zip",
    unzip=True,
)
src_dir = os.path.join(DATASET_DIR, "ml-1m")
for f in os.listdir(src_dir):
    shutil.move(os.path.join(src_dir, f), os.path.join(DATASET_DIR, f))
shutil.rmtree(src_dir)
Downloading https://files.grouplens.org/datasets/movielens/ml-1m.zip to ../input/movielens_1m_ds/ml-1m.zip
✅ Download complete: ../input/movielens_1m_ds/ml-1m.zip
Unzipping ../input/movielens_1m_ds/ml-1m.zip
✅ Unzipped to: ../input/movielens_1m_ds
In [3]:
for dirname, _, filenames in os.walk(DATASET_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))
../input/movielens_1m_ds\movies.dat
../input/movielens_1m_ds\ratings.dat
../input/movielens_1m_ds\README
../input/movielens_1m_ds\users.dat
In [4]:
DATASET_DIR
Out[4]:
'../input/movielens_1m_ds'
In [5]:
unames = ["user_id", "gender", "age", "occupation", "zip"]
users = pd.read_csv(
    os.path.join(DATASET_DIR, "users.dat"),
    sep="::",
    header=None,
    names=unames,
    engine="python",
)
users.head()
Out[5]:
user_id gender age occupation zip
0 1 F 1 10 48067
1 2 M 56 16 70072
2 3 M 25 15 55117
3 4 M 45 7 02460
4 5 M 25 20 55455
In [6]:
rnames = ["user_id", "movie_id", "rating", "timestamp"]
ratings = pd.read_csv(
    os.path.join(DATASET_DIR, "ratings.dat"),
    sep="::",
    header=None,
    names=rnames,
    engine="python",
)
ratings.head()
Out[6]:
user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
In [7]:
mnames = ["movie_id", "title", "genres"]
movies = pd.read_csv(
    os.path.join(DATASET_DIR, "movies.dat"),
    sep="::",
    header=None,
    names=mnames,
    engine="python",
    encoding="ISO-8859-1",
)
movies.head()
Out[7]:
movie_id title genres
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
In [8]:
data = ratings.merge(users).merge(movies)
data.head()
Out[8]:
user_id movie_id rating timestamp gender age occupation zip title genres
0 1 1193 5 978300760 F 1 10 48067 One Flew Over the Cuckoo's Nest (1975) Drama
1 1 661 3 978302109 F 1 10 48067 James and the Giant Peach (1996) Animation|Children's|Musical
2 1 914 3 978301968 F 1 10 48067 My Fair Lady (1964) Musical|Romance
3 1 3408 4 978300275 F 1 10 48067 Erin Brockovich (2000) Drama
4 1 2355 5 978824291 F 1 10 48067 Bug's Life, A (1998) Animation|Children's|Comedy
In [9]:
mean_ratings = data.pivot_table("rating", index="title", columns="gender", aggfunc="mean")
mean_ratings.head()
Out[9]:
gender F M
title
$1,000,000 Duck (1971) 3.375000 2.761905
'Night Mother (1986) 3.388889 3.352941
'Til There Was You (1997) 2.675676 2.733333
'burbs, The (1989) 2.793478 2.962085
...And Justice for All (1979) 3.828571 3.689024
In [10]:
ratings_by_title = data.groupby("title").size()
ratings_by_title.head()
Out[10]:
title
$1,000,000 Duck (1971)            37
'Night Mother (1986)              70
'Til There Was You (1997)         52
'burbs, The (1989)               303
...And Justice for All (1979)    199
dtype: int64
In [11]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles
Out[11]:
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)
In [12]:
mean_ratings = mean_ratings.loc[active_titles]
mean_ratings.head()
Out[12]:
gender F M
title
'burbs, The (1989) 2.793478 2.962085
10 Things I Hate About You (1999) 3.646552 3.311966
101 Dalmatians (1961) 3.791444 3.500000
101 Dalmatians (1996) 3.240000 2.911215
12 Angry Men (1957) 4.184397 4.328421
In [13]:
top_female_ratings = mean_ratings.sort_values(by="F", ascending=False)
top_female_ratings.head()
Out[13]:
gender F M
title
Close Shave, A (1995) 4.644444 4.473795
Wrong Trousers, The (1993) 4.588235 4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 4.572650 4.464589
Wallace & Gromit: The Best of Aardman Animation (1996) 4.563107 4.385075
Schindler's List (1993) 4.562602 4.491415

Measuring Rating Disagreement¶

In [14]:
mean_ratings["diff"] = mean_ratings["M"] - mean_ratings["F"]
sorted_by_diff = mean_ratings.sort_values(by="diff")
sorted_by_diff.head()
Out[14]:
gender F M diff
title
Dirty Dancing (1987) 3.790378 2.959596 -0.830782
Jumpin' Jack Flash (1986) 3.254717 2.578358 -0.676359
Grease (1978) 3.975265 3.367041 -0.608224
Little Women (1994) 3.870588 3.321739 -0.548849
Steel Magnolias (1989) 3.901734 3.365957 -0.535777
In [15]:
sorted_by_diff[::-1].head()
Out[15]:
gender F M diff
title
Good, The Bad and The Ugly, The (1966) 3.494949 4.221300 0.726351
Kentucky Fried Movie, The (1977) 2.878788 3.555147 0.676359
Dumb & Dumber (1994) 2.697987 3.336595 0.638608
Longest Day, The (1962) 3.411765 4.031447 0.619682
Cable Guy, The (1996) 2.250000 2.863787 0.613787
In [16]:
rating_std_by_title = data.groupby("title")["rating"].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]
rating_std_by_title.head()
Out[16]:
title
'burbs, The (1989)                   1.107760
10 Things I Hate About You (1999)    0.989815
101 Dalmatians (1961)                0.982103
101 Dalmatians (1996)                1.098717
12 Angry Men (1957)                  0.812731
Name: rating, dtype: float64
In [17]:
rating_std_by_title.sort_values(ascending=False)[:10]
Out[17]:
title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Eyes Wide Shut (1999)                    1.259624
Evita (1996)                             1.253631
Billy Madison (1995)                     1.249970
Fear and Loathing in Las Vegas (1998)    1.246408
Bicentennial Man (1999)                  1.245533
Name: rating, dtype: float64
In [18]:
movies["genres"].head()
Out[18]:
0     Animation|Children's|Comedy
1    Adventure|Children's|Fantasy
2                  Comedy|Romance
3                    Comedy|Drama
4                          Comedy
Name: genres, dtype: object
In [19]:
movies["genres"].head().str.split(r"|")
Out[19]:
0     [Animation, Children's, Comedy]
1    [Adventure, Children's, Fantasy]
2                   [Comedy, Romance]
3                     [Comedy, Drama]
4                            [Comedy]
Name: genres, dtype: object
In [20]:
movies["genre"] = movies.pop("genres").str.split(r"|")
movies.head()
Out[20]:
movie_id title genre
0 1 Toy Story (1995) [Animation, Children's, Comedy]
1 2 Jumanji (1995) [Adventure, Children's, Fantasy]
2 3 Grumpier Old Men (1995) [Comedy, Romance]
3 4 Waiting to Exhale (1995) [Comedy, Drama]
4 5 Father of the Bride Part II (1995) [Comedy]
In [21]:
movies_exploded = movies.explode("genre")
movies_exploded[:10]
Out[21]:
movie_id title genre
0 1 Toy Story (1995) Animation
0 1 Toy Story (1995) Children's
0 1 Toy Story (1995) Comedy
1 2 Jumanji (1995) Adventure
1 2 Jumanji (1995) Children's
1 2 Jumanji (1995) Fantasy
2 3 Grumpier Old Men (1995) Comedy
2 3 Grumpier Old Men (1995) Romance
3 4 Waiting to Exhale (1995) Comedy
3 4 Waiting to Exhale (1995) Drama
In [22]:
ratings_with_genre = pd.merge(pd.merge(movies_exploded, ratings), users
)
ratings_with_genre.iloc[0]
Out[22]:
movie_id                     1
title         Toy Story (1995)
genre                Animation
user_id                      1
rating                       5
timestamp            978824268
gender                       F
age                          1
occupation                  10
zip                      48067
Name: 0, dtype: object
In [23]:
genre_ratings = (
    ratings_with_genre.groupby(["genre", "age"])["rating"].mean().unstack("age")
)
genre_ratings.head(10)
Out[23]:
age 1 18 25 35 45 50 56
genre
Action 3.506385 3.447097 3.453358 3.538107 3.528543 3.611333 3.610709
Adventure 3.449975 3.408525 3.443163 3.515291 3.528963 3.628163 3.649064
Animation 3.476113 3.624014 3.701228 3.740545 3.734856 3.780020 3.756233
Children's 3.241642 3.294257 3.426873 3.518423 3.527593 3.556555 3.621822
Comedy 3.497491 3.460417 3.490385 3.561984 3.591789 3.646868 3.650949
Crime 3.710170 3.668054 3.680321 3.733736 3.750661 3.810688 3.832549
Documentary 3.730769 3.865865 3.946690 3.953747 3.966521 3.908108 3.961538
Drama 3.794735 3.721930 3.726428 3.782512 3.784356 3.878415 3.933465
Fantasy 3.317647 3.353778 3.452484 3.482301 3.532468 3.581570 3.532700
Film-Noir 4.145455 3.997368 4.058725 4.064910 4.105376 4.175401 4.125932
In [ ]: