Youtube Video Analysis¶

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
import os
import sys

sys.path.append("../")

from helpers.utils import dataframe_summary, download_from_kaggle, get_working_dir
In [2]:
# Define Constants
username="datasnaek"
dataset_name= "youtube-new"
In [3]:
DATASET_DIR = download_from_kaggle(username, dataset_name)
Setting the base path to: c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies
Dataset URL: https://www.kaggle.com/datasets/datasnaek/youtube-new
Downloaded and extracted to: c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new
In [4]:
for dirname, _, filenames in os.walk(DATASET_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\CAvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\CA_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\DEvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\DE_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\FRvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\FR_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\GBvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\GB_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\INvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\IN_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\JPvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\JP_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\KRvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\KR_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\MXvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\MX_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\RUvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\RU_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\USvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\US_category_id.json

Data Preparation¶

  • Merge input/youtube-new/**videos.csv into a single dataframe (videos_df).
  • Merge input/youtube-new/**_category_id.json into a single dataframe (categories_df).
In [5]:
## Loading the videos files into a single frame
videos_df_list = []
for filename in glob.glob(os.path.join(DATASET_DIR, "**videos.csv")):
    print("Loading file {}...".format(filename))
    video_df = pd.read_csv(filename, encoding='latin-1')
    video_df["filename"] = filename
    videos_df_list.append(video_df)
    print("Loaded file {}.".format(filename))

videos_df = pd.concat(videos_df_list, ignore_index=True)
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\CAvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\CAvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\DEvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\DEvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\FRvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\FRvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\GBvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\GBvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\INvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\INvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\JPvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\JPvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\KRvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\KRvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\MXvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\MXvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\RUvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\RUvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\USvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\USvideos.csv.
In [6]:
## Loading the categories files into a single dataframe
categories_df_list = []
for filename in glob.glob(os.path.join(DATASET_DIR, "**_category_id.json")):
    print("Loading file {}...".format(filename))
    category_data = json.load(open(filename, "r"))
    category_df = pd.json_normalize(category_data, record_path="items", sep="_")
    category_df["filename"] = filename
    categories_df_list.append(category_df)

categories_df = pd.concat(categories_df_list, ignore_index=True)
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\CA_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\DE_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\FR_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\GB_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\IN_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\JP_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\KR_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\MX_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\RU_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\US_category_id.json...

Data Preprocessing & Cleaning¶

  • Extract the country_cd from the filename.
  • Convert the category_id column into int.
  • Convert the trending_date into datetime
  • Drop the unnecessary columns (filename, video_id, kind, etag, snippet_channelId)
  • Merge the categories_df into the videos_df on category_id and country_cd.
  • Rename columns snippet_title to category_title, snippet_assignable to category_assignable.
  • Write the cleaned dataframe to working/videos_cleaned.csv
In [7]:
# Extracting the country from the filename
videos_df['country_cd'] = videos_df.filename.str.extract("([A-Z]+)videos.csv")
print("Unique countries: {}".format(list(videos_df.country_cd.unique())))

categories_df['country_cd'] = categories_df.filename.str.extract("([A-Z]+)_category_id.json")
print("Unique countries: {}".format(list(categories_df.country_cd.unique())))
Unique countries: ['CA', 'DE', 'FR', 'GB', 'IN', 'JP', 'KR', 'MX', 'RU', 'US']
Unique countries: ['CA', 'DE', 'FR', 'GB', 'IN', 'JP', 'KR', 'MX', 'RU', 'US']
In [8]:
# Convert the category_id to int
categories_df["id"] = categories_df["id"].astype("int")
videos_df["category_id"] = videos_df["category_id"].astype("int")
In [9]:
# Convert the trending date into ISO format
videos_df["trending_date"] = pd.to_datetime(videos_df["trending_date"], format="%y.%d.%m")
In [10]:
videos_df.head()
Out[10]:
video_id trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description filename country_cd
0 n1WpP7iowLc 2017-11-14 Eminem - Walk On Water (Audio) ft. Beyoncé EminemVEVO 10 2017-11-10T17:00:03.000Z Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... 17158579 787425 43420 125882 https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg False False False Eminem's new track Walk on Water ft. Beyoncé ... c:\Users\purch\Documents\Projects\Github\shivi... CA
1 0dBIkQ4Mz1M 2017-11-14 PLUSH - Bad Unboxing Fan Mail iDubbbzTV 23 2017-11-13T17:00:00.000Z plush|"bad unboxing"|"unboxing"|"fan mail"|"id... 1014651 127794 1688 13030 https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg False False False STill got a lot of packages. Probably will las... c:\Users\purch\Documents\Projects\Github\shivi... CA
2 5qpjK5DgCt4 2017-11-14 Racist Superman | Rudy Mancuso, King Bach & Le... Rudy Mancuso 23 2017-11-12T19:05:24.000Z racist superman|"rudy"|"mancuso"|"king"|"bach"... 3191434 146035 5339 8181 https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg False False False WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► ... c:\Users\purch\Documents\Projects\Github\shivi... CA
3 d380meD0W0M 2017-11-14 I Dare You: GOING BALD!? nigahiga 24 2017-11-12T18:01:41.000Z ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... 2095828 132239 1989 17518 https://i.ytimg.com/vi/d380meD0W0M/default.jpg False False False I know it's been a while since we did this sho... c:\Users\purch\Documents\Projects\Github\shivi... CA
4 2Vv-BfVoq4g 2017-11-14 Ed Sheeran - Perfect (Official Music Video) Ed Sheeran 10 2017-11-09T11:04:14.000Z edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... 33523622 1634130 21082 85067 https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg False False False 🎧: https://ad.gt/yt-perfect\n💰: https://... c:\Users\purch\Documents\Projects\Github\shivi... CA
In [11]:
dataframe_summary(videos_df)

DataFrame Summary¶


Shape: (375942, 18)
Columns: ['video_id', 'trending_date', 'title', 'channel_title', 'category_id', 'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count', 'thumbnail_link', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 'description', 'filename', 'country_cd']

Column Summary:¶

Column Type Missing Unique Min Max Mean
video_id object 0 184287
trending_date datetime64[ns] 0 205
title object 0 186272
channel_title object 0 37824
category_id int64 0 18 1 44 20.232
publish_time object 0 169286
tags object 0 142211
views int64 0 240399 117 4.2454e+08 1.3266e+06
likes int64 0 73693 0 5.6138e+06 37884
dislikes int64 0 16715 0 1.945e+06 2126.1
comment_count int64 0 24477 0 1.6265e+06 4253.8
thumbnail_link object 0 185690
comments_disabled bool 0 2
ratings_disabled bool 0 2
video_error_or_removed bool 0 2
description object 19478 156165
filename object 0 10
country_cd object 0 10
In [12]:
categories_df.head()
Out[12]:
kind etag id snippet_channelId snippet_title snippet_assignable filename country_cd
0 youtube#videoCategory "ld9biNPKjAjgjV7EZ4EKeEGrhao/Xy1mB4_yLrHy_BmKm... 1 UCBR8-60-B28hp2BmDPdntcQ Film & Animation True c:\Users\purch\Documents\Projects\Github\shivi... CA
1 youtube#videoCategory "ld9biNPKjAjgjV7EZ4EKeEGrhao/UZ1oLIIz2dxIhO45Z... 2 UCBR8-60-B28hp2BmDPdntcQ Autos & Vehicles True c:\Users\purch\Documents\Projects\Github\shivi... CA
2 youtube#videoCategory "ld9biNPKjAjgjV7EZ4EKeEGrhao/nqRIq97-xe5XRZTxb... 10 UCBR8-60-B28hp2BmDPdntcQ Music True c:\Users\purch\Documents\Projects\Github\shivi... CA
3 youtube#videoCategory "ld9biNPKjAjgjV7EZ4EKeEGrhao/HwXKamM1Q20q9BN-o... 15 UCBR8-60-B28hp2BmDPdntcQ Pets & Animals True c:\Users\purch\Documents\Projects\Github\shivi... CA
4 youtube#videoCategory "ld9biNPKjAjgjV7EZ4EKeEGrhao/9GQMSRjrZdHeb1OEM... 17 UCBR8-60-B28hp2BmDPdntcQ Sports True c:\Users\purch\Documents\Projects\Github\shivi... CA
In [13]:
dataframe_summary(categories_df)

DataFrame Summary¶


Shape: (311, 8)
Columns: ['kind', 'etag', 'id', 'snippet_channelId', 'snippet_title', 'snippet_assignable', 'filename', 'country_cd']

Column Summary:¶

Column Type Missing Unique Min Max Mean
kind object 0 1
etag object 0 94
id int64 0 32 1 44 27.521
snippet_channelId object 0 1
snippet_title object 0 31
snippet_assignable bool 0 2
filename object 0 10
country_cd object 0 10
In [14]:
# Drop the unnecessary columns
videos_df = videos_df.drop(["filename", "video_id"], axis=1)
categories_df = categories_df.drop(["filename", "kind", "etag", "snippet_channelId"], axis=1)
In [15]:
# Merge the dataframes
videos_df = videos_df.merge(categories_df, how="left", left_on=["category_id", "country_cd"], right_on=["id", "country_cd"])
videos_df.head()
Out[15]:
trending_date title channel_title category_id publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description country_cd id snippet_title snippet_assignable
0 2017-11-14 Eminem - Walk On Water (Audio) ft. Beyoncé EminemVEVO 10 2017-11-10T17:00:03.000Z Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... 17158579 787425 43420 125882 https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg False False False Eminem's new track Walk on Water ft. Beyoncé ... CA 10.0 Music True
1 2017-11-14 PLUSH - Bad Unboxing Fan Mail iDubbbzTV 23 2017-11-13T17:00:00.000Z plush|"bad unboxing"|"unboxing"|"fan mail"|"id... 1014651 127794 1688 13030 https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg False False False STill got a lot of packages. Probably will las... CA 23.0 Comedy True
2 2017-11-14 Racist Superman | Rudy Mancuso, King Bach & Le... Rudy Mancuso 23 2017-11-12T19:05:24.000Z racist superman|"rudy"|"mancuso"|"king"|"bach"... 3191434 146035 5339 8181 https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg False False False WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► ... CA 23.0 Comedy True
3 2017-11-14 I Dare You: GOING BALD!? nigahiga 24 2017-11-12T18:01:41.000Z ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... 2095828 132239 1989 17518 https://i.ytimg.com/vi/d380meD0W0M/default.jpg False False False I know it's been a while since we did this sho... CA 24.0 Entertainment True
4 2017-11-14 Ed Sheeran - Perfect (Official Music Video) Ed Sheeran 10 2017-11-09T11:04:14.000Z edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... 33523622 1634130 21082 85067 https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg False False False 🎧: https://ad.gt/yt-perfect\n💰: https://... CA 10.0 Music True
In [16]:
# Drop more unnecessary columns
videos_df = videos_df.drop(["id", "category_id"], axis=1)
In [17]:
# Rename columns `snippet_title` to `category_title`, `snippet_assignable` to `category_assignable`
videos_df = videos_df.rename(
    columns={
        "snippet_title": "category_title",
        "snippet_assignable": "category_assignable",
    }
)
In [18]:
dataframe_summary(videos_df)

DataFrame Summary¶


Shape: (375942, 17)
Columns: ['trending_date', 'title', 'channel_title', 'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count', 'thumbnail_link', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 'description', 'country_cd', 'category_title', 'category_assignable']

Column Summary:¶

Column Type Missing Unique Min Max Mean
trending_date datetime64[ns] 0 205
title object 0 186272
channel_title object 0 37824
publish_time object 0 169286
tags object 0 142211
views int64 0 240399 117 4.2454e+08 1.3266e+06
likes int64 0 73693 0 5.6138e+06 37884
dislikes int64 0 16715 0 1.945e+06 2126.1
comment_count int64 0 24477 0 1.6265e+06 4253.8
thumbnail_link object 0 185690
comments_disabled bool 0 2
ratings_disabled bool 0 2
video_error_or_removed bool 0 2
description object 19478 156165
country_cd object 0 10
category_title object 2738 18
category_assignable object 2738 2
In [19]:
videos_df.head()
Out[19]:
trending_date title channel_title publish_time tags views likes dislikes comment_count thumbnail_link comments_disabled ratings_disabled video_error_or_removed description country_cd category_title category_assignable
0 2017-11-14 Eminem - Walk On Water (Audio) ft. Beyoncé EminemVEVO 2017-11-10T17:00:03.000Z Eminem|"Walk"|"On"|"Water"|"Aftermath/Shady/In... 17158579 787425 43420 125882 https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg False False False Eminem's new track Walk on Water ft. Beyoncé ... CA Music True
1 2017-11-14 PLUSH - Bad Unboxing Fan Mail iDubbbzTV 2017-11-13T17:00:00.000Z plush|"bad unboxing"|"unboxing"|"fan mail"|"id... 1014651 127794 1688 13030 https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg False False False STill got a lot of packages. Probably will las... CA Comedy True
2 2017-11-14 Racist Superman | Rudy Mancuso, King Bach & Le... Rudy Mancuso 2017-11-12T19:05:24.000Z racist superman|"rudy"|"mancuso"|"king"|"bach"... 3191434 146035 5339 8181 https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg False False False WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► ... CA Comedy True
3 2017-11-14 I Dare You: GOING BALD!? nigahiga 2017-11-12T18:01:41.000Z ryan|"higa"|"higatv"|"nigahiga"|"i dare you"|"... 2095828 132239 1989 17518 https://i.ytimg.com/vi/d380meD0W0M/default.jpg False False False I know it's been a while since we did this sho... CA Entertainment True
4 2017-11-14 Ed Sheeran - Perfect (Official Music Video) Ed Sheeran 2017-11-09T11:04:14.000Z edsheeran|"ed sheeran"|"acoustic"|"live"|"cove... 33523622 1634130 21082 85067 https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg False False False 🎧: https://ad.gt/yt-perfect\n💰: https://... CA Music True
In [20]:
videos_df.to_csv("{}/videos.csv".format(get_working_dir(dataset_name=dataset_name)), index=False)
Setting the base path to: c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies
In [ ]: