import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
import os
import sys

sys.path.append("../")

from helpers.utils import dataframe_summary, download_from_kaggle, get_working_dir

# Define Constants
username="datasnaek"
dataset_name= "youtube-new"

DATASET_DIR = download_from_kaggle(username, dataset_name)

Setting the base path to: c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies
Dataset URL: https://www.kaggle.com/datasets/datasnaek/youtube-new
Downloaded and extracted to: c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new

for dirname, _, filenames in os.walk(DATASET_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\CAvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\CA_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\DEvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\DE_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\FRvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\FR_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\GBvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\GB_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\INvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\IN_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\JPvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\JP_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\KRvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\KR_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\MXvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\MX_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\RUvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\RU_category_id.json
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\USvideos.csv
c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\US_category_id.json

## Loading the videos files into a single frame
videos_df_list = []
for filename in glob.glob(os.path.join(DATASET_DIR, "**videos.csv")):
    print("Loading file {}...".format(filename))
    video_df = pd.read_csv(filename, encoding='latin-1')
    video_df["filename"] = filename
    videos_df_list.append(video_df)
    print("Loaded file {}.".format(filename))

videos_df = pd.concat(videos_df_list, ignore_index=True)

Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\CAvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\CAvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\DEvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\DEvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\FRvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\FRvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\GBvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\GBvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\INvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\INvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\JPvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\JPvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\KRvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\KRvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\MXvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\MXvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\RUvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\RUvideos.csv.
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\USvideos.csv...
Loaded file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\USvideos.csv.

## Loading the categories files into a single dataframe
categories_df_list = []
for filename in glob.glob(os.path.join(DATASET_DIR, "**_category_id.json")):
    print("Loading file {}...".format(filename))
    category_data = json.load(open(filename, "r"))
    category_df = pd.json_normalize(category_data, record_path="items", sep="_")
    category_df["filename"] = filename
    categories_df_list.append(category_df)

categories_df = pd.concat(categories_df_list, ignore_index=True)

Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\CA_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\DE_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\FR_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\GB_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\IN_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\JP_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\KR_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\MX_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\RU_category_id.json...
Loading file c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies\input\youtube-new\US_category_id.json...

# Extracting the country from the filename
videos_df['country_cd'] = videos_df.filename.str.extract("([A-Z]+)videos.csv")
print("Unique countries: {}".format(list(videos_df.country_cd.unique())))

categories_df['country_cd'] = categories_df.filename.str.extract("([A-Z]+)_category_id.json")
print("Unique countries: {}".format(list(categories_df.country_cd.unique())))

Unique countries: ['CA', 'DE', 'FR', 'GB', 'IN', 'JP', 'KR', 'MX', 'RU', 'US']
Unique countries: ['CA', 'DE', 'FR', 'GB', 'IN', 'JP', 'KR', 'MX', 'RU', 'US']

# Convert the category_id to int
categories_df["id"] = categories_df["id"].astype("int")
videos_df["category_id"] = videos_df["category_id"].astype("int")

# Convert the trending date into ISO format
videos_df["trending_date"] = pd.to_datetime(videos_df["trending_date"], format="%y.%d.%m")

videos_df.head()

dataframe_summary(videos_df)

categories_df.head()

dataframe_summary(categories_df)

# Drop the unnecessary columns
videos_df = videos_df.drop(["filename", "video_id"], axis=1)
categories_df = categories_df.drop(["filename", "kind", "etag", "snippet_channelId"], axis=1)

# Merge the dataframes
videos_df = videos_df.merge(categories_df, how="left", left_on=["category_id", "country_cd"], right_on=["id", "country_cd"])
videos_df.head()

# Drop more unnecessary columns
videos_df = videos_df.drop(["id", "category_id"], axis=1)

# Rename columns `snippet_title` to `category_title`, `snippet_assignable` to `category_assignable`
videos_df = videos_df.rename(
    columns={
        "snippet_title": "category_title",
        "snippet_assignable": "category_assignable",
    }
)

dataframe_summary(videos_df)

videos_df.head()

videos_df.to_csv("{}/videos.csv".format(get_working_dir(dataset_name=dataset_name)), index=False)

Setting the base path to: c:\Users\purch\Documents\Projects\Github\shivishbrahma\data-science-case-studies

	video_id	trending_date	title	channel_title	category_id	publish_time	tags	views	likes	dislikes	comment_count	thumbnail_link	comments_disabled	ratings_disabled	video_error_or_removed	description	filename	country_cd
0	n1WpP7iowLc	2017-11-14	Eminem - Walk On Water (Audio) ft. BeyoncÃ©	EminemVEVO	10	2017-11-10T17:00:03.000Z	Eminem\|"Walk"\|"On"\|"Water"\|"Aftermath/Shady/In...	17158579	787425	43420	125882	https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg	False	False	False	Eminem's new track Walk on Water ft. BeyoncÃ© ...	c:\Users\purch\Documents\Projects\Github\shivi...	CA
1	0dBIkQ4Mz1M	2017-11-14	PLUSH - Bad Unboxing Fan Mail	iDubbbzTV	23	2017-11-13T17:00:00.000Z	plush\|"bad unboxing"\|"unboxing"\|"fan mail"\|"id...	1014651	127794	1688	13030	https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg	False	False	False	STill got a lot of packages. Probably will las...	c:\Users\purch\Documents\Projects\Github\shivi...	CA
2	5qpjK5DgCt4	2017-11-14	Racist Superman \| Rudy Mancuso, King Bach & Le...	Rudy Mancuso	23	2017-11-12T19:05:24.000Z	racist superman\|"rudy"\|"mancuso"\|"king"\|"bach"...	3191434	146035	5339	8181	https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg	False	False	False	WATCH MY PREVIOUS VIDEO â¶ \n\nSUBSCRIBE âº ...	c:\Users\purch\Documents\Projects\Github\shivi...	CA
3	d380meD0W0M	2017-11-14	I Dare You: GOING BALD!?	nigahiga	24	2017-11-12T18:01:41.000Z	ryan\|"higa"\|"higatv"\|"nigahiga"\|"i dare you"\|"...	2095828	132239	1989	17518	https://i.ytimg.com/vi/d380meD0W0M/default.jpg	False	False	False	I know it's been a while since we did this sho...	c:\Users\purch\Documents\Projects\Github\shivi...	CA
4	2Vv-BfVoq4g	2017-11-14	Ed Sheeran - Perfect (Official Music Video)	Ed Sheeran	10	2017-11-09T11:04:14.000Z	edsheeran\|"ed sheeran"\|"acoustic"\|"live"\|"cove...	33523622	1634130	21082	85067	https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg	False	False	False	ð§: https://ad.gt/yt-perfect\nð°: https://...	c:\Users\purch\Documents\Projects\Github\shivi...	CA

Column	Type	Missing	Unique	Min	Max	Mean
video_id	object	0	184287
trending_date	datetime64[ns]	0	205
title	object	0	186272
channel_title	object	0	37824
category_id	int64	0	18	1	44	20.232
publish_time	object	0	169286
tags	object	0	142211
views	int64	0	240399	117	4.2454e+08	1.3266e+06
likes	int64	0	73693	0	5.6138e+06	37884
dislikes	int64	0	16715	0	1.945e+06	2126.1
comment_count	int64	0	24477	0	1.6265e+06	4253.8
thumbnail_link	object	0	185690
comments_disabled	bool	0	2
ratings_disabled	bool	0	2
video_error_or_removed	bool	0	2
description	object	19478	156165
filename	object	0	10
country_cd	object	0	10

	kind	etag	id	snippet_channelId	snippet_title	snippet_assignable	filename	country_cd
0	youtube#videoCategory	"ld9biNPKjAjgjV7EZ4EKeEGrhao/Xy1mB4_yLrHy_BmKm...	1	UCBR8-60-B28hp2BmDPdntcQ	Film & Animation	True	c:\Users\purch\Documents\Projects\Github\shivi...	CA
1	youtube#videoCategory	"ld9biNPKjAjgjV7EZ4EKeEGrhao/UZ1oLIIz2dxIhO45Z...	2	UCBR8-60-B28hp2BmDPdntcQ	Autos & Vehicles	True	c:\Users\purch\Documents\Projects\Github\shivi...	CA
2	youtube#videoCategory	"ld9biNPKjAjgjV7EZ4EKeEGrhao/nqRIq97-xe5XRZTxb...	10	UCBR8-60-B28hp2BmDPdntcQ	Music	True	c:\Users\purch\Documents\Projects\Github\shivi...	CA
3	youtube#videoCategory	"ld9biNPKjAjgjV7EZ4EKeEGrhao/HwXKamM1Q20q9BN-o...	15	UCBR8-60-B28hp2BmDPdntcQ	Pets & Animals	True	c:\Users\purch\Documents\Projects\Github\shivi...	CA
4	youtube#videoCategory	"ld9biNPKjAjgjV7EZ4EKeEGrhao/9GQMSRjrZdHeb1OEM...	17	UCBR8-60-B28hp2BmDPdntcQ	Sports	True	c:\Users\purch\Documents\Projects\Github\shivi...	CA

Column	Type	Unique	Min	Max	Mean
kind	object	1
etag	object	94
id	int64	32	1	44	27.521
snippet_channelId	object	1
snippet_title	object	31
snippet_assignable	bool	2
filename	object	10
country_cd	object	10

	trending_date	title	channel_title	category_id	publish_time	tags	views	likes	dislikes	comment_count	thumbnail_link	comments_disabled	ratings_disabled	video_error_or_removed	description	country_cd	id	snippet_title	snippet_assignable
0	2017-11-14	Eminem - Walk On Water (Audio) ft. BeyoncÃ©	EminemVEVO	10	2017-11-10T17:00:03.000Z	Eminem\|"Walk"\|"On"\|"Water"\|"Aftermath/Shady/In...	17158579	787425	43420	125882	https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg	False	False	False	Eminem's new track Walk on Water ft. BeyoncÃ© ...	CA	10.0	Music	True
1	2017-11-14	PLUSH - Bad Unboxing Fan Mail	iDubbbzTV	23	2017-11-13T17:00:00.000Z	plush\|"bad unboxing"\|"unboxing"\|"fan mail"\|"id...	1014651	127794	1688	13030	https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg	False	False	False	STill got a lot of packages. Probably will las...	CA	23.0	Comedy	True
2	2017-11-14	Racist Superman \| Rudy Mancuso, King Bach & Le...	Rudy Mancuso	23	2017-11-12T19:05:24.000Z	racist superman\|"rudy"\|"mancuso"\|"king"\|"bach"...	3191434	146035	5339	8181	https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg	False	False	False	WATCH MY PREVIOUS VIDEO â¶ \n\nSUBSCRIBE âº ...	CA	23.0	Comedy	True
3	2017-11-14	I Dare You: GOING BALD!?	nigahiga	24	2017-11-12T18:01:41.000Z	ryan\|"higa"\|"higatv"\|"nigahiga"\|"i dare you"\|"...	2095828	132239	1989	17518	https://i.ytimg.com/vi/d380meD0W0M/default.jpg	False	False	False	I know it's been a while since we did this sho...	CA	24.0	Entertainment	True
4	2017-11-14	Ed Sheeran - Perfect (Official Music Video)	Ed Sheeran	10	2017-11-09T11:04:14.000Z	edsheeran\|"ed sheeran"\|"acoustic"\|"live"\|"cove...	33523622	1634130	21082	85067	https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg	False	False	False	ð§: https://ad.gt/yt-perfect\nð°: https://...	CA	10.0	Music	True

Youtube Video Analysis¶

Data Preparation¶

Data Preprocessing & Cleaning¶

DataFrame Summary¶

Column Summary:¶

DataFrame Summary¶

Column Summary:¶

DataFrame Summary¶

Column Summary:¶