US Baby Names 1880-2010¶

In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
import os
import shutil
from pathlib import Path

sys.path.append("../")

from helpers.utils import download_by_url
In [2]:
DATASET_DIR = download_by_url(
    "https://www.ssa.gov/oact/babynames/names.zip",
    "../input/us_baby_names/names.zip",
    unzip=True,
)
Downloading https://www.ssa.gov/oact/babynames/names.zip to ../input/us_baby_names/names.zip
✅ Download complete: ../input/us_baby_names/names.zip
Unzipping ../input/us_baby_names/names.zip
✅ Unzipped to: ../input/us_baby_names
In [3]:
for root, _, f in os.walk(DATASET_DIR):
    print(f)
['NationalReadMe.pdf', 'yob1880.txt', 'yob1881.txt', 'yob1882.txt', 'yob1883.txt', 'yob1884.txt', 'yob1885.txt', 'yob1886.txt', 'yob1887.txt', 'yob1888.txt', 'yob1889.txt', 'yob1890.txt', 'yob1891.txt', 'yob1892.txt', 'yob1893.txt', 'yob1894.txt', 'yob1895.txt', 'yob1896.txt', 'yob1897.txt', 'yob1898.txt', 'yob1899.txt', 'yob1900.txt', 'yob1901.txt', 'yob1902.txt', 'yob1903.txt', 'yob1904.txt', 'yob1905.txt', 'yob1906.txt', 'yob1907.txt', 'yob1908.txt', 'yob1909.txt', 'yob1910.txt', 'yob1911.txt', 'yob1912.txt', 'yob1913.txt', 'yob1914.txt', 'yob1915.txt', 'yob1916.txt', 'yob1917.txt', 'yob1918.txt', 'yob1919.txt', 'yob1920.txt', 'yob1921.txt', 'yob1922.txt', 'yob1923.txt', 'yob1924.txt', 'yob1925.txt', 'yob1926.txt', 'yob1927.txt', 'yob1928.txt', 'yob1929.txt', 'yob1930.txt', 'yob1931.txt', 'yob1932.txt', 'yob1933.txt', 'yob1934.txt', 'yob1935.txt', 'yob1936.txt', 'yob1937.txt', 'yob1938.txt', 'yob1939.txt', 'yob1940.txt', 'yob1941.txt', 'yob1942.txt', 'yob1943.txt', 'yob1944.txt', 'yob1945.txt', 'yob1946.txt', 'yob1947.txt', 'yob1948.txt', 'yob1949.txt', 'yob1950.txt', 'yob1951.txt', 'yob1952.txt', 'yob1953.txt', 'yob1954.txt', 'yob1955.txt', 'yob1956.txt', 'yob1957.txt', 'yob1958.txt', 'yob1959.txt', 'yob1960.txt', 'yob1961.txt', 'yob1962.txt', 'yob1963.txt', 'yob1964.txt', 'yob1965.txt', 'yob1966.txt', 'yob1967.txt', 'yob1968.txt', 'yob1969.txt', 'yob1970.txt', 'yob1971.txt', 'yob1972.txt', 'yob1973.txt', 'yob1974.txt', 'yob1975.txt', 'yob1976.txt', 'yob1977.txt', 'yob1978.txt', 'yob1979.txt', 'yob1980.txt', 'yob1981.txt', 'yob1982.txt', 'yob1983.txt', 'yob1984.txt', 'yob1985.txt', 'yob1986.txt', 'yob1987.txt', 'yob1988.txt', 'yob1989.txt', 'yob1990.txt', 'yob1991.txt', 'yob1992.txt', 'yob1993.txt', 'yob1994.txt', 'yob1995.txt', 'yob1996.txt', 'yob1997.txt', 'yob1998.txt', 'yob1999.txt', 'yob2000.txt', 'yob2001.txt', 'yob2002.txt', 'yob2003.txt', 'yob2004.txt', 'yob2005.txt', 'yob2006.txt', 'yob2007.txt', 'yob2008.txt', 'yob2009.txt', 'yob2010.txt', 'yob2011.txt', 'yob2012.txt', 'yob2013.txt', 'yob2014.txt', 'yob2015.txt', 'yob2016.txt', 'yob2017.txt', 'yob2018.txt', 'yob2019.txt', 'yob2020.txt', 'yob2021.txt', 'yob2022.txt', 'yob2023.txt', 'yob2024.txt']
In [4]:
names1880 = pd.read_csv(
    os.path.join(DATASET_DIR, "yob1880.txt"), names=["name", "sex", "births"]
)
names1880.head()
Out[4]:
name sex births
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
In [5]:
names1880.groupby("sex")["births"].sum()
Out[5]:
sex
F     90994
M    110490
Name: births, dtype: int64
In [6]:
pieces = []
for year in range(1880, 2011):
    path = os.path.join(DATASET_DIR, f"yob{year}.txt")
    frame = pd.read_csv(path, names=["name", "sex", "births"])
    # Add a column for the year
    frame["year"] = year
    pieces.append(frame)

# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)
In [7]:
names.head()
Out[7]:
name sex births year
0 Mary F 7065 1880
1 Anna F 2604 1880
2 Emma F 2003 1880
3 Elizabeth F 1939 1880
4 Minnie F 1746 1880
In [8]:
total_births = names.pivot_table("births", index="year", columns="sex", aggfunc="sum")
total_births.tail()
Out[8]:
sex F M
year
2006 1901070 2055119
2007 1922448 2075235
2008 1890718 2039947
2009 1836870 1983305
2010 1777327 1918412
In [9]:
total_births.plot(title="Total births by sex and year")
Out[9]:
<Axes: title={'center': 'Total births by sex and year'}, xlabel='year'>
No description has been provided for this image
In [10]:
def add_prop(group):
    group["prop"] = group["births"] / group["births"].sum()
    return group


names = (
    names.groupby(["year", "sex"]).apply(add_prop, include_groups=False).reset_index()
)
In [11]:
names.head()
Out[11]:
year sex level_2 name births prop
0 1880 F 0 Mary 7065 0.077642
1 1880 F 1 Anna 2604 0.028617
2 1880 F 2 Emma 2003 0.022012
3 1880 F 3 Elizabeth 1939 0.021309
4 1880 F 4 Minnie 1746 0.019188
In [12]:
names.groupby(["year", "sex"])["prop"].sum()
Out[12]:
year  sex
1880  F      1.0
      M      1.0
1881  F      1.0
      M      1.0
1882  F      1.0
            ... 
2008  M      1.0
2009  F      1.0
      M      1.0
2010  F      1.0
      M      1.0
Name: prop, Length: 262, dtype: float64
In [13]:
def get_top1000(group):
    return group.sort_values(by="births", ascending=False).head(1000)


grouped = names.groupby(["year", "sex"])
top1000 = grouped.apply(get_top1000)
top1000.head()
C:\Users\purch\AppData\Local\Temp\ipykernel_24316\1970094788.py:6: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  top1000 = grouped.apply(get_top1000)
Out[13]:
year sex level_2 name births prop
year sex
1880 F 0 1880 F 0 Mary 7065 0.077642
1 1880 F 1 Anna 2604 0.028617
2 1880 F 2 Emma 2003 0.022012
3 1880 F 3 Elizabeth 1939 0.021309
4 1880 F 4 Minnie 1746 0.019188
In [14]:
top1000 = top1000.reset_index(drop=True)
top1000.head()
Out[14]:
year sex level_2 name births prop
0 1880 F 0 Mary 7065 0.077642
1 1880 F 1 Anna 2604 0.028617
2 1880 F 2 Emma 2003 0.022012
3 1880 F 3 Elizabeth 1939 0.021309
4 1880 F 4 Minnie 1746 0.019188

Analyzing Naming Trends¶

In [15]:
boys = top1000[top1000["sex"] == "M"]
girls = top1000[top1000["sex"] == "F"]

total_births = top1000.pivot_table(
    "births", index="year", columns="name", aggfunc="sum"
)
total_births.tail()
Out[15]:
name Aaden Aaliyah Aarav Aaron Aarush Ab Abagail Abb Abbey Abbie ... Zoa Zoe Zoey Zoie Zola Zollie Zona Zora Zula Zuri
year
2006 NaN 3745.0 NaN 8309.0 NaN NaN 298.0 NaN 404.0 440.0 ... NaN 5153.0 2858.0 531.0 NaN NaN NaN NaN NaN NaN
2007 NaN 3960.0 NaN 8947.0 NaN NaN 314.0 NaN 349.0 469.0 ... NaN 4937.0 3044.0 527.0 NaN NaN NaN NaN NaN NaN
2008 958.0 4045.0 219.0 8545.0 NaN NaN 321.0 NaN 344.0 401.0 ... NaN 4781.0 3449.0 494.0 NaN NaN NaN NaN NaN NaN
2009 1268.0 4378.0 270.0 7983.0 NaN NaN 297.0 NaN 307.0 370.0 ... NaN 5152.0 3995.0 500.0 NaN NaN NaN NaN NaN NaN
2010 450.0 4672.0 438.0 7468.0 227.0 NaN 282.0 NaN 299.0 329.0 ... NaN 6274.0 5219.0 508.0 NaN NaN NaN NaN NaN 259.0

5 rows × 6848 columns

In [16]:
total_births.info()
<class 'pandas.core.frame.DataFrame'>
Index: 131 entries, 1880 to 2010
Columns: 6848 entries, Aaden to Zuri
dtypes: float64(6848)
memory usage: 6.8 MB
In [17]:
subset = total_births[["John", "Harry", "Mary", "Marilyn"]]
In [18]:
subset.plot(subplots=True, figsize=(12, 10), title="Number of births per year")
Out[18]:
array([<Axes: xlabel='year'>, <Axes: xlabel='year'>,
       <Axes: xlabel='year'>, <Axes: xlabel='year'>], dtype=object)
No description has been provided for this image

Measuring the increase in naming diversity¶

In [19]:
table = top1000.pivot_table("prop", index="year", columns="sex", aggfunc="sum")
In [20]:
table.plot(
    title="Sum of table1000.prop by year and sex", yticks=np.linspace(0, 1.2, 13)
)
Out[20]:
<Axes: title={'center': 'Sum of table1000.prop by year and sex'}, xlabel='year'>
No description has been provided for this image
In [21]:
df = boys[boys["year"] == 2010]
df
Out[21]:
year sex level_2 name births prop
260876 2010 M 1678364 Jacob 22154 0.011548
260877 2010 M 1678365 Ethan 18017 0.009392
260878 2010 M 1678366 Michael 17381 0.009060
260879 2010 M 1678367 Jayden 17200 0.008966
260880 2010 M 1678368 William 17071 0.008899
... ... ... ... ... ... ...
261871 2010 M 1679358 Taj 197 0.000103
261872 2010 M 1679361 Destin 196 0.000102
261873 2010 M 1679365 Joziah 196 0.000102
261874 2010 M 1679366 Keshawn 196 0.000102
261875 2010 M 1679363 Enoch 196 0.000102

1000 rows × 6 columns

In [22]:
prop_cumsum = df["prop"].sort_values(ascending=False).cumsum()
prop_cumsum[:10]
Out[22]:
260876    0.011548
260877    0.020940
260878    0.030000
260879    0.038966
260880    0.047864
260881    0.056604
260882    0.065194
260883    0.073463
260884    0.081567
260885    0.089651
Name: prop, dtype: float64
In [23]:
prop_cumsum.searchsorted(0.5)
Out[23]:
np.int64(116)
In [24]:
df = boys[boys.year == 1900]
in1900 = df.sort_values("prop", ascending=False).prop.cumsum()
in1900.searchsorted(0.5) + 1
Out[24]:
np.int64(25)
In [25]:
def get_quantile_count(group, q=0.5):
    group = group.sort_values("prop", ascending=False)
    return group.prop.cumsum().searchsorted(q) + 1

diversity = top1000.groupby(["year", "sex"]).apply(get_quantile_count)
diversity = diversity.unstack()
diversity.head()
C:\Users\purch\AppData\Local\Temp\ipykernel_24316\1524694453.py:5: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  diversity = top1000.groupby(["year", "sex"]).apply(get_quantile_count)
Out[25]:
sex F M
year
1880 38 14
1881 38 14
1882 38 15
1883 39 15
1884 39 16
In [26]:
diversity.plot(title="Number of popular names in top 50%")
Out[26]:
<Axes: title={'center': 'Number of popular names in top 50%'}, xlabel='year'>
No description has been provided for this image

The “last letter” revolution¶

In [27]:
def get_last_letter(x):
    return x[-1]

last_letters = names["name"].map(get_last_letter)
In [28]:
last_letters.name = "last_letter"
table = names.pivot_table(
    "births", index=last_letters, columns=["sex", "year"], aggfunc=sum
)
table.head()
C:\Users\purch\AppData\Local\Temp\ipykernel_24316\686817189.py:2: FutureWarning: The provided callable <built-in function sum> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
  table = names.pivot_table(
Out[28]:
sex F ... M
year 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 ... 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
last_letter
a 31446.0 31581.0 36536.0 38330.0 43680.0 45408.0 49100.0 48941.0 59440.0 58631.0 ... 39176.0 38862.0 37878.0 38727.0 36926.0 36271.0 34780.0 33074.0 31659.0 28906.0
b NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 51028.0 49344.0 48140.0 45985.0 43232.0 42715.0 42282.0 40124.0 39131.0 39320.0
c NaN NaN 5.0 5.0 NaN NaN NaN NaN NaN NaN ... 27135.0 27269.0 27734.0 26810.0 26133.0 26694.0 26933.0 25414.0 24180.0 23378.0
d 609.0 607.0 734.0 810.0 916.0 862.0 1007.0 1027.0 1298.0 1374.0 ... 60941.0 55969.0 53499.0 51890.0 50815.0 51581.0 50796.0 48121.0 46402.0 44876.0
e 33381.0 34080.0 40399.0 41914.0 48089.0 49616.0 53883.0 54353.0 66748.0 66662.0 ... 145560.0 144833.0 144984.0 142315.0 141380.0 143326.0 144126.0 141487.0 136172.0 130395.0

5 rows × 262 columns

In [29]:
subtable = table.reindex(columns=[1910, 1960, 2010], level="year")
subtable.head()
Out[29]:
sex F M
year 1910 1960 2010 1910 1960 2010
last_letter
a 108399.0 691254.0 677566.0 977.0 5216.0 28906.0
b NaN 694.0 455.0 411.0 3913.0 39320.0
c 5.0 49.0 958.0 482.0 15456.0 23378.0
d 6751.0 3732.0 2645.0 22111.0 262105.0 44876.0
e 133599.0 435000.0 317138.0 28665.0 178731.0 130395.0
In [30]:
subtable.sum()
Out[30]:
sex  year
F    1910     396503.0
     1960    2021900.0
     2010    1777327.0
M    1910     194208.0
     1960    2131915.0
     2010    1918412.0
dtype: float64
In [31]:
letter_prop = subtable / subtable.sum()
letter_prop.head()
Out[31]:
sex F M
year 1910 1960 2010 1910 1960 2010
last_letter
a 0.273388 0.341883 0.381228 0.005031 0.002447 0.015068
b NaN 0.000343 0.000256 0.002116 0.001835 0.020496
c 0.000013 0.000024 0.000539 0.002482 0.007250 0.012186
d 0.017026 0.001846 0.001488 0.113852 0.122943 0.023392
e 0.336943 0.215144 0.178435 0.147599 0.083836 0.067970
In [32]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop["M"].plot(kind="bar", rot=0, ax=axes[0], title="Male")
letter_prop["F"].plot(kind="bar", rot=0, ax=axes[1], title="Female", legend=False)
Out[32]:
<Axes: title={'center': 'Female'}, xlabel='last_letter'>
No description has been provided for this image
In [33]:
letter_prop = table / table.sum()
letter_prop.head()
Out[33]:
sex F ... M
year 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 ... 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
last_letter
a 0.345583 0.343447 0.338776 0.341254 0.338555 0.341272 0.339715 0.335256 0.332770 0.328712 ... 0.020164 0.020017 0.019176 0.019512 0.018489 0.017649 0.016760 0.016213 0.015963 0.015068
b NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 0.026265 0.025416 0.024371 0.023169 0.021647 0.020785 0.020375 0.019669 0.019730 0.020496
c NaN NaN 0.000046 0.000045 NaN NaN NaN NaN NaN NaN ... 0.013967 0.014046 0.014041 0.013508 0.013085 0.012989 0.012978 0.012458 0.012192 0.012186
d 0.006693 0.006601 0.006806 0.007211 0.007100 0.006479 0.006967 0.007035 0.007267 0.007703 ... 0.031367 0.028828 0.027084 0.026144 0.025444 0.025099 0.024477 0.023589 0.023396 0.023392
e 0.366848 0.370624 0.374595 0.373163 0.372728 0.372898 0.372808 0.372329 0.373683 0.373737 ... 0.074922 0.074600 0.073400 0.071702 0.070791 0.069741 0.069450 0.069358 0.068659 0.067970

5 rows × 262 columns

In [34]:
dny_ts = letter_prop.loc[["d", "n", "y"], "M"].T
dny_ts.head()
Out[34]:
last_letter d n y
year
1880 0.083057 0.153217 0.075763
1881 0.083238 0.153214 0.077461
1882 0.085332 0.149561 0.077538
1883 0.084053 0.151656 0.079149
1884 0.086122 0.149927 0.080408
In [35]:
dny_ts.plot()
Out[35]:
<Axes: xlabel='year'>
No description has been provided for this image

Boy names that became girl names (and vice versa)¶

In [36]:
all_names = pd.Series(top1000["name"].unique())
lesley_like = all_names[all_names.str.contains("Lesl")]
lesley_like.head()
Out[36]:
645     Leslie
2324    Lesley
4238    Leslee
4709     Lesli
6084     Lesly
dtype: object
In [37]:
filtered = top1000[top1000["name"].isin(lesley_like)]
filtered.groupby("name")["births"].sum()
Out[37]:
name
Leslee       994
Lesley     35029
Lesli        929
Leslie    370585
Lesly      10073
Name: births, dtype: int64
In [38]:
table = filtered.pivot_table("births", index="year", columns="sex", aggfunc="sum")
table = table.div(table.sum(axis="columns"), axis="index")
table.tail()
Out[38]:
sex F M
year
2006 1.0 NaN
2007 1.0 NaN
2008 1.0 NaN
2009 1.0 NaN
2010 1.0 NaN
In [39]:
table.plot(style={"M": "k-", "F": "k--"})
Out[39]:
<Axes: xlabel='year'>
No description has been provided for this image
In [ ]: