2012 Federal Election Commission Database¶
In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
import os
import shutil
from pathlib import Path
sys.path.append("../")
from helpers.utils import download_by_url
In [2]:
url = "https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/datasets/fec/P00000001-ALL.csv"
DATASET_DIR = download_by_url(url, "../input/federal-election-commission-2012/P00000001-ALL.csv")
File already downloaded to: ../input/federal-election-commission-2012/P00000001-ALL.csv
In [3]:
fec = pd.read_csv(Path(DATASET_DIR) / "P00000001-ALL.csv", low_memory=False)
fec.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1001731 entries, 0 to 1001730 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 cmte_id 1001731 non-null object 1 cand_id 1001731 non-null object 2 cand_nm 1001731 non-null object 3 contbr_nm 1001731 non-null object 4 contbr_city 1001712 non-null object 5 contbr_st 1001727 non-null object 6 contbr_zip 1001620 non-null object 7 contbr_employer 988002 non-null object 8 contbr_occupation 993301 non-null object 9 contb_receipt_amt 1001731 non-null float64 10 contb_receipt_dt 1001731 non-null object 11 receipt_desc 14166 non-null object 12 memo_cd 92482 non-null object 13 memo_text 97770 non-null object 14 form_tp 1001731 non-null object 15 file_num 1001731 non-null int64 dtypes: float64(1), int64(1), object(14) memory usage: 122.3+ MB
In [4]:
fec.iloc[123456]
Out[4]:
cmte_id C00431445 cand_id P80003338 cand_nm Obama, Barack contbr_nm ELLMAN, IRA contbr_city TEMPE contbr_st AZ contbr_zip 852816719 contbr_employer ARIZONA STATE UNIVERSITY contbr_occupation PROFESSOR contb_receipt_amt 50.0 contb_receipt_dt 01-DEC-11 receipt_desc NaN memo_cd NaN memo_text NaN form_tp SA17A file_num 772372 Name: 123456, dtype: object
In [5]:
unique_cands = fec["cand_nm"].unique()
unique_cands
Out[5]:
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
"Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
'Huntsman, Jon', 'Perry, Rick'], dtype=object)
In [6]:
unique_cands[2]
Out[6]:
'Obama, Barack'
In [7]:
parties = {"Bachmann, Michelle": "Republican",
"Cain, Herman": "Republican",
"Gingrich, Newt": "Republican",
"Huntsman, Jon": "Republican",
"Johnson, Gary Earl": "Republican",
"McCotter, Thaddeus G": "Republican",
"Obama, Barack": "Democrat",
"Paul, Ron": "Republican",
"Pawlenty, Timothy": "Republican",
"Perry, Rick": "Republican",
"Roemer, Charles E. 'Buddy' III": "Republican",
"Romney, Mitt": "Republican",
"Santorum, Rick": "Republican"}
In [8]:
fec["cand_nm"][123456:123461]
Out[8]:
123456 Obama, Barack 123457 Obama, Barack 123458 Obama, Barack 123459 Obama, Barack 123460 Obama, Barack Name: cand_nm, dtype: object
In [9]:
fec["cand_nm"][123456:123461].map(parties)
Out[9]:
123456 Democrat 123457 Democrat 123458 Democrat 123459 Democrat 123460 Democrat Name: cand_nm, dtype: object
In [10]:
fec["party"] = fec["cand_nm"].map(parties)
fec["party"].value_counts()
Out[10]:
party Democrat 593746 Republican 407985 Name: count, dtype: int64
In [11]:
(fec["contb_receipt_amt"] > 0).value_counts()
Out[11]:
contb_receipt_amt True 991475 False 10256 Name: count, dtype: int64
In [12]:
fec = fec[fec["contb_receipt_amt"] > 0]
In [13]:
fec_mrbo = fec[fec["cand_nm"].isin(["Obama, Barack", "Romney, Mitt"])]
Donation Statistics by Occupation and Employer¶
In [14]:
fec["contbr_occupation"].value_counts()[:10]
Out[14]:
contbr_occupation RETIRED 233990 INFORMATION REQUESTED 35107 ATTORNEY 34286 HOMEMAKER 29931 PHYSICIAN 23432 INFORMATION REQUESTED PER BEST EFFORTS 21138 ENGINEER 14334 TEACHER 13990 CONSULTANT 13273 PROFESSOR 12555 Name: count, dtype: int64
In [15]:
occ_mapping = {
"INFORMATION REQUESTED PER BEST EFFORTS": "NOT PROVIDED",
"INFORMATION REQUESTED": "NOT PROVIDED",
"INFORMATION REQUESTED (BEST EFFORTS)": "NOT PROVIDED",
"C.E.O.": "CEO",
}
def get_occ(x):
# If no mapping provided, return x
return occ_mapping.get(x, x)
fec["contbr_occupation"] = fec["contbr_occupation"].map(get_occ)
In [16]:
emp_mapping = {
"INFORMATION REQUESTED PER BEST EFFORTS": "NOT PROVIDED",
"INFORMATION REQUESTED": "NOT PROVIDED",
"SELF": "SELF-EMPLOYED",
"SELF EMPLOYED": "SELF-EMPLOYED",
}
def get_emp(x):
# If no mapping provided, return x
return emp_mapping.get(x, x)
fec["contbr_employer"] = fec["contbr_employer"].map(get_emp)
In [17]:
by_occupation = fec.pivot_table(
"contb_receipt_amt", index="contbr_occupation", columns="party", aggfunc="sum"
)
over_2mm = by_occupation[by_occupation.sum(axis="columns") > 2000000]
over_2mm
Out[17]:
| party | Democrat | Republican |
|---|---|---|
| contbr_occupation | ||
| ATTORNEY | 11141982.97 | 7477194.43 |
| CEO | 2074974.79 | 4211040.52 |
| CONSULTANT | 2459912.71 | 2544725.45 |
| ENGINEER | 951525.55 | 1818373.70 |
| EXECUTIVE | 1355161.05 | 4138850.09 |
| HOMEMAKER | 4248875.80 | 13634275.78 |
| INVESTOR | 884133.00 | 2431768.92 |
| LAWYER | 3160478.87 | 391224.32 |
| MANAGER | 762883.22 | 1444532.37 |
| NOT PROVIDED | 4866973.96 | 20565473.01 |
| OWNER | 1001567.36 | 2408286.92 |
| PHYSICIAN | 3735124.94 | 3594320.24 |
| PRESIDENT | 1878509.95 | 4720923.76 |
| PROFESSOR | 2165071.08 | 296702.73 |
| REAL ESTATE | 528902.09 | 1625902.25 |
| RETIRED | 25305116.38 | 23561244.49 |
| SELF-EMPLOYED | 672393.40 | 1640252.54 |
In [18]:
over_2mm.plot(kind="barh")
Out[18]:
<Axes: ylabel='contbr_occupation'>
In [19]:
def get_top_amounts(group, key, n=5):
totals = group.groupby(key)["contb_receipt_amt"].sum()
return totals.nlargest(n)
grouped = fec_mrbo.groupby("cand_nm")
grouped.apply(get_top_amounts, "contbr_occupation", n=7)
C:\Users\purch\AppData\Local\Temp\ipykernel_6632\1928039006.py:6: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. grouped.apply(get_top_amounts, "contbr_occupation", n=7)
Out[19]:
cand_nm contbr_occupation
Obama, Barack RETIRED 25305116.38
ATTORNEY 11141982.97
INFORMATION REQUESTED 4866973.96
HOMEMAKER 4248875.80
PHYSICIAN 3735124.94
LAWYER 3160478.87
CONSULTANT 2459912.71
Romney, Mitt RETIRED 11508473.59
INFORMATION REQUESTED PER BEST EFFORTS 11396894.84
HOMEMAKER 8147446.22
ATTORNEY 5364718.82
PRESIDENT 2491244.89
EXECUTIVE 2300947.03
C.E.O. 1968386.11
Name: contb_receipt_amt, dtype: float64
In [20]:
grouped.apply(get_top_amounts, "contbr_employer", n=10)
C:\Users\purch\AppData\Local\Temp\ipykernel_6632\4213707751.py:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning. grouped.apply(get_top_amounts, "contbr_employer", n=10)
Out[20]:
cand_nm contbr_employer
Obama, Barack RETIRED 22694358.85
SELF-EMPLOYED 17080985.96
NOT EMPLOYED 8586308.70
INFORMATION REQUESTED 5053480.37
HOMEMAKER 2605408.54
SELF 1076531.20
SELF EMPLOYED 469290.00
STUDENT 318831.45
VOLUNTEER 257104.00
MICROSOFT 215585.36
Romney, Mitt INFORMATION REQUESTED PER BEST EFFORTS 12059527.24
RETIRED 11506225.71
HOMEMAKER 8147196.22
SELF-EMPLOYED 7409860.98
STUDENT 496490.94
CREDIT SUISSE 281150.00
MORGAN STANLEY 267266.00
GOLDMAN SACH & CO. 238250.00
BARCLAYS CAPITAL 162750.00
H.I.G. CAPITAL 139500.00
Name: contb_receipt_amt, dtype: float64
Bucketing Donation Amounts¶
In [21]:
bins = np.array([0, 1, 10, 100, 1000, 10000, 100_000, 1_000_000, 10_000_000])
In [22]:
labels = pd.cut(fec_mrbo["contb_receipt_amt"], bins)
labels
Out[22]:
411 (10, 100]
412 (100, 1000]
413 (100, 1000]
414 (10, 100]
415 (10, 100]
...
701381 (10, 100]
701382 (100, 1000]
701383 (1, 10]
701384 (10, 100]
701385 (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64, right]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
In [23]:
grouped = fec_mrbo.groupby(["cand_nm", labels])
grouped.size().unstack(level=0)
C:\Users\purch\AppData\Local\Temp\ipykernel_6632\3276222007.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped = fec_mrbo.groupby(["cand_nm", labels])
Out[23]:
| cand_nm | Obama, Barack | Romney, Mitt |
|---|---|---|
| contb_receipt_amt | ||
| (0, 1] | 493 | 77 |
| (1, 10] | 40070 | 3681 |
| (10, 100] | 372280 | 31853 |
| (100, 1000] | 153991 | 43357 |
| (1000, 10000] | 22284 | 26186 |
| (10000, 100000] | 2 | 1 |
| (100000, 1000000] | 3 | 0 |
| (1000000, 10000000] | 4 | 0 |
In [24]:
bucket_sums = grouped["contb_receipt_amt"].sum().unstack(level=0)
normed_sums = bucket_sums.div(bucket_sums.sum(axis="columns"), axis="index")
normed_sums
Out[24]:
| cand_nm | Obama, Barack | Romney, Mitt |
|---|---|---|
| contb_receipt_amt | ||
| (0, 1] | 0.805182 | 0.194818 |
| (1, 10] | 0.918767 | 0.081233 |
| (10, 100] | 0.910769 | 0.089231 |
| (100, 1000] | 0.710176 | 0.289824 |
| (1000, 10000] | 0.447326 | 0.552674 |
| (10000, 100000] | 0.823120 | 0.176880 |
| (100000, 1000000] | 1.000000 | 0.000000 |
| (1000000, 10000000] | 1.000000 | 0.000000 |
In [25]:
normed_sums[:-2].plot(kind="barh")
Out[25]:
<Axes: ylabel='contb_receipt_amt'>
Donation Statistics by State¶
In [26]:
grouped = fec_mrbo.groupby(["cand_nm", "contbr_st"])
totals = grouped["contb_receipt_amt"].sum().unstack(level=0).fillna(0)
totals = totals[totals.sum(axis="columns") > 100000]
totals.head()
Out[26]:
| cand_nm | Obama, Barack | Romney, Mitt |
|---|---|---|
| contbr_st | ||
| AK | 281840.15 | 86204.24 |
| AL | 543123.48 | 527303.51 |
| AR | 359247.28 | 105556.00 |
| AZ | 1506476.98 | 1888436.23 |
| CA | 23824984.24 | 11237636.60 |
In [27]:
percent = totals.div(totals.sum(axis="columns"), axis="index")
percent.head()
Out[27]:
| cand_nm | Obama, Barack | Romney, Mitt |
|---|---|---|
| contbr_st | ||
| AK | 0.765778 | 0.234222 |
| AL | 0.507390 | 0.492610 |
| AR | 0.772902 | 0.227098 |
| AZ | 0.443745 | 0.556255 |
| CA | 0.679498 | 0.320502 |
In [ ]: