2012 Federal Election Commission Database¶

In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
import os
import shutil
from pathlib import Path

sys.path.append("../")

from helpers.utils import download_by_url
In [2]:
url = "https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/datasets/fec/P00000001-ALL.csv"
DATASET_DIR = download_by_url(url, "../input/federal-election-commission-2012/P00000001-ALL.csv")
File already downloaded to: ../input/federal-election-commission-2012/P00000001-ALL.csv
In [3]:
fec = pd.read_csv(Path(DATASET_DIR) / "P00000001-ALL.csv", low_memory=False)
fec.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001731 entries, 0 to 1001730
Data columns (total 16 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   cmte_id            1001731 non-null  object 
 1   cand_id            1001731 non-null  object 
 2   cand_nm            1001731 non-null  object 
 3   contbr_nm          1001731 non-null  object 
 4   contbr_city        1001712 non-null  object 
 5   contbr_st          1001727 non-null  object 
 6   contbr_zip         1001620 non-null  object 
 7   contbr_employer    988002 non-null   object 
 8   contbr_occupation  993301 non-null   object 
 9   contb_receipt_amt  1001731 non-null  float64
 10  contb_receipt_dt   1001731 non-null  object 
 11  receipt_desc       14166 non-null    object 
 12  memo_cd            92482 non-null    object 
 13  memo_text          97770 non-null    object 
 14  form_tp            1001731 non-null  object 
 15  file_num           1001731 non-null  int64  
dtypes: float64(1), int64(1), object(14)
memory usage: 122.3+ MB
In [4]:
fec.iloc[123456]
Out[4]:
cmte_id                             C00431445
cand_id                             P80003338
cand_nm                         Obama, Barack
contbr_nm                         ELLMAN, IRA
contbr_city                             TEMPE
contbr_st                                  AZ
contbr_zip                          852816719
contbr_employer      ARIZONA STATE UNIVERSITY
contbr_occupation                   PROFESSOR
contb_receipt_amt                        50.0
contb_receipt_dt                    01-DEC-11
receipt_desc                              NaN
memo_cd                                   NaN
memo_text                                 NaN
form_tp                                 SA17A
file_num                               772372
Name: 123456, dtype: object
In [5]:
unique_cands = fec["cand_nm"].unique()
unique_cands
Out[5]:
array(['Bachmann, Michelle', 'Romney, Mitt', 'Obama, Barack',
       "Roemer, Charles E. 'Buddy' III", 'Pawlenty, Timothy',
       'Johnson, Gary Earl', 'Paul, Ron', 'Santorum, Rick',
       'Cain, Herman', 'Gingrich, Newt', 'McCotter, Thaddeus G',
       'Huntsman, Jon', 'Perry, Rick'], dtype=object)
In [6]:
unique_cands[2]
Out[6]:
'Obama, Barack'
In [7]:
parties = {"Bachmann, Michelle": "Republican",
"Cain, Herman": "Republican",
"Gingrich, Newt": "Republican",
"Huntsman, Jon": "Republican",
"Johnson, Gary Earl": "Republican",
"McCotter, Thaddeus G": "Republican",
"Obama, Barack": "Democrat",
"Paul, Ron": "Republican",
"Pawlenty, Timothy": "Republican",
"Perry, Rick": "Republican",
"Roemer, Charles E. 'Buddy' III": "Republican",
"Romney, Mitt": "Republican",
"Santorum, Rick": "Republican"}
In [8]:
fec["cand_nm"][123456:123461]
Out[8]:
123456    Obama, Barack
123457    Obama, Barack
123458    Obama, Barack
123459    Obama, Barack
123460    Obama, Barack
Name: cand_nm, dtype: object
In [9]:
fec["cand_nm"][123456:123461].map(parties)
Out[9]:
123456    Democrat
123457    Democrat
123458    Democrat
123459    Democrat
123460    Democrat
Name: cand_nm, dtype: object
In [10]:
fec["party"] = fec["cand_nm"].map(parties)
fec["party"].value_counts()
Out[10]:
party
Democrat      593746
Republican    407985
Name: count, dtype: int64
In [11]:
(fec["contb_receipt_amt"] > 0).value_counts()
Out[11]:
contb_receipt_amt
True     991475
False     10256
Name: count, dtype: int64
In [12]:
fec = fec[fec["contb_receipt_amt"] > 0]
In [13]:
fec_mrbo = fec[fec["cand_nm"].isin(["Obama, Barack", "Romney, Mitt"])]

Donation Statistics by Occupation and Employer¶

In [14]:
fec["contbr_occupation"].value_counts()[:10]
Out[14]:
contbr_occupation
RETIRED                                   233990
INFORMATION REQUESTED                      35107
ATTORNEY                                   34286
HOMEMAKER                                  29931
PHYSICIAN                                  23432
INFORMATION REQUESTED PER BEST EFFORTS     21138
ENGINEER                                   14334
TEACHER                                    13990
CONSULTANT                                 13273
PROFESSOR                                  12555
Name: count, dtype: int64
In [15]:
occ_mapping = {
    "INFORMATION REQUESTED PER BEST EFFORTS": "NOT PROVIDED",
    "INFORMATION REQUESTED": "NOT PROVIDED",
    "INFORMATION REQUESTED (BEST EFFORTS)": "NOT PROVIDED",
    "C.E.O.": "CEO",
}

def get_occ(x):
    # If no mapping provided, return x
    return occ_mapping.get(x, x)

fec["contbr_occupation"] = fec["contbr_occupation"].map(get_occ)
In [16]:
emp_mapping = {
    "INFORMATION REQUESTED PER BEST EFFORTS": "NOT PROVIDED",
    "INFORMATION REQUESTED": "NOT PROVIDED",
    "SELF": "SELF-EMPLOYED",
    "SELF EMPLOYED": "SELF-EMPLOYED",
}


def get_emp(x):
    # If no mapping provided, return x
    return emp_mapping.get(x, x)


fec["contbr_employer"] = fec["contbr_employer"].map(get_emp)
In [17]:
by_occupation = fec.pivot_table(
    "contb_receipt_amt", index="contbr_occupation", columns="party", aggfunc="sum"
)
over_2mm = by_occupation[by_occupation.sum(axis="columns") > 2000000]
over_2mm
Out[17]:
party Democrat Republican
contbr_occupation
ATTORNEY 11141982.97 7477194.43
CEO 2074974.79 4211040.52
CONSULTANT 2459912.71 2544725.45
ENGINEER 951525.55 1818373.70
EXECUTIVE 1355161.05 4138850.09
HOMEMAKER 4248875.80 13634275.78
INVESTOR 884133.00 2431768.92
LAWYER 3160478.87 391224.32
MANAGER 762883.22 1444532.37
NOT PROVIDED 4866973.96 20565473.01
OWNER 1001567.36 2408286.92
PHYSICIAN 3735124.94 3594320.24
PRESIDENT 1878509.95 4720923.76
PROFESSOR 2165071.08 296702.73
REAL ESTATE 528902.09 1625902.25
RETIRED 25305116.38 23561244.49
SELF-EMPLOYED 672393.40 1640252.54
In [18]:
over_2mm.plot(kind="barh")
Out[18]:
<Axes: ylabel='contbr_occupation'>
No description has been provided for this image
In [19]:
def get_top_amounts(group, key, n=5):
    totals = group.groupby(key)["contb_receipt_amt"].sum()
    return totals.nlargest(n)

grouped = fec_mrbo.groupby("cand_nm")
grouped.apply(get_top_amounts, "contbr_occupation", n=7)
C:\Users\purch\AppData\Local\Temp\ipykernel_6632\1928039006.py:6: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  grouped.apply(get_top_amounts, "contbr_occupation", n=7)
Out[19]:
cand_nm        contbr_occupation                     
Obama, Barack  RETIRED                                   25305116.38
               ATTORNEY                                  11141982.97
               INFORMATION REQUESTED                      4866973.96
               HOMEMAKER                                  4248875.80
               PHYSICIAN                                  3735124.94
               LAWYER                                     3160478.87
               CONSULTANT                                 2459912.71
Romney, Mitt   RETIRED                                   11508473.59
               INFORMATION REQUESTED PER BEST EFFORTS    11396894.84
               HOMEMAKER                                  8147446.22
               ATTORNEY                                   5364718.82
               PRESIDENT                                  2491244.89
               EXECUTIVE                                  2300947.03
               C.E.O.                                     1968386.11
Name: contb_receipt_amt, dtype: float64
In [20]:
grouped.apply(get_top_amounts, "contbr_employer", n=10)
C:\Users\purch\AppData\Local\Temp\ipykernel_6632\4213707751.py:1: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  grouped.apply(get_top_amounts, "contbr_employer", n=10)
Out[20]:
cand_nm        contbr_employer                       
Obama, Barack  RETIRED                                   22694358.85
               SELF-EMPLOYED                             17080985.96
               NOT EMPLOYED                               8586308.70
               INFORMATION REQUESTED                      5053480.37
               HOMEMAKER                                  2605408.54
               SELF                                       1076531.20
               SELF EMPLOYED                               469290.00
               STUDENT                                     318831.45
               VOLUNTEER                                   257104.00
               MICROSOFT                                   215585.36
Romney, Mitt   INFORMATION REQUESTED PER BEST EFFORTS    12059527.24
               RETIRED                                   11506225.71
               HOMEMAKER                                  8147196.22
               SELF-EMPLOYED                              7409860.98
               STUDENT                                     496490.94
               CREDIT SUISSE                               281150.00
               MORGAN STANLEY                              267266.00
               GOLDMAN SACH & CO.                          238250.00
               BARCLAYS CAPITAL                            162750.00
               H.I.G. CAPITAL                              139500.00
Name: contb_receipt_amt, dtype: float64

Bucketing Donation Amounts¶

In [21]:
bins = np.array([0, 1, 10, 100, 1000, 10000, 100_000, 1_000_000, 10_000_000])
In [22]:
labels = pd.cut(fec_mrbo["contb_receipt_amt"], bins)
labels
Out[22]:
411         (10, 100]
412       (100, 1000]
413       (100, 1000]
414         (10, 100]
415         (10, 100]
             ...     
701381      (10, 100]
701382    (100, 1000]
701383        (1, 10]
701384      (10, 100]
701385    (100, 1000]
Name: contb_receipt_amt, Length: 694282, dtype: category
Categories (8, interval[int64, right]): [(0, 1] < (1, 10] < (10, 100] < (100, 1000] < (1000, 10000] < (10000, 100000] < (100000, 1000000] < (1000000, 10000000]]
In [23]:
grouped = fec_mrbo.groupby(["cand_nm", labels])
grouped.size().unstack(level=0)
C:\Users\purch\AppData\Local\Temp\ipykernel_6632\3276222007.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = fec_mrbo.groupby(["cand_nm", labels])
Out[23]:
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 493 77
(1, 10] 40070 3681
(10, 100] 372280 31853
(100, 1000] 153991 43357
(1000, 10000] 22284 26186
(10000, 100000] 2 1
(100000, 1000000] 3 0
(1000000, 10000000] 4 0
In [24]:
bucket_sums = grouped["contb_receipt_amt"].sum().unstack(level=0)
normed_sums = bucket_sums.div(bucket_sums.sum(axis="columns"), axis="index")
normed_sums
Out[24]:
cand_nm Obama, Barack Romney, Mitt
contb_receipt_amt
(0, 1] 0.805182 0.194818
(1, 10] 0.918767 0.081233
(10, 100] 0.910769 0.089231
(100, 1000] 0.710176 0.289824
(1000, 10000] 0.447326 0.552674
(10000, 100000] 0.823120 0.176880
(100000, 1000000] 1.000000 0.000000
(1000000, 10000000] 1.000000 0.000000
In [25]:
normed_sums[:-2].plot(kind="barh")
Out[25]:
<Axes: ylabel='contb_receipt_amt'>
No description has been provided for this image

Donation Statistics by State¶

In [26]:
grouped = fec_mrbo.groupby(["cand_nm", "contbr_st"])
totals = grouped["contb_receipt_amt"].sum().unstack(level=0).fillna(0)
totals = totals[totals.sum(axis="columns") > 100000]
totals.head()
Out[26]:
cand_nm Obama, Barack Romney, Mitt
contbr_st
AK 281840.15 86204.24
AL 543123.48 527303.51
AR 359247.28 105556.00
AZ 1506476.98 1888436.23
CA 23824984.24 11237636.60
In [27]:
percent  = totals.div(totals.sum(axis="columns"), axis="index")
percent.head()
Out[27]:
cand_nm Obama, Barack Romney, Mitt
contbr_st
AK 0.765778 0.234222
AL 0.507390 0.492610
AR 0.772902 0.227098
AZ 0.443745 0.556255
CA 0.679498 0.320502
In [ ]: