Senate Votes ETL and Database Population¶

By Nigel Story

Introduction¶

This is the ETL notebook that populates my locally hosted senate MySQL database, used for the machine learning notebook Analysis of U.S. Senate Polarization. Here, we scrape publicly available voting records for each U.S. senator, along with data about the proceedings and issues at vote.

Packages¶

import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
import os
from io import StringIO
from tqdm import tqdm
from helpers.db_connection import DBConnect

Proceedings ETL¶

We can extract data for the proceedings from senate.gov using a BASH script and the wget command. This script can be found in "./bash_scripts/get_data.bash" within this repository.

#!mkdir /Users/nastory/repos/senate_votes/xml_data
#!cd /Users/nastory/repos/senate_votes/xml_data
#!bash ../bash_scripts/get_data.bash

After retrieving the data as xml, we will need to convert it to a pandas data frame. This will make it easier to import into the database.

xml_path = './senate_votes/xml_data/'
vote_roll_call_url = 'https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress={}&session={}&vote={}'

data = []
files = os.listdir(xml_path)
files.sort()
xml_check = re.compile('\.xml')

def check_value(element_name, parent):
    # coalesce missing values with empty string
    check = parent.find(element_name)
    if check is None:
        return ''
    else:
        return check.text

for f in files:
    if xml_check.search(f):
        tree = ET.parse(xml_path + f)
        root = tree.getroot()
        
        congress = root[0].text
        session = root[1].text
        year = root[2].text
        
        for vote in root.iter('vote'):
            data.append([
                congress, 
                session, 
                year, 
                vote.find('vote_number').text,
                vote.find('vote_date').text,
                check_value('issue', vote),
                check_value('question', vote),
                vote.find('title').text,
                vote.find('vote_tally').find('yeas').text,
                vote.find('vote_tally').find('nays').text,
                check_value('result', vote)
            ])
        
    else:
        pass

column_names = ['congress', 'session', 'year', 'vote_number', 'vote_date', 'issue',
                'question', 'title', 'yeas', 'nays', 'result']
df = pd.DataFrame(data, columns=column_names)

num_cols = ['congress', 'session', 'year', 'yeas', 'nays']
df[num_cols] = df[num_cols].astype(float)
df[num_cols] = df[num_cols].fillna(0)

df['vote_date'] = df['vote_date'].astype(str) + '-' + df['year'].astype(str)
df['vote_date'] = pd.to_datetime(df['vote_date'], format='%d-%b-%Y.0')

df['question'] = df['question'].str.replace(r'\n\s*', '')

with DBConnect('senate', autocommit=True) as cnx:
    cnx.df2db(df, 'proceedings', method='insert', if_dup_key_replace=True)

Senators' Votes ETL¶

I attempted two methods of retrieving senators' voting records. The first was to scrape the data from senate.gov, but the volume of requests to the server that I was making was too high, so I was temporarily banned from the site. The alternative was a site called govtrack.us, which had the data in a more accessable format to scrape anyway, so it all worked out.

Method 1¶

senate.gov banned me, so the below won't work anymore...

# def html2votes(response_text, congress, session, vote_number):
#     soup = BeautifulSoup(response_text)
#     votes = soup.find_all('div', {'class': 'newspaperDisplay_3column'})[0]

#     vote_text = re.sub(r'<[\s\S]*>', '', votes.text)
#     vote_text = vote_text.replace(' (', ',').replace('), ', ',').replace('-', ',')
#     vote_text = 'senator,party,state,vote\n' + vote_text

#     out = pd.read_csv(StringIO(vote_text), delimiter=',')
#     out['congress'] = congress
#     out['session'] = session
#     out['vote_number'] = vote_number
    
#     cols = ['congress', 'session', 'vote_number', 'senator', 'party', 'state', 'vote']
#     out = out[cols]
    
#     return out

# failed = []
# df2 = pd.DataFrame()
# for row in params:
#     url = vote_roll_call_url.format(int(row[0]), int(row[1]), row[2])
#     res = requests.get(url)
    
#     if res.status_code != 200:
#         failed.append([url, res.status_code])
        
#     else:
#         df2 = pd.concat([df2, html2votes(res.text, int(row[0]), int(row[1]), row[2])], axis=0)

Method 2¶

gt_url = 'https://www.govtrack.us/congress/votes/{}-{}/s{}/export/csv'
params = df[['congress', 'session', 'vote_number', 'year']].to_numpy()

Download data

# df2 = pd.DataFrame()
# failed = []
# for row in params:
#     res = requests.get(gt_url.format(int(row[0]), int(row[3]), int(row[2])))
#     if res.status_code == 200:
#         with open('./csv_data/{}-{}-{}.txt'.format(int(row[0]), int(row[3]), int(row[2])), 'w') as f:
#             f.write(res.text)
#     else:
#         failed.append([gt_url.format(int(row[0]), int(row[3]), int(row[2])), res.status_code])

#len(failed)

Read files and import to db.

path = './csv_data/'
regex = re.compile(r'person,state,district,vote,name,party[\s\S]*')
failed = []
senator_set = set()

with DBConnect('senate', autocommit=True) as cnx:
    for file in os.listdir(path):
        try:
            c, y, v = file.split('-')
            with open(path + file, 'r') as f:
                txt = f.read()
                match = regex.search(txt)
                if match:
                    data_txt = match.group(0)
                    
            dummy = pd.read_csv(StringIO(data_txt))
            dummy['name'] = dummy['name'].str.replace(r'\s\[[\S\s]*\]', '')

            tups = [tuple(x) for x in dummy[['person', 'name', 'state', 'district', 'party']].to_numpy()]
            for t in tups:
                senator_set.add(t)

            dummy['congress'] = c
            dummy['year'] = y
            dummy['vote_number'] = v.replace('.txt', '').zfill(5)
            dummy = dummy[['congress', 'year', 'vote_number', 'person', 'vote']]
            dummy.rename(columns={'person': 'senator_id'}, inplace=True)

            cnx.df2db(dummy, 'votes', method='insert', if_dup_key_replace=True)
            
        except Exception as e:
            failed.append([file, str(e)])

len(failed)

0

sen_df = pd.DataFrame(senator_ls, columns=['senator_id', 'name', 'state', 'district', 'party'])

sen_df['name'] = sen_df['name'].str.strip()

with DBConnect('senate', autocommit=True) as cnx:
    cnx.df2db(sen_df, 'senators', method='insert', if_dup_key_replace=True)

Test¶

Now that the data is in the database, we'll perform a few test queries.

with DBConnect('senate') as cnx:
    proceedings = pd.read_sql(con=cnx.cnx, sql='select * from proceedings limit 100')
    votes = pd.read_sql(con=cnx.cnx, sql='select * from votes limit 100')
    senators = pd.read_sql(con=cnx.cnx, sql='select * from senators limit 100')

proceedings.head()

votes.head()

senators.head()

	congress	year	vote_number	session	vote_date	issue	question	title	yeas	result
0	101	1989	00001	1	1989-01-25	PN128	On the Nomination	Nomination - Baker	99	Confirmed
1	101	1989	00002	1	1989-01-25	PN133	On the Nomination	Nomination - Dole	99	Confirmed
2	101	1989	00003	1	1989-01-25	PN139	On the Nomination	Nomination - Darman	99	Confirmed
3	101	1989	00004	1	1989-01-31	PN140	On the Nomination	Nomination - Hills	100	Confirmed
4	101	1989	00005	1	1989-01-31	PN132	On the Nomination	Nomination - Mosbacher	100	Confirmed

	congress	year	vote_number	senator_id	vote
0	101	1989	00001	300005	Yea
1	101	1989	00001	300008	Yea
2	101	1989	00001	300009	Yea
3	101	1989	00001	300010	Yea
4	101	1989	00001	300012	Yea

	senator_id	name	state	district	party
0	300001	Sen. Daniel Akaka	HI	None	Democrat
1	300002	Sen. Lamar Alexander	TN	None	Republican
2	300003	Sen. Wayne Allard	CO	None	Republican
3	300004	Sen. George Allen	VA	None	Republican
4	300005	Sen. Max Baucus	MT	None	Democrat