This is the ETL notebook that populates my locally hosted senate
MySQL database, used for the machine learning notebook Analysis of U.S. Senate Polarization. Here, we scrape publicly available voting records for each U.S. senator, along with data about the proceedings and issues at vote.
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re
import os
from io import StringIO
from tqdm import tqdm
from helpers.db_connection import DBConnect
We can extract data for the proceedings from senate.gov using a BASH script and the wget
command. This script can be found in "./bash_scripts/get_data.bash" within this repository.
#!mkdir /Users/nastory/repos/senate_votes/xml_data
#!cd /Users/nastory/repos/senate_votes/xml_data
#!bash ../bash_scripts/get_data.bash
After retrieving the data as xml, we will need to convert it to a pandas data frame. This will make it easier to import into the database.
xml_path = './senate_votes/xml_data/'
vote_roll_call_url = 'https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress={}&session={}&vote={}'
data = []
files = os.listdir(xml_path)
files.sort()
xml_check = re.compile('\.xml')
def check_value(element_name, parent):
# coalesce missing values with empty string
check = parent.find(element_name)
if check is None:
return ''
else:
return check.text
for f in files:
if xml_check.search(f):
tree = ET.parse(xml_path + f)
root = tree.getroot()
congress = root[0].text
session = root[1].text
year = root[2].text
for vote in root.iter('vote'):
data.append([
congress,
session,
year,
vote.find('vote_number').text,
vote.find('vote_date').text,
check_value('issue', vote),
check_value('question', vote),
vote.find('title').text,
vote.find('vote_tally').find('yeas').text,
vote.find('vote_tally').find('nays').text,
check_value('result', vote)
])
else:
pass
column_names = ['congress', 'session', 'year', 'vote_number', 'vote_date', 'issue',
'question', 'title', 'yeas', 'nays', 'result']
df = pd.DataFrame(data, columns=column_names)
num_cols = ['congress', 'session', 'year', 'yeas', 'nays']
df[num_cols] = df[num_cols].astype(float)
df[num_cols] = df[num_cols].fillna(0)
df['vote_date'] = df['vote_date'].astype(str) + '-' + df['year'].astype(str)
df['vote_date'] = pd.to_datetime(df['vote_date'], format='%d-%b-%Y.0')
df['question'] = df['question'].str.replace(r'\n\s*', '')
with DBConnect('senate', autocommit=True) as cnx:
cnx.df2db(df, 'proceedings', method='insert', if_dup_key_replace=True)
I attempted two methods of retrieving senators' voting records. The first was to scrape the data from senate.gov, but the volume of requests to the server that I was making was too high, so I was temporarily banned from the site. The alternative was a site called govtrack.us, which had the data in a more accessable format to scrape anyway, so it all worked out.
senate.gov banned me, so the below won't work anymore...
# def html2votes(response_text, congress, session, vote_number):
# soup = BeautifulSoup(response_text)
# votes = soup.find_all('div', {'class': 'newspaperDisplay_3column'})[0]
# vote_text = re.sub(r'<[\s\S]*>', '', votes.text)
# vote_text = vote_text.replace(' (', ',').replace('), ', ',').replace('-', ',')
# vote_text = 'senator,party,state,vote\n' + vote_text
# out = pd.read_csv(StringIO(vote_text), delimiter=',')
# out['congress'] = congress
# out['session'] = session
# out['vote_number'] = vote_number
# cols = ['congress', 'session', 'vote_number', 'senator', 'party', 'state', 'vote']
# out = out[cols]
# return out
# failed = []
# df2 = pd.DataFrame()
# for row in params:
# url = vote_roll_call_url.format(int(row[0]), int(row[1]), row[2])
# res = requests.get(url)
# if res.status_code != 200:
# failed.append([url, res.status_code])
# else:
# df2 = pd.concat([df2, html2votes(res.text, int(row[0]), int(row[1]), row[2])], axis=0)
gt_url = 'https://www.govtrack.us/congress/votes/{}-{}/s{}/export/csv'
params = df[['congress', 'session', 'vote_number', 'year']].to_numpy()
Download data
# df2 = pd.DataFrame()
# failed = []
# for row in params:
# res = requests.get(gt_url.format(int(row[0]), int(row[3]), int(row[2])))
# if res.status_code == 200:
# with open('./csv_data/{}-{}-{}.txt'.format(int(row[0]), int(row[3]), int(row[2])), 'w') as f:
# f.write(res.text)
# else:
# failed.append([gt_url.format(int(row[0]), int(row[3]), int(row[2])), res.status_code])
#len(failed)
Read files and import to db.
path = './csv_data/'
regex = re.compile(r'person,state,district,vote,name,party[\s\S]*')
failed = []
senator_set = set()
with DBConnect('senate', autocommit=True) as cnx:
for file in os.listdir(path):
try:
c, y, v = file.split('-')
with open(path + file, 'r') as f:
txt = f.read()
match = regex.search(txt)
if match:
data_txt = match.group(0)
dummy = pd.read_csv(StringIO(data_txt))
dummy['name'] = dummy['name'].str.replace(r'\s\[[\S\s]*\]', '')
tups = [tuple(x) for x in dummy[['person', 'name', 'state', 'district', 'party']].to_numpy()]
for t in tups:
senator_set.add(t)
dummy['congress'] = c
dummy['year'] = y
dummy['vote_number'] = v.replace('.txt', '').zfill(5)
dummy = dummy[['congress', 'year', 'vote_number', 'person', 'vote']]
dummy.rename(columns={'person': 'senator_id'}, inplace=True)
cnx.df2db(dummy, 'votes', method='insert', if_dup_key_replace=True)
except Exception as e:
failed.append([file, str(e)])
len(failed)
sen_df = pd.DataFrame(senator_ls, columns=['senator_id', 'name', 'state', 'district', 'party'])
sen_df['name'] = sen_df['name'].str.strip()
with DBConnect('senate', autocommit=True) as cnx:
cnx.df2db(sen_df, 'senators', method='insert', if_dup_key_replace=True)
Now that the data is in the database, we'll perform a few test queries.
with DBConnect('senate') as cnx:
proceedings = pd.read_sql(con=cnx.cnx, sql='select * from proceedings limit 100')
votes = pd.read_sql(con=cnx.cnx, sql='select * from votes limit 100')
senators = pd.read_sql(con=cnx.cnx, sql='select * from senators limit 100')
proceedings.head()
votes.head()
senators.head()