-
Notifications
You must be signed in to change notification settings - Fork 0
/
filingScraper.py
153 lines (132 loc) · 6.05 KB
/
filingScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#! python3
# This is a scraper for the 2016 Indiana governor's candidates.
from bs4 import BeautifulSoup
import requests
import csv
import time
# Global variables
contribs_base_url = "http://campaignfinance.in.gov/PublicSite/Filings/Schedules/ViewContributionSchedule.aspx?FilingID="
expenses_base_url = "http://campaignfinance.in.gov/PublicSite/Filings/Schedules/ViewExpenditureSchedule.aspx?FilingID="
contrib_headers = ["date", "type", "contributor", "address", "occupation", "amount", "aggregate", "election_comm", "explanation"]
expense_headers = ["date", "disbursement_type", "expenditure_type", "payee", "address", "occupation", "amount", "explanation"]
pence_filings = [58176, 58175, 54573, 54572, 50042, 50041, 45265, 45264, 45263, 45262, 45261, 44726, 44727]
gregg_filings = [58162, 58161, 54567, 54566, 50028, 50027, 51099, 45245, 45244, 45243, 45242, 45241, 44734]
def main():
"""
Our function names are a table of contents for what we want to do.
"""
start_time = time.time()
get_pence_expenses()
get_pence_contribs()
get_gregg_expenses()
get_gregg_contribs()
print("All files are scraped. It took " " %s seconds." % (time.time() - start_time))
def get_pence_expenses():
# Loop through each of the filings for Pence.
for filing in pence_filings:
filing_page = expenses_base_url + str(filing)
r = requests.get(filing_page)
soup = BeautifulSoup(r.content, "html.parser")
table = soup.find('table', attrs={'class': 'frmDataGrid'})
# Collect all of the rows from the table.
try:
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.strip().replace('\n', ' ')
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
# Write the results to a csv file
with open("pence/expense-" + str(filing) + ".csv", "w", newline='') as outfile:
writer = csv.writer(outfile)
writer.writerows(list_of_rows)
# Exception in case there aren't any entries for that filing. Be sure to manually check when bulletproofing.
except:
print("There might not be a anything here for file " + str(filing) + ".")
continue
print("Finished file number %s." % filing)
time.sleep(3)
print("Pence's expenditures have been scraped.")
def get_pence_contribs():
# Loop through each of the filings for Pence.
for filing in pence_filings:
filing_page = contribs_base_url + str(filing)
r = requests.get(filing_page)
soup = BeautifulSoup(r.content, "html.parser")
table = soup.find('table', attrs={'class': 'frmDataGrid'})
# Collect all of the rows from the table.
try:
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.strip().replace('\n', ' ')
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
# Write the results to a csv file
with open("pence/contrib-" + str(filing) + ".csv", "w", newline='') as outfile:
writer = csv.writer(outfile)
writer.writerows(list_of_rows)
except:
print("There might not be a anything here for file " + str(filing) + ".")
continue
print("Finished file number %s." % filing)
time.sleep(3)
print("Pence's contributions have been scraped.")
def get_gregg_expenses():
# Loop through each of the filings for Gregg.
for filing in gregg_filings:
filing_page = expenses_base_url + str(filing)
r = requests.get(filing_page)
soup = BeautifulSoup(r.content, "html.parser")
table = soup.find('table', attrs={'class': 'frmDataGrid'})
# Collect all of the rows from the table.
try:
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.strip().replace('\n', ' ')
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
# Write the results to a csv file
with open("gregg/expense-" + str(filing) + ".csv", "w", newline='') as outfile:
writer = csv.writer(outfile)
writer.writerows(list_of_rows)
except:
print("There might not be a anything here for file " + str(filing) + ".")
continue
print("Finished file number %s." % filing)
time.sleep(3)
print("Gregg's expenditures have been scraped.")
def get_gregg_contribs():
# Loop through each of the filings for Gregg.
for filing in gregg_filings:
filing_page = contribs_base_url + str(filing)
r = requests.get(filing_page)
soup = BeautifulSoup(r.content, "html.parser")
table = soup.find('table', attrs={'class': 'frmDataGrid'})
try:
# Collect all of the rows from the table.
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
for cell in row.findAll('td'):
text = cell.text.strip().replace('\n', ' ')
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
# Write the results to a csv file
with open("gregg/contrib-" + str(filing) + ".csv", "w", newline='') as outfile:
writer = csv.writer(outfile)
writer.writerows(list_of_rows)
print("Finished file number %s." % filing)
except:
print("There might not be a anything here for file " + str(filing) + ".")
continue
time.sleep(3)
print("Gregg's contributions have been scraped.")
if __name__ == "__main__":
# This function executes when you do "filingScraper.py" on the command line.
print("Campaign finance snooping commencing...")
main()