This repository has been archived by the owner on Dec 25, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsorn_scraper.py
159 lines (133 loc) · 5.37 KB
/
sorn_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import time, csv
import requests
from bs4 import BeautifulSoup
GSA_SORNS_URL = "https://www.gsa.gov/reference/gsa-privacy-program/systems-of-records-privacy-act/system-of-records-notices-sorns-privacy-act"
class Agency:
def __init__(self):
'''
Agency holds the url to the list of SORNs and the gathered SORNs.
'''
self.url = GSA_SORNS_URL
self.sorns = []
def get_sorn_urls(self):
result = requests.get(self.url)
soup = BeautifulSoup(result.text, 'html.parser')
for link in soup.find_all('a'): # Find all anchor tag's
if "https://www.federalregister.gov" in link.get('href'):
sorn = Sorn(link.get('href'))
self.sorns.append(sorn)
def get_all_data(self):
self.get_sorn_urls()
for sorn in self.sorns:
time.sleep(0.1)
sorn.get_all_data()
def write_all_to_csv(self):
with open("gsa_sorns.csv", "w") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['System Name', 'URL', 'Authority'])
for sorn in self.sorns:
writer.writerow([sorn.system_name, sorn.html_url, sorn.authority])
class Sorn:
def __init__(self, html_url):
self.html_url = html_url
self.xml_url = self.build_xml_url()
self.full_xml = None
self.system_name = None
self.pii = None
self.purpose = None
self.retention = None
self.routine_uses = None
self.doc_title = None
self.authority = None
def build_xml_url(self):
'''
transform the html url to the xml url
Example html and xml urls
https://www.federalregister.gov/documents/2015/06/04/2015-13701/privacy-act-of-1974-notice-of-an-updated-system-of-records
https://www.federalregister.gov/documents/full_text/xml/2015/06/04/2015-13701.xml
'''
split_url = self.html_url.split("/")
first_half = "/".join(split_url[0:4])
second_half = "/".join(split_url[4:8])
return first_half + "/full_text/xml/" + second_half + ".xml"
def get_full_xml(self):
response = requests.get(self.xml_url)
response.encoding = 'utf-8' # Convert encoding to get rid of weird characters
self.full_xml = response.text
if response.status_code != 200:
print("XML url not working: " + self.xml_url)
def get_first_child(self, section, sorn_attribute):
'''
grabs the first child element of the section provided.
TODO: delete all the '/n' new line elements from contents
'''
soup = BeautifulSoup(self.full_xml, 'xml')
try:
section = soup.find(section)
first_child = section.contents[1].get_text()
setattr(self, sorn_attribute, first_child)
except:
print("%s not found for %s" % (sorn_attribute, self.xml_url))
def get_sorn_text_after_a_given_heading(self, heading, sorn_attribute):
'''
heading is the text we are searching for in the XML.
We grab all the text after that heading until we hit the next heading.
sorn_attribute is where we want to save the scraped text.
'''
soup = BeautifulSoup(self.full_xml, 'xml')
html = u""
try:
# Many SORNs have an added space after their heading.
# We want to match with space or without, so we pass both.
# We could also regex here.
headings = [heading, heading + " "]
for tag in soup.find('HD', text=headings).next_siblings:
if tag.name == "HD":
break
else:
html += str(tag)
# put the text back into BS to strip out the xml tags
new_soup = BeautifulSoup(html, 'html.parser')
tagless_content = new_soup.get_text().strip()
setattr(self, sorn_attribute, tagless_content)
except:
print("%s not found for %s" % (sorn_attribute, self.xml_url))
def get_all_data(self):
self.get_full_xml()
self.get_system_name()
# self.get_pii()
# self.get_purpose()
# self.get_retention()
# self.get_routine_uses()
# self.get_doc_title()
self.get_authority()
# self.write_to_csv()
def get_system_name(self):
self.get_sorn_text_after_a_given_heading("SYSTEM NAME:", "system_name")
def get_pii(self):
self.get_sorn_text_after_a_given_heading("CATEGORIES OF RECORDS IN THE SYSTEM:", "pii")
def get_purpose(self):
self.get_sorn_text_after_a_given_heading("PURPOSE:", "purpose")
def get_retention(self):
self.get_sorn_text_after_a_given_heading("RETENTION AND DISPOSAL:", "retention")
def get_routine_uses(self):
header = "ROUTINE USES OF RECORDS MAINTAINED IN THE SYSTEM INCLUDING CATEGORIES OF USERS AND THE PURPOSES OF SUCH USES:"
self.get_sorn_text_after_a_given_heading(header, "routine_uses")
def get_authority(self):
header = "AUTHORITIES FOR MAINTENANCE OF THE SYSTEM:"
self.get_sorn_text_after_a_given_heading(header, "authority")
if not self.authority:
header = "AUTHORITY FOR MAINTENANCE OF THE SYSTEM:"
self.get_sorn_text_after_a_given_heading(header, "authority")
def get_doc_title(self):
self.get_first_child("PRIACT", "doc_title")
def write_to_csv(self):
with open("gsa_sorns.csv", 'a', newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow([self.system_name, self.html_url, self.pii, self.purpose, self.retention, self.routine_uses])
if __name__ == '__main__':
agency = Agency()
agency.get_all_data()
agency.write_all_to_csv()
# sorn_sample = Sorn('https://www.federalregister.gov/documents/2011/07/25/2011-18637/privacy-act-of-1974-notice-of-updated-systems-of-records')
# sorn_sample.get_all_data()