Skip to content

Commit

Permalink
Update bills scrape for new AL site structure, Bills
Browse files Browse the repository at this point in the history
  • Loading branch information
showerst committed Aug 9, 2016
1 parent f886e39 commit d862c15
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 28 deletions.
19 changes: 15 additions & 4 deletions openstates/al/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
'name': '2015-2018',
'start_year': 2015,
'end_year': 2018,
'sessions': ['2015os','2015rs', '2015fs', '2015ss', '2016rs'],
'sessions': ['2015os','2015rs', '2015fs', '2015ss', '2016rs','2016ss','2017rs'],
}
],
'session_details': {
Expand Down Expand Up @@ -88,6 +88,18 @@
'internal_id': '1065',
'_scraped_name': 'Regular Session 2016',
},
'2016ss': {
'type': 'special',
'display_name': 'First Special Session 2016',
'internal_id': '1068',
'_scraped_name': 'First Special Session 2016',
},
'2017rs': {
'type': 'primary',
'display_name': '2017 Regular Session',
'internal_id': '1069',
'_scraped_name': 'Regular Session 2017',
},
},
'feature_flags': ['subjects', 'influenceexplorer'],
'_ignored_scraped_sessions': [
Expand Down Expand Up @@ -132,10 +144,9 @@ def session_list():
import requests

s = requests.Session()
r = s.get('http://alisondb.legislature.state.al.us/alison/alisonlogin.aspx')
r = s.get('http://alisondb.legislature.state.al.us/alison/SelectSession.aspx')
doc = lxml.html.fromstring(r.text)
options = doc.xpath('//option/text()')

options = doc.xpath('//*[@id="ContentPlaceHolder1_gvSessions"]/tr/td/font/a/font/text()')
return options


Expand Down
78 changes: 54 additions & 24 deletions openstates/al/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,29 @@ def _set_session(self, session):
''' Activate an ASP.NET session, and set the legislative session '''

SESSION_SET_URL = ('http://alisondb.legislature.state.al.us/'
'Alison/ALISONLogin.aspx')
'Alison/SelectSession.aspx')

doc = lxml.html.fromstring(self.get(url=SESSION_SET_URL).text)
(current_session, ) = doc.xpath('//option[@selected]/text()')
(viewstate, ) = doc.xpath('//input[@id="__VIEWSTATE"]/@value')
(viewstategenerator, ) = doc.xpath(
'//input[@id="__VIEWSTATEGENERATOR"]/@value')

# Find the link whose text matches the session metadata _scraped_name on the session list page
# The __EVENTARGUMENT form value we need to set the session is the second argument
# to the __doPostBack JS function, which is the href of each that link
(target_session, ) = doc.xpath('//table[@id="ContentPlaceHolder1_gvSessions"]//tr//a/font'
'[text()="{}"]/parent::a/@href'.format(self.session_name))
target_session = target_session.replace("javascript:__doPostBack('ctl00$ContentPlaceHolder1$gvSessions','",'')
target_session = target_session.replace("')",'')

form = {
'__EVENTTARGET': 'ctl00$cboSession',
'ctl00$cboSession': self.session_name
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$gvSessions',
'__EVENTARGUMENT': target_session,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategenerator,

}
self.post(url=SESSION_SET_URL, data=form, allow_redirects=False)
self.post(url=SESSION_SET_URL, data=form, allow_redirects=True)

def _get_bill_list(self, url):
'''
Expand All @@ -93,23 +106,37 @@ def _get_bill_list(self, url):

for _retry in range(self.retry_attempts):
html = self.get(url=url).text

#print html

doc = lxml.html.fromstring(html)
bills = doc.xpath('//table[@class="box_billstatusresults"]/tr')[1:]

bills = doc.xpath('//table[@id="ContentPlaceHolder1_gvBills"]/tr')[1:]

resolutions = doc.xpath(
'//table[@class="box_resostatusgrid "]/tr')[1:]
'//table[@id="ContentPlaceHolder1_gvResolutions"]/tr')[1:]

print doc.xpath(
'//span[@id="ContentPlaceHolder1_lblCount"]/font/text()'
)

if bills and resolutions:
raise AssertionError("Found multiple bill types")
elif bills or resolutions:
return bills or resolutions
elif doc.xpath(
'//span[@class="ctl00_MainDefaultContent_lblCount"]/text()'
'//span[@id="ContentPlaceHolder1_lblCount"]/font/text()'
) == ["0 Instruments", ]:
self.warning("Missing either bills or resolutions")
return []
else:
print "Attempt"
print doc.xpath(
'//span[@id="ContentPlaceHolder1_lblCount"]/text()'
)
continue
else:
#print html
raise AssertionError("Bill list not found")

def _get_bill_response(self, url):
Expand All @@ -118,7 +145,7 @@ def _get_bill_response(self, url):
try:
html = self.get(url=url, allow_redirects=False).text
if lxml.html.fromstring(html).xpath(
'//div[@class="shorttitle"]'):
'//span[@id="ContentPlaceHolder1_lblShotTitle"]'):
return html
# If a bill page doesn't exist yet, ignore redirects and timeouts
except scrapelib.HTTPError:
Expand Down Expand Up @@ -148,51 +175,54 @@ def scrape(self, session, chambers):
(viewstategenerator, ) = doc.xpath(
'//input[@id="__VIEWSTATEGENERATOR"]/@value')
form = {
'__EVENTTARGET': 'ctl00$MainDefaultContent$gvStatus$ctl02$ctl00',
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$gvStatus$ctl02$ctl00',
'__EVENTARGUMENT': 'Select$0',
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategenerator,
'ctl00$cboSession': self.session_name,
'ctl00$ScriptManager1': 'ctl00$UpdatePanel1|ctl00$'
'MainDefaultContent$gvStatus$ctl02$ctl00'
}
self.post(url=BILL_TYPE_URL, data=form, allow_redirects=False)
self.post(url=BILL_TYPE_URL, data=form, allow_redirects=True)

self.scrape_bill_list(BILL_LIST_URL)

#self._set_session(session)

# Acquire and process a list of all resolutions
RESOLUTION_TYPE_URL = (
'http://alisondb.legislature.state.al.us/Alison/'
'SESSResBySelectedStatus.aspx')
'SESSResosBySelectedStatus.aspx')
RESOLUTION_LIST_URL = (
'http://alisondb.legislature.state.al.us/Alison/'
'SESSResList.aspx?STATUSCODES=Had%20First%20Reading'
'SESSResosList.aspx?STATUSCODES=Had%20First%20Reading'
'%20House%20of%20Origin&BODY=999999')

doc = lxml.html.fromstring(self.get(url=RESOLUTION_TYPE_URL).text)
doc = lxml.html.fromstring(self.get(url=BILL_TYPE_URL).text)
(viewstate, ) = doc.xpath('//input[@id="__VIEWSTATE"]/@value')
(viewstategenerator, ) = doc.xpath(
'//input[@id="__VIEWSTATEGENERATOR"]/@value')

form = {
'__EVENTTARGET': 'ctl00$MainDefaultContent$gvStatus$ctl02$ctl00',
'__EVENTTARGET': 'ctl00$ContentPlaceHolder1$gvStatus$ctl02$ctl00',
'__EVENTARGUMENT': 'Select$0',
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategenerator,
'ctl00$cboSession': self.session_name,
'ctl00$ScriptManager1': 'ctl00$UpdatePanel1|ctl00$'
'MainDefaultContent$gvStatus$ctl02$ctl00'
}
self.post(url=RESOLUTION_TYPE_URL, data=form, allow_redirects=False)

deb = self.post(url=RESOLUTION_TYPE_URL, data=form, allow_redirects=True)
self.scrape_bill_list(RESOLUTION_LIST_URL)

def scrape_bill_list(self, url):
bill_list = self._get_bill_list(url)

for bill_info in bill_list:

(bill_id, ) = bill_info.xpath('td[1]/font/input/@value')
(sponsor, ) = bill_info.xpath('td[2]/font/input/@value')
subject = bill_info.xpath('td[3]/font/text()')[0].strip()
(subject, ) = bill_info.xpath('td[3]//text()')
subject = subject.strip()
chamber = self.CHAMBERS[bill_id[0]]

if 'B' in bill_id:
Expand All @@ -219,7 +249,7 @@ def scrape_bill_list(self, url):
bill.add_source(url)

bill_url = ('http://alisondb.legislature.state.al.us/Alison/'
'SESSBillResult.aspx?BILL={}'.format(bill_id))
'SESSBillStatusResult.aspx?BILL={}'.format(bill_id))
bill.add_source(bill_url)

bill_html = self._get_bill_response(bill_url)
Expand All @@ -230,7 +260,7 @@ def scrape_bill_list(self, url):
bill_doc = lxml.html.fromstring(bill_html)

title = bill_doc.xpath(
'//div[@class="shorttitle"]')[0].text_content().strip()
'//span[@id="ContentPlaceHolder1_lblShotTitle"]//text()')[0].strip()
if not title:
title = "[No title given by state]"
bill['title'] = title
Expand Down Expand Up @@ -310,7 +340,7 @@ def scrape_bill_list(self, url):
action_text=bir_text
)

actions = bill_doc.xpath('//table[@class="box_history"]/tr')[1:]
actions = bill_doc.xpath('//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:]
action_date = None
for action in actions:
# If actions occur on the same day, only one date will exist
Expand Down

0 comments on commit d862c15

Please sign in to comment.