diff --git a/openstates/al/__init__.py b/openstates/al/__init__.py index 84976399d6..57a3681421 100644 --- a/openstates/al/__init__.py +++ b/openstates/al/__init__.py @@ -24,7 +24,7 @@ 'name': '2015-2018', 'start_year': 2015, 'end_year': 2018, - 'sessions': ['2015os','2015rs', '2015fs', '2015ss', '2016rs'], + 'sessions': ['2015os','2015rs', '2015fs', '2015ss', '2016rs','2016ss','2017rs'], } ], 'session_details': { @@ -88,6 +88,18 @@ 'internal_id': '1065', '_scraped_name': 'Regular Session 2016', }, + '2016ss': { + 'type': 'special', + 'display_name': 'First Special Session 2016', + 'internal_id': '1068', + '_scraped_name': 'First Special Session 2016', + }, + '2017rs': { + 'type': 'primary', + 'display_name': '2017 Regular Session', + 'internal_id': '1069', + '_scraped_name': 'Regular Session 2017', + }, }, 'feature_flags': ['subjects', 'influenceexplorer'], '_ignored_scraped_sessions': [ @@ -132,10 +144,9 @@ def session_list(): import requests s = requests.Session() - r = s.get('http://alisondb.legislature.state.al.us/alison/alisonlogin.aspx') + r = s.get('http://alisondb.legislature.state.al.us/alison/SelectSession.aspx') doc = lxml.html.fromstring(r.text) - options = doc.xpath('//option/text()') - + options = doc.xpath('//*[@id="ContentPlaceHolder1_gvSessions"]/tr/td/font/a/font/text()') return options diff --git a/openstates/al/bills.py b/openstates/al/bills.py index 9217c164eb..3ddde46292 100644 --- a/openstates/al/bills.py +++ b/openstates/al/bills.py @@ -74,16 +74,29 @@ def _set_session(self, session): ''' Activate an ASP.NET session, and set the legislative session ''' SESSION_SET_URL = ('http://alisondb.legislature.state.al.us/' - 'Alison/ALISONLogin.aspx') + 'Alison/SelectSession.aspx') doc = lxml.html.fromstring(self.get(url=SESSION_SET_URL).text) - (current_session, ) = doc.xpath('//option[@selected]/text()') + (viewstate, ) = doc.xpath('//input[@id="__VIEWSTATE"]/@value') + (viewstategenerator, ) = doc.xpath( + '//input[@id="__VIEWSTATEGENERATOR"]/@value') + + # Find the link whose text matches the session metadata _scraped_name on the session list page + # The __EVENTARGUMENT form value we need to set the session is the second argument + # to the __doPostBack JS function, which is the href of each that link + (target_session, ) = doc.xpath('//table[@id="ContentPlaceHolder1_gvSessions"]//tr//a/font' + '[text()="{}"]/parent::a/@href'.format(self.session_name)) + target_session = target_session.replace("javascript:__doPostBack('ctl00$ContentPlaceHolder1$gvSessions','",'') + target_session = target_session.replace("')",'') form = { - '__EVENTTARGET': 'ctl00$cboSession', - 'ctl00$cboSession': self.session_name + '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$gvSessions', + '__EVENTARGUMENT': target_session, + '__VIEWSTATE': viewstate, + '__VIEWSTATEGENERATOR': viewstategenerator, + } - self.post(url=SESSION_SET_URL, data=form, allow_redirects=False) + self.post(url=SESSION_SET_URL, data=form, allow_redirects=True) def _get_bill_list(self, url): ''' @@ -93,23 +106,37 @@ def _get_bill_list(self, url): for _retry in range(self.retry_attempts): html = self.get(url=url).text + + #print html + doc = lxml.html.fromstring(html) - bills = doc.xpath('//table[@class="box_billstatusresults"]/tr')[1:] + + bills = doc.xpath('//table[@id="ContentPlaceHolder1_gvBills"]/tr')[1:] + resolutions = doc.xpath( - '//table[@class="box_resostatusgrid "]/tr')[1:] + '//table[@id="ContentPlaceHolder1_gvResolutions"]/tr')[1:] + + print doc.xpath( + '//span[@id="ContentPlaceHolder1_lblCount"]/font/text()' + ) if bills and resolutions: raise AssertionError("Found multiple bill types") elif bills or resolutions: return bills or resolutions elif doc.xpath( - '//span[@class="ctl00_MainDefaultContent_lblCount"]/text()' + '//span[@id="ContentPlaceHolder1_lblCount"]/font/text()' ) == ["0 Instruments", ]: self.warning("Missing either bills or resolutions") return [] else: + print "Attempt" + print doc.xpath( + '//span[@id="ContentPlaceHolder1_lblCount"]/text()' + ) continue else: + #print html raise AssertionError("Bill list not found") def _get_bill_response(self, url): @@ -118,7 +145,7 @@ def _get_bill_response(self, url): try: html = self.get(url=url, allow_redirects=False).text if lxml.html.fromstring(html).xpath( - '//div[@class="shorttitle"]'): + '//span[@id="ContentPlaceHolder1_lblShotTitle"]'): return html # If a bill page doesn't exist yet, ignore redirects and timeouts except scrapelib.HTTPError: @@ -148,51 +175,54 @@ def scrape(self, session, chambers): (viewstategenerator, ) = doc.xpath( '//input[@id="__VIEWSTATEGENERATOR"]/@value') form = { - '__EVENTTARGET': 'ctl00$MainDefaultContent$gvStatus$ctl02$ctl00', + '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$gvStatus$ctl02$ctl00', '__EVENTARGUMENT': 'Select$0', '__VIEWSTATE': viewstate, '__VIEWSTATEGENERATOR': viewstategenerator, - 'ctl00$cboSession': self.session_name, 'ctl00$ScriptManager1': 'ctl00$UpdatePanel1|ctl00$' 'MainDefaultContent$gvStatus$ctl02$ctl00' } - self.post(url=BILL_TYPE_URL, data=form, allow_redirects=False) + self.post(url=BILL_TYPE_URL, data=form, allow_redirects=True) self.scrape_bill_list(BILL_LIST_URL) + #self._set_session(session) + # Acquire and process a list of all resolutions RESOLUTION_TYPE_URL = ( 'http://alisondb.legislature.state.al.us/Alison/' - 'SESSResBySelectedStatus.aspx') + 'SESSResosBySelectedStatus.aspx') RESOLUTION_LIST_URL = ( 'http://alisondb.legislature.state.al.us/Alison/' - 'SESSResList.aspx?STATUSCODES=Had%20First%20Reading' + 'SESSResosList.aspx?STATUSCODES=Had%20First%20Reading' '%20House%20of%20Origin&BODY=999999') - doc = lxml.html.fromstring(self.get(url=RESOLUTION_TYPE_URL).text) + doc = lxml.html.fromstring(self.get(url=BILL_TYPE_URL).text) (viewstate, ) = doc.xpath('//input[@id="__VIEWSTATE"]/@value') (viewstategenerator, ) = doc.xpath( '//input[@id="__VIEWSTATEGENERATOR"]/@value') + form = { - '__EVENTTARGET': 'ctl00$MainDefaultContent$gvStatus$ctl02$ctl00', + '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$gvStatus$ctl02$ctl00', '__EVENTARGUMENT': 'Select$0', '__VIEWSTATE': viewstate, '__VIEWSTATEGENERATOR': viewstategenerator, - 'ctl00$cboSession': self.session_name, 'ctl00$ScriptManager1': 'ctl00$UpdatePanel1|ctl00$' 'MainDefaultContent$gvStatus$ctl02$ctl00' } - self.post(url=RESOLUTION_TYPE_URL, data=form, allow_redirects=False) - + deb = self.post(url=RESOLUTION_TYPE_URL, data=form, allow_redirects=True) + self.scrape_bill_list(RESOLUTION_LIST_URL) def scrape_bill_list(self, url): bill_list = self._get_bill_list(url) + for bill_info in bill_list: - + (bill_id, ) = bill_info.xpath('td[1]/font/input/@value') (sponsor, ) = bill_info.xpath('td[2]/font/input/@value') - subject = bill_info.xpath('td[3]/font/text()')[0].strip() + (subject, ) = bill_info.xpath('td[3]//text()') + subject = subject.strip() chamber = self.CHAMBERS[bill_id[0]] if 'B' in bill_id: @@ -219,7 +249,7 @@ def scrape_bill_list(self, url): bill.add_source(url) bill_url = ('http://alisondb.legislature.state.al.us/Alison/' - 'SESSBillResult.aspx?BILL={}'.format(bill_id)) + 'SESSBillStatusResult.aspx?BILL={}'.format(bill_id)) bill.add_source(bill_url) bill_html = self._get_bill_response(bill_url) @@ -230,7 +260,7 @@ def scrape_bill_list(self, url): bill_doc = lxml.html.fromstring(bill_html) title = bill_doc.xpath( - '//div[@class="shorttitle"]')[0].text_content().strip() + '//span[@id="ContentPlaceHolder1_lblShotTitle"]//text()')[0].strip() if not title: title = "[No title given by state]" bill['title'] = title @@ -310,7 +340,7 @@ def scrape_bill_list(self, url): action_text=bir_text ) - actions = bill_doc.xpath('//table[@class="box_history"]/tr')[1:] + actions = bill_doc.xpath('//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:] action_date = None for action in actions: # If actions occur on the same day, only one date will exist