From 8d5ca8e18832955f915d5ae29d42ec9dbc1a8607 Mon Sep 17 00:00:00 2001 From: Tim Showers Date: Thu, 1 Dec 2016 12:28:52 -0500 Subject: [PATCH] Added 2017 session and fixed house bill# and action parsing --- openstates/mo/__init__.py | 12 ++++++++++++ openstates/mo/bills.py | 19 +++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/openstates/mo/__init__.py b/openstates/mo/__init__.py index 9ae609c346..48fcb78115 100644 --- a/openstates/mo/__init__.py +++ b/openstates/mo/__init__.py @@ -41,6 +41,12 @@ 'start_year': 2015, 'end_year': 2016, }, + { + 'name': '2017-2018', + 'sessions': ['2017'], + 'start_year': 2017, + 'end_year': 2018, + }, ], # General Assembly sessions convene the Wed. following the first Mon. # of January and adjourn May 30. @@ -80,6 +86,12 @@ 'end_date': datetime.date(2016,5,30), 'display_name': '2016 Regular Session', }, + '2017': { + 'type': 'primary', + 'start_date': datetime.date(2017,1,4), + 'end_date': datetime.date(2017,5,12), + 'display_name': '2017 Regular Session', + }, }, feature_flags = ['subjects', 'influenceexplorer'], _ignored_scraped_sessions = [ diff --git a/openstates/mo/bills.py b/openstates/mo/bills.py index 3c42f4ae09..56459de8dd 100644 --- a/openstates/mo/bills.py +++ b/openstates/mo/bills.py @@ -275,13 +275,12 @@ def _scrape_house_subjects(self, session): self._subjects[bill_id].append(subject.text) def _parse_house_actions(self, bill, url): - url = re.sub("BillActions", "BillActionsPrn", url) bill.add_source(url) actions_page = self.get(url).text actions_page = lxml.html.fromstring(actions_page) rows = actions_page.xpath('//table/tr') - for row in rows[1:]: + for row in rows: # new actions are represented by having dates in the first td # otherwise, it's a continuation of the description from the # previous action @@ -337,11 +336,19 @@ def _parse_house_bill(self, url, session): url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._senate_base_url,url) + #the URL is an iframed version now, so swap in for the actual bill page + + url = url.replace('Bill.aspx','BillContent.aspx') + url = url.replace('&code=R','&code=R&style=new') + + # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R + # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new + bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) - bill_id = bill_page.xpath('//*[@class="entry-title"]') + bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.log("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) @@ -440,7 +447,7 @@ def _parse_house_bill(self, url, session): # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( - "//a[contains(@href, 'BillActions.aspx')]/@href") + "//a[contains(@href, 'BillActionsPrn.aspx')]/@href") self._parse_house_actions(bill, actions_link) # get bill versions @@ -512,6 +519,6 @@ def scrape(self, chamber, year): getattr(self, '_scrape_' + chamber + '_chamber')(year) if len(self._bad_urls) > 0: - self.warn('WARNINGS:') + self.warning('WARNINGS:') for url in self._bad_urls: - self.warn('{}'.format(url)) + self.warning('{}'.format(url))