diff --git a/openstates/va/bills.py b/openstates/va/bills.py index 243739439f..8f7cc981fc 100644 --- a/openstates/va/bills.py +++ b/openstates/va/bills.py @@ -9,10 +9,13 @@ BASE_URL = 'http://lis.virginia.gov' + class VABillScraper(BillScraper): jurisdiction = 'va' - vote_strip_re = re.compile(r'(.+)\((\d{1,2})-Y (\d{1,2})-N(?: (\d{1,2})-A)?\)') + # There's a weird catch-all for numerals after the dash in the Yes + # count. That's because we've actually encountered this. + vote_strip_re = re.compile(r'(.+)\((\d+)-[\d]*Y (\d+)-N(?: (\d+)-A)?\)') actor_map = {'House': 'lower', 'Senate': 'upper', 'Governor': 'governor', 'Conference': 'conference'} @@ -44,14 +47,14 @@ class VABillScraper(BillScraper): link_xpath = '//ul[@class="linkSect"]/li/a' - def accept_response(self, response): + def _accept_response(self, response): # check for rate limit pages - normal = super(VABillScraper, self).accept_response(response) + normal = super(VABillScraper, self)._accept_response(response) return (normal and - 'Sorry, your query could not be processed' not in response.text - and 'the source database is temporarily unavailable' not in response.text) + 'Sorry, your query could not be processed' not in response.text + and 'the source database is temporarily unavailable' not in response.text) - def get_page_bills(self, issue_name, href): + def _get_page_bills(self, issue_name, href): issue_html = self.get('http://lis.virginia.gov' + href, retry_on_404=True).text idoc = lxml.html.fromstring(issue_html) @@ -60,9 +63,9 @@ def get_page_bills(self, issue_name, href): more_links = idoc.xpath('//a/b[text()="More..."]/../@href') if more_links: - self.get_page_bills(issue_name, more_links[0]) + self._get_page_bills(issue_name, more_links[0]) - def build_subject_map(self): + def _build_subject_map(self): url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+sbj+SBJ' % self.site_id self.subject_map = defaultdict(list) @@ -71,58 +74,68 @@ def build_subject_map(self): doc = lxml.html.fromstring(html) for link in doc.xpath(self.link_xpath): # get bills from page - self.get_page_bills(link.text, link.get('href')) - - - def scrape(self, chamber, session): - self.user_agent = 'openstates +mozilla' - # internal id for the session, store on self so all methods have access - self.site_id = self.metadata['session_details'][session]['site_id'] - - self.build_subject_map() - - # used for skipping bills from opposite chamber - start_letter = 'H' if chamber == 'lower' else 'S' + self._get_page_bills(link.text, link.get('href')) - url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id + def _fetch_sponsors(self, bill): + url = "http://lis.virginia.gov/cgi-bin/legp604.exe?%s+mbr+%s" % ( + self.site_id, bill['bill_id'].replace(' ', '')) - while url: - html = self.get(url, retry_on_404=True).text - doc = lxml.html.fromstring(html) + html = self.get(url, retry_on_404=True).text + doc = lxml.html.fromstring(html) - url = None # no more unless we encounter 'More...' + for slist in doc.xpath('//ul[@class="linkSect"]'): + # note that first ul is origin chamber + for sponsor in slist.xpath('li'): + name = sponsor.text_content().strip() + if name.endswith(u' (chief\xa0patron)'): + name = name[:-15] + type = 'primary' + elif name.endswith(u' (chief\xa0co-patron)'): + name = name[:-18] + type = 'cosponsor' + else: + type = 'cosponsor' + bill.add_sponsor(type, name) - bills = doc.xpath('//ul[@class="linkSect"]/li') - for bill in bills: - link = bill.getchildren()[0] - bill_id = str(link.text_content()) + def _split_vote(self, block): + if block: + block = block[0].text.replace('\r\n', ' ') - # check if this is the 'More...' link - if bill_id.startswith('More'): - url = BASE_URL + link.get('href') + pieces = block.split('--') + # if there are only two pieces, there are no abstentions + if len(pieces) <= 2: + return [] + else: + # lookahead and don't split if comma precedes initials + # Also, Bell appears as Bell, Richard B. and Bell, Robert P. + # and so needs the lookbehind assertion. + return [x.strip() for x in re.split('(? int(n), - int(y), int(n), 0) + vote_action, y, n, o = vrematch.groups() + y = int(y) + n = int(n) + # Set default count for "other" votes to 0. We have to + # do this explicitly as it's excluded from the action + # text when there were no abstentions (the only type of + # "other" vote encountered thus far). + if o is None: + o = 0 + else: + o = int(o) + vote_url = ali.xpath('a/@href') - if vote_url: - self.parse_vote(vote, vote_url[0]) - vote.add_source(BASE_URL + vote_url[0]) - # set other count, it isn't provided - vote['other_count'] = len(vote['other_votes']) + + # Caches relevant information from the current action if + # vote count encountered, then searches for the presence + # of identical counts in the next entry (we assume that + # it's probably there). If matching votes are found, it + # pulls the cached data to create a unified vote record. + # + # This is because Virginia usually publishes two lines + # of history data for a single vote, without guaranteed + # order, so we cache and unsafely attempt to match on + # identical vote counts in the next line. + if cached_vote is None: + cached_action = action + cached_vote = Vote(actor, date, vote_action, y > n, y, n, + o) + if vote_url: + cached_vote.add_source(BASE_URL + vote_url[0]) + continue + elif cached_vote is not None: + if vote_action.startswith(u'VOTE:'): + if (vote_url + and cached_vote['yes_count'] == y + and cached_vote['no_count'] == n + and cached_vote['other_count'] == o): + vote = cached_vote + self._parse_vote(vote, vote_url[0]) + vote.add_source(BASE_URL + vote_url[0]) + action = cached_action + elif cached_vote['motion'].startswith('VOTE:'): + if (cached_vote['yes_count'] == y + and cached_vote['no_count'] == n + and cached_vote['other_count'] == o): + vote = cached_vote + vote['motion'] = vote_action + else: + # Cached vote doesn't match up to the current + # one. Save, then cache the current vote to + # begin the next search. + bill.add_vote(cached_vote) + cached_vote = Vote(actor, date, vote_action, y > n, y, + n, o) + if vote_url: + cached_vote.add_source(BASE_URL + vote_url[0]) + cached_action = action + continue + + if vote is None: + raise ValueError('Cannot save an empty vote.') #vote.validate() bill.add_vote(vote) + else: + # If this action isn't a vote, but the last one was, + # there's obviously no additional vote data to match. + # Go ahead and save the cached data. + if cached_vote is not None: + bill.add_vote(cached_vote) + + cached_vote = cached_action = None # categorize actions for pattern, atype in self._action_classifiers: @@ -184,67 +271,49 @@ def scrape_bill_details(self, url, bill): bill.add_action(actor, action, date, type=atype) - def fetch_sponsors(self, bill): - url = "http://lis.virginia.gov/cgi-bin/legp604.exe?%s+mbr+%s" % ( - self.site_id, bill['bill_id'].replace(' ', '')) + def scrape(self, chamber, session): + self.user_agent = 'openstates +mozilla' + # internal id for the session, store on self so all methods have access + self.site_id = self.metadata['session_details'][session]['site_id'] - # order of chamber uls - #if bill['chamber'] == 'lower': - # order = ['lower', 'upper'] - #else: - # order = ['upper', 'lower'] + self._build_subject_map() - html = self.get(url, retry_on_404=True).text - doc = lxml.html.fromstring(html) + # used for skipping bills from opposite chamber + start_letter = 'H' if chamber == 'lower' else 'S' - for slist in doc.xpath('//ul[@class="linkSect"]'): - # note that first ul is origin chamber - for sponsor in slist.xpath('li'): - name = sponsor.text_content().strip() - if name.endswith(u' (chief\xa0patron)'): - name = name[:-15] - type = 'primary' - elif name.endswith(u' (chief\xa0co-patron)'): - name = name[:-18] - type = 'cosponsor' - else: - type = 'cosponsor' - bill.add_sponsor(type, name) + url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id - def split_vote(self, block): - if block: - block = block[0].text.replace('\r\n', ' ') + while url: + html = self.get(url, retry_on_404=True).text + doc = lxml.html.fromstring(html) - pieces = block.split('--') - # if there are only two pieces, there are no abstentions - if len(pieces) <= 2: - return [] - else: - # lookahead and don't split if comma precedes initials - # Also, Bell appears as Bell, Richard B. and Bell, Robert P. - # and so needs the lookbehind assertion. - return [x.strip() for x in re.split('(?