VA: Implemented more complex parsing for bill histories.

This changeset should largely prevent duplicate vote processing.
showerst · Jun 15, 2016 · 928a1f1 · 928a1f1
1 parent b7f5512
commit 928a1f1
Showing 1 changed file with 183 additions and 114 deletions.
diff --git a/openstates/va/bills.py b/openstates/va/bills.py
@@ -9,10 +9,13 @@
 
 BASE_URL = 'http://lis.virginia.gov'
 
+
 class VABillScraper(BillScraper):
     jurisdiction = 'va'
 
-    vote_strip_re = re.compile(r'(.+)\((\d{1,2})-Y (\d{1,2})-N(?: (\d{1,2})-A)?\)')
+    # There's a weird catch-all for numerals after the dash in the Yes
+    # count. That's because we've actually encountered this.
+    vote_strip_re = re.compile(r'(.+)\((\d+)-[\d]*Y (\d+)-N(?: (\d+)-A)?\)')
     actor_map = {'House': 'lower', 'Senate': 'upper', 'Governor': 'governor',
                  'Conference': 'conference'}
 
@@ -44,14 +47,14 @@ class VABillScraper(BillScraper):
 
     link_xpath = '//ul[@class="linkSect"]/li/a'
 
-    def accept_response(self, response):
+    def _accept_response(self, response):
         # check for rate limit pages
-        normal = super(VABillScraper, self).accept_response(response)
+        normal = super(VABillScraper, self)._accept_response(response)
         return (normal and
-                'Sorry, your query could not be processed' not in response.text
-                and 'the source database is temporarily unavailable' not in response.text)
+            'Sorry, your query could not be processed' not in response.text
+            and 'the source database is temporarily unavailable' not in response.text)
 
-    def get_page_bills(self, issue_name, href):
+    def _get_page_bills(self, issue_name, href):
         issue_html = self.get('http://lis.virginia.gov' + href,
                                   retry_on_404=True).text
         idoc = lxml.html.fromstring(issue_html)
@@ -60,9 +63,9 @@ def get_page_bills(self, issue_name, href):
 
         more_links = idoc.xpath('//a/b[text()="More..."]/../@href')
         if more_links:
-            self.get_page_bills(issue_name, more_links[0])
+            self._get_page_bills(issue_name, more_links[0])
 
-    def build_subject_map(self):
+    def _build_subject_map(self):
         url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+sbj+SBJ' % self.site_id
         self.subject_map = defaultdict(list)
 
@@ -71,58 +74,68 @@ def build_subject_map(self):
         doc = lxml.html.fromstring(html)
         for link in doc.xpath(self.link_xpath):
             # get bills from page
-            self.get_page_bills(link.text, link.get('href'))
-
-
-    def scrape(self, chamber, session):
-        self.user_agent = 'openstates +mozilla'
-        # internal id for the session, store on self so all methods have access
-        self.site_id = self.metadata['session_details'][session]['site_id']
-
-        self.build_subject_map()
-
-        # used for skipping bills from opposite chamber
-        start_letter = 'H' if chamber == 'lower' else 'S'
+            self._get_page_bills(link.text, link.get('href'))
 
-        url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id
+    def _fetch_sponsors(self, bill):
+        url = "http://lis.virginia.gov/cgi-bin/legp604.exe?%s+mbr+%s" % (
+            self.site_id, bill['bill_id'].replace(' ', ''))
 
-        while url:
-            html = self.get(url, retry_on_404=True).text
-            doc = lxml.html.fromstring(html)
+        html = self.get(url, retry_on_404=True).text
+        doc = lxml.html.fromstring(html)
 
-            url = None  # no more unless we encounter 'More...'
+        for slist in doc.xpath('//ul[@class="linkSect"]'):
+            # note that first ul is origin chamber
+            for sponsor in slist.xpath('li'):
+                name = sponsor.text_content().strip()
+                if name.endswith(u' (chief\xa0patron)'):
+                    name = name[:-15]
+                    type = 'primary'
+                elif name.endswith(u' (chief\xa0co-patron)'):
+                    name = name[:-18]
+                    type = 'cosponsor'
+                else:
+                    type = 'cosponsor'
+                bill.add_sponsor(type, name)
 
-            bills = doc.xpath('//ul[@class="linkSect"]/li')
-            for bill in bills:
-                link = bill.getchildren()[0]
-                bill_id = str(link.text_content())
+    def _split_vote(self, block):
+        if block:
+            block = block[0].text.replace('\r\n', ' ')
 
-                # check if this is the 'More...' link
-                if bill_id.startswith('More'):
-                    url = BASE_URL + link.get('href')
+            pieces = block.split('--')
+            # if there are only two pieces, there are no abstentions
+            if len(pieces) <= 2:
+                return []
+            else:
+                # lookahead and don't split if comma precedes initials
+                # Also, Bell appears as Bell, Richard B. and Bell, Robert P.
+                # and so needs the lookbehind assertion.
+                return [x.strip() for x in re.split('(?<!Bell), (?!\w\.\w?\.?)', pieces[1]) if x.strip()]
+        else:
+            return []
 
-                # skip bills from the other chamber
-                elif not bill_id.startswith(start_letter):
-                    continue
+    def _parse_vote(self, vote, url):
+        url = BASE_URL + url
 
-                else:
-                    # create a bill
-                    desc = bill.xpath('text()')[0].strip()
-                    bill_type = {'B': 'bill',
-                                 'J': 'joint resolution',
-                                 'R': 'resolution'}[bill_id[1]]
-                    bill = Bill(session, chamber, bill_id, desc,
-                                type=bill_type)
+        html = self.get(url, retry_on_404=True).text
+        doc = lxml.html.fromstring(html)
 
-                    bill_url = BASE_URL + link.get('href')
-                    self.fetch_sponsors(bill)
-                    self.scrape_bill_details(bill_url, bill)
-                    bill['subjects'] = self.subject_map[bill_id]
-                    bill.add_source(bill_url)
-                    self.save_bill(bill)
+        yeas = doc.xpath('//p[contains(text(), "YEAS--")]')
+        nays = doc.xpath('//p[contains(text(), "NAYS--")]')
+        # We capture "other" types of votes separately just in case we
+        # want to have the granularity later.
+        rule36 = doc.xpath('//p[contains(text(), "RULE 36--")]')
+        abstaining = doc.xpath('//p[contains(text(), "ABSTENTIONS--")]')
+        notvoting = doc.xpath('//p[contains(text(), "NOT VOTING--")]')
 
+        map(vote.yes, self._split_vote(yeas))
+        map(vote.no, self._split_vote(nays))
+        # Flattening all types of other votes into a single list.
+        other_votes = []
+        map(other_votes.extend, (self._split_vote(rule36), self._split_vote(abstaining),
+            self._split_vote(notvoting)))
+        map(vote.other, other_votes)
 
-    def scrape_bill_details(self, url, bill):
+    def _scrape_bill_details(self, url, bill):
         html = self.get(url, retry_on_404=True).text
         doc = lxml.html.fromstring(html)
 
@@ -150,27 +163,101 @@ def scrape_bill_details(self, url, bill):
                                  on_duplicate='use_old')
 
         # actions
-        for ali in doc.xpath('//h4[text()="HISTORY"]/following-sibling::ul[1]/li'):
+        cached_vote = None
+        cached_action = None
+        for ali in doc.xpath('//h4[text()="HISTORY"]/following-sibling::ul[1]/'
+            'li'):
+            vote = None
+
             date, action = ali.text_content().split(u' \xa0')
             actor, action = action.split(': ', 1)
 
+            # Bill history entries purely in parentheses tend to be
+            # notes and not actions, so we'll skip them.
+            if action.startswith('(') and action.endswith(')'):
+                continue
+
             actor = self.actor_map[actor]
             date = datetime.datetime.strptime(date.strip(), '%m/%d/%y')
 
             # if action ends in (##-Y ##-N) remove that part
             vrematch = self.vote_strip_re.match(action)
+            # The following conditional logic is messy to handle
+            # Virginia's crazy and inconsistently formatted bill
+            # histories. Someone less harried and tired than me
+            # could probably make this much cleaner. - alo
             if vrematch:
-                action, y, n, o = vrematch.groups()
-                vote = Vote(actor, date, action, int(y) > int(n),
-                            int(y), int(n), 0)
+                vote_action, y, n, o = vrematch.groups()
+                y = int(y)
+                n = int(n)
+                # Set default count for "other" votes to 0. We have to
+                # do this explicitly as it's excluded from the action
+                # text when there were no abstentions (the only type of
+                # "other" vote encountered thus far).
+                if o is None:
+                    o = 0
+                else:
+                    o = int(o)
+
                 vote_url = ali.xpath('a/@href')
-                if vote_url:
-                    self.parse_vote(vote, vote_url[0])
-                    vote.add_source(BASE_URL + vote_url[0])
-                # set other count, it isn't provided
-                vote['other_count'] = len(vote['other_votes'])
+
+                # Caches relevant information from the current action if
+                # vote count encountered, then searches for the presence
+                # of identical counts in the next entry (we assume that
+                # it's probably there). If matching votes are found, it
+                # pulls the cached data to create a unified vote record.
+                #
+                # This is because Virginia usually publishes two lines
+                # of history data for a single vote, without guaranteed
+                # order, so we cache and unsafely attempt to match on
+                # identical vote counts in the next line.
+                if cached_vote is None:
+                    cached_action = action
+                    cached_vote = Vote(actor, date, vote_action, y > n, y, n,
+                        o)
+                    if vote_url:
+                        cached_vote.add_source(BASE_URL + vote_url[0])
+                    continue
+                elif cached_vote is not None:
+                    if vote_action.startswith(u'VOTE:'):
+                        if (vote_url
+                            and cached_vote['yes_count'] == y
+                            and cached_vote['no_count'] == n
+                            and cached_vote['other_count'] == o):
+                            vote = cached_vote
+                            self._parse_vote(vote, vote_url[0])
+                            vote.add_source(BASE_URL + vote_url[0])
+                            action = cached_action
+                    elif cached_vote['motion'].startswith('VOTE:'):
+                        if (cached_vote['yes_count'] == y
+                            and cached_vote['no_count'] == n
+                            and cached_vote['other_count'] == o):
+                            vote = cached_vote
+                            vote['motion'] = vote_action
+                    else:
+                        # Cached vote doesn't match up to the current
+                        # one. Save, then cache the current vote to
+                        # begin the next search.
+                        bill.add_vote(cached_vote)
+                        cached_vote = Vote(actor, date, vote_action, y > n, y,
+                            n, o)
+                        if vote_url:
+                            cached_vote.add_source(BASE_URL + vote_url[0])
+                        cached_action = action
+                        continue
+
+                if vote is None:
+                    raise ValueError('Cannot save an empty vote.')
                 #vote.validate()
                 bill.add_vote(vote)
+            else:
+                # If this action isn't a vote, but the last one was,
+                # there's obviously no additional vote data to match.
+                # Go ahead and save the cached data.
+                if cached_vote is not None:
+                    bill.add_vote(cached_vote)
+
+            cached_vote = cached_action = None
 
             # categorize actions
             for pattern, atype in self._action_classifiers:
@@ -184,67 +271,49 @@ def scrape_bill_details(self, url, bill):
                 bill.add_action(actor, action, date, type=atype)
 
 
-    def fetch_sponsors(self, bill):
-        url = "http://lis.virginia.gov/cgi-bin/legp604.exe?%s+mbr+%s" % (
-            self.site_id, bill['bill_id'].replace(' ', ''))
+    def scrape(self, chamber, session):
+        self.user_agent = 'openstates +mozilla'
+        # internal id for the session, store on self so all methods have access
+        self.site_id = self.metadata['session_details'][session]['site_id']
 
-        # order of chamber uls
-        #if bill['chamber'] == 'lower':
-        #    order = ['lower', 'upper']
-        #else:
-        #    order = ['upper', 'lower']
+        self._build_subject_map()
 
-        html = self.get(url, retry_on_404=True).text
-        doc = lxml.html.fromstring(html)
+        # used for skipping bills from opposite chamber
+        start_letter = 'H' if chamber == 'lower' else 'S'
 
-        for slist in doc.xpath('//ul[@class="linkSect"]'):
-            # note that first ul is origin chamber
-            for sponsor in slist.xpath('li'):
-                name = sponsor.text_content().strip()
-                if name.endswith(u' (chief\xa0patron)'):
-                    name = name[:-15]
-                    type = 'primary'
-                elif name.endswith(u' (chief\xa0co-patron)'):
-                    name = name[:-18]
-                    type = 'cosponsor'
-                else:
-                    type = 'cosponsor'
-                bill.add_sponsor(type, name)
+        url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id
 
-    def split_vote(self, block):
-        if block:
-            block = block[0].text.replace('\r\n', ' ')
+        while url:
+            html = self.get(url, retry_on_404=True).text
+            doc = lxml.html.fromstring(html)
 
-            pieces = block.split('--')
-            # if there are only two pieces, there are no abstentions
-            if len(pieces) <= 2:
-                return []
-            else:
-                # lookahead and don't split if comma precedes initials
-                # Also, Bell appears as Bell, Richard B. and Bell, Robert P.
-                # and so needs the lookbehind assertion.
-                return [x.strip() for x in re.split('(?<!Bell), (?!\w\.\w?\.?)', pieces[1]) if x.strip()]
-        else:
-            return []
+            url = None  # no more unless we encounter 'More...'
 
-    def parse_vote(self, vote, url):
-        url = BASE_URL + url
+            bills = doc.xpath('//ul[@class="linkSect"]/li')
+            for bill in bills:
+                link = bill.getchildren()[0]
+                bill_id = str(link.text_content())
 
-        html = self.get(url, retry_on_404=True).text
-        doc = lxml.html.fromstring(html)
+                # check if this is the 'More...' link
+                if bill_id.startswith('More'):
+                    url = BASE_URL + link.get('href')
 
-        yeas = doc.xpath('//p[contains(text(), "YEAS--")]')
-        nays = doc.xpath('//p[contains(text(), "NAYS--")]')
-        # We capture "other" types of votes separately just in case we
-        # want to have the granularity later.
-        rule36 = doc.xpath('//p[contains(text(), "RULE 36--")]')
-        abstaining = doc.xpath('//p[contains(text(), "ABSTENTIONS--")]')
-        notvoting = doc.xpath('//p[contains(text(), "NOT VOTING--")]')
+                # skip bills from the other chamber
+                elif not bill_id.startswith(start_letter):
+                    continue
 
-        map(vote.yes, self.split_vote(yeas))
-        map(vote.no, self.split_vote(nays))
-        # Flattening all types of other votes into a single list.
-        other_votes = []
-        map(other_votes.extend, (self.split_vote(rule36), self.split_vote(abstaining),
-            self.split_vote(notvoting)))
-        map(vote.other, other_votes)
+                else:
+                    # create a bill
+                    desc = bill.xpath('text()')[0].strip()
+                    bill_type = {'B': 'bill',
+                                 'J': 'joint resolution',
+                                 'R': 'resolution'}[bill_id[1]]
+                    bill = Bill(session, chamber, bill_id, desc,
+                                type=bill_type)
+
+                    bill_url = BASE_URL + link.get('href')
+                    self._fetch_sponsors(bill)
+                    self._scrape_bill_details(bill_url, bill)
+                    bill['subjects'] = self.subject_map[bill_id]
+                    bill.add_source(bill_url)
+                    self.save_bill(bill)