Skip to content

Commit

Permalink
VA: Implemented more complex parsing for bill histories.
Browse files Browse the repository at this point in the history
This changeset should largely prevent duplicate vote processing.
  • Loading branch information
Andy Lo committed Jun 15, 2016
1 parent b7f5512 commit 928a1f1
Showing 1 changed file with 183 additions and 114 deletions.
297 changes: 183 additions & 114 deletions openstates/va/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@

BASE_URL = 'http://lis.virginia.gov'


class VABillScraper(BillScraper):
jurisdiction = 'va'

vote_strip_re = re.compile(r'(.+)\((\d{1,2})-Y (\d{1,2})-N(?: (\d{1,2})-A)?\)')
# There's a weird catch-all for numerals after the dash in the Yes
# count. That's because we've actually encountered this.
vote_strip_re = re.compile(r'(.+)\((\d+)-[\d]*Y (\d+)-N(?: (\d+)-A)?\)')
actor_map = {'House': 'lower', 'Senate': 'upper', 'Governor': 'governor',
'Conference': 'conference'}

Expand Down Expand Up @@ -44,14 +47,14 @@ class VABillScraper(BillScraper):

link_xpath = '//ul[@class="linkSect"]/li/a'

def accept_response(self, response):
def _accept_response(self, response):
# check for rate limit pages
normal = super(VABillScraper, self).accept_response(response)
normal = super(VABillScraper, self)._accept_response(response)
return (normal and
'Sorry, your query could not be processed' not in response.text
and 'the source database is temporarily unavailable' not in response.text)
'Sorry, your query could not be processed' not in response.text
and 'the source database is temporarily unavailable' not in response.text)

def get_page_bills(self, issue_name, href):
def _get_page_bills(self, issue_name, href):
issue_html = self.get('http://lis.virginia.gov' + href,
retry_on_404=True).text
idoc = lxml.html.fromstring(issue_html)
Expand All @@ -60,9 +63,9 @@ def get_page_bills(self, issue_name, href):

more_links = idoc.xpath('//a/b[text()="More..."]/../@href')
if more_links:
self.get_page_bills(issue_name, more_links[0])
self._get_page_bills(issue_name, more_links[0])

def build_subject_map(self):
def _build_subject_map(self):
url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+sbj+SBJ' % self.site_id
self.subject_map = defaultdict(list)

Expand All @@ -71,58 +74,68 @@ def build_subject_map(self):
doc = lxml.html.fromstring(html)
for link in doc.xpath(self.link_xpath):
# get bills from page
self.get_page_bills(link.text, link.get('href'))


def scrape(self, chamber, session):
self.user_agent = 'openstates +mozilla'
# internal id for the session, store on self so all methods have access
self.site_id = self.metadata['session_details'][session]['site_id']

self.build_subject_map()

# used for skipping bills from opposite chamber
start_letter = 'H' if chamber == 'lower' else 'S'
self._get_page_bills(link.text, link.get('href'))

url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id
def _fetch_sponsors(self, bill):
url = "http://lis.virginia.gov/cgi-bin/legp604.exe?%s+mbr+%s" % (
self.site_id, bill['bill_id'].replace(' ', ''))

while url:
html = self.get(url, retry_on_404=True).text
doc = lxml.html.fromstring(html)
html = self.get(url, retry_on_404=True).text
doc = lxml.html.fromstring(html)

url = None # no more unless we encounter 'More...'
for slist in doc.xpath('//ul[@class="linkSect"]'):
# note that first ul is origin chamber
for sponsor in slist.xpath('li'):
name = sponsor.text_content().strip()
if name.endswith(u' (chief\xa0patron)'):
name = name[:-15]
type = 'primary'
elif name.endswith(u' (chief\xa0co-patron)'):
name = name[:-18]
type = 'cosponsor'
else:
type = 'cosponsor'
bill.add_sponsor(type, name)

bills = doc.xpath('//ul[@class="linkSect"]/li')
for bill in bills:
link = bill.getchildren()[0]
bill_id = str(link.text_content())
def _split_vote(self, block):
if block:
block = block[0].text.replace('\r\n', ' ')

# check if this is the 'More...' link
if bill_id.startswith('More'):
url = BASE_URL + link.get('href')
pieces = block.split('--')
# if there are only two pieces, there are no abstentions
if len(pieces) <= 2:
return []
else:
# lookahead and don't split if comma precedes initials
# Also, Bell appears as Bell, Richard B. and Bell, Robert P.
# and so needs the lookbehind assertion.
return [x.strip() for x in re.split('(?<!Bell), (?!\w\.\w?\.?)', pieces[1]) if x.strip()]
else:
return []

# skip bills from the other chamber
elif not bill_id.startswith(start_letter):
continue
def _parse_vote(self, vote, url):
url = BASE_URL + url

else:
# create a bill
desc = bill.xpath('text()')[0].strip()
bill_type = {'B': 'bill',
'J': 'joint resolution',
'R': 'resolution'}[bill_id[1]]
bill = Bill(session, chamber, bill_id, desc,
type=bill_type)
html = self.get(url, retry_on_404=True).text
doc = lxml.html.fromstring(html)

bill_url = BASE_URL + link.get('href')
self.fetch_sponsors(bill)
self.scrape_bill_details(bill_url, bill)
bill['subjects'] = self.subject_map[bill_id]
bill.add_source(bill_url)
self.save_bill(bill)
yeas = doc.xpath('//p[contains(text(), "YEAS--")]')
nays = doc.xpath('//p[contains(text(), "NAYS--")]')
# We capture "other" types of votes separately just in case we
# want to have the granularity later.
rule36 = doc.xpath('//p[contains(text(), "RULE 36--")]')
abstaining = doc.xpath('//p[contains(text(), "ABSTENTIONS--")]')
notvoting = doc.xpath('//p[contains(text(), "NOT VOTING--")]')

map(vote.yes, self._split_vote(yeas))
map(vote.no, self._split_vote(nays))
# Flattening all types of other votes into a single list.
other_votes = []
map(other_votes.extend, (self._split_vote(rule36), self._split_vote(abstaining),
self._split_vote(notvoting)))
map(vote.other, other_votes)

def scrape_bill_details(self, url, bill):
def _scrape_bill_details(self, url, bill):
html = self.get(url, retry_on_404=True).text
doc = lxml.html.fromstring(html)

Expand Down Expand Up @@ -150,27 +163,101 @@ def scrape_bill_details(self, url, bill):
on_duplicate='use_old')

# actions
for ali in doc.xpath('//h4[text()="HISTORY"]/following-sibling::ul[1]/li'):
cached_vote = None
cached_action = None
for ali in doc.xpath('//h4[text()="HISTORY"]/following-sibling::ul[1]/'
'li'):
vote = None

date, action = ali.text_content().split(u' \xa0')
actor, action = action.split(': ', 1)

# Bill history entries purely in parentheses tend to be
# notes and not actions, so we'll skip them.
if action.startswith('(') and action.endswith(')'):
continue

actor = self.actor_map[actor]
date = datetime.datetime.strptime(date.strip(), '%m/%d/%y')

# if action ends in (##-Y ##-N) remove that part
vrematch = self.vote_strip_re.match(action)
# The following conditional logic is messy to handle
# Virginia's crazy and inconsistently formatted bill
# histories. Someone less harried and tired than me
# could probably make this much cleaner. - alo
if vrematch:
action, y, n, o = vrematch.groups()
vote = Vote(actor, date, action, int(y) > int(n),
int(y), int(n), 0)
vote_action, y, n, o = vrematch.groups()
y = int(y)
n = int(n)
# Set default count for "other" votes to 0. We have to
# do this explicitly as it's excluded from the action
# text when there were no abstentions (the only type of
# "other" vote encountered thus far).
if o is None:
o = 0
else:
o = int(o)

vote_url = ali.xpath('a/@href')
if vote_url:
self.parse_vote(vote, vote_url[0])
vote.add_source(BASE_URL + vote_url[0])
# set other count, it isn't provided
vote['other_count'] = len(vote['other_votes'])

# Caches relevant information from the current action if
# vote count encountered, then searches for the presence
# of identical counts in the next entry (we assume that
# it's probably there). If matching votes are found, it
# pulls the cached data to create a unified vote record.
#
# This is because Virginia usually publishes two lines
# of history data for a single vote, without guaranteed
# order, so we cache and unsafely attempt to match on
# identical vote counts in the next line.
if cached_vote is None:
cached_action = action
cached_vote = Vote(actor, date, vote_action, y > n, y, n,
o)
if vote_url:
cached_vote.add_source(BASE_URL + vote_url[0])
continue
elif cached_vote is not None:
if vote_action.startswith(u'VOTE:'):
if (vote_url
and cached_vote['yes_count'] == y
and cached_vote['no_count'] == n
and cached_vote['other_count'] == o):
vote = cached_vote
self._parse_vote(vote, vote_url[0])
vote.add_source(BASE_URL + vote_url[0])
action = cached_action
elif cached_vote['motion'].startswith('VOTE:'):
if (cached_vote['yes_count'] == y
and cached_vote['no_count'] == n
and cached_vote['other_count'] == o):
vote = cached_vote
vote['motion'] = vote_action
else:
# Cached vote doesn't match up to the current
# one. Save, then cache the current vote to
# begin the next search.
bill.add_vote(cached_vote)
cached_vote = Vote(actor, date, vote_action, y > n, y,
n, o)
if vote_url:
cached_vote.add_source(BASE_URL + vote_url[0])
cached_action = action
continue

if vote is None:
raise ValueError('Cannot save an empty vote.')
#vote.validate()
bill.add_vote(vote)
else:
# If this action isn't a vote, but the last one was,
# there's obviously no additional vote data to match.
# Go ahead and save the cached data.
if cached_vote is not None:
bill.add_vote(cached_vote)

cached_vote = cached_action = None

# categorize actions
for pattern, atype in self._action_classifiers:
Expand All @@ -184,67 +271,49 @@ def scrape_bill_details(self, url, bill):
bill.add_action(actor, action, date, type=atype)


def fetch_sponsors(self, bill):
url = "http://lis.virginia.gov/cgi-bin/legp604.exe?%s+mbr+%s" % (
self.site_id, bill['bill_id'].replace(' ', ''))
def scrape(self, chamber, session):
self.user_agent = 'openstates +mozilla'
# internal id for the session, store on self so all methods have access
self.site_id = self.metadata['session_details'][session]['site_id']

# order of chamber uls
#if bill['chamber'] == 'lower':
# order = ['lower', 'upper']
#else:
# order = ['upper', 'lower']
self._build_subject_map()

html = self.get(url, retry_on_404=True).text
doc = lxml.html.fromstring(html)
# used for skipping bills from opposite chamber
start_letter = 'H' if chamber == 'lower' else 'S'

for slist in doc.xpath('//ul[@class="linkSect"]'):
# note that first ul is origin chamber
for sponsor in slist.xpath('li'):
name = sponsor.text_content().strip()
if name.endswith(u' (chief\xa0patron)'):
name = name[:-15]
type = 'primary'
elif name.endswith(u' (chief\xa0co-patron)'):
name = name[:-18]
type = 'cosponsor'
else:
type = 'cosponsor'
bill.add_sponsor(type, name)
url = 'http://lis.virginia.gov/cgi-bin/legp604.exe?%s+lst+ALL' % self.site_id

def split_vote(self, block):
if block:
block = block[0].text.replace('\r\n', ' ')
while url:
html = self.get(url, retry_on_404=True).text
doc = lxml.html.fromstring(html)

pieces = block.split('--')
# if there are only two pieces, there are no abstentions
if len(pieces) <= 2:
return []
else:
# lookahead and don't split if comma precedes initials
# Also, Bell appears as Bell, Richard B. and Bell, Robert P.
# and so needs the lookbehind assertion.
return [x.strip() for x in re.split('(?<!Bell), (?!\w\.\w?\.?)', pieces[1]) if x.strip()]
else:
return []
url = None # no more unless we encounter 'More...'

def parse_vote(self, vote, url):
url = BASE_URL + url
bills = doc.xpath('//ul[@class="linkSect"]/li')
for bill in bills:
link = bill.getchildren()[0]
bill_id = str(link.text_content())

html = self.get(url, retry_on_404=True).text
doc = lxml.html.fromstring(html)
# check if this is the 'More...' link
if bill_id.startswith('More'):
url = BASE_URL + link.get('href')

yeas = doc.xpath('//p[contains(text(), "YEAS--")]')
nays = doc.xpath('//p[contains(text(), "NAYS--")]')
# We capture "other" types of votes separately just in case we
# want to have the granularity later.
rule36 = doc.xpath('//p[contains(text(), "RULE 36--")]')
abstaining = doc.xpath('//p[contains(text(), "ABSTENTIONS--")]')
notvoting = doc.xpath('//p[contains(text(), "NOT VOTING--")]')
# skip bills from the other chamber
elif not bill_id.startswith(start_letter):
continue

map(vote.yes, self.split_vote(yeas))
map(vote.no, self.split_vote(nays))
# Flattening all types of other votes into a single list.
other_votes = []
map(other_votes.extend, (self.split_vote(rule36), self.split_vote(abstaining),
self.split_vote(notvoting)))
map(vote.other, other_votes)
else:
# create a bill
desc = bill.xpath('text()')[0].strip()
bill_type = {'B': 'bill',
'J': 'joint resolution',
'R': 'resolution'}[bill_id[1]]
bill = Bill(session, chamber, bill_id, desc,
type=bill_type)

bill_url = BASE_URL + link.get('href')
self._fetch_sponsors(bill)
self._scrape_bill_details(bill_url, bill)
bill['subjects'] = self.subject_map[bill_id]
bill.add_source(bill_url)
self.save_bill(bill)

0 comments on commit 928a1f1

Please sign in to comment.