Skip to content

Commit

Permalink
NH: Created committee scraper.
Browse files Browse the repository at this point in the history
  • Loading branch information
Andy Lo committed May 9, 2016
1 parent 384b726 commit f652e68
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 1 deletion.
3 changes: 2 additions & 1 deletion openstates/nh/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import lxml.html
from .bills import NHBillScraper
from .legislators import NHLegislatorScraper
from .committees import NHCommitteeScraper
from . import utils

metadata = {
Expand All @@ -18,7 +19,7 @@
'start_year': 2011, 'end_year': 2012},
{'name': '2013-2014', 'sessions': ['2013', '2014'],
'start_year': 2013, 'end_year': 2014},
{'name': '2015-2016', 'sessions': ['2015','2016'],
{'name': '2015-2016', 'sessions': ['2015', '2016'],
'start_year': 2015, 'end_year': 2016}
],
'session_details': {
Expand Down
91 changes: 91 additions & 0 deletions openstates/nh/committees.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import re

from billy.scrape.committees import CommitteeScraper, Committee

from .utils import db_cursor


class NHCommitteeScraper(CommitteeScraper):
jurisdiction = 'nh'

_chamber_map = {
'upper': 'S',
'lower': 'H',
}

def _parse_committees(self, chamber):
"""Queries and returns a dict for legislative committees."""
chamber_code = NHCommitteeScraper._chamber_map[chamber]

query = "SELECT "\
"Committees.CommitteeCode AS committee_code, "\
"Committees.committeename AS committee_name "\
"FROM Committees "\
"WHERE Committees.CommitteeCode LIKE '{}%' "\
"AND DATALENGTH(Committees.CommitteeEmailAddress) > 0"\
.format(chamber_code)

self.db_cursor.execute(query)

committees = {}
for row in self.db_cursor.fetchall():
# There is technically a potential for duplicated committee
# records to cause issues here. Haven't decided how to
# handle this yet, so we're throwing an exception instead.
if row['committee_code'] not in committees.keys():
committee = Committee(chamber, row['committee_name'])
committee_url = 'http://www.gencourt.state.nh.us/house/'\
'committees/committeedetails.aspx?code={}'\
.format(row['committee_code'])
committee.add_source(committee_url)
committees[row['committee_code']] = committee
else:
raise ValueError('{} committee is already present.'.format(
row['committee_name']))

return committees

def _parse_committee_members(self, committees):
"""Queries and parses the members for a given dict of committees."""
# Adopting the following method of parsing committee members to
# reduce querying. Grabs a list of members for all committees
# given instead of querying for each committee individually.
query = "SELECT "\
"CommitteeMembers.CommitteeCode AS committee_code, "\
"CommitteeMembers.comments AS comment, "\
"Legislators.LastName AS last_name, "\
"Legislators.FirstName AS first_name, "\
"Legislators.MiddleName AS middle_name "\
"FROM CommitteeMembers "\
"LEFT OUTER JOIN Legislators "\
"ON CommitteeMembers.EmployeeNumber = Legislators.Employeeno "\
"WHERE CommitteeMembers.CommitteeCode IN ('{}') "\
"AND CommitteeMembers.ActiveMember = 1 "\
"ORDER BY CommitteeCode ASC, SequenceNumber ASC"\
.format('\', \''.join(committees))

self.db_cursor.execute(query)

for row in self.db_cursor.fetchall():
committee = committees[row['committee_code']]
member_name = '{} {} {}'.format(row['first_name'],
row['middle_name'], row['last_name'])
member_name = re.sub(r'[\s]{2,}', ' ', member_name)

if row['comment'] in ('Chairman', 'V Chairman'):
member_role = 'chair'
else:
member_role = 'member'

committee.add_member(member_name, member_role)

return committees

def scrape(self, chamber, term):
self.db_cursor = db_cursor()

committees = self._parse_committee_members(
self._parse_committees(chamber))

for committee_code, committee in committees.iteritems():
self.save_committee(committee)

0 comments on commit f652e68

Please sign in to comment.