Skip to content

Commit

Permalink
LA: Overhauled committee scraper due to site change.
Browse files Browse the repository at this point in the history
  • Loading branch information
Andy Lo committed Jun 8, 2016
1 parent f1bf8f6 commit b480f1b
Showing 1 changed file with 117 additions and 163 deletions.
280 changes: 117 additions & 163 deletions openstates/la/committees.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,46 @@
from billy.scrape import NoDataForPeriod
from billy.scrape.committees import CommitteeScraper, Committee

import re
import lxml.html
from scrapelib import HTTPError
import name_tools

class LACommitteeScraper(CommitteeScraper):
jurisdiction = 'la'
from billy.scrape import NoDataForPeriod
from billy.scrape.committees import CommitteeScraper, Committee
from openstates.utils import LXMLMixin

def scrape(self, chamber, term):
if term != self.metadata['terms'][-1]['name']:
raise NoDataForPeriod(term)

if chamber == 'upper':
self.scrape_senate()
else:
self.scrape_house()
class LACommitteeScraper(CommitteeScraper, LXMLMixin):
jurisdiction = 'la'

def scrape_senate(self):
committee_types = {
'Standing': 'http://senate.la.gov/Committees/Assignments.asp?type=Standing',
'Select': 'http://senate.la.gov/Committees/Assignments.asp?type=Select'
def _normalize_committee_name(self, name):
committees = {
'House Executive Cmte': 'House Executive Committee',
'Atchafalaya Basin Oversight': 'Atchafalaya Basin Program Oversight Committee',
'Homeland Security': 'House Select Committee on Homeland Security',
'Hurricane Recovery': 'Select Committee on Hurricane Recovery',
'Legislative Budgetary Control': 'Legislative Budgetary Control Council',
'Military and Veterans Affairs': 'Special Committee on Military and Veterans Affairs'
}

for name, url in committee_types.items():
text = self.get(url).text
page = lxml.html.fromstring(text)
page.make_links_absolute(url)
return committees[name] if name in committees else name

committees = page.xpath('//td[@bgcolor="#EBEAEC"]//a')

for link in committees:
name = link.xpath('string()').strip()
url2 = link.attrib['href']
self.scrape_senate_committee(name, url2)
def _normalize_member_role(self, member_role):
if member_role in ['Chairman', 'Co-Chairmain', 'Vice Chair',
'Ex Officio']:
role = member_role.lower()
elif member_role == 'Interim Member':
role = 'interim'
else:
role = 'member'

def scrape_senate_committee(self, name, url2):
return role

def _scrape_upper_committee(self, name, url2):
cat = "Assignments.asp"
url3 = "".join((url2, cat))

committee = Committee('upper', name)
committee.add_source(url2)

text = self.get(url3).text
page = lxml.html.fromstring(text)
page = self.lxmlize(url3)

members = page.xpath('//table[@id="table38"]//font/a/b')

Expand All @@ -62,139 +59,96 @@ def scrape_senate_committee(self, name, url2):

self.save_committee(committee)

def scrape_house(self):
url = "http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp"
comm_cache = {}
text = self.get(url).text
page = lxml.html.fromstring(text)

for row in page.xpath("//table[@bordercolorlight='#EAEAEA']/tr"):
cells = row.xpath('td')

name = cells[0].xpath('string()').strip()

if name.startswith('Vacant'):
continue

font = cells[1]
committees = []

if font is not None and font.text:
committees.append(font.text.strip())
for br in font.xpath('br'):
if br.text:
committees.append(br.text.strip())
if br.tail:
committees.append(br.tail)

for comm_name in committees:
mtype = 'member'
if comm_name.endswith(', Chairman'):
mtype = 'chairman'
comm_name = comm_name.replace(', Chairman', '')
elif comm_name.endswith(', Co-Chairmain'):
mtype = 'co-chairmain'
comm_name = comm_name.replace(', Co-Chairmain', '')
elif comm_name.endswith(', Vice Chair'):
mtype = 'vice chair'
comm_name = comm_name.replace(', Vice Chair', '')
elif comm_name.endswith(', Ex Officio'):
mtype = 'ex officio'
comm_name = comm_name.replace(', Ex Officio', '')
elif comm_name.endswith(", Interim Member"):
mtype = 'interim'
comm_name = comm_name.replace(", Interim Member", "")


if comm_name.startswith('Joint'):
chamber = 'joint'
else:
chamber = 'lower'

try:
committee = comm_cache[comm_name]
except KeyError:
if comm_name.strip() == "":
continue

committee = Committee(chamber, comm_name)
committee.add_source(url)
comm_cache[comm_name] = committee

committee.add_member(name, mtype)

special = self.scrape_house_special(comm_cache.keys())
for name, comm in special.items():
comm_cache[name] = comm

for committee in comm_cache.values():
self.save_committee(committee)

def scrape_house_special(self, scraped_committees):
url = 'http://house.louisiana.gov/H_Reps/H_Reps_SpecialCmtes.asp'
text = self.get(url).text
page = lxml.html.fromstring(text)
page.make_links_absolute('http://house.louisiana.gov')

committees = {}
for el in page.xpath("//a[contains(@href,'H_Cmtes/')]"):
comm_name = el.xpath('normalize-space(string())')
comm_name = self.normalize_committee_name(comm_name)

# skip committees that have already been scraped from
# http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp
if comm_name not in scraped_committees:
comm_url = el.get('href').replace('../','')

try:
text = self.get(comm_url).text
except HTTPError:
self.logger.warning("Link not working, skipping.")
continue

# check for no record found
if re.search('No records returned.', text):
self.logger.warning("No record found, skipping.")
continue

chamber = 'joint' if comm_name.startswith('Joint') else 'lower'
committee = Committee(chamber, comm_name)
committee.add_source(url)


page = lxml.html.fromstring(text)
page.make_links_absolute('http://house.louisiana.gov')

for row in page.xpath('//table[@id="table1"]//tbody/tr'):
member_info = row.xpath('./td')
mname = member_info[0].xpath('normalize-space(string())')
mtype = member_info[1].xpath('normalize-space(string())')
if mtype == 'Chairman':
mtype = 'chairman'
elif mtype == 'Co-Chairmain':
mtype = 'co-chairmain'
elif mtype == 'Vice Chair':
mtype = 'vice chair'
elif mtype == 'Ex Officio':
mtype = 'ex officio'
elif mtype == 'Interim Member':
mtype = 'interim'
else:
mtype = 'member'
committee.add_member(mname, mtype)

committees[comm_name] = committee
def _scrape_lower_standing_committee(self, committee_name, url):
page = self.lxmlize(url)

committee = Committee('lower', committee_name)
committee.add_source(url)

rows = page.xpath('//table[@id="body_ListView1_itemPlaceholder'
'Container"]/tr[@class="linkStyle2"]')

for row in rows:
member_name = row.xpath('normalize-space(string(./td[1]/a))')
member_name = ' '.join(filter(None, name_tools.split(member_name)))
member_role = row.xpath('normalize-space(string(./td[2]))')

member_role = self._normalize_member_role(member_role)

committee.add_member(member_name, member_role)

self.save_committee(committee)

def _scrape_lower_standing_committees(self):
url = 'http://house.louisiana.gov/H_Reps/H_Reps_StandCmtees.aspx'
page = self.lxmlize(url)

return committees
committee_cells = page.xpath('//table[@id="table11"]/tr/td[@class='
'"auto-style1"]')

for committee_cell in committee_cells:
committee_link = committee_cell.xpath('.//a')[0]

committee_url = committee_link.get('href')
committee_name = committee_link.xpath('normalize-space(string())').strip()

self._scrape_lower_standing_committee(committee_name,
committee_url)

def _scrape_lower_special_committees(self):
url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx'
page = self.lxmlize(url)


def normalize_committee_name(self, name):
committees = {
'House Executive Cmte': 'House Executive Committee',
'Atchafalaya Basin Oversight': 'Atchafalaya Basin Program Oversight Committee',
'Homeland Security': 'House Select Committee on Homeland Security',
'Hurricane Recovery': 'Select Committee on Hurricane Recovery',
'Legislative Budgetary Control': 'Legislative Budgetary Control Council',
'Military and Veterans Affairs': 'Special Committee on Military and Veterans Affairs'
committee_list = page.xpath('//table[@id="table106"]//div[@class='
'"exBody1A"]/div[@class="accordion"]')[0]
headers = committee_list.xpath('./h3')

for header in headers:
committee_name_text = header.xpath('string()')
committee_name = committee_name_text.strip()
committee_name = self._normalize_committee_name(committee_name)

chamber = 'joint' if committee_name.startswith('Joint') else 'lower'

committee = Committee(chamber, committee_name)
committee.add_source(url)

committee_memberlist = header.xpath('./following-sibling::div['
'@class="pane"]//tr[@class="linkStyle2"]')

for row in committee_memberlist:
member_name = row.xpath('normalize-space(string(./td[1]))')
member_name = ' '.join(filter(None, name_tools.split(member_name)))
member_role = row.xpath('normalize-space(string(./td[2]))')

member_role = self._normalize_member_role(member_role)

committee.add_member(member_name, member_role)

self.save_committee(committee)

def _scrape_upper_chamber(self):
committee_types = {
'Standing': 'http://senate.la.gov/Committees/Assignments.asp?type=Standing',
'Select': 'http://senate.la.gov/Committees/Assignments.asp?type=Select'
}
return committees[name] if name in committees else name

for name, url in committee_types.items():
page = self.lxmlize(url)

committees = page.xpath('//td[@bgcolor="#EBEAEC"]//a')

for link in committees:
name = link.xpath('string()').strip()
url2 = link.attrib['href']
self._scrape_upper_committee(name, url2)

def _scrape_lower_chamber(self):
self._scrape_lower_standing_committees()
self._scrape_lower_special_committees()

def scrape(self, chamber, term):
if term != self.metadata['terms'][-1]['name']:
raise NoDataForPeriod(term)

getattr(self, '_scrape_' + chamber + '_chamber')()

0 comments on commit b480f1b

Please sign in to comment.