diff --git a/openstates/la/committees.py b/openstates/la/committees.py index 06ca74500f..281ede0bce 100644 --- a/openstates/la/committees.py +++ b/openstates/la/committees.py @@ -1,49 +1,46 @@ -from billy.scrape import NoDataForPeriod -from billy.scrape.committees import CommitteeScraper, Committee - import re import lxml.html from scrapelib import HTTPError +import name_tools -class LACommitteeScraper(CommitteeScraper): - jurisdiction = 'la' +from billy.scrape import NoDataForPeriod +from billy.scrape.committees import CommitteeScraper, Committee +from openstates.utils import LXMLMixin - def scrape(self, chamber, term): - if term != self.metadata['terms'][-1]['name']: - raise NoDataForPeriod(term) - if chamber == 'upper': - self.scrape_senate() - else: - self.scrape_house() +class LACommitteeScraper(CommitteeScraper, LXMLMixin): + jurisdiction = 'la' - def scrape_senate(self): - committee_types = { - 'Standing': 'http://senate.la.gov/Committees/Assignments.asp?type=Standing', - 'Select': 'http://senate.la.gov/Committees/Assignments.asp?type=Select' + def _normalize_committee_name(self, name): + committees = { + 'House Executive Cmte': 'House Executive Committee', + 'Atchafalaya Basin Oversight': 'Atchafalaya Basin Program Oversight Committee', + 'Homeland Security': 'House Select Committee on Homeland Security', + 'Hurricane Recovery': 'Select Committee on Hurricane Recovery', + 'Legislative Budgetary Control': 'Legislative Budgetary Control Council', + 'Military and Veterans Affairs': 'Special Committee on Military and Veterans Affairs' } - - for name, url in committee_types.items(): - text = self.get(url).text - page = lxml.html.fromstring(text) - page.make_links_absolute(url) + return committees[name] if name in committees else name - committees = page.xpath('//td[@bgcolor="#EBEAEC"]//a') - - for link in committees: - name = link.xpath('string()').strip() - url2 = link.attrib['href'] - self.scrape_senate_committee(name, url2) + def _normalize_member_role(self, member_role): + if member_role in ['Chairman', 'Co-Chairmain', 'Vice Chair', + 'Ex Officio']: + role = member_role.lower() + elif member_role == 'Interim Member': + role = 'interim' + else: + role = 'member' - def scrape_senate_committee(self, name, url2): + return role + + def _scrape_upper_committee(self, name, url2): cat = "Assignments.asp" url3 = "".join((url2, cat)) committee = Committee('upper', name) committee.add_source(url2) - text = self.get(url3).text - page = lxml.html.fromstring(text) + page = self.lxmlize(url3) members = page.xpath('//table[@id="table38"]//font/a/b') @@ -62,139 +59,96 @@ def scrape_senate_committee(self, name, url2): self.save_committee(committee) - def scrape_house(self): - url = "http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp" - comm_cache = {} - text = self.get(url).text - page = lxml.html.fromstring(text) - - for row in page.xpath("//table[@bordercolorlight='#EAEAEA']/tr"): - cells = row.xpath('td') - - name = cells[0].xpath('string()').strip() - - if name.startswith('Vacant'): - continue - - font = cells[1] - committees = [] - - if font is not None and font.text: - committees.append(font.text.strip()) - for br in font.xpath('br'): - if br.text: - committees.append(br.text.strip()) - if br.tail: - committees.append(br.tail) - - for comm_name in committees: - mtype = 'member' - if comm_name.endswith(', Chairman'): - mtype = 'chairman' - comm_name = comm_name.replace(', Chairman', '') - elif comm_name.endswith(', Co-Chairmain'): - mtype = 'co-chairmain' - comm_name = comm_name.replace(', Co-Chairmain', '') - elif comm_name.endswith(', Vice Chair'): - mtype = 'vice chair' - comm_name = comm_name.replace(', Vice Chair', '') - elif comm_name.endswith(', Ex Officio'): - mtype = 'ex officio' - comm_name = comm_name.replace(', Ex Officio', '') - elif comm_name.endswith(", Interim Member"): - mtype = 'interim' - comm_name = comm_name.replace(", Interim Member", "") - - - if comm_name.startswith('Joint'): - chamber = 'joint' - else: - chamber = 'lower' - - try: - committee = comm_cache[comm_name] - except KeyError: - if comm_name.strip() == "": - continue - - committee = Committee(chamber, comm_name) - committee.add_source(url) - comm_cache[comm_name] = committee - - committee.add_member(name, mtype) - - special = self.scrape_house_special(comm_cache.keys()) - for name, comm in special.items(): - comm_cache[name] = comm - - for committee in comm_cache.values(): - self.save_committee(committee) - - def scrape_house_special(self, scraped_committees): - url = 'http://house.louisiana.gov/H_Reps/H_Reps_SpecialCmtes.asp' - text = self.get(url).text - page = lxml.html.fromstring(text) - page.make_links_absolute('http://house.louisiana.gov') - - committees = {} - for el in page.xpath("//a[contains(@href,'H_Cmtes/')]"): - comm_name = el.xpath('normalize-space(string())') - comm_name = self.normalize_committee_name(comm_name) - - # skip committees that have already been scraped from - # http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp - if comm_name not in scraped_committees: - comm_url = el.get('href').replace('../','') - - try: - text = self.get(comm_url).text - except HTTPError: - self.logger.warning("Link not working, skipping.") - continue - - # check for no record found - if re.search('No records returned.', text): - self.logger.warning("No record found, skipping.") - continue - - chamber = 'joint' if comm_name.startswith('Joint') else 'lower' - committee = Committee(chamber, comm_name) - committee.add_source(url) - - - page = lxml.html.fromstring(text) - page.make_links_absolute('http://house.louisiana.gov') - - for row in page.xpath('//table[@id="table1"]//tbody/tr'): - member_info = row.xpath('./td') - mname = member_info[0].xpath('normalize-space(string())') - mtype = member_info[1].xpath('normalize-space(string())') - if mtype == 'Chairman': - mtype = 'chairman' - elif mtype == 'Co-Chairmain': - mtype = 'co-chairmain' - elif mtype == 'Vice Chair': - mtype = 'vice chair' - elif mtype == 'Ex Officio': - mtype = 'ex officio' - elif mtype == 'Interim Member': - mtype = 'interim' - else: - mtype = 'member' - committee.add_member(mname, mtype) - - committees[comm_name] = committee + def _scrape_lower_standing_committee(self, committee_name, url): + page = self.lxmlize(url) + + committee = Committee('lower', committee_name) + committee.add_source(url) + + rows = page.xpath('//table[@id="body_ListView1_itemPlaceholder' + 'Container"]/tr[@class="linkStyle2"]') + + for row in rows: + member_name = row.xpath('normalize-space(string(./td[1]/a))') + member_name = ' '.join(filter(None, name_tools.split(member_name))) + member_role = row.xpath('normalize-space(string(./td[2]))') + + member_role = self._normalize_member_role(member_role) + + committee.add_member(member_name, member_role) + + self.save_committee(committee) + + def _scrape_lower_standing_committees(self): + url = 'http://house.louisiana.gov/H_Reps/H_Reps_StandCmtees.aspx' + page = self.lxmlize(url) - return committees + committee_cells = page.xpath('//table[@id="table11"]/tr/td[@class=' + '"auto-style1"]') + + for committee_cell in committee_cells: + committee_link = committee_cell.xpath('.//a')[0] + + committee_url = committee_link.get('href') + committee_name = committee_link.xpath('normalize-space(string())').strip() + + self._scrape_lower_standing_committee(committee_name, + committee_url) + + def _scrape_lower_special_committees(self): + url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx' + page = self.lxmlize(url) - - def normalize_committee_name(self, name): - committees = { - 'House Executive Cmte': 'House Executive Committee', - 'Atchafalaya Basin Oversight': 'Atchafalaya Basin Program Oversight Committee', - 'Homeland Security': 'House Select Committee on Homeland Security', - 'Hurricane Recovery': 'Select Committee on Hurricane Recovery', - 'Legislative Budgetary Control': 'Legislative Budgetary Control Council', - 'Military and Veterans Affairs': 'Special Committee on Military and Veterans Affairs' + committee_list = page.xpath('//table[@id="table106"]//div[@class=' + '"exBody1A"]/div[@class="accordion"]')[0] + headers = committee_list.xpath('./h3') + + for header in headers: + committee_name_text = header.xpath('string()') + committee_name = committee_name_text.strip() + committee_name = self._normalize_committee_name(committee_name) + + chamber = 'joint' if committee_name.startswith('Joint') else 'lower' + + committee = Committee(chamber, committee_name) + committee.add_source(url) + + committee_memberlist = header.xpath('./following-sibling::div[' + '@class="pane"]//tr[@class="linkStyle2"]') + + for row in committee_memberlist: + member_name = row.xpath('normalize-space(string(./td[1]))') + member_name = ' '.join(filter(None, name_tools.split(member_name))) + member_role = row.xpath('normalize-space(string(./td[2]))') + + member_role = self._normalize_member_role(member_role) + + committee.add_member(member_name, member_role) + + self.save_committee(committee) + + def _scrape_upper_chamber(self): + committee_types = { + 'Standing': 'http://senate.la.gov/Committees/Assignments.asp?type=Standing', + 'Select': 'http://senate.la.gov/Committees/Assignments.asp?type=Select' } - return committees[name] if name in committees else name + + for name, url in committee_types.items(): + page = self.lxmlize(url) + + committees = page.xpath('//td[@bgcolor="#EBEAEC"]//a') + + for link in committees: + name = link.xpath('string()').strip() + url2 = link.attrib['href'] + self._scrape_upper_committee(name, url2) + + def _scrape_lower_chamber(self): + self._scrape_lower_standing_committees() + self._scrape_lower_special_committees() + + def scrape(self, chamber, term): + if term != self.metadata['terms'][-1]['name']: + raise NoDataForPeriod(term) + + getattr(self, '_scrape_' + chamber + '_chamber')()