LA: Overhauled committee scraper due to site change.

showerst · Jun 8, 2016 · b480f1b · b480f1b
1 parent f1bf8f6
commit b480f1b
Showing 1 changed file with 117 additions and 163 deletions.
diff --git a/openstates/la/committees.py b/openstates/la/committees.py
@@ -1,49 +1,46 @@
-from billy.scrape import NoDataForPeriod
-from billy.scrape.committees import CommitteeScraper, Committee
-
 import re
 import lxml.html
 from scrapelib import HTTPError
+import name_tools
 
-class LACommitteeScraper(CommitteeScraper):
-    jurisdiction = 'la'
+from billy.scrape import NoDataForPeriod
+from billy.scrape.committees import CommitteeScraper, Committee
+from openstates.utils import LXMLMixin
 
-    def scrape(self, chamber, term):
-        if term != self.metadata['terms'][-1]['name']:
-            raise NoDataForPeriod(term)
 
-        if chamber == 'upper':
-            self.scrape_senate()
-        else:
-            self.scrape_house()
+class LACommitteeScraper(CommitteeScraper, LXMLMixin):
+    jurisdiction = 'la'
 
-    def scrape_senate(self):
-        committee_types = {
-            'Standing': 'http://senate.la.gov/Committees/Assignments.asp?type=Standing',
-            'Select': 'http://senate.la.gov/Committees/Assignments.asp?type=Select'
+    def _normalize_committee_name(self, name):
+        committees = {
+            'House Executive Cmte': 'House Executive Committee',
+            'Atchafalaya Basin Oversight': 'Atchafalaya Basin Program Oversight Committee',
+            'Homeland Security': 'House Select Committee on Homeland Security',
+            'Hurricane Recovery': 'Select Committee on Hurricane Recovery',
+            'Legislative Budgetary Control': 'Legislative Budgetary Control Council',
+            'Military and Veterans Affairs': 'Special Committee on Military and Veterans Affairs'
         }
-
-        for name, url in committee_types.items():
-            text = self.get(url).text
-            page = lxml.html.fromstring(text)
-            page.make_links_absolute(url)
+        return committees[name] if name in committees else name
 
-            committees = page.xpath('//td[@bgcolor="#EBEAEC"]//a') 
-
-            for link in committees:
-                name = link.xpath('string()').strip()
-                url2 = link.attrib['href']
-                self.scrape_senate_committee(name, url2)
+    def _normalize_member_role(self, member_role):
+        if member_role in ['Chairman', 'Co-Chairmain', 'Vice Chair',
+            'Ex Officio']:
+            role = member_role.lower()
+        elif member_role == 'Interim Member':
+            role = 'interim'
+        else:
+            role = 'member'
 
-    def scrape_senate_committee(self, name, url2):
+        return role
+
+    def _scrape_upper_committee(self, name, url2):
         cat = "Assignments.asp"
         url3 = "".join((url2, cat))
 
         committee = Committee('upper', name)
         committee.add_source(url2)
 
-        text = self.get(url3).text
-        page = lxml.html.fromstring(text)
+        page = self.lxmlize(url3)
 
         members = page.xpath('//table[@id="table38"]//font/a/b')
 
@@ -62,139 +59,96 @@ def scrape_senate_committee(self, name, url2):
 
         self.save_committee(committee)
 
-    def scrape_house(self):
-        url = "http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp"
-        comm_cache = {}
-        text = self.get(url).text
-        page = lxml.html.fromstring(text)
-
-        for row in page.xpath("//table[@bordercolorlight='#EAEAEA']/tr"):
-            cells = row.xpath('td')
-
-            name = cells[0].xpath('string()').strip()
-
-            if name.startswith('Vacant'):
-                continue
-
-            font = cells[1]
-            committees = []
-
-            if font is not None and font.text:
-                committees.append(font.text.strip())
-            for br in font.xpath('br'):
-                if br.text:
-                    committees.append(br.text.strip())
-                if br.tail:
-                    committees.append(br.tail)
-
-            for comm_name in committees:
-                mtype = 'member'
-                if comm_name.endswith(', Chairman'):
-                    mtype = 'chairman'
-                    comm_name = comm_name.replace(', Chairman', '')
-                elif comm_name.endswith(', Co-Chairmain'):
-                    mtype = 'co-chairmain'
-                    comm_name = comm_name.replace(', Co-Chairmain', '')
-                elif comm_name.endswith(', Vice Chair'):
-                    mtype = 'vice chair'
-                    comm_name = comm_name.replace(', Vice Chair', '')
-                elif comm_name.endswith(', Ex Officio'):
-                    mtype = 'ex officio'
-                    comm_name = comm_name.replace(', Ex Officio', '')
-                elif comm_name.endswith(", Interim Member"):
-                    mtype = 'interim'
-                    comm_name = comm_name.replace(", Interim Member", "")
-
-
-                if comm_name.startswith('Joint'):
-                    chamber = 'joint'
-                else:
-                    chamber = 'lower'
-
-                try:
-                    committee = comm_cache[comm_name]
-                except KeyError:
-                    if comm_name.strip() == "":
-                        continue
-
-                    committee = Committee(chamber, comm_name)
-                    committee.add_source(url)
-                    comm_cache[comm_name] = committee
-
-                committee.add_member(name, mtype)
-
-        special = self.scrape_house_special(comm_cache.keys())
-        for name, comm in special.items():
-            comm_cache[name] = comm
-
-        for committee in comm_cache.values():
-            self.save_committee(committee)
-
-    def scrape_house_special(self, scraped_committees):
-        url = 'http://house.louisiana.gov/H_Reps/H_Reps_SpecialCmtes.asp'
-        text = self.get(url).text
-        page = lxml.html.fromstring(text)
-        page.make_links_absolute('http://house.louisiana.gov')
-
-        committees = {}
-        for el in page.xpath("//a[contains(@href,'H_Cmtes/')]"):
-            comm_name = el.xpath('normalize-space(string())')
-            comm_name = self.normalize_committee_name(comm_name)
-
-            # skip committees that have already been scraped from 
-            # http://house.louisiana.gov/H_Reps/H_Reps_CmtesFull.asp
-            if comm_name not in scraped_committees:    
-                comm_url = el.get('href').replace('../','')
-
-                try:
-                    text = self.get(comm_url).text
-                except HTTPError:
-                    self.logger.warning("Link not working, skipping.")
-                    continue
-
-                # check for no record found
-                if re.search('No records returned.', text):
-                    self.logger.warning("No record found, skipping.")
-                    continue 
-
-                chamber = 'joint' if comm_name.startswith('Joint') else 'lower'
-                committee = Committee(chamber, comm_name)
-                committee.add_source(url)
-
-
-                page = lxml.html.fromstring(text)
-                page.make_links_absolute('http://house.louisiana.gov')
-
-                for row in page.xpath('//table[@id="table1"]//tbody/tr'):
-                    member_info = row.xpath('./td')
-                    mname = member_info[0].xpath('normalize-space(string())')
-                    mtype = member_info[1].xpath('normalize-space(string())')
-                    if mtype == 'Chairman':
-                        mtype = 'chairman'
-                    elif mtype  == 'Co-Chairmain':
-                        mtype = 'co-chairmain'
-                    elif mtype ==  'Vice Chair':
-                        mtype = 'vice chair'
-                    elif mtype  == 'Ex Officio':
-                        mtype = 'ex officio'
-                    elif mtype == 'Interim Member':
-                        mtype = 'interim'
-                    else:
-                        mtype = 'member'
-                    committee.add_member(mname, mtype)
-
-                committees[comm_name] = committee
+    def _scrape_lower_standing_committee(self, committee_name, url):
+        page = self.lxmlize(url)
+
+        committee = Committee('lower', committee_name)
+        committee.add_source(url)
+
+        rows = page.xpath('//table[@id="body_ListView1_itemPlaceholder'
+            'Container"]/tr[@class="linkStyle2"]')
+
+        for row in rows:
+            member_name = row.xpath('normalize-space(string(./td[1]/a))')
+            member_name = ' '.join(filter(None, name_tools.split(member_name)))
+            member_role = row.xpath('normalize-space(string(./td[2]))')
+
+            member_role = self._normalize_member_role(member_role)
+
+            committee.add_member(member_name, member_role)
+
+        self.save_committee(committee)
+
+    def _scrape_lower_standing_committees(self):
+        url = 'http://house.louisiana.gov/H_Reps/H_Reps_StandCmtees.aspx'
+        page = self.lxmlize(url)
 
-        return committees
+        committee_cells = page.xpath('//table[@id="table11"]/tr/td[@class='
+            '"auto-style1"]')
+
+        for committee_cell in committee_cells:
+            committee_link = committee_cell.xpath('.//a')[0]
+
+            committee_url = committee_link.get('href')
+            committee_name = committee_link.xpath('normalize-space(string())').strip()
+
+            self._scrape_lower_standing_committee(committee_name,
+                committee_url)
+
+    def _scrape_lower_special_committees(self):
+        url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx'
+        page = self.lxmlize(url)
 
-
-    def normalize_committee_name(self, name):
-        committees = {
-            'House Executive Cmte': 'House Executive Committee',
-            'Atchafalaya Basin Oversight': 'Atchafalaya Basin Program Oversight Committee',
-            'Homeland Security': 'House Select Committee on Homeland Security',
-            'Hurricane Recovery': 'Select Committee on Hurricane Recovery',
-            'Legislative Budgetary Control': 'Legislative Budgetary Control Council',
-            'Military and Veterans Affairs': 'Special Committee on Military and Veterans Affairs'
+        committee_list = page.xpath('//table[@id="table106"]//div[@class='
+            '"exBody1A"]/div[@class="accordion"]')[0]
+        headers = committee_list.xpath('./h3')
+
+        for header in headers:
+            committee_name_text = header.xpath('string()')
+            committee_name = committee_name_text.strip()
+            committee_name = self._normalize_committee_name(committee_name)
+
+            chamber = 'joint' if committee_name.startswith('Joint') else 'lower'
+
+            committee = Committee(chamber, committee_name)
+            committee.add_source(url)
+
+            committee_memberlist = header.xpath('./following-sibling::div['
+                '@class="pane"]//tr[@class="linkStyle2"]')
+
+            for row in committee_memberlist:
+                member_name = row.xpath('normalize-space(string(./td[1]))')
+                member_name = ' '.join(filter(None, name_tools.split(member_name)))
+                member_role = row.xpath('normalize-space(string(./td[2]))')
+
+                member_role = self._normalize_member_role(member_role)
+
+                committee.add_member(member_name, member_role)
+
+            self.save_committee(committee)
+
+    def _scrape_upper_chamber(self):
+        committee_types = {
+            'Standing': 'http://senate.la.gov/Committees/Assignments.asp?type=Standing',
+            'Select': 'http://senate.la.gov/Committees/Assignments.asp?type=Select'
         }
-        return committees[name] if name in committees else name
+
+        for name, url in committee_types.items():
+            page = self.lxmlize(url)
+
+            committees = page.xpath('//td[@bgcolor="#EBEAEC"]//a')
+
+            for link in committees:
+                name = link.xpath('string()').strip()
+                url2 = link.attrib['href']
+                self._scrape_upper_committee(name, url2)
+
+    def _scrape_lower_chamber(self):
+        self._scrape_lower_standing_committees()
+        self._scrape_lower_special_committees()
+
+    def scrape(self, chamber, term):
+        if term != self.metadata['terms'][-1]['name']:
+            raise NoDataForPeriod(term)
+
+        getattr(self, '_scrape_' + chamber + '_chamber')()