From 6a599fef2febaf4322a2950ea7418b4b8df36d3c Mon Sep 17 00:00:00 2001 From: Tim Showers Date: Thu, 15 Dec 2016 10:25:16 -0500 Subject: [PATCH] Updated SC bills to scrape prefiles --- openstates/sc/bills.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/openstates/sc/bills.py b/openstates/sc/bills.py index 9cb3083538..eef2671500 100644 --- a/openstates/sc/bills.py +++ b/openstates/sc/bills.py @@ -63,9 +63,11 @@ class SCBillScraper(BillScraper): urls = { 'lower' : { 'daily-bill-index': "http://www.scstatehouse.gov/hintro/hintros.php", + 'prefile-index': "http://www.scstatehouse.gov/sessphp/prefil17.php", }, 'upper' : { 'daily-bill-index': "http://www.scstatehouse.gov/sintro/sintros.php", + 'prefile-index': "http://www.scstatehouse.gov/sessphp/prefil17.php", } } @@ -252,7 +254,7 @@ def scrape_details(self, bill_detail_url, session, chamber, bill_id): def scrape(self, chamber, session): # start with subjects session_code = self.metadata['session_details'][session]['_code'] - self.scrape_subjects(session_code) + #self.scrape_subjects(session_code) # get bill index index_url = self.urls[chamber]['daily-bill-index'] @@ -278,3 +280,30 @@ def scrape(self, chamber, session): if bill_id.startswith(chamber_letter): self.scrape_details(bill_a.get('href'), session, chamber, bill_id) + + prefile_url = self.urls[chamber]['prefile-index'] + page = self.get(prefile_url).text + doc = lxml.html.fromstring(page) + doc.make_links_absolute(prefile_url) + + # visit each day and extract bill ids + days = '' + if chamber == 'lower': + days = doc.xpath('//dd[contains(text(),"House")]/a/@href') + else: + days = doc.xpath('//dd[contains(text(),"Senate")]/a/@href') + + for day_url in days: + try: + data = self.get(day_url).text + except scrapelib.HTTPError: + continue + + doc = lxml.html.fromstring(data) + doc.make_links_absolute(day_url) + + for bill_a in doc.xpath('//p/a[1]'): + bill_id = bill_a.text.replace('.', '') + if bill_id.startswith(chamber_letter): + self.scrape_details(bill_a.get('href'), session, chamber, + bill_id) \ No newline at end of file