diff --git a/scripts/ak/get_legislation.py b/scripts/ak/get_legislation.py index f026b93ef6..7d449fe403 100755 --- a/scripts/ak/get_legislation.py +++ b/scripts/ak/get_legislation.py @@ -191,4 +191,4 @@ def scrape_bills(self, chamber, year): self.scrape_session(chamber, year) if __name__ == '__main__': - AKLegislationScraper().run() + AKLegislationScraper.run() diff --git a/scripts/al/get_legislation.py b/scripts/al/get_legislation.py index c38effceb9..ee815a31e8 100755 --- a/scripts/al/get_legislation.py +++ b/scripts/al/get_legislation.py @@ -211,4 +211,4 @@ def unescape(self,s): return re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: unichr(name2codepoint[m.group(1)]), s) if __name__ == '__main__': - ALLegislationScraper().run() + ALLegislationScraper.run() diff --git a/scripts/ct/get_legislation.py b/scripts/ct/get_legislation.py index 1db83af9ea..af86f77e56 100755 --- a/scripts/ct/get_legislation.py +++ b/scripts/ct/get_legislation.py @@ -319,4 +319,4 @@ def get_baby(soup): pass if __name__ == '__main__': - CTLegislationScraper().run() + CTLegislationScraper.run() diff --git a/scripts/fl/get_legislation.py b/scripts/fl/get_legislation.py index f33bf39678..8334e7ee6e 100755 --- a/scripts/fl/get_legislation.py +++ b/scripts/fl/get_legislation.py @@ -219,4 +219,4 @@ def split_name(self, full): return (last, first, middle) if __name__ == '__main__': - FLLegislationScraper().run() + FLLegislationScraper.run() diff --git a/scripts/ga/get_legislation.py b/scripts/ga/get_legislation.py index 47723c1ec5..efa0df749e 100755 --- a/scripts/ga/get_legislation.py +++ b/scripts/ga/get_legislation.py @@ -350,4 +350,4 @@ def scrape2009(self, url, year, chamberName, session, number): self.add_bill(bill) if __name__ == '__main__': - GALegislationScraper().run() + GALegislationScraper.run() diff --git a/scripts/ky/get_legislation.py b/scripts/ky/get_legislation.py index 52908217ae..cbd12b6823 100755 --- a/scripts/ky/get_legislation.py +++ b/scripts/ky/get_legislation.py @@ -187,4 +187,4 @@ def parse_legislator(self, chamber, year, full_name, district, url): self.add_legislator(legislator) if __name__ == '__main__': - KYLegislationScraper().run() + KYLegislationScraper.run() diff --git a/scripts/la/get_legislation.py b/scripts/la/get_legislation.py index 5d44e5617f..673eb8fe91 100755 --- a/scripts/la/get_legislation.py +++ b/scripts/la/get_legislation.py @@ -254,4 +254,4 @@ def unescape(self,s): return re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: unichr(name2codepoint[m.group(1)]), s).encode('ascii', 'ignore') if __name__ == '__main__': - LouisianaScraper().run({'upper': LANameMatcher, 'lower': LANameMatcher}) + LouisianaScraper.run({'upper': LANameMatcher, 'lower': LANameMatcher}) diff --git a/scripts/me/get_legislation.py b/scripts/me/get_legislation.py index 1252111daf..617fe4d9f4 100755 --- a/scripts/me/get_legislation.py +++ b/scripts/me/get_legislation.py @@ -50,4 +50,4 @@ def scrape_bills(self, chamber, year): self.scrape_session(chamber, year, special) if __name__ == '__main__': - MELegislationScraper().run() + MELegislationScraper.run() diff --git a/scripts/mn/get_legislation.py b/scripts/mn/get_legislation.py index 34860edc9b..cb12ecaac0 100755 --- a/scripts/mn/get_legislation.py +++ b/scripts/mn/get_legislation.py @@ -286,4 +286,4 @@ def scrape_bills(self, chamber, year): self.scrape_session(chamber, session, session_year, session_number, legislative_session) if __name__ == '__main__': - MNLegislationScraper().run() + MNLegislationScraper.run() diff --git a/scripts/mo/get_legislation.py b/scripts/mo/get_legislation.py index d4fa9352c5..13c5545e48 100755 --- a/scripts/mo/get_legislation.py +++ b/scripts/mo/get_legislation.py @@ -339,4 +339,4 @@ def parse_house_cosponsors(self, bill, cell): raise e if __name__ == '__main__': - MOLegislationScraper().run() + MOLegislationScraper.run() diff --git a/scripts/nc/get_legislation.py b/scripts/nc/get_legislation.py index 955ae308fe..033f8fba28 100755 --- a/scripts/nc/get_legislation.py +++ b/scripts/nc/get_legislation.py @@ -196,4 +196,4 @@ def scrape_legislators(self, chamber, year): self.add_legislator(legislator) if __name__ == '__main__': - NCLegislationScraper().run() + NCLegislationScraper.run() diff --git a/scripts/nd/get_legislation.py b/scripts/nd/get_legislation.py index cf9d728523..601e3f91e4 100755 --- a/scripts/nd/get_legislation.py +++ b/scripts/nd/get_legislation.py @@ -537,4 +537,4 @@ def scrape_bill_sponsors(self, assembly_url): return (bill_sponsors, url) if __name__ == '__main__': - NDLegislationScraper().run() + NDLegislationScraper.run() diff --git a/scripts/nh/get_legislation.py b/scripts/nh/get_legislation.py index 32e52a5cb8..8188e656a8 100755 --- a/scripts/nh/get_legislation.py +++ b/scripts/nh/get_legislation.py @@ -114,4 +114,4 @@ def scrape_bills(self, chamber, year): if __name__ == '__main__': - NHLegislationScraper().run() + NHLegislationScraper.run() diff --git a/scripts/pa/get_legislation.py b/scripts/pa/get_legislation.py index 06da98d522..bbc0c428c2 100755 --- a/scripts/pa/get_legislation.py +++ b/scripts/pa/get_legislation.py @@ -283,4 +283,4 @@ def scrape_legislators(self, chamber, year): self.add_legislator(legislator) if __name__ == '__main__': - PALegislationScraper().run() + PALegislationScraper.run() diff --git a/scripts/python_template/get_legislation.py b/scripts/python_template/get_legislation.py index 240e43c8bf..070f808809 100644 --- a/scripts/python_template/get_legislation.py +++ b/scripts/python_template/get_legislation.py @@ -14,4 +14,4 @@ def scrape_bills(self,chamber,year): pass if __name__ == '__main__': - MyScraper().run() + MyScraper.run() diff --git a/scripts/pyutils/legislation.py b/scripts/pyutils/legislation.py index f6f99a0bed..cdaa96bea6 100755 --- a/scripts/pyutils/legislation.py +++ b/scripts/pyutils/legislation.py @@ -83,20 +83,29 @@ class LegislationScraper(object): # state's logs): user_agent = 'robot: http://fiftystates-dev.sunlightlabs.com/' - def __init__(self): + def __init__(self, verbosity=logging.INFO, sleep=False, + no_cache=False, output_dir=None, **kwargs): if not hasattr(self, 'state'): raise Exception('LegislationScrapers must have a state attribute') self._cookie_jar = cookielib.CookieJar() + self.reset_name_matchers() + + self.sleep = sleep + self.no_cache = no_cache + self.requests = 0 + + self.output_dir = output_dir or os.path.join('data', self.state) self.cache_dir = os.path.join('cache', self.state) - self.output_dir = os.path.join('data', self.state) self.error_dir = os.path.join('errors', self.state) + self._init_dirs() self.logger = logging.getLogger("fiftystates") formatter = logging.Formatter("%(asctime)s %(levelname)s " + self.state + " %(message)s") console = logging.StreamHandler() console.setFormatter(formatter) self.logger.addHandler(console) + self.logger.setLevel(verbosity) # Convenience methods self.log = self.logger.info @@ -201,7 +210,7 @@ def soup_context(self, url): def _make_headers(self): return {'User-Agent': self.user_agent} - def init_dirs(self): + def _init_dirs(self): def makedir(path): try: @@ -303,27 +312,31 @@ def write_metadata(self): 'w') as f: json.dump(metadata, f, cls=DateEncoder) - def run(self, matcher=None): + def reset_name_matchers(self, upper=None, lower=None): + self.matcher = {} + self.matcher['upper'] = upper or NameMatcher() + self.matcher['lower'] = lower or NameMatcher() + + @classmethod + def run(cls, matcher=None): + """ + Create and run a scraper for this state, based on + command line options. + """ parser = OptionParser( - option_list=self.option_list) + option_list=cls.option_list) options, spares = parser.parse_args() - self.no_cache = options.no_cache - self.sleep = options.sleep - self.requests = 0 if options.verbose == 0: - level = logging.WARNING + verbosity = logging.WARNING elif options.verbose == 1: - level = logging.INFO + verbosity = logging.INFO else: - level = logging.DEBUG - self.logger.setLevel(level) + verbosity = logging.DEBUG - if options.output_dir: - self.output_dir = options.output_dir + scraper = cls(verbosity=verbosity, **vars(options)) - self.init_dirs() - self.write_metadata() + scraper.write_metadata() years = options.years if options.all_years: @@ -341,15 +354,16 @@ def run(self, matcher=None): chambers = ['upper', 'lower'] for year in years: if matcher is None: - self.matcher = {'upper': NameMatcher(), 'lower': NameMatcher()} + scraper.reset_name_matchers() else: - self.matcher = {'upper': matcher['upper'](), 'lower': matcher['lower']()} + scraper.reset_name_matchers(upper=matcher['upper'](), + lower=matcher['lower']()) try: for chamber in chambers: - self.scrape_legislators(chamber, year) + scraper.scrape_legislators(chamber, year) for chamber in chambers: - self.old_bills = {} - self.scrape_bills(chamber, year) + scraper.old_bills = {} + scraper.scrape_bills(chamber, year) except NoDataForYear, e: if options.all_years: pass diff --git a/scripts/sd/get_legislation.py b/scripts/sd/get_legislation.py index bd6288b35f..423584a343 100755 --- a/scripts/sd/get_legislation.py +++ b/scripts/sd/get_legislation.py @@ -441,4 +441,4 @@ def scrape_legislators(self, chamber, year): self.scrape_old_legislators(chamber, year) if __name__ == '__main__': - SDLegislationScraper().run() + SDLegislationScraper.run() diff --git a/scripts/tx/get_legislation.py b/scripts/tx/get_legislation.py index 4a4acb4f3b..e8ef5be3d3 100755 --- a/scripts/tx/get_legislation.py +++ b/scripts/tx/get_legislation.py @@ -122,4 +122,4 @@ def scrape_bills(self, chamber, year): self.scrape_session(chamber, session) if __name__ == '__main__': - TXLegislationScraper().run() + TXLegislationScraper.run() diff --git a/scripts/ut/get_legislation.py b/scripts/ut/get_legislation.py index d5cc7ef08c..977a35a609 100755 --- a/scripts/ut/get_legislation.py +++ b/scripts/ut/get_legislation.py @@ -226,4 +226,4 @@ def scrape_bills(self, chamber, year): self.scrape_session(chamber, sub_session) if __name__ == '__main__': - UTLegislationScraper().run() + UTLegislationScraper.run() diff --git a/scripts/va/get_legislation.py b/scripts/va/get_legislation.py index d8a0aa5859..c9802d2476 100755 --- a/scripts/va/get_legislation.py +++ b/scripts/va/get_legislation.py @@ -307,4 +307,4 @@ def unescape(self,s): return s.replace(' ', ' ') if __name__ == '__main__': - VALegislationScraper().run({'upper': VANameMatcher, 'lower': VANameMatcher}) + VALegislationScraper.run(matcher={'upper': VANameMatcher, 'lower': VANameMatcher}) diff --git a/scripts/vt/get_legislation.py b/scripts/vt/get_legislation.py index 17b9ffca5d..e1f6000ac1 100755 --- a/scripts/vt/get_legislation.py +++ b/scripts/vt/get_legislation.py @@ -302,4 +302,4 @@ def scrape_legislators(self, chamber, year): self.add_legislator(leg) if __name__ == '__main__': - VTLegislationScraper().run() + VTLegislationScraper.run() diff --git a/scripts/wv/get_legislation.py b/scripts/wv/get_legislation.py index 0c79d26218..0921cfe546 100755 --- a/scripts/wv/get_legislation.py +++ b/scripts/wv/get_legislation.py @@ -156,4 +156,4 @@ def scrape_bill(self, chamber, session, billid, histurl, year): self.add_bill(bill) if __name__ == '__main__': - WVLegislationScraper().run() + WVLegislationScraper.run()