Skip to content

Commit

Permalink
move initialization out of run() into __init__() so that it is easier…
Browse files Browse the repository at this point in the history
… to create and run scrapers outside of command line context
  • Loading branch information
mikejs committed Jan 4, 2010
1 parent 518a9eb commit c5188bf
Show file tree
Hide file tree
Showing 22 changed files with 56 additions and 42 deletions.
2 changes: 1 addition & 1 deletion scripts/ak/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,4 +191,4 @@ def scrape_bills(self, chamber, year):
self.scrape_session(chamber, year)

if __name__ == '__main__':
AKLegislationScraper().run()
AKLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/al/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,4 +211,4 @@ def unescape(self,s):
return re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: unichr(name2codepoint[m.group(1)]), s)

if __name__ == '__main__':
ALLegislationScraper().run()
ALLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/ct/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,4 +319,4 @@ def get_baby(soup):
pass

if __name__ == '__main__':
CTLegislationScraper().run()
CTLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/fl/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,4 +219,4 @@ def split_name(self, full):
return (last, first, middle)

if __name__ == '__main__':
FLLegislationScraper().run()
FLLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/ga/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,4 +350,4 @@ def scrape2009(self, url, year, chamberName, session, number):
self.add_bill(bill)

if __name__ == '__main__':
GALegislationScraper().run()
GALegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/ky/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,4 @@ def parse_legislator(self, chamber, year, full_name, district, url):
self.add_legislator(legislator)

if __name__ == '__main__':
KYLegislationScraper().run()
KYLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/la/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,4 +254,4 @@ def unescape(self,s):
return re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: unichr(name2codepoint[m.group(1)]), s).encode('ascii', 'ignore')

if __name__ == '__main__':
LouisianaScraper().run({'upper': LANameMatcher, 'lower': LANameMatcher})
LouisianaScraper.run({'upper': LANameMatcher, 'lower': LANameMatcher})
2 changes: 1 addition & 1 deletion scripts/me/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ def scrape_bills(self, chamber, year):
self.scrape_session(chamber, year, special)

if __name__ == '__main__':
MELegislationScraper().run()
MELegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/mn/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,4 +286,4 @@ def scrape_bills(self, chamber, year):
self.scrape_session(chamber, session, session_year, session_number, legislative_session)

if __name__ == '__main__':
MNLegislationScraper().run()
MNLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/mo/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,4 +339,4 @@ def parse_house_cosponsors(self, bill, cell):
raise e

if __name__ == '__main__':
MOLegislationScraper().run()
MOLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/nc/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,4 +196,4 @@ def scrape_legislators(self, chamber, year):
self.add_legislator(legislator)

if __name__ == '__main__':
NCLegislationScraper().run()
NCLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/nd/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,4 +537,4 @@ def scrape_bill_sponsors(self, assembly_url):
return (bill_sponsors, url)

if __name__ == '__main__':
NDLegislationScraper().run()
NDLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/nh/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,4 @@ def scrape_bills(self, chamber, year):


if __name__ == '__main__':
NHLegislationScraper().run()
NHLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/pa/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,4 +283,4 @@ def scrape_legislators(self, chamber, year):
self.add_legislator(legislator)

if __name__ == '__main__':
PALegislationScraper().run()
PALegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/python_template/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,4 @@ def scrape_bills(self,chamber,year):
pass

if __name__ == '__main__':
MyScraper().run()
MyScraper.run()
56 changes: 35 additions & 21 deletions scripts/pyutils/legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,20 +83,29 @@ class LegislationScraper(object):
# state's logs):
user_agent = 'robot: http://fiftystates-dev.sunlightlabs.com/'

def __init__(self):
def __init__(self, verbosity=logging.INFO, sleep=False,
no_cache=False, output_dir=None, **kwargs):
if not hasattr(self, 'state'):
raise Exception('LegislationScrapers must have a state attribute')
self._cookie_jar = cookielib.CookieJar()

self.reset_name_matchers()

self.sleep = sleep
self.no_cache = no_cache
self.requests = 0

self.output_dir = output_dir or os.path.join('data', self.state)
self.cache_dir = os.path.join('cache', self.state)
self.output_dir = os.path.join('data', self.state)
self.error_dir = os.path.join('errors', self.state)
self._init_dirs()

self.logger = logging.getLogger("fiftystates")
formatter = logging.Formatter("%(asctime)s %(levelname)s " + self.state + " %(message)s")
console = logging.StreamHandler()
console.setFormatter(formatter)
self.logger.addHandler(console)
self.logger.setLevel(verbosity)

# Convenience methods
self.log = self.logger.info
Expand Down Expand Up @@ -201,7 +210,7 @@ def soup_context(self, url):
def _make_headers(self):
return {'User-Agent': self.user_agent}

def init_dirs(self):
def _init_dirs(self):

def makedir(path):
try:
Expand Down Expand Up @@ -303,27 +312,31 @@ def write_metadata(self):
'w') as f:
json.dump(metadata, f, cls=DateEncoder)

def run(self, matcher=None):
def reset_name_matchers(self, upper=None, lower=None):
self.matcher = {}
self.matcher['upper'] = upper or NameMatcher()
self.matcher['lower'] = lower or NameMatcher()

@classmethod
def run(cls, matcher=None):
"""
Create and run a scraper for this state, based on
command line options.
"""
parser = OptionParser(
option_list=self.option_list)
option_list=cls.option_list)
options, spares = parser.parse_args()
self.no_cache = options.no_cache
self.sleep = options.sleep
self.requests = 0

if options.verbose == 0:
level = logging.WARNING
verbosity = logging.WARNING
elif options.verbose == 1:
level = logging.INFO
verbosity = logging.INFO
else:
level = logging.DEBUG
self.logger.setLevel(level)
verbosity = logging.DEBUG

if options.output_dir:
self.output_dir = options.output_dir
scraper = cls(verbosity=verbosity, **vars(options))

self.init_dirs()
self.write_metadata()
scraper.write_metadata()

years = options.years
if options.all_years:
Expand All @@ -341,15 +354,16 @@ def run(self, matcher=None):
chambers = ['upper', 'lower']
for year in years:
if matcher is None:
self.matcher = {'upper': NameMatcher(), 'lower': NameMatcher()}
scraper.reset_name_matchers()
else:
self.matcher = {'upper': matcher['upper'](), 'lower': matcher['lower']()}
scraper.reset_name_matchers(upper=matcher['upper'](),
lower=matcher['lower']())
try:
for chamber in chambers:
self.scrape_legislators(chamber, year)
scraper.scrape_legislators(chamber, year)
for chamber in chambers:
self.old_bills = {}
self.scrape_bills(chamber, year)
scraper.old_bills = {}
scraper.scrape_bills(chamber, year)
except NoDataForYear, e:
if options.all_years:
pass
Expand Down
2 changes: 1 addition & 1 deletion scripts/sd/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,4 +441,4 @@ def scrape_legislators(self, chamber, year):
self.scrape_old_legislators(chamber, year)

if __name__ == '__main__':
SDLegislationScraper().run()
SDLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/tx/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,4 @@ def scrape_bills(self, chamber, year):
self.scrape_session(chamber, session)

if __name__ == '__main__':
TXLegislationScraper().run()
TXLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/ut/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,4 +226,4 @@ def scrape_bills(self, chamber, year):
self.scrape_session(chamber, sub_session)

if __name__ == '__main__':
UTLegislationScraper().run()
UTLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/va/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,4 +307,4 @@ def unescape(self,s):
return s.replace(' ', ' ')

if __name__ == '__main__':
VALegislationScraper().run({'upper': VANameMatcher, 'lower': VANameMatcher})
VALegislationScraper.run(matcher={'upper': VANameMatcher, 'lower': VANameMatcher})
2 changes: 1 addition & 1 deletion scripts/vt/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,4 +302,4 @@ def scrape_legislators(self, chamber, year):
self.add_legislator(leg)

if __name__ == '__main__':
VTLegislationScraper().run()
VTLegislationScraper.run()
2 changes: 1 addition & 1 deletion scripts/wv/get_legislation.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,4 @@ def scrape_bill(self, chamber, session, billid, histurl, year):
self.add_bill(bill)

if __name__ == '__main__':
WVLegislationScraper().run()
WVLegislationScraper.run()

0 comments on commit c5188bf

Please sign in to comment.