From 419761c259d90c15cdeae494411283c99f638bd6 Mon Sep 17 00:00:00 2001 From: Christophe Vandeplas Date: Mon, 15 Oct 2012 13:58:05 +0200 Subject: [PATCH] fixes bugs loads proxies and user agents from files --- .gitignore | 1 + README.md | 3 +- proxies.txt | 2 + pystemon.py | 133 +++++++++++----- pystemon.yaml | 411 +----------------------------------------------- user-agents.txt | 401 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 501 insertions(+), 450 deletions(-) create mode 100644 proxies.txt create mode 100644 user-agents.txt diff --git a/.gitignore b/.gitignore index 80fc7f9..a83dc2a 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ pip-log.txt /.project /.pydevproject /archive +/alerts diff --git a/README.md b/README.md index a222372..08fd860 100644 --- a/README.md +++ b/README.md @@ -22,4 +22,5 @@ Python Dependencies Default configuration file: /etc/pystemon.yaml or pystemon.yaml in current directory Limitations: -- Only HTTP proxies are allowed \ No newline at end of file +- Only HTTP proxies are allowed +- Only HTTP urls will use proxies \ No newline at end of file diff --git a/proxies.txt b/proxies.txt new file mode 100644 index 0000000..ce36ea6 --- /dev/null +++ b/proxies.txt @@ -0,0 +1,2 @@ +http://127.0.0.1:8080 +http://127.0.0.1:9000 \ No newline at end of file diff --git a/pystemon.py b/pystemon.py index 5f04b79..0f875fc 100755 --- a/pystemon.py +++ b/pystemon.py @@ -69,9 +69,11 @@ def run(self): logger.info("Downloading pasties from {name}. Next download scheduled in {time} seconds".format(name=self.name, time=sleep_time)) # get the list of last pasties, but reverse it so we first have the old # entries and then the new ones - for pastie in reversed(self.getLastPasties()): - queues[self.name].put(pastie) # add pastie to queue - time.sleep(sleep_time) + last_pasties = self.getLastPasties() + if last_pasties: + for pastie in reversed(last_pasties): + queues[self.name].put(pastie) # add pastie to queue + time.sleep(sleep_time) def getLastPasties(self): # reset the pasties list @@ -96,7 +98,7 @@ def getLastPasties(self): pasties.append(pastie) logger.debug("Found {amount} pasties for site {site}".format(amount=len(pasties_ids), site=self.name)) return pasties - logger.error("No last pasties matches for regular expression site:{site} regex:{regex}. Error in your regex?".format(site=self.name, regex=self.archive_regex)) + logger.error("No last pasties matches for regular expression site:{site} regex:{regex}. Error in your regex? Dumping htmlPage \n {html}".format(site=self.name, regex=self.archive_regex, html=htmlPage.encode('utf8'))) return False def seenPastie(self, pastie_id): @@ -228,14 +230,19 @@ def __init__(self, site, pastie_id): def fetchPastie(self): validation_form_page, headers = downloadUrl(self.url) - htmlDom = BeautifulSoup(validation_form_page) - content_left = htmlDom.find(id='full-width') - plain_confirm = content_left.find('input')['value'] - # build a form with plainConfirm = value and the cookie - data = urllib.urlencode({'plainConfirm': plain_confirm}) - url = "http://pastesite.com/plain/{id}".format(id=self.id) - cookie = headers.dict['set-cookie'] - self.pastie_content, headers = downloadUrl(url, data, cookie) + if validation_form_page: + htmlDom = BeautifulSoup(validation_form_page) + if not htmlDom: + return self.pastie_content + content_left = htmlDom.find(id='full-width') + if not content_left: + return self.pastie_content + plain_confirm = content_left.find('input')['value'] + # build a form with plainConfirm = value and the cookie + data = urllib.urlencode({'plainConfirm': plain_confirm}) + url = "http://pastesite.com/plain/{id}".format(id=self.id) + cookie = headers.dict['set-cookie'] + self.pastie_content, headers = downloadUrl(url, data, cookie) return self.pastie_content @@ -249,13 +256,14 @@ def __init__(self, site, pastie_id): def fetchPastie(self): downloaded_page, headers = downloadUrl(self.url) - # make the json valid: strip json1( ) - downloaded_page = u'[' + downloaded_page[6:-2] + u']' - # convert to json object - json_pastie = json.loads(downloaded_page) - if json_pastie: - # and extract the code - self.pastie_content = json_pastie[0]['code_record'] + if downloaded_page: + # make the json valid: strip json1( ) + downloaded_page = u'[' + downloaded_page[6:-2] + u']' + # convert to json object + json_pastie = json.loads(downloaded_page) + if json_pastie: + # and extract the code + self.pastie_content = json_pastie[0]['code_record'] return self.pastie_content @@ -269,13 +277,14 @@ def __init__(self, site, pastie_id): def fetchPastie(self): downloaded_page, headers = downloadUrl(self.url) - htmlDom = BeautifulSoup(downloaded_page) - # search for