add image handler

INCHMAN1900 · Aug 11, 2017 · c024748 · c024748
1 parent 5679d99
commit c024748
Show file tree

Hide file tree

Showing 8 changed files with 2,560 additions and 55 deletions.
diff --git a/ImgHandler.py b/ImgHandler.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+
+import config
+import re
+import math
+import json
+import random
+import requests
+import time
+
+class ImageHandler(object):
+	"""docstring for ImageHandler"""
+	def __init__(self, **kwargs):
+		super(ImageHandler, self).__init__()
+		self.path = kwargs.get('path', config.img_base_path)
+
+	def write_image(self, image):
+		if(image):
+			_id = self._generate_image_id()
+			_type_re = re.compile(r'wx_fmt=\w+')
+			_types = _type_re.findall(image)
+			end = ''
+			print(image)
+			if(len(_types) == 0 or _types[0] == 'wx_fmt=1'):
+				end = '.png'
+				path = self.path + str(_id) + end
+			else:
+				end = '.' + re.sub('wx_fmt=', '', _types[0])
+				path = self.path + str(_id) + end
+			f_img = open(path, 'w')
+			r = requests.get(image)
+			f_img.write(r.content)
+			f_img.close()
+			time.sleep(3)
+			# if you want to change the path where the images will be put, change config file and this path config.
+			path = '/imgs' + str(_id) + end
+			return path
+		else:
+			return ''
+
+
+	def _generate_image_id(self):
+		chars = 'QWERTYUPLKJHGFDSAZXCVBNMqwertyuplkjhgfdsazxcvbnm123456789'
+		length = len(chars)
+		_id = ''
+		for i in range(15):
+			_id = _id + chars[math.trunc(random.random()*length)]
+		return _id
diff --git a/README.md b/README.md
@@ -1,10 +1,4 @@
 # WechatScraper
-A python wechat official account scrawler, using sougou search engine. 
 
-### Needed
-Python + Selenium + PhantomJS + MySQL
-
-### MySQL
-sql folder is for mysql.
-
-connect mysql and type ``` source ```, then drag the file into the command lines, the table will automatically be created.
+# A python wechat official account scrawler, using sougou search engine. 
+# selenium needed
diff --git a/WechatScraper.py b/WechatScraper.py
@@ -2,19 +2,24 @@
 
 from selenium import webdriver
 from selenium.common.exceptions import TimeoutException
+from pyvirtualdisplay import Display
 import time
 import re
 import json
 
 import config
 import utils
 
-browser = webdriver.PhantomJS()
+display = Display(visible=0, size=(1366, 768))
+display.start()
+print('Display start')
+
+browser = webdriver.Firefox()
 
 class WechatScraper():
 
-	def __init__(self, ws_config, **kwargs):
-		config = utils.merge(ws_config, config)
+	def __init__(self, **kwargs):
+		self.config = config
 
 	"""
 		query: keyword
@@ -24,7 +29,7 @@ def __init__(self, ws_config, **kwargs):
 	def get_article_list_by_keyword(self, query, page=1):
 		query = 'query=' + query
 		page = 'page=' + str(page)
-		built_url = self._build_url(config.article_search_url, ['query', 'page'], [query, page])
+		built_url = self._build_url(self.config.article_search_url, ['query', 'page'], [query, page])
 		article_list = []
 		browser.get(built_url)
 
@@ -63,7 +68,7 @@ def get_article_by_url(self, url):
 		if(raw_avatar):
 			avatar = re.sub(re.compile(r'[^"]+"'), '', raw_avatar[0], 1).replace('";', '')
 		page_content = browser.find_element_by_id('img-content')
-       		ems = page_content.find_elements_by_css_selector('.rich_media_meta_list>em')
+		ems = page_content.find_elements_by_css_selector('.rich_media_meta_list>em')
 		author = ''
 		if(len(ems)>1):
 			author = ems[1].text
@@ -80,7 +85,7 @@ def search_gzh_by_keyword(self, query, **kwargs):
 		page = kwargs.get('page', 1)
 		query = 'query=' + query
 		page = 'page=' + str(page)
-		built_url = self._build_url(config.gzh_search_url, ['query', 'page'], [query, page])
+		built_url = self._build_url(self.config.gzh_search_url, ['query', 'page'], [query, page])
 		browser.get(built_url)
 		gzh_list = browser.find_elements_by_css_selector('.news-list2 li')
 		for i in range(len(gzh_list)):
@@ -109,7 +114,7 @@ def search_gzh_by_keyword(self, query, **kwargs):
 	def get_gzh_message(self, wechatid):
 		query = 'query=' + str(wechatid)
 		page = 'page=' + str(1)
-		built_url = self._build_url(config.gzh_search_url, ['query', 'page'], [query, page])
+		built_url = self._build_url(self.config.gzh_search_url, ['query', 'page'], [query, page])
 		browser.get(built_url)
 		gzh_list = browser.find_elements_by_css_selector('.news-list2 li')
 		gzh_url = gzh_list[0].find_element_by_css_selector('.img-box a').get_attribute('href')
@@ -154,7 +159,7 @@ def get_gzh_message(self, wechatid):
 
 
 	"""
-		below here are some private functions
+		below here are some common functions
 
 	"""
 

diff --git a/config.py b/config.py
@@ -1,18 +1,20 @@
 # -*- coding: utf-8 -*-
-# config file
 
 import pymysql.cursors
 
 article_search_url = 'http://weixin.sogou.com/weixin?oq=&query&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=17&_sug_=n&type=2&sst0=1499142762773&page&ie=utf8&p=40040108&dp=1&w=01015002&dr=1'
 
 gzh_search_url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query&ie=utf8&_sug_=n&_sug_type_=&w=01019900&sut=10144&sst0=1500528877030&lkt=1%2C1500528876895%2C1500528876895&page'
 
+# test path
+img_base_path = './imgs/'
+
 db_config = {
 	'host': '127.0.0.1',    # host
 	'port': 3306,           # port
 	'user': 'root',         # database username
 	'password': '123456',   # database password
-	'db': 'HUANLEYE',       # database name
+	'db': 'DATABASE',       # database name
 	'charset': 'utf8mb4',   # database charset
 	'cursorclass': pymysql.cursors.DictCursor
 }
diff --git a/db.py b/db.py
@@ -10,66 +10,50 @@
 
 import config
 import utils
-from ImgHandler import ImgHandler
-
-img_handler = ImgHandler()
 
 class DB(object):
 	"""docstring for DB"""
-	def __init__(self, db_config, **kwargs):
+	def __init__(self, **kwargs):
 		super(DB, self).__init__()
-		self._db = utils.merge(db_config, (config.db_config))
+		db_config = kwargs.get('db_config')
+		self._db = utils.merge(db_config, config.db_config)
+		print(self._db)
 
 	def store_gzh_list(self, gzhList, **kwargs):
 		if(type(gzhList) != list):
 			return False
 		table = kwargs.get('table', 'gzh')
 		[self._store_gzh_info(i, table=table) for i in gzhList]
 
-	def store_gzh_info(self, info, **kwargs):
+	def _store_gzh_info(self, info, **kwargs):
 		table = kwargs.get('table')
 		query = 'insert into ' + table + ' (`title`, `wechatid`, `avatar`, `qrcode`, `introduction`, `verification`) values ("' + info.get('title', '') + '", "' + info.get('wechatid', '') + '", "' + info.get('avatar', '') + '", "' + info.get('qrcode', '') + '", "' + info.get('introduction', '') + '", "' + info.get('verification', '') + '")'
 		self._execute(query)
 
 	def store_article(self, article, **kwargs):
 		for i in article:
 			article[i] = self._escape(article[i])
-		table = kwargs.get('table', 'article_copy')
+
+		table = kwargs.get('table', 'articles')
 		query = 'insert into ' + table + ' (`title`, `poster`, `authorId`, `authorAvatar`, `authorName`, `col`, `description`, `content`, `updateTime`, `tag`, `likes`, `type`) values ("' + article.get('title', '') + '", "' + article.get('poster', '') + '", NULL, "' + article.get('authorAvatar', '') + '", "' + article.get('authorName', '') + '", "' + article.get('col', '') + '", "' + article.get('description', '') + '", "' + article.get('content', '').strip() + '", DATE_ADD(\'1970-01-01 00:00:00\', INTERVAL ' + str(article.get('updateTime', '')) + ' SECOND), 0, 0, 0)'
 		self._execute(query)
 		print(article['title'] + ' inserted')
 
-	# replace poster or author avatar or all imgs in content
-	def replace_imgs(self, imgs, position, **kwargs):
-		table = kwargs.get('table', 'articles')
-		multi = kwargs.get('multi', False)
-		if(multi or type(imgs) == 'list'):
-			imgs_copy = imgs[:]
-			paths = [img_handler.write_img(img) for img in imgs_copy]
-			for i in range(len(paths)):
-				self._replace_img(position, path[i], imgs[i], table)
-		else:
-			path = img_handler.write_img(imgs)
-			effect_rows = self._replace_img(position, path, imgs, table)
-			return effect_rows
-
 	def check_exist(self, title, **kwargs):
-		query = 'select * from ' + kwargs.get('table', 'article_copy') + ' where title="' + self._escape(title) + '"'
+		print('checking exist')
+		query = 'select * from ' + kwargs.get('table', 'articles') + ' where title="' + self._escape(title) + '"'
 		effect_rows = self._execute(query)
 		return effect_rows
 
+
 	def _execute(self, query):
 		conn = pymysql.connect(**self._db)
 		cursor = conn.cursor()
 		effect_rows = cursor.execute(query)
 		conn.commit()
 		cursor.close()
 		conn.close()
-		return effect_rows
-
-	def _replace_img(self, position, newPath, oldPath, table, **kwargs):
-		query = 'update ' + table + ' set ' + position + '=' + newPath + ' where ' + position + '=' + oldPath
-		effect_rows = self._execute(query)
+		print('connection closed')
 		return effect_rows
 
 	def _escape(self, string):
@@ -86,4 +70,4 @@ def _escape(self, string):
 
 
 
-
+
diff --git a/geckodriver b/geckodriver
diff --git a/main.py b/main.py
@@ -5,27 +5,40 @@
 from WechatScraper import WechatScraper
 from db import DB
 import utils
+import re
 
 ws = WechatScraper()
 
-for i in range(3):
-	gzh_list = ws.search_gzh_by_keyword('manwei', page=i + 1)
-	db.store_gzh_list(gzh_list)
-	time.sleep(3)
+'''
+to custom your databse config , add db_config as a named parameter like:
+	db_config = {
+		'host': '127.0.0.1',    # host
+		'port': 3306,           # port
+		'user': 'root',         # database username
+		'password': '123456',   # database password
+		'db': 'DATABASE',       # database name
+	}
+'''
+
+db = DB()
+
+from ImgHandler import ImageHandler
 
+ih = ImageHandler() 
 
-# scrap 10 page articles related to one keyword
+# scrap 10 page articles related to one keyword.
+# images will be put oin imgs folder
 
 keyword_list = [{
-	'keyword': 'manwei',
+	'keyword': '漫威',
 	'page': 10
 }, {
-	'keyword': 'marvel',
+	'keyword': 'DC',
 	'page': 10
 }]
 
 def digest_article(msg, **kwargs):
-	col = kwargs.get('col', '过山车')
+	col = kwargs.get('col', 'mycol')
 	url = msg['url'].replace('amp;', '')
 	effect_rows = db.check_exist(msg['title'])
 	if(effect_rows>0):
@@ -34,8 +47,22 @@ def digest_article(msg, **kwargs):
 	article = ws.get_article_by_url(url)
 	article = utils.merge(article, msg)
 	article['col'] = col
+
+	content = article['content']
+	# replace images
+	img_re = re.compile(r'<img[^>]+data\-src\=\"http[^"]+\"')
+	images = img_re.findall(content)
+	for j in range(len(images)):
+		images[j] = re.sub('<img.+data\-src=', '', images[j]).replace('"', '')
+		newPath = ih.write_image(images[j])
+		content = content.replace(images[j], newPath, 2)
+	article['content'] = content
+	if(article['poster'] and len(article['poster']) > 0):
+		article['poster'] = ih.write_image(article['poster'][0])
+	print('article get')
 	db.store_article(article)
-	time.sleep(5)
+	print('article stored')
+	time.sleep(3)
 
 for item in keyword_list:
 	for i in range(item['page']):