Skip to content
This repository has been archived by the owner on Jun 13, 2023. It is now read-only.

Commit

Permalink
add image handler
Browse files Browse the repository at this point in the history
  • Loading branch information
INCHMAN1900 committed Aug 11, 2017
1 parent 5679d99 commit c024748
Show file tree
Hide file tree
Showing 8 changed files with 2,560 additions and 55 deletions.
48 changes: 48 additions & 0 deletions ImgHandler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-

import config
import re
import math
import json
import random
import requests
import time

class ImageHandler(object):
"""docstring for ImageHandler"""
def __init__(self, **kwargs):
super(ImageHandler, self).__init__()
self.path = kwargs.get('path', config.img_base_path)

def write_image(self, image):
if(image):
_id = self._generate_image_id()
_type_re = re.compile(r'wx_fmt=\w+')
_types = _type_re.findall(image)
end = ''
print(image)
if(len(_types) == 0 or _types[0] == 'wx_fmt=1'):
end = '.png'
path = self.path + str(_id) + end
else:
end = '.' + re.sub('wx_fmt=', '', _types[0])
path = self.path + str(_id) + end
f_img = open(path, 'w')
r = requests.get(image)
f_img.write(r.content)
f_img.close()
time.sleep(3)
# if you want to change the path where the images will be put, change config file and this path config.
path = '/imgs' + str(_id) + end
return path
else:
return ''


def _generate_image_id(self):
chars = 'QWERTYUPLKJHGFDSAZXCVBNMqwertyuplkjhgfdsazxcvbnm123456789'
length = len(chars)
_id = ''
for i in range(15):
_id = _id + chars[math.trunc(random.random()*length)]
return _id
10 changes: 2 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
# WechatScraper
A python wechat official account scrawler, using sougou search engine.

### Needed
Python + Selenium + PhantomJS + MySQL

### MySQL
sql folder is for mysql.

connect mysql and type ``` source ```, then drag the file into the command lines, the table will automatically be created.
# A python wechat official account scrawler, using sougou search engine.
# selenium needed
21 changes: 13 additions & 8 deletions WechatScraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,24 @@

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from pyvirtualdisplay import Display
import time
import re
import json

import config
import utils

browser = webdriver.PhantomJS()
display = Display(visible=0, size=(1366, 768))
display.start()
print('Display start')

browser = webdriver.Firefox()

class WechatScraper():

def __init__(self, ws_config, **kwargs):
config = utils.merge(ws_config, config)
def __init__(self, **kwargs):
self.config = config

"""
query: keyword
Expand All @@ -24,7 +29,7 @@ def __init__(self, ws_config, **kwargs):
def get_article_list_by_keyword(self, query, page=1):
query = 'query=' + query
page = 'page=' + str(page)
built_url = self._build_url(config.article_search_url, ['query', 'page'], [query, page])
built_url = self._build_url(self.config.article_search_url, ['query', 'page'], [query, page])
article_list = []
browser.get(built_url)

Expand Down Expand Up @@ -63,7 +68,7 @@ def get_article_by_url(self, url):
if(raw_avatar):
avatar = re.sub(re.compile(r'[^"]+"'), '', raw_avatar[0], 1).replace('";', '')
page_content = browser.find_element_by_id('img-content')
ems = page_content.find_elements_by_css_selector('.rich_media_meta_list>em')
ems = page_content.find_elements_by_css_selector('.rich_media_meta_list>em')
author = ''
if(len(ems)>1):
author = ems[1].text
Expand All @@ -80,7 +85,7 @@ def search_gzh_by_keyword(self, query, **kwargs):
page = kwargs.get('page', 1)
query = 'query=' + query
page = 'page=' + str(page)
built_url = self._build_url(config.gzh_search_url, ['query', 'page'], [query, page])
built_url = self._build_url(self.config.gzh_search_url, ['query', 'page'], [query, page])
browser.get(built_url)
gzh_list = browser.find_elements_by_css_selector('.news-list2 li')
for i in range(len(gzh_list)):
Expand Down Expand Up @@ -109,7 +114,7 @@ def search_gzh_by_keyword(self, query, **kwargs):
def get_gzh_message(self, wechatid):
query = 'query=' + str(wechatid)
page = 'page=' + str(1)
built_url = self._build_url(config.gzh_search_url, ['query', 'page'], [query, page])
built_url = self._build_url(self.config.gzh_search_url, ['query', 'page'], [query, page])
browser.get(built_url)
gzh_list = browser.find_elements_by_css_selector('.news-list2 li')
gzh_url = gzh_list[0].find_element_by_css_selector('.img-box a').get_attribute('href')
Expand Down Expand Up @@ -154,7 +159,7 @@ def get_gzh_message(self, wechatid):


"""
below here are some private functions
below here are some common functions
"""

Expand Down
6 changes: 4 additions & 2 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
# -*- coding: utf-8 -*-
# config file

import pymysql.cursors

article_search_url = 'http://weixin.sogou.com/weixin?oq=&query&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=17&_sug_=n&type=2&sst0=1499142762773&page&ie=utf8&p=40040108&dp=1&w=01015002&dr=1'

gzh_search_url = 'http://weixin.sogou.com/weixin?type=1&s_from=input&query&ie=utf8&_sug_=n&_sug_type_=&w=01019900&sut=10144&sst0=1500528877030&lkt=1%2C1500528876895%2C1500528876895&page'

# test path
img_base_path = './imgs/'

db_config = {
'host': '127.0.0.1', # host
'port': 3306, # port
'user': 'root', # database username
'password': '123456', # database password
'db': 'HUANLEYE', # database name
'db': 'DATABASE', # database name
'charset': 'utf8mb4', # database charset
'cursorclass': pymysql.cursors.DictCursor
}
40 changes: 12 additions & 28 deletions db.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,66 +10,50 @@

import config
import utils
from ImgHandler import ImgHandler

img_handler = ImgHandler()

class DB(object):
"""docstring for DB"""
def __init__(self, db_config, **kwargs):
def __init__(self, **kwargs):
super(DB, self).__init__()
self._db = utils.merge(db_config, (config.db_config))
db_config = kwargs.get('db_config')
self._db = utils.merge(db_config, config.db_config)
print(self._db)

def store_gzh_list(self, gzhList, **kwargs):
if(type(gzhList) != list):
return False
table = kwargs.get('table', 'gzh')
[self._store_gzh_info(i, table=table) for i in gzhList]

def store_gzh_info(self, info, **kwargs):
def _store_gzh_info(self, info, **kwargs):
table = kwargs.get('table')
query = 'insert into ' + table + ' (`title`, `wechatid`, `avatar`, `qrcode`, `introduction`, `verification`) values ("' + info.get('title', '') + '", "' + info.get('wechatid', '') + '", "' + info.get('avatar', '') + '", "' + info.get('qrcode', '') + '", "' + info.get('introduction', '') + '", "' + info.get('verification', '') + '")'
self._execute(query)

def store_article(self, article, **kwargs):
for i in article:
article[i] = self._escape(article[i])
table = kwargs.get('table', 'article_copy')

table = kwargs.get('table', 'articles')
query = 'insert into ' + table + ' (`title`, `poster`, `authorId`, `authorAvatar`, `authorName`, `col`, `description`, `content`, `updateTime`, `tag`, `likes`, `type`) values ("' + article.get('title', '') + '", "' + article.get('poster', '') + '", NULL, "' + article.get('authorAvatar', '') + '", "' + article.get('authorName', '') + '", "' + article.get('col', '') + '", "' + article.get('description', '') + '", "' + article.get('content', '').strip() + '", DATE_ADD(\'1970-01-01 00:00:00\', INTERVAL ' + str(article.get('updateTime', '')) + ' SECOND), 0, 0, 0)'
self._execute(query)
print(article['title'] + ' inserted')

# replace poster or author avatar or all imgs in content
def replace_imgs(self, imgs, position, **kwargs):
table = kwargs.get('table', 'articles')
multi = kwargs.get('multi', False)
if(multi or type(imgs) == 'list'):
imgs_copy = imgs[:]
paths = [img_handler.write_img(img) for img in imgs_copy]
for i in range(len(paths)):
self._replace_img(position, path[i], imgs[i], table)
else:
path = img_handler.write_img(imgs)
effect_rows = self._replace_img(position, path, imgs, table)
return effect_rows

def check_exist(self, title, **kwargs):
query = 'select * from ' + kwargs.get('table', 'article_copy') + ' where title="' + self._escape(title) + '"'
print('checking exist')
query = 'select * from ' + kwargs.get('table', 'articles') + ' where title="' + self._escape(title) + '"'
effect_rows = self._execute(query)
return effect_rows


def _execute(self, query):
conn = pymysql.connect(**self._db)
cursor = conn.cursor()
effect_rows = cursor.execute(query)
conn.commit()
cursor.close()
conn.close()
return effect_rows

def _replace_img(self, position, newPath, oldPath, table, **kwargs):
query = 'update ' + table + ' set ' + position + '=' + newPath + ' where ' + position + '=' + oldPath
effect_rows = self._execute(query)
print('connection closed')
return effect_rows

def _escape(self, string):
Expand All @@ -86,4 +70,4 @@ def _escape(self, string):





Binary file added geckodriver
Binary file not shown.
45 changes: 36 additions & 9 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,40 @@
from WechatScraper import WechatScraper
from db import DB
import utils
import re

ws = WechatScraper()

for i in range(3):
gzh_list = ws.search_gzh_by_keyword('manwei', page=i + 1)
db.store_gzh_list(gzh_list)
time.sleep(3)
'''
to custom your databse config , add db_config as a named parameter like:
db_config = {
'host': '127.0.0.1', # host
'port': 3306, # port
'user': 'root', # database username
'password': '123456', # database password
'db': 'DATABASE', # database name
}
'''

db = DB()

from ImgHandler import ImageHandler

ih = ImageHandler()

# scrap 10 page articles related to one keyword
# scrap 10 page articles related to one keyword.
# images will be put oin imgs folder

keyword_list = [{
'keyword': 'manwei',
'keyword': '漫威',
'page': 10
}, {
'keyword': 'marvel',
'keyword': 'DC',
'page': 10
}]

def digest_article(msg, **kwargs):
col = kwargs.get('col', '过山车')
col = kwargs.get('col', 'mycol')
url = msg['url'].replace('amp;', '')
effect_rows = db.check_exist(msg['title'])
if(effect_rows>0):
Expand All @@ -34,8 +47,22 @@ def digest_article(msg, **kwargs):
article = ws.get_article_by_url(url)
article = utils.merge(article, msg)
article['col'] = col

content = article['content']
# replace images
img_re = re.compile(r'<img[^>]+data\-src\=\"http[^"]+\"')
images = img_re.findall(content)
for j in range(len(images)):
images[j] = re.sub('<img.+data\-src=', '', images[j]).replace('"', '')
newPath = ih.write_image(images[j])
content = content.replace(images[j], newPath, 2)
article['content'] = content
if(article['poster'] and len(article['poster']) > 0):
article['poster'] = ih.write_image(article['poster'][0])
print('article get')
db.store_article(article)
time.sleep(5)
print('article stored')
time.sleep(3)

for item in keyword_list:
for i in range(item['page']):
Expand Down
Loading

0 comments on commit c024748

Please sign in to comment.