From 59fcaf44fc7562e8525346cfed50c2f324e4e6c1 Mon Sep 17 00:00:00 2001 From: thekingofcity Date: Tue, 3 Mar 2020 13:33:31 +0800 Subject: [PATCH 1/2] fix InsecureRequestWarning --- page_get/basic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/page_get/basic.py b/page_get/basic.py index 4265aeec..8505982f 100755 --- a/page_get/basic.py +++ b/page_get/basic.py @@ -3,7 +3,6 @@ import signal import requests -from requests.packages.urllib3.exceptions import InsecureRequestWarning from config import headers from logger import crawler @@ -26,9 +25,10 @@ EXCP_INTERAL = get_excp_interal() COOKIES = get_cookies() - -# Disable annoying InsecureRequestWarning -requests.packages.urllib3.disable_warnings(InsecureRequestWarning) +# Instead of disable warning, why not use it as docs suggested +# https://stackoverflow.com/questions/42982143/python-requests-how-to-use-system-ca-certificates-debian-ubuntu +os.environ['REQUESTS_CA_BUNDLE'] = os.path.join(os.sep, '/etc/ssl/certs', + 'ca-certificates.crt') def is_banned(url): From 4d80e0b67454d430374ae8742394fe17831dc9be Mon Sep 17 00:00:00 2001 From: thekingofcity Date: Sat, 2 May 2020 19:05:02 +0800 Subject: [PATCH 2/2] :fire:repost create_time --- db/tables.py | 2 +- page_parse/basic.py | 2 -- page_parse/comment.py | 52 +++++++--------------------- page_parse/interact_time.py | 69 +++++++++++++++++++++++++++++++++++++ page_parse/repost.py | 19 +++++++--- 5 files changed, 97 insertions(+), 47 deletions(-) create mode 100644 page_parse/interact_time.py diff --git a/db/tables.py b/db/tables.py index a9d22829..28cc9c0a 100644 --- a/db/tables.py +++ b/db/tables.py @@ -105,7 +105,7 @@ Column("user_name", String(200)), Column("weibo_id", String(200), unique=True), Column("parent_user_id", String(20)), - Column("repost_time", String(200)), + Column("repost_time", DateTime), Column("repost_cont", Text), Column("weibo_url", String(200)), Column("parent_user_name", String(200)), diff --git a/page_parse/basic.py b/page_parse/basic.py index 92c81a05..a3202b4c 100755 --- a/page_parse/basic.py +++ b/page_parse/basic.py @@ -48,5 +48,3 @@ def is_403(html): def is_complete(html): return True if 'uid' in html else False - - diff --git a/page_parse/comment.py b/page_parse/comment.py index 4c280e86..3285dd0d 100644 --- a/page_parse/comment.py +++ b/page_parse/comment.py @@ -6,9 +6,10 @@ from db.models import WeiboComment from decorators import parse_decorator from utils import parse_emoji -from page_get import get_profile -import datetime -import re +from page_parse.interact_time import ( + get_create_time_from_text, get_create_time_from_text_default_error_handler) + + @parse_decorator('') def get_html_cont(html): cont = '' @@ -114,42 +115,15 @@ def get_comment_list(html, wb_id): wb_comment.comment_id = comment['comment_id'] # TODO 将wb_comment.user_id加入待爬队列(seed_ids) wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:] - # 爬取新用户基本信息 - if wb_comment.user_id: - get_profile(wb_comment.user_id) - # 日期格式化 - create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text - if '分钟前' in create_time: - now = datetime.datetime.now() - reduce_minute = create_time.strip().split('分钟')[0] - delta = datetime.timedelta(minutes=int(reduce_minute)) - real_time = now - delta - wb_comment.create_time = str(real_time.strftime('%Y-%m-%d %H:%M')) - elif '今天' in create_time: - now = datetime.datetime.now().strftime('%Y-%m-%d') - real_time = now + create_time.strip().split('今天')[-1] - wb_comment.create_time = str(real_time) - elif '楼' in create_time: - wb_comment.create_time = str(re.sub('第\d*楼', '', create_time)) - else: - wb_comment.create_time = create_time - if not wb_comment.create_time.startswith('201'): - wb_comment.create_time = str(datetime.datetime.now().year) + wb_comment.create_time - # 中文时间戳转换成标准格式 "%Y-%m-%d %H:%M" - create_time_copy = wb_comment.create_time - if '月' in create_time_copy and '日' in create_time_copy: - month = create_time_copy.split("年")[-1].split("月")[0] - day = create_time_copy.split("年")[-1].split("月")[-1].split("日")[0] - # 补齐0 - if month and int(month) < 10: - wb_comment.create_time = wb_comment.create_time.replace(str(month) + "月", - "0" + str(month) + "月") - if day and int(day) < 10: - wb_comment.create_time = wb_comment.create_time.replace(str(day) + "日", "0" + str(day) + "日") - wb_comment.create_time = wb_comment.create_time.replace("月", "-") - wb_comment.create_time = wb_comment.create_time.replace("日", "") - if '年' in wb_comment.create_time: - wb_comment.create_time = wb_comment.create_time.replace("年", "-") + + create_time_str = comment.find(attrs={'class': 'WB_from S_txt2'}).text + try: + create_time = get_create_time_from_text(create_time_str) + except ValueError as e: + create_time = get_create_time_from_text_default_error_handler( + create_time_str, e) + create_time_str = create_time.strftime("%Y-%m-%d %H:%M:%S") + wb_comment.create_time = create_time_str wb_comment.weibo_id = wb_id except Exception as e: diff --git a/page_parse/interact_time.py b/page_parse/interact_time.py new file mode 100644 index 00000000..c32a8536 --- /dev/null +++ b/page_parse/interact_time.py @@ -0,0 +1,69 @@ +import datetime +import re + +from logger import parser + + +def get_create_time_from_text_default_error_handler( + create_time_str: str, e: Exception) -> datetime.datetime: + """[default error handler will return datetime of now] + + Arguments: + create_time_str {str} -- [origin str] + e {Exception} -- [Exception] + + Returns: + datetime -- [datetime of now] + """ + + parser.error('解析评论时间失败,原时间为"{}",具体信息是{}'.format(create_time_str, e)) + return datetime.datetime.now() + + +def get_create_time_from_text(create_time_str: str) -> datetime.datetime: + """[Get create time from text] + + Arguments: + create_time_str {str} -- [create time str] + + Returns: + datetime -- [create time] + """ + + # 第XX楼 + create_time_str = re.sub(r"\u7b2c[0-9]+\u697c", "", create_time_str) + create_time_str = create_time_str.strip() + if '秒' in create_time_str: + # 40秒前 + # Since the datetime accuracy is set to minute, + # we use now as create time + create_time = datetime.datetime.now() + elif '分钟前' in create_time_str: + # 2分钟前/12分钟前/55分钟前 + create_time_minute = re.sub(r"\D", "", create_time_str) # 10分钟前 -> 10 + create_time_minute = int(create_time_minute) + create_time = (datetime.datetime.now() + + datetime.timedelta(minutes=-create_time_minute)) + elif '今天' in create_time_str: + # 今天 22:11/今天 21:44/今天 05:11 + create_time = create_time_str.split() + if len(create_time) == 2: + create_time = datetime.datetime.now().strftime( + "%Y-%m-%d ") + create_time[1] + ":00" + create_time = datetime.datetime.strptime(create_time, + "%Y-%m-%d %H:%M:%S") + else: + raise ValueError + elif '月' in create_time_str: + # 9月21日 14:05/9月21日 03:07/9月20日 22:20/1月5日 08:39 + create_time = datetime.datetime.strptime(create_time_str, + "%m月%d日 %H:%M") + # the year of create_time will be 1900 (default value) + year = int(datetime.datetime.now().strftime("%Y")) + # https://stackoverflow.com/questions/12468823/python-datetime-setting-fixed-hour-and-minute-after-using-strptime-to-get-day#comment16772522_12468869 + create_time = create_time.replace(year=year) + else: + # 2017-12-29 10:48/2017-12-28 10:15 + create_time = datetime.datetime.strptime(create_time_str, + "%Y-%m-%d %H:%M") + return create_time diff --git a/page_parse/repost.py b/page_parse/repost.py index c08865a9..96aa2855 100644 --- a/page_parse/repost.py +++ b/page_parse/repost.py @@ -6,6 +6,8 @@ from db.models import WeiboRepost from db.redis_db import IdNames from decorators import parse_decorator +from page_parse.interact_time import ( + get_create_time_from_text, get_create_time_from_text_default_error_handler) REPOST_URL = 'http://weibo.com{}' @@ -56,11 +58,18 @@ def get_repost_list(html, mid): wb_repost.weibo_id = repost['mid'] # TODO 将wb_repost.user_id加入待爬队列(seed_ids) wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:] - wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\ - text - wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title') - wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a'). - get('href')) + wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').text + + create_time_str = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title') + try: + create_time = get_create_time_from_text(create_time_str) + except ValueError as e: + create_time = get_create_time_from_text_default_error_handler( + create_time_str, e) + create_time_str = create_time.strftime("%Y-%m-%d %H:%M:%S") + wb_repost.repost_time = create_time_str + + wb_repost.weibo_url = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('href') parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'}) wb_repost.root_weibo_id = mid