Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

datetime for repost_time #194

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion db/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
Column("user_name", String(200)),
Column("weibo_id", String(200), unique=True),
Column("parent_user_id", String(20)),
Column("repost_time", String(200)),
Column("repost_time", DateTime),
Column("repost_cont", Text),
Column("weibo_url", String(200)),
Column("parent_user_name", String(200)),
Expand Down
8 changes: 4 additions & 4 deletions page_get/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import signal

import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning

from config import headers
from logger import crawler
Expand All @@ -26,9 +25,10 @@
EXCP_INTERAL = get_excp_interal()
COOKIES = get_cookies()


# Disable annoying InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
# Instead of disable warning, why not use it as docs suggested
# https://stackoverflow.com/questions/42982143/python-requests-how-to-use-system-ca-certificates-debian-ubuntu
os.environ['REQUESTS_CA_BUNDLE'] = os.path.join(os.sep, '/etc/ssl/certs',
'ca-certificates.crt')


def is_banned(url):
Expand Down
2 changes: 0 additions & 2 deletions page_parse/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,3 @@ def is_403(html):

def is_complete(html):
return True if 'uid' in html else False


52 changes: 13 additions & 39 deletions page_parse/comment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
from db.models import WeiboComment
from decorators import parse_decorator
from utils import parse_emoji
from page_get import get_profile
import datetime
import re
from page_parse.interact_time import (
get_create_time_from_text, get_create_time_from_text_default_error_handler)


@parse_decorator('')
def get_html_cont(html):
cont = ''
Expand Down Expand Up @@ -114,42 +115,15 @@ def get_comment_list(html, wb_id):
wb_comment.comment_id = comment['comment_id']
# TODO 将wb_comment.user_id加入待爬队列(seed_ids)
wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]
# 爬取新用户基本信息
if wb_comment.user_id:
get_profile(wb_comment.user_id)
# 日期格式化
create_time = comment.find(attrs={'class': 'WB_from S_txt2'}).text
if '分钟前' in create_time:
now = datetime.datetime.now()
reduce_minute = create_time.strip().split('分钟')[0]
delta = datetime.timedelta(minutes=int(reduce_minute))
real_time = now - delta
wb_comment.create_time = str(real_time.strftime('%Y-%m-%d %H:%M'))
elif '今天' in create_time:
now = datetime.datetime.now().strftime('%Y-%m-%d')
real_time = now + create_time.strip().split('今天')[-1]
wb_comment.create_time = str(real_time)
elif '楼' in create_time:
wb_comment.create_time = str(re.sub('第\d*楼', '', create_time))
else:
wb_comment.create_time = create_time
if not wb_comment.create_time.startswith('201'):
wb_comment.create_time = str(datetime.datetime.now().year) + wb_comment.create_time
# 中文时间戳转换成标准格式 "%Y-%m-%d %H:%M"
create_time_copy = wb_comment.create_time
if '月' in create_time_copy and '日' in create_time_copy:
month = create_time_copy.split("年")[-1].split("月")[0]
day = create_time_copy.split("年")[-1].split("月")[-1].split("日")[0]
# 补齐0
if month and int(month) < 10:
wb_comment.create_time = wb_comment.create_time.replace(str(month) + "月",
"0" + str(month) + "月")
if day and int(day) < 10:
wb_comment.create_time = wb_comment.create_time.replace(str(day) + "日", "0" + str(day) + "日")
wb_comment.create_time = wb_comment.create_time.replace("月", "-")
wb_comment.create_time = wb_comment.create_time.replace("日", "")
if '年' in wb_comment.create_time:
wb_comment.create_time = wb_comment.create_time.replace("年", "-")

create_time_str = comment.find(attrs={'class': 'WB_from S_txt2'}).text
try:
create_time = get_create_time_from_text(create_time_str)
except ValueError as e:
create_time = get_create_time_from_text_default_error_handler(
create_time_str, e)
create_time_str = create_time.strftime("%Y-%m-%d %H:%M:%S")
wb_comment.create_time = create_time_str

wb_comment.weibo_id = wb_id
except Exception as e:
Expand Down
69 changes: 69 additions & 0 deletions page_parse/interact_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import datetime
import re

from logger import parser


def get_create_time_from_text_default_error_handler(
create_time_str: str, e: Exception) -> datetime.datetime:
"""[default error handler will return datetime of now]

Arguments:
create_time_str {str} -- [origin str]
e {Exception} -- [Exception]

Returns:
datetime -- [datetime of now]
"""

parser.error('解析评论时间失败,原时间为"{}",具体信息是{}'.format(create_time_str, e))
return datetime.datetime.now()


def get_create_time_from_text(create_time_str: str) -> datetime.datetime:
"""[Get create time from text]

Arguments:
create_time_str {str} -- [create time str]

Returns:
datetime -- [create time]
"""

# 第XX楼
create_time_str = re.sub(r"\u7b2c[0-9]+\u697c", "", create_time_str)
create_time_str = create_time_str.strip()
if '秒' in create_time_str:
# 40秒前
# Since the datetime accuracy is set to minute,
# we use now as create time
create_time = datetime.datetime.now()
elif '分钟前' in create_time_str:
# 2分钟前/12分钟前/55分钟前
create_time_minute = re.sub(r"\D", "", create_time_str) # 10分钟前 -> 10
create_time_minute = int(create_time_minute)
create_time = (datetime.datetime.now() +
datetime.timedelta(minutes=-create_time_minute))
elif '今天' in create_time_str:
# 今天 22:11/今天 21:44/今天 05:11
create_time = create_time_str.split()
if len(create_time) == 2:
create_time = datetime.datetime.now().strftime(
"%Y-%m-%d ") + create_time[1] + ":00"
create_time = datetime.datetime.strptime(create_time,
"%Y-%m-%d %H:%M:%S")
else:
raise ValueError
elif '月' in create_time_str:
# 9月21日 14:05/9月21日 03:07/9月20日 22:20/1月5日 08:39
create_time = datetime.datetime.strptime(create_time_str,
"%m月%d日 %H:%M")
# the year of create_time will be 1900 (default value)
year = int(datetime.datetime.now().strftime("%Y"))
# https://stackoverflow.com/questions/12468823/python-datetime-setting-fixed-hour-and-minute-after-using-strptime-to-get-day#comment16772522_12468869
create_time = create_time.replace(year=year)
else:
# 2017-12-29 10:48/2017-12-28 10:15
create_time = datetime.datetime.strptime(create_time_str,
"%Y-%m-%d %H:%M")
return create_time
19 changes: 14 additions & 5 deletions page_parse/repost.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from db.models import WeiboRepost
from db.redis_db import IdNames
from decorators import parse_decorator
from page_parse.interact_time import (
get_create_time_from_text, get_create_time_from_text_default_error_handler)


REPOST_URL = 'http://weibo.com{}'
Expand Down Expand Up @@ -56,11 +58,18 @@ def get_repost_list(html, mid):
wb_repost.weibo_id = repost['mid']
# TODO 将wb_repost.user_id加入待爬队列(seed_ids)
wb_repost.user_id = repost.find(attrs={'class': 'WB_face W_fl'}).find('a').get('usercard')[3:]
wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').\
text
wb_repost.repost_time = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title')
wb_repost.weibo_url = REPOST_URL.format(repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').
get('href'))
wb_repost.user_name = repost.find(attrs={'class': 'list_con'}).find(attrs={'class': 'WB_text'}).find('a').text

create_time_str = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('title')
try:
create_time = get_create_time_from_text(create_time_str)
except ValueError as e:
create_time = get_create_time_from_text_default_error_handler(
create_time_str, e)
create_time_str = create_time.strftime("%Y-%m-%d %H:%M:%S")
wb_repost.repost_time = create_time_str

wb_repost.weibo_url = repost.find(attrs={'class': 'WB_from S_txt2'}).find('a').get('href')
parents = repost.find(attrs={'class': 'WB_text'}).find(attrs={'node-type': 'text'})
wb_repost.root_weibo_id = mid

Expand Down