forked from tanjiti/sec_profile
-
Notifications
You must be signed in to change notification settings - Fork 1
/
secwiki_today.py
129 lines (100 loc) · 4.66 KB
/
secwiki_today.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import logging
import re
from bs4 import BeautifulSoup
from mills import get_request
from mills import get_special_date
from mills import strip_n
from mills import parse_url
from mills import d2sql
from mills import get_title
from mills import SQLiteOper
from mills import get_github_info
from mills import get_weixin_info
from mills import get_twitter_info
def scraw(so, proxy=None, delta=3):
"""
:param so:
:param proxy:
:return:
"""
ts_list = [get_special_date(delta, format="%Y-%m-%d") for delta in range(0, 0 - delta, -1)]
url = "https://www.sec-wiki.com/?2019-03-04"
r = get_request(url)
if r:
try:
soup = BeautifulSoup(r.content, 'lxml')
except Exception as e:
logging.error("GET %s failed : %s" % (url, repr(e)))
return
if soup:
rows = soup.find_all("span", class_='dropcap')
if rows:
for row in rows:
if row:
cur_ts = row.get_text()
if cur_ts in ts_list:
a = row.next_sibling
if a:
url = a.get("href")
o, ext = parse_url(url)
domain = o.netloc
cur_ts = re.sub("-", "", cur_ts)
title = strip_n(a.get_text())
overview = {}
overview['ts'] = cur_ts
overview['url'] = url
overview['title'] = title
overview['domain'] = domain
overview["domain_name"] = \
str(get_title(overview["domain"], proxy=proxy))
if overview:
sql = d2sql(overview, table="secwiki_today_detail",
action="INSERT OR IGNORE ")
if sql:
try:
so.execute(sql)
except Exception as e:
logging.error("[secwiki_today_sql]: "
"sql(%s) error(%s)" % (sql, str(e)))
st = "{ts}\t{url}" \
"\t{title}\t{domain}\t{domain_name}".format(
ts=overview.get("ts"),
domain=overview.get("domain"),
title=overview.get("title"),
domain_name=overview.get("domain_name"),
url=overview.get("url")
)
print(st)
url = overview.get("url")
ts = overview.get("ts")
tag = overview.get("tag", "")
title = overview.get("title")
sql = ""
if url.find("://twitter.com") != -1:
d = get_twitter_info(url, title, ts=ts, tag=tag, proxy=proxy)
if d:
sql = d2sql(d, table="twitter")
elif url.find("weixin.qq.com") != -1:
d = get_weixin_info(url, ts, tag)
if d:
sql = d2sql(d, table="weixin")
elif url.find("//github.com") != -1:
d = get_github_info(url, title, ts=ts, tag=tag)
if d:
sql = d2sql(d, table='github')
if sql:
try:
print(sql)
so.execute(sql)
except Exception as e:
logging.error("[sql]: %s %s" % (sql, str(e)))
if __name__ == "__main__":
"""
"""
proxy = None
so = SQLiteOper("data/scrap.db")
scraw(so, proxy=proxy, delta=7)