-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathsec_crawler.py
113 lines (89 loc) · 4.75 KB
/
sec_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""
sec.gov crawler that downloads exhibit 10 form filings of type 10-K, 10-Q, and 8-K (i.e. material contracts)
sec.gov allows 10 requests per second https://www.sec.gov/privacy.htm#security
"""
import re
import requests
import os
import time
import zipfile
OUTDIR = '/home/don/resources/sec_crawler/data/'
BASE_URL = 'https://www.sec.gov/Archives/'
YEARS = range(2015, 1992, -1)
QS = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
VALID_FORMS = ['10-K', '10-Q', '8-K']
SLEEP_TIME = 0.15
def fetch_master_files():
"""Get the master files"""
for year in YEARS:
year = str(year)
outdir_year = os.path.join(OUTDIR, year)
if not os.path.exists(outdir_year):
os.makedirs(outdir_year)
for q in QS:
outdir_year_q = os.path.join(outdir_year, q)
if not os.path.exists(outdir_year_q):
os.makedirs(outdir_year_q)
outdir_year_q_master = os.path.join(outdir_year_q, 'master.zip')
if not os.path.exists(outdir_year_q_master):
master_url = BASE_URL + 'edgar/full-index/' + year + '/' + q + '/master.zip'
print('Downloading', master_url)
time.sleep(SLEEP_TIME)
response = requests.get(master_url)
with open(outdir_year_q_master, 'wb') as f:
f.write(response.content)
def crawl_master_files():
"""Get crawlable URLs from master files and download contracts"""
for year in YEARS:
print(year)
year = str(year)
outdir_year = os.path.join(OUTDIR, year)
for q in QS:
print(q)
outdir_year_q = os.path.join(outdir_year, q)
outdir_year_q_master = os.path.join(outdir_year_q, 'master.zip')
try:
z = zipfile.ZipFile(outdir_year_q_master) # Fails for non-existant Qs, e.g. 2019 Q3
except:
continue
with z.open('master.idx') as f:
for line in f:
line = line.decode('utf8', errors='ignore')
if line[0].isdigit(): # CIK number
line = line.split('|')
if line[2] in VALID_FORMS:
filing_txt = line[4].strip().split('/')[-1]
filing_id = filing_txt.replace('-', '').replace('.txt', '')
filing_dir = os.path.join(outdir_year_q, filing_id)
if not os.path.exists(filing_dir):
os.makedirs(filing_dir)
filing_index = os.path.join(filing_dir, filing_txt.replace('.txt', '') + '-index.html')
if not os.path.exists(filing_index): # Check if we already have downloaded the filing index
index_url = os.path.join(BASE_URL, 'edgar/data', filing_id, filing_txt.replace('.txt', '') + '-index.html')
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), year, q, 'Downloading index', index_url)
time.sleep(SLEEP_TIME)
index_html = requests.get(index_url)
with open(os.path.join(filing_dir, filing_index), 'w') as f:
f.write(index_html.text)
# Load the index_html
index_html = open(filing_index).read()
trs = re.findall('<tr[^>]*>(.*?)</tr>', index_html, re.S)
for row in trs:
if '<td' not in row:
continue
tds = re.split('</?td[^>]*>', row)
if tds[7].startswith('EX-10'):
file_name = re.search('"(.+)"', tds[5]).group(1)
file_url = 'https://www.sec.gov' + file_name
if file_url.endswith('htm') or file_url.endswith('html'):
filing_file = os.path.join(filing_dir, file_name.split('/')[-1])
if not os.path.exists(filing_file):
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), year, q, 'Downloading contract', file_url)
filing_html = requests.get(file_url)
with open(filing_file, 'w') as f:
f.write(filing_html.text)
if __name__ == '__main__':
print('Fetching master files')
fetch_master_files()
print('Fetching contracts')
crawl_master_files()