-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDownloadDavisFinancials_selenium.py
114 lines (104 loc) · 5.46 KB
/
DownloadDavisFinancials_selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#Code for finding and printing webpages
#Need to have installed with webdriver in path. Instructions at http://selenium-python.readthedocs.io/installation.html
#Pre-load stuff
import os
import pandas as pd
import numpy as np
import yaml
from selenium import webdriver
from bs4 import BeautifulSoup
import camelot
#User Inputted Parameters in filename in same directory
inputFile = "DavisFinancials.yaml"
with open(inputFile, 'r') as stream:
params = yaml.load(stream, yaml.SafeLoader)
#Parse Inputs
url = params['Inputs']['website']
savedir = params['Outputs']['outputDir']
if os.path.isdir(savedir) == False:
os.mkdir(savedir)
#Start Driver
driver = webdriver.Firefox()
#Get to Starting URL
driver.get(url)
assert params['Inputs']['confirmTitle'] in driver.title
#Specific TabPanel Role Elements to Extract
elContent = driver.find_elements_by_css_selector("section[role='tabpanel']")
nestedLinks = []
for el in elContent:
nestedLinks.append(el.find_elements_by_tag_name('a'))
#Flatten all links to search through
elLinks = [el for sublist in nestedLinks for el in sublist]
#Find Links to only Financial Section Only PDFs
financialLinks = []
for idx, link in enumerate(elLinks):
if params['Inputs']['PDFFindString'] in link.get_attribute('innerHTML'):
financialLinks.append(link.get_attribute('href'))
#Loop through PDFs to extract tables
pdfDict = {}
for pdflink in financialLinks:
driver.get(pdflink)
pdfTables = camelot.read_pdf(driver.current_url, flavor='stream', pages='6-8', strip_text='$') #Read in Tables of Interest
tabLoop = 0
for table in pdfTables:
if table.shape[1] > params['Inputs']['tableSizeThreshold']: #Keep only tables with more than X colums (Real Tables)
for val in params['Inputs']['falseHeaderValues']:
if val in table.data[0]: #If missed top row of header
top, bottom = table.rows[0]
newy1 = int(top + (top-bottom))
y2 = int(table.rows[-1][-1])
x1 = int(table.cols[0][0])
x2 = int(table.cols[-1][-1])
newtable = camelot.read_pdf(driver.current_url, flavor='stream', pages=str(table.page), strip_text='$', table_areas=['{0},{1},{2},{3}'.format(x1,newy1,x2,y2)])
table = newtable[0]
for val in params['Inputs']['falseFooterValues']:
if val in table.data[-1]: #If added extra falseFooterValues
top, bottom = table.rows[-1]
newy2 = int(top)
y1 = int(table.rows[0][0])
x1 = int(table.cols[0][0])
x2 = int(table.cols[-1][-1])
newtable = camelot.read_pdf(driver.current_url, flavor='stream', pages=str(table.page), strip_text='$', table_areas=['{0},{1},{2},{3}'.format(x1,y1,x2,newy2)])
table = newtable[0]
#If captured too many additional rows in top header (Determined as only one value in 1 less than number of columns)
extraHeaderRows = []
for idx, row in enumerate(table.data[:params['Inputs']['maxHeaderRows']]):
emptySpots = [i for i, x in enumerate(row) if x == '']
if len(emptySpots) == len(row)-1:
extraHeaderRows.append(idx)
curTable = table.df
curTable = curTable.drop(index=extraHeaderRows)
nHeader = curTable[0].eq('').sum() #Get Number of Header Rows
if tabLoop == 0:
tabName = params['Inputs']['tableName0'] + curTable.iloc[nHeader-1,-1]
elif tabLoop == 1:
tabName = params['Inputs']['tableName1'] + curTable.iloc[nHeader-1,-1]
if tabName == 'ChangeNetPos2016-2015': #Handle one annoying exception
curTable.iloc[:2,4] = curTable.iloc[:2,5]
#Drop columns that are empty on more than 3 quarters of rows
emptyCols = table.df.columns[table.df.eq('').sum() > table.df.index.size*.75]
curTable = curTable.drop(columns=emptyCols)
curTable = curTable.rename(index=curTable[0]).drop(columns=0) #Rename first column to indexes
nCols = curTable.columns.size
for colidx in range(0,nCols-1,2): #Copy Subheaders for Blank Columns, skipping last column
if curTable.iloc[:nHeader-1,colidx].eq('').sum() > 1:
curTable.iloc[:nHeader-1,colidx] = curTable.iloc[:nHeader-1,colidx+1]
elif curTable.iloc[:nHeader-1,colidx+1].eq('').sum() > 1:
curTable.iloc[:nHeader-1,colidx+1] = curTable.iloc[:nHeader-1,colidx]
columns = [curTable.iloc[:nHeader,col].str.cat(sep=' ').lstrip() for col in range(nCols)] #Concatenate Subheaders for Column Names
curTable = curTable.iloc[nHeader:,:]
curTable.columns = columns
for col in curTable: #Clean up Strings to Convert to Floats
curTable[col] = curTable[col].str.lstrip('$ \n') #Remove $ signs
curTable[col] = curTable[col].str.replace('(','-').str.rstrip(')') #Make negatives negative
curTable[col] = curTable[col].str.replace(',','') #Remove Commas
curTable = curTable.astype('float64', errors='ignore') #Ignore Empty Fields and leave Empty
#Add to Master Dict
pdfDict[tabName] = curTable
tabLoop += 1
#Save dictionary DataFrames to CSVs
os.chdir(savedir)
for key in pdfDict:
pdfDict[key].to_csv(key+'.csv')
#Close Webdriver
driver.quit()