forked from mattpearson/etfanalytics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathetf_models.py
254 lines (208 loc) · 8.51 KB
/
etf_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import urllib.request
import html5lib
import ast
import re
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
"""
Fetch expenses and holdings for a single ETF.
Three potential sources (etfdailynews, Zacks, Bloomberg) are used to scrape data.
"""
class ETFData:
def __init__(self, ticker):
self.ticker = ticker
self.holdings = None
self.num_holdings = None
self.expense_ratio = None
@staticmethod
def convert_percent(pct):
if type(pct)==float:
return pct/100
return float(pct.rstrip('%'))/100
def get_expenses(self, ticker):
"""
Fetch expense ratio from Bloomberg
"""
url = "http://bloomberg.com/quote/" + str(ticker) + ":US"
html = urllib.request.urlopen(url).read()
soup = bs(html, "lxml")
ratio_pattern = re.compile(r'Expense Ratio')
percent_pattern = re.compile(r'%$')
ratio = soup.find('div', text=ratio_pattern).find_next_sibling().text.rstrip('\n ').lstrip('\n ')
self.expense_ratio = self.convert_percent(ratio)
"""
Y Finance version no longer working due to changes to site
"""
# def get_expenses(self, ticker):
# """
# Fetch expense ratio from Y Finance
# """
# url = 'http://finance.yahoo.com/q/pr?s='+str(ticker)+'+Profile'
# html = urllib.request.urlopen(url).read()
# soup = bs(html, "lxml")
# percent_pattern = re.compile(r'%$')
# ratio = soup.find('td', text='Annual Report Expense Ratio (net)').find_next_sibling('td', text=percent_pattern).text
# self.expense_ratio = self.convert_percent(ratio)
def holdings_first_parse(self, ticker):
"""
Default API for fetching ETF holdings. Generates a pandas DataFrame with name,
ticker, allocation for each holding. Equity ETFs only.
"""
url='http://etfdailynews.com/tools/what-is-in-your-etf/?FundVariable=' + str(ticker)
# decode to unicode, then re-encode to utf-8 to avoid gzip
html = urllib.request.urlopen(url).read().decode('cp1252').encode('utf-8')
soup = bs(html, "lxml")
# Build Holdings Table - find the only tbody element on the page
holdings_table = "<table>" + str(soup.tbody).lstrip('<tbody>').rstrip('</tbody') + "</table>"
# Fetch expense ratio
ratio_pattern = re.compile(r'Expense Ratio')
percent_pattern = re.compile(r'%$')
td = soup.find('td', text=ratio_pattern)
if not td:
return False
# find_next_siblings returns a Result Set object - take first matching item and strip the tags
expense_ratio = str(td.find_next_siblings('td', text=percent_pattern)[0]).lstrip('<td>').rstrip('</td>')
self.expense_ratio = self.convert_percent(expense_ratio)
# convert to DataFrame
df = pd.read_html(holdings_table)[0]
df.columns = ['name', 'ticker', 'allocation']
df['allocation'] = df.allocation.map(lambda x: self.convert_percent(x))
self.holdings, self.num_holdings = df, len(df)
def holdings_second_parse(self, ticker):
"""
Backup source for holdings (zacks.com). Slower (data is parsed from string,
not read into pandas from table element as in holdings_first_parse) and less reliable data. Output is in same format.
Zacks may not have data for some ETFs.
"""
def clean_name(str_input):
if "<span" in str_input:
soup = bs(str_input, "lxml")
return soup.find('span')['onmouseover'].lstrip("tooltip.show('").rstrip(".');")
return str_input
def clean_ticker(str_input):
soup = bs(str_input, "lxml")
return soup.find('a').text
def clean_allocation(str_input):
if str_input == "NA":
return 0
return float(str_input)/100
url = 'https://www.zacks.com/funds/etf/' + str(ticker) + '/holding'
html = urllib.request.urlopen(url).read().decode('cp1252')
str_start, str_end = html.find('data: [ [ '), html.find(' ] ]')
if str_start == -1 or str_end == -1:
# If Zacks does not have data for the given ETF
print("Could not fetch data for {}".format(ticker))
return
list_str = "[["+html[(str_start+12):str_end]+"]]"
holdings_list = ast.literal_eval(list_str)
df = pd.DataFrame(holdings_list).drop(2,1).drop(4,1).drop(5,1)
df.columns = ['name', 'ticker', 'allocation']
df['allocation'] = df.allocation.map(lambda x: clean_allocation(x))
df['name'] = df.name.map(lambda x: clean_name(x))
df['ticker'] = df.ticker.map(lambda x: clean_ticker(x))
self.holdings, self.num_holdings = df, len(df)
self.get_expenses(ticker)
# print(df['allocation'].sum())
@classmethod
def get(cls, ticker, parse_method):
ticker = ticker.upper()
data = cls(ticker)
result = data.holdings_second_parse(ticker) if parse_method=="second" else data.holdings_first_parse(ticker)
return data
"""
Build a portfolio of ETFs and calculate the weight of individual securities within the portfolio
and its overall expense ratio.
"""
class Portfolio:
def __init__(self, parse_method="first"):
self.parse_method = parse_method
self.port_etfs = pd.DataFrame({
'etf':[],
'allocation':[],
'true_weight':[],
'holdings':[],
'expenses':[],
'weighted_expenses':[]
})
self.num_etfs = 0
self.num_holdings = 0
self.port_holdings = None
self.port_expenses = None
def display_portfolio(self):
return self.port_etfs[['etf', 'allocation', 'true_weight', 'expenses', 'weighted_expenses']]
def calculate_weight(self, allocation):
# current total user-entered allocation
total_allocation = self.port_etfs['allocation'].sum(axis=0) + allocation
# Calculate current real allocation
def true_weight(alloc_input):
weight_factor = 1/total_allocation
return weight_factor*alloc_input
# Update real allocation and weighted expenses for each row
self.port_etfs['true_weight'] = self.port_etfs.allocation.map(lambda x: true_weight(x))
self.port_etfs['weighted_expenses'] = self.port_etfs['expenses']*self.port_etfs['true_weight']
return true_weight(allocation)
def add(self, ticker, allocation):
allocation = float(allocation)/100
weight = self.calculate_weight(allocation)
etf = ETFData.get(ticker, self.parse_method)
# if information not found, break
if not etf.expense_ratio:
return
weighted_expenses = etf.expense_ratio*weight
# New ETF row
self.port_etfs = self.port_etfs.append({'etf':ticker,
'allocation':allocation,
'true_weight': weight,
'holdings':etf.holdings,
'expenses':etf.expense_ratio,
'weighted_expenses':weighted_expenses}, ignore_index=True).sort_values(by='etf')
self.num_etfs += 1
print("{} added".format(ticker))
def get_port_expenses(self):
self.port_expenses = self.port_etfs['weighted_expenses'].sum()
return self.port_expenses
def get_stock_allocation(self):
"""
Returns DataFrame with portfolio weight of all individual constituent stocks
"""
all_holdings = pd.DataFrame({'name':[], 'ticker':[], 'allocation':[], 'portfolio_weight':[]})
for each in zip(self.port_etfs['holdings'], self.port_etfs['true_weight']):
each[0]['portfolio_weight'] = each[0].allocation * each[1]
all_holdings = all_holdings.append(each[0], ignore_index=True)
# Convert holdings with NaN or Cash tickers to same symbol to be grouped together in next step
all_holdings.ix[all_holdings.ticker.isin(["CASH_USD", "USD", np.nan]), 'ticker'] = 'N/A'
all_holdings.ix[all_holdings.ticker=="N/A", 'name'] = 'Cash/Other'
# Drop rows with NaN portfolio weight by only saving finite values
all_holdings = all_holdings[np.isfinite(all_holdings['portfolio_weight'])]
# Group holdings by ticker and add respective weights, match with names
names = all_holdings.drop_duplicates(subset='ticker')[['name','ticker']]
grouped_holdings = pd.DataFrame(all_holdings.groupby('ticker')['portfolio_weight'].sum()).reset_index()
grouped_holdings = pd.merge(grouped_holdings, names, left_on='ticker', right_on='ticker', how='inner').sort_values(by='portfolio_weight', ascending=False).reset_index()
self.port_holdings, self.num_holdings = grouped_holdings, len(grouped_holdings)
return grouped_holdings
"""
Print all relevant information for the ETF portfolio
"""
def report(self):
print()
print("Number of ETFs added to portfolio: {}".format(self.num_etfs))
print("Overall portfolio expense ratio: {}".format(str(self.port_expenses*100)+'%'))
print("Number of total individual holdings: {}".format(self.num_holdings))
print("Top 50 holdings: ")
print(self.port_holdings[['name', 'portfolio_weight']].head(50))
if __name__ == "__main__":
# a = Portfolio(parse_method="second")
a = Portfolio()
a.add('VTI', 20)
a.add('SPY', 20)
a.add('XLK', 10)
a.add('IBB', 25)
a.add('HDV', 15)
a.add('MGK', 5)
a.add('IYH', 5)
a.get_stock_allocation()
a.get_port_expenses()
a.report()