-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathupdate_sentiment_page.py.bak
197 lines (168 loc) · 8.73 KB
/
update_sentiment_page.py.bak
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# libraries for webscraping, parsing and getting stock data
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from yahooquery import Ticker
import time
import requests
import itertools
import numpy as np
from itertools import chain
# for plotting and data manipulation
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
# NLTK VADER for sentiment analysis
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# for getting current date and time to print 'last updated'
from datetime import datetime
import datetime
# Get All Tickers from Dow Jones Index
df_dow_jones = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]
tickers = df_dow_jones['Symbol'].tolist()
# Scrape the Date, Time and News Headlines Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}
for ticker in tickers:
url = finwiz_url + ticker
req = Request(url=url,headers = { "user-Agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'})
try:
response = urlopen(req)
except:
time.sleep(10) # if there is an error and request is blocked, do it more slowly by waiting for 10 seconds before requesting again
response = urlopen(req)
# Read the contents of the file into 'html'
html = BeautifulSoup(response, "html")
# Find 'news-table' in the Soup and load it into 'news_table'
news_table = html.find(id='news-table')
# Add the table to our dictionary
news_tables[ticker] = news_table
# Parse the Date, Time and News Headlines into a Python List
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
# Iterate through all tr tags in 'news_table'
for x in news_table.findAll('tr'):
# occasionally x (below) may be None when the html table is poorly formatted, skip it in try except instead of throwing an error and exiting
# may also use an if loop here to check if x is None first
try:
# read the text from each tr tag into text
# get text from a only
text = x.a.get_text()
# splite text in the td tag into a list
date_scrape = x.td.text.split()
# if the length of 'date_scrape' is 1, load 'time' as the only element
if len(date_scrape) == 1:
time = date_scrape[0]
# else load 'date' as the 1st element and 'time' as the second
else:
date = date_scrape[0]
time = date_scrape[1]
# Extract the ticker from the file name, get the string up to the 1st '_'
ticker = file_name.split('_')[0]
print("Scraping news for", ticker)
# Append ticker, date, time and headline as a list to the 'parsed_news' list
parsed_news.append([ticker, date, time, text])
except Exception as e:
print(e)
# Perform Sentiment Analysis with Vader
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)
# There is a value "Today" for date column, instead of the actual today's date, so replace it
today = datetime.date.today()
parsed_and_scored_news = parsed_and_scored_news.replace("Today", today)
# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)
# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date
# Group by each ticker and get the mean of all sentiment scores
mean_scores = parsed_and_scored_news.groupby(['ticker']).mean()
# Get Market Cap, Sector and Industry of each Ticker
# get the price, sector and industry of each ticker using the yahooquery api (UPDATE: yahooquery api no longer works)
# UPDATE: yahooquery api no longer works so this code instead scrapes the finviz website for the marketcap, sector and industry of each ticker
def get_ticker_data(ticker):
data = {}
url_base = "https://finviz.com/quote.ashx?t="
url = url_base + ticker
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
l = []
# parse all values in table into dict
for i, row in enumerate(soup.select('.snapshot-td2')):
#print(row)
#l.append([td.text for td in row.select(""'td')])
if i%2 == 0:
key = row.text
else:
data[key]= row.text
data['Sector'] = soup.select('.quote-links')[0].select('.tab-link')[0].text
data['Industry'] = soup.select('.quote-links')[0].select('.tab-link')[1].text
if data['Market Cap'][-1] == 'B':
data['Market Cap'] = float(data['Market Cap'][:-1])*10e9
elif data['Market Cap'][-1] == 'M':
data['Market Cap'] = float(data['Market Cap'][:-1])*10e6
else:
pass
return data
sectors = []
industries = []
marketcap = []
for ticker in tickers:
print("Getting data market cap and sector/industry data for", ticker)
data = get_ticker_data(ticker)
marketcap.append(data['Market Cap'])
try:
sectors.append(data['Sector'])
except:
sectors.append("Others")
try:
industries.append(data['Industry'])
except:
industries.append("Others")
# Combine the Information Above and the Corresponding Tickers into a DataFrame
d = {'Symbol': tickers, 'Sector': sectors, 'Industry': industries, 'Market Cap': marketcap}
# create dataframe from
df_info = pd.DataFrame(data=d)
# Get Names of Companies from the Dow Jones DataFrame obtained Earlier
df_info_name = df_info.merge(df_dow_jones[['Company', 'Symbol']], on = 'Symbol')
# Join Stock Information and Sentiment Information
df = mean_scores.merge(df_info_name, left_on = 'ticker', right_on = 'Symbol')
df = df.rename(columns={"compound": "Sentiment Score", "neg": "Negative", "neu": "Neutral", "pos": "Positive"})
# Generate the Treemap Plot
# group data into sectors at the highest level, breaks it down into industry, and then ticker, specified in the 'path' parameter
# the 'values' parameter uses the value of the column to determine the relative size of each box in the chart
# the color of the chart follows the sentiment score
# when the mouse is hovered over each box in the chart, the negative, neutral, positive and overall sentiment scores will all be shown
# the color is red (#ff0000) for negative sentiment scores, black (#000000) for 0 sentiment score and green (#00FF00) for positive sentiment scores
fig = px.treemap(df, path=[px.Constant("Dow Jones"), 'Sector', 'Industry', 'Symbol'], values='Market Cap',
color='Sentiment Score', hover_data=['Company', 'Negative', 'Neutral', 'Positive', 'Sentiment Score'],
color_continuous_scale=['#FF0000', "#000000", '#00FF00'],
color_continuous_midpoint=0)
fig.data[0].customdata = df[['Company', 'Negative', 'Neutral', 'Positive', 'Sentiment Score']].round(3) # round to 3 decimal places
fig.data[0].texttemplate = "%{label}<br>%{customdata[4]}"
fig.update_traces(textposition="middle center")
fig.update_layout(margin = dict(t=30, l=10, r=10, b=10), font_size=20)
# Get current date, time and timezone to print to the html page
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
timezone_string = datetime.now().astimezone().tzname()
# Generate HTML File with Updated Time and Treemap
with open('dow_jones_live_sentiment.html', 'a') as f:
f.truncate(0) # clear file if something is already written on it
title = "<h1>Dow Jones Stock Sentiment Dashboard</h1>"
updated = "<h2>Last updated: " + dt_string + " (Timezone: " + timezone_string + ")</h2>"
description = "This dashboard is updated every half an hour with sentiment analysis performed on latest scraped news headlines from the FinViz website.<br><br>"
code = """<a href="https://medium.com/datadriveninvestor/use-github-actions-to-create-a-live-stock-sentiment-dashboard-online-580a08457650">Explanatory Article</a> | <a href="https://github.com/damianboh/dow_jones_live_stock_sentiment_treemap">Source Code</a>"""
author = """ | Created by Damian Boh, check out my <a href="https://damianboh.github.io/">GitHub Page</a>"""
f.write(title + updated + description + code + author)
f.write(fig.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file