-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy patharticle_retriever.py
142 lines (122 loc) · 5.13 KB
/
article_retriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from datetime import date
import requests
from bs4 import BeautifulSoup
from models.article import Article
class ArticleRetriever:
'''
Data Retrieval tool for both price history and stock news, utilizing yahoo
finance and MarketWatch for the respective subjects. May be updated to include
more financial data in the near future.
1. Simple Helper Functions
2. Scraping/Implementation Helpers
3. Functions the Bot should use / access
NOTE: Helper functions start with '__' to make them private to the class
NOTE: Create an instance : DR = DataRetriever(), use this instance to call
functions many times.
'''
def __init__(self) -> None:
self.STORY = 'https://www.marketwatch.com/story/'
self.ARTICLE = 'https://www.marketwatch.com/articles/'
return
def __create_soup(self, url):
'''
REQUIRES: url formatted as string of website to scrape
EFFECTS: returns BeautifulSoup object of sites html contents
'''
response = requests.get(url)
return BeautifulSoup(response.text, 'html.parser')
def __scrape_news_links(self, ticker):
'''
REQUIRES: a string representing a stock ticker
MODIFIES: technically modifies files that we use to grab desired links
to our output file, but this file is deleted (end of func)
EFFECTS: Accesses MarketWatch, searches the ticker's page for news articles,
finds, filters, and individualizes links to articles. Creates a file
containing links to all of the ticker's stories, returns its path.
'''
soup = self.__create_soup(f'https://www.marketwatch.com/investing/stock/{ticker}?mod=quote_search')
a_tag_list = []
for a_tag in soup.findAll('a'):
a_tag_list.append(str(a_tag))
link_set = set()
unrelated_links = 0
# Many of the <a> elements have unwanted info or nested elements
for tag in a_tag_list:
# find link, check if it's to news story, add it to temp_links if so
if tag.find('href') != -1:
link_start = tag.find('href') + 6
link_end = tag.find('>', link_start)
link = tag[link_start: link_end - 1]
if link[:34] == self.STORY or link[:37] == self.ARTICLE:
unrelated_links += 1
if unrelated_links > 10:
link_set.add(link)
return link_set
def scrape_news_data(self, url):
'''
REQUIRES: string of article link, int number associated with output
file name, string ticker representing associated stock
MODIFIES: technically modifies a file that we use to move desired text
to our output file, but this file is deleted (end of func)
EFFECTS: Parses the entire article webpage's text for article info
that we want, storing it in a file at the path.
'''
soup = self.__create_soup(url)
article_html_contents = soup.get_text()
advertisement_count = 0
title_tag = str(soup.find("title"))
title = title_tag[title_tag.index(">") + 1 : title_tag.rindex("-")]
article_content = ""
beginning_junk = True
words_that_end = ["Editor's", "Read", "Read:", "read:", "Write", "Email"]
for word in article_html_contents.split():
if word == 'Advertisement':
advertisement_count += 1
continue
if advertisement_count == 4:
if beginning_junk:
if (word.isalpha() and not word.isupper()):
beginning_junk = False
if not beginning_junk:
if word in words_that_end:
break
article_content += word + " "
if "Community Guidelines" in article_content or "performs" in title:
return "BAD ARTICLE"
article = Article(title, url, date.today(), article_content)
return article
# For website to display brief intro on mainpage
def get_article_intro(self, article):
'''
REQUIRES: a valid path to a news article's contents
EFFECTS: returns a string of the first 45 relevant words in the article,
as a brief intro to be used on the site
'''
intro = ""
for word in article.contents.split()[:40]:
intro += word + " "
return intro
# main function bot will use to gather articles for stocks
def get_stock_news(self, ticker):
'''
REQUIRES: a string representing a valid ticker / symbol
MODIFIES: news folder's subfolders (populates them)
EFFECTS: gathers news links related to a ticker, parses through
each link for article content, stores individual contents
in numbered files.
NOTE: at some point the deletion of past stock news should be added
before the new article contents are stored, as well as the
updating of paths to reflect individual tickers for sorting and
storage simplicity.
'''
links = self.__scrape_news_links(ticker)
news_articles = []
for link in links:
article = self.scrape_news_data(link)
if article != "BAD ARTICLE":
news_articles.append(article)
return news_articles
# from data_retriever import DataRetriever
# DR = DataRetriever()
# DR.get_article_intro("./data_retriever_storage/news/news_article_contents/TSLA/TSLA1.txt")
#'./data_retriever_storage/news/news_article_contents/