forked from rithwikbabu/yf-scrape-workflow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_text.py
136 lines (111 loc) · 4.37 KB
/
extract_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import json
import time
#====================================
# To update the scraped_text.json file, all that needs to be done is run update(). This will run through all of links.txt, and scrape any articles that it has not already
#====================================
# Note that since this script doesn't remove the links, any dead links will be re-scraped every time, this will slowly cause the script to take longer and longer, might be beneficial to delete/clear the links.txt file after scraping
#====================================
# Function to load existing links
def load_existing_links(filename):
try:
with open(filename, 'r') as f:
return set(line.strip() for line in f.readlines())
except FileNotFoundError:
return set()
#Return the list containing elements that exist only in list2 and not in list1
def list_inverse(list1, list2):
return [item2 for item2 in list2 if item2 not in list1]
def load_existing_articles(filename):
try:
with open(filename, 'r') as f:
arts = json.load(f)
links = set(art['link'] for art in arts)
return links
except FileNotFoundError:
return set()
def scrape_content(link, driver, save_path):
#Loading initial page in "classic" version
driver.get(link+'?.neo_opt=0')
print('Loaded Site')
#Clicking story cotinues until we've loading the full thing,
#always clicking the most recent button that hasn't been clicked yet
cont_buttons = driver.find_elements(By.XPATH, "//*[contains(text(), 'Story continues')]")
clicked = []
while len(list_inverse(clicked, cont_buttons)) > 0:
button = list_inverse(clicked, cont_buttons)
button[0].click()
clicked.append(button[0])
time.sleep(2)
cont_buttons = driver.find_elements(By.XPATH, "//*[contains(text(), 'Story continues')]")
print('Opened Full Article')
time.sleep(1)
#Getting main content
contents = driver.find_elements(By.CLASS_NAME, 'caas-content-wrapper')
if len(contents) > 0:
content = contents[0]
else:
print('Article not found, Skipping')
return
article_data = {}
#Getting the title
title = driver.find_element(By.CLASS_NAME, 'caas-title-wrapper').find_element(By.XPATH, './/h1').text
article_data['title'] = title
#link
article_data['link'] = link
#Getting time
publish_date = content.find_element(By.CLASS_NAME, 'caas-attr-time-style').find_element(By.XPATH, './/time').get_attribute('datetime')
article_data['date'] = publish_date
#Getting text
text = ''
paragraphs = content.find_element(By.CLASS_NAME, 'caas-body').find_elements(By.XPATH, './/p')
for paragraph in paragraphs:
text = text + paragraph.text
article_data['text'] = text
#Getting related stocks
related_stocks = set()
related_stock_elements = driver.find_elements(By.CLASS_NAME, 'xray-entity-title-link')
#print(related_stock_elements)
for elem in related_stock_elements:
#print(elem.text)
if elem.text != '':
related_stocks.add(elem.text)
article_data['stocks'] = list(related_stocks)
print('Got Data')
# Read existing data from the file
try:
with open(save_path, 'r') as file:
data = json.load(file)
except:
print('Not able to open links file to read')
return
# Append the new dictionary to the existing data
data.append(article_data)
# Write the updated data back to the file
try:
with open(save_path, 'w') as file:
json.dump(data, file, indent=4)
except:
print('Not able to open file to save')
return
print('Saved Data')
def update(driver, save_path, link_path):
#Getting the full list of links and scraped links so we only scrape the new ones
links = load_existing_links(link_path)
scraped_links = load_existing_articles(save_path)
for link in links:
if not link in scraped_links:
print('Begining to scrape {}'.format(link))
scrape_content(link, driver, save_path)
scraped_links.add(link)
#Setting up the web scraper driver itself to pass into the internal scraper
service = Service(executable_path='/usr/bin/chromedriver')
options = webdriver.ChromeOptions()
options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
options.add_argument("--log-level=3")
options.add_argument("--headless")
driver = webdriver.Chrome(options=options, service=service)
update(driver, 'scraped_text.json', 'links.txt')