-
Notifications
You must be signed in to change notification settings - Fork 0
/
100lines_scrp.py
100 lines (80 loc) · 3.44 KB
/
100lines_scrp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
web = "https://twitter.com/i/flow/login"
driver = webdriver.Firefox()
driver.get(web)
driver.maximize_window()
# wait of 6 seconds to let the page load the content
time.sleep(6) # this time might vary depending on your computer
# locating username and password inputs and sending text to the inputs
# username
username = driver.find_element("xpath",'//input[@autocomplete ="username"]')
username.send_keys("USERNAME") # Write Email Here
# username.send_keys(os.environ.get("TWITTER_USER"))
# Clicking on "Next" button
next_button = driver.find_element("xpath",'//div[@role="button"]//span[text()="Next"]')
next_button.click()
# wait of 2 seconds after clicking button
time.sleep(2)
# password
password = driver.find_element("xpath",'//input[@autocomplete ="current-password"]')
password.send_keys("PASSWORD") # Write Password Here
# password.send_keys(os.environ.get("TWITTER_PASS"))
# locating login button and then clicking on it
login_button = driver.find_element("xpath",'//div[@role="button"]//span[text()="Log in"]')
login_button.click()
time.sleep(10)
web = "https://twitter.com/nmrduino/status/1628351274394480640"
# web = "https://twitter.com/TwitterSupport"
#Initialize the webdriver
driver.get(web)
def get_tweet(element):
try:
user = element.find_element("xpath",".//span[contains(text(), '@')]").text
text = element.find_element("xpath",".//div[@data-testid='tweetText']").text
tweet_data = [user, text]
except:
tweet_data = ['user', 'text']
return tweet_data
user_data = []
text_data = []
tweet_ids = set()
scrolling = True
while scrolling:
tweets = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.XPATH, "//article[@data-testid='tweet']")))
#print(len(tweets))
for tweet in tweets: # you can change this number with the number of tweets in a website || NOTE: ONLY THOSE LOADED IN THE last page will be considered while those from previous page will be forgotten (example: scroll all the way down and then try to find an @username that it's on top --> it won't find it)
tweet_list = get_tweet(tweet)
tweet_id = ''.join(tweet_list)
if tweet_id not in tweet_ids:
tweet_ids.add(tweet_id)
print(tweet_list)
user_data.append(tweet_list[0])
text_data.append(" ".join(tweet_list[1].split()))
# Get the initial scroll height
last_height = driver.execute_script("return document.documentElement.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(3)
# Calculate new scroll height and compare it with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
print(str(last_height))
print(str(new_height))
print(len(tweet_ids))
if len(tweet_ids) > 100 or new_height == last_height :
scrolling = False
break
else:
last_height = new_height
break
driver.quit()
df_tweets = pd.DataFrame({'user': user_data, 'text': text_data})
df_tweets.to_csv('tweets_pagination.csv', index=False)
print(df_tweets)