-
Notifications
You must be signed in to change notification settings - Fork 0
/
step1.py
48 lines (40 loc) · 1.42 KB
/
step1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# from https://stackoverflow.com/a/72241008/1623645
import time
import selenium.webdriver
from bs4 import BeautifulSoup
driver = selenium.webdriver.Chrome()
output = open("numba-dependents.txt", "w")
repo = "numba/numba"
url = 'https://github.com/{}/network/dependents?dependents_after=MjgxMjEwMDkxMzU'.format(repo)
nextExists = True
total = 0
while nextExists:
print(f"{time.strftime('%H:%M:%S')}: get {url}")
driver.get(url)
while (
"exceeded a secondary rate limit" in driver.page_source
or "dependents are currently unavailable" in driver.page_source
):
print(f"{time.strftime('%H:%M:%S')}: wait 1 minute...")
time.sleep(60)
driver.get(url)
soup = BeautifulSoup(driver.page_source, "html.parser")
batch = [
"{}/{}".format(
t.find('a', {"data-repository-hovercards-enabled":""}).text,
t.find('a', {"data-hovercard-type":"repository"}).text
)
for t in soup.findAll("div", {"class": "Box-row"})
]
total += len(batch)
print(f"{time.strftime('%H:%M:%S')}: found {len(batch)} (running total: {total})")
output.write("\n".join(batch))
output.write("\n")
output.flush()
nextExists = False
for u in soup.find("div", {"class":"paginate-container"}).findAll('a'):
if u.text == "Next":
nextExists = True
url = u["href"]
time.sleep(1)
print("{time.strftime('%H:%M:%S')}: DONE")