-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_db.py
44 lines (37 loc) · 1.47 KB
/
scrape_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import requests
import pandas as pd
from bs4 import BeautifulSoup
from functools import lru_cache
@lru_cache(maxsize = 50)
def getDataStation(url):
DS = requests.get(url)
soup = BeautifulSoup(DS.text, "lxml")
return soup.find('div',{"data-component": "Realtime"})["data-station"]
def main():
urls = []
lines = []
destinations = []
stops = []
dataStation = []
alphabet = list(map(chr, range(97, 123)))
for letter in alphabet:
response = requests.get('https://www.vdl.lu/fr/se-deplacer/en-bus/horaires-et-depart-en-temps-reel/arrets/' + letter.capitalize())
soup = BeautifulSoup(response.text, "lxml")
print(letter)
for direction in soup.find_all(class_='panel-list-item'):
url = 'https://vdl.lu' + direction.find('div', {'role': 'article'})["data-url"]
urls.append(url)
stops.append(url.split("/")[-1].replace("-", " ").title())
lines.append(direction.get_text().replace("\n", "").split("Direction")[0].strip())
destinations.append(direction.get_text().replace("\n", "").split("Direction")[1].strip())
dataStation.append(getDataStation(url))
dict = {"stop": stops
, "line": lines
, "destination": destinations
, "url": urls
, "station_id": dataStation
}
df = pd.DataFrame(dict)
df.to_csv('lux_bus.csv', index=False, encoding='latin1')
if __name__ == '__main__':
main()