-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_phone_numbers.py
153 lines (120 loc) · 3.9 KB
/
scrape_phone_numbers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import os
import os.path
import shutil
from pathlib import Path
try:
from tqdm import tqdm
except ImportError:
tqdm = lambda x: x
import requests
from bs4 import BeautifulSoup
from bs4 import NavigableString, Tag
import csv
def get_country_name(element):
name = get_text_withouttag(element, 'span,sup')
return name
def get_all_phonenumbers(element):
phones = element.find_all('b')
result = []
for phone in phones:
text = get_text(phone)
# Poland ambulance is "999 or 112", need split it
if " or " in text:
result += text.split(" or ")
else:
# Angola ambulance is "112/116", need split it
split = text.split('/')
result += split
return result
def get_text_withouttag(element, tag):
texts = []
tags = tag.split(',')
for t in element.children:
if isinstance(t, NavigableString):
texts.append(t)
elif t.name in tags:
continue
else:
subtext = get_text_withouttag(t, 'sup')
texts.append(subtext)
text = "".join(texts)
return text
def get_text(element):
text = "".join([t for t in element.contents if isinstance(t, NavigableString)])
return text
URL = "https://en.wikipedia.org/wiki/List_of_emergency_telephone_numbers"
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html5lib')
countries=[]
tables = soup.find_all('table', attrs = {'class':'wikitable'})
for table in tables:
tbody = table.tbody
for row in tbody.find_all('tr'):
tds = row.find_all('td')
# print(row)
print(tds)
if len(tds) < 3:
continue
# if len(tds) < 3:
# raise Exception(f'Error number of tds {len(tds)}')
country = {}
country_name = get_country_name(tds[0]).strip()
country['country_name'] = country_name
police = []
ambulance = []
fire = []
notes = get_text_withouttag(tds[-1], 'style,sup').strip()
td1 = tds[1]
colspan = td1.get('colspan','1')
print(colspan)
if colspan == '1':
print(1)
police = get_all_phonenumbers(td1)
td2 = tds[2]
colspan = td2.get('colspan', '1')
if colspan == '1':
ambulance = get_all_phonenumbers(td2)
td3 = tds[3]
text = get_all_phonenumbers(td3)
fire = text
elif colspan == '2':
td2 = tds[2]
text = get_all_phonenumbers(td2)
ambulance = text
fire = text
elif colspan == '2':
print(2)
text = get_all_phonenumbers(td1)
police = text
ambulance = text
td2 = tds[2]
fire = get_all_phonenumbers(td2)
elif colspan == '3':
print(3)
text = get_all_phonenumbers(td1)
police = text
ambulance = text
fire = text
else:
print(colspan)
print(f"{country_name}, {police}, {ambulance}, {fire}, {notes}")
country['country_name'] = country_name
country['police'] = police
if country_name == "Iran":
country['police'] = ["110"]
country['ambulance'] = ambulance
country['fire'] = fire
country['notes'] = notes
countries.append(country)
dst_path = Path(os.path.dirname(__file__))
with open(dst_path / 'wiki_emergency_phone_numbers.json', 'w', encoding='utf-8') as f:
json.dump(countries, f, ensure_ascii=False, indent=4)
# filename = 'inspirational_quotes.csv'
# with open(filename, 'w', newline='') as f:
# w = csv.DictWriter(f,['theme','url','img','lines','author'])
# w.writeheader()
# for quote in quotes:
# w.writerow(quote)