-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl_all_prefectures.py
31 lines (27 loc) · 1.24 KB
/
crawl_all_prefectures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# A script to crawl all prefectures in Japan using crawl_knowledges.py and prefecture_portal_url.csv
import csv
import json
import os
def crawl_all_prefectures():
with open("prefecture_portal_url.csv", "r") as f:
reader = csv.reader(f)
next(reader) # Skip the header
for row in reader:
prefecture_name = row[0]
prefecture_url = row[1]
allow_backward_crawling = row[2].lower() == "true"
output_file_name = f"crawled_knowledges/{prefecture_name}.json"
if os.path.exists(output_file_name):
existing_entry_counts = len(json.load(open(output_file_name)))
if (
existing_entry_counts > 100
): # すでに存在していてしかも中身が十分にある場合はスキップ
print(f"Skipping {prefecture_name}...")
continue
print(f"Crawling {prefecture_name}...")
command = f"python crawl_knowledges.py {prefecture_url} {output_file_name} --max-page-count 1000"
if allow_backward_crawling:
command += " --allow-backward-crawling"
os.system(command)
if __name__ == "__main__":
crawl_all_prefectures()