-
Notifications
You must be signed in to change notification settings - Fork 0
/
cni.py
118 lines (100 loc) · 5.37 KB
/
cni.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pandas as pd
# pip install pypinyin
from pypinyin import lazy_pinyin
import requests
from bs4 import BeautifulSoup
import csv
def get_latest_inst_name_id(url):
output_id = ""
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
output_id = soup.find('input', {'name': 'c_inst_name_code'})['value']
return output_id
def get_latest_inst_id(url):
output_id = ""
response = requests.get(url)
last_page_url = response.json()['last_page_url']
response = requests.get(last_page_url)
output_id = response.json()['data'][-1]['c_inst_code']
return output_id
input_pd = pd.read_csv('input.txt', delimiter='\t', encoding='utf-8', dtype=str, header=None)
input_name_list = [x.strip() for x in input_pd[0].tolist()]
input_name_id_list = []
input_type_list = input_pd[1].tolist()
input_dynasty_list = input_pd[2].tolist()
input_addr_id_list = [x.strip() for x in input_pd[4].tolist()]
input_source_list = [x.strip() for x in input_pd[5].tolist()]
latest_inst_name_id = ""
call_latest_inst_name_url = "https://input.cbdb.fas.harvard.edu/socialinstitutioncodes/create"
latest_inst_name_id = get_latest_inst_name_id(call_latest_inst_name_url)
latest_inst_id = ""
call_latest_inst_url = "https://input.cbdb.fas.harvard.edu/api/select/search/socialinstcode"
latest_inst_id = get_latest_inst_id(call_latest_inst_url)
dy_df = pd.read_csv('DYNASTIES.csv', delimiter=',', encoding='utf-8', dtype=str)
dynasty_dict = dict(zip(dy_df['c_dynasty_chn'], dy_df['c_dy']))
inst_type_df = pd.read_csv('SOCIAL_INSTITUTION_TYPES.csv', delimiter=',', encoding='utf-8', dtype=str)
inst_type_dict = dict(zip(inst_type_df['c_inst_type_hz'], inst_type_df['c_inst_type_code']))
print(latest_inst_name_id)
print(latest_inst_id)
inst_name_sql_list = []
inst_code_sql_list = []
inst_addr_sql_list = []
inst_info_list = []
# remove the content in existing_inst_name_filename
existing_inst_name_filename = 'existing_inst_name.csv'
with open(existing_inst_name_filename, 'w', encoding='utf-8') as f:
f.write('')
# check whether inst_name is in the database
print("Existing inst_name check...")
for i in range(len(input_name_list)):
inst_name = input_name_list[i]
url = "https://input.cbdb.fas.harvard.edu/api/select/search/socialinstcode?q=" + inst_name
response = requests.get(url)
if response.json()['total'] != 0:
inst_name_id = response.json()['data'][0]['c_inst_name_code']
print(f"{inst_name},{inst_name_id}")
data_list = [response.json()['data']][0]
column_names = data_list[0].keys()
# print(column_names)
# with open(existing_inst_name_filename, mode='a+', newline='', encoding='utf-8') as csvfile:
# writer = csv.DictWriter(csvfile, fieldnames=column_names)
# writer.writeheader()
# for row in data_list:
# writer.writerow(row)
# raise Exception('Find existing inst_name, please check existing_inst_name.csv')
input_name_id_list.append(inst_name_id)
else:
input_name_id_list.append("")
for i in range(len(input_name_list)):
inst_name_sql = ""
inst_code_sql = ""
inst_addr_sql = ""
if input_name_id_list[i] != "":
new_inst_name_id = int(input_name_id_list[i])
else:
new_inst_name_id = int(latest_inst_name_id) + i + 1
new_inst_id = int(latest_inst_id) + i + 1
inst_name = input_name_list[i]
inst_type_id = inst_type_dict[input_type_list[i]]
inst_dynasty_id = dynasty_dict[input_dynasty_list[i]]
inst_addr_id = input_addr_id_list[i]
inst_source = input_source_list[i]
inst_name_pinyin = ' '.join(lazy_pinyin(inst_name))
# Don't create new inst_name if it is already in the database
if input_name_id_list[i] == "":
inst_name_sql = "INSERT INTO `SOCIAL_INSTITUTION_NAME_CODES` (`c_inst_name_code`, `c_inst_name_hz`, `c_inst_name_py`) VALUES ('" + str(new_inst_name_id) + "', '" + inst_name + "', '" + inst_name_pinyin + "');"
inst_name_sql_list.append(inst_name_sql)
inst_code_sql = "INSERT INTO `SOCIAL_INSTITUTION_CODES` (`c_inst_name_code`, `c_inst_code`, `c_inst_type_code`, `c_inst_begin_dy`, `c_source`, `c_inst_floruit_dy`) VALUES ('" + str(new_inst_name_id) + "', '" + str(new_inst_id) + "', '" + inst_type_id + "', '" + inst_dynasty_id + "', '" + inst_source + "', '" + inst_dynasty_id + "');"
inst_code_sql_list.append(inst_code_sql)
inst_addr_sql = "INSERT INTO `SOCIAL_INSTITUTION_ADDR` (`c_inst_name_code`, `c_inst_code`, `c_inst_addr_type_code`, `c_inst_addr_id`, `inst_xcoord`, `inst_ycoord`, `c_source`) VALUES ('" + str(new_inst_name_id) + "', '" + str(new_inst_id) + "', '1', '" + inst_addr_id + "', '0', '0', '" + inst_source + "');"
inst_addr_sql_list.append(inst_addr_sql)
inst_info_list.append([new_inst_id, inst_name, new_inst_name_id, inst_type_id, inst_dynasty_id, inst_addr_id, inst_source])
with open('output_sql.txt', 'w', encoding='utf-8') as f:
for i in range(len(inst_code_sql_list)):
f.write(inst_name_sql_list[i] + '\n')
f.write(inst_code_sql_list[i] + '\n')
f.write(inst_addr_sql_list[i] + '\n')
inst_info_df = pd.DataFrame(inst_info_list, columns=['c_inst_code', 'c_inst_name_hz', 'c_inst_name_code', 'c_inst_type_code', 'c_inst_begin_dy', 'c_inst_addr_id', 'c_source'])
inst_info_df.to_csv('output_inst_info.csv', index=False, encoding='utf-8')
inst_info_df.to_excel('output_inst_info.xlsx', index=False)
print('Finished!')