-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_build_embeddings_indexes.py
90 lines (64 loc) · 2.45 KB
/
process_build_embeddings_indexes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pickle
import glob
import json
import util
total_daybook = len(list(glob.glob('daybook-and-diaries-1856-1906-daybook-1*/*.json')))
done_counter = 0
index={}
for file in glob.glob('daybook-and-diaries-1856-1906-daybook-1*/*.json'):
done_counter+=1
print(done_counter, '/',total_daybook)
dir = file.split('/')[-2]
file_id = int(file.split('/')[-1].replace('.json',''))
data = json.load(open(file))
if 'gpt' in data:
if 'gpt3.5-daybook-json' in data['gpt']:
counter=0
for entry in data['gpt']['gpt3.5-daybook-json']:
counter+=1
if 'embedding' in entry:
digital_id = data['options']['digital_id']
print(digital_id,f"{file_id}_{counter}")
index[("daybooks",digital_id,f"{file_id}_{counter}")] = entry['embedding']
file = open('daybook-and-diaries-1856-1906.pickle', 'wb')
pickle.dump(index, file)
file.close()
index = {}
total_writtings = len(list(glob.glob('anthony-speeches-and-other-writings-resources/*.json')))
done_counter = 0
for file in glob.glob('anthony-speeches-and-other-writings-resources/*.json'):
done_counter+=1
print(done_counter, '/',total_writtings)
print(file)
data = json.load(open(file))
for block in data:
pages = []
digital_id = None
for item in block['items']:
digital_id = item['options']['digital_id']
pages.append(str(item['id']))
print(digital_id,pages)
index[("writings",digital_id,"_".join(pages))] = block['embedding']
file = open('anthony-speeches-and-other-writings-resources.pickle', 'wb')
pickle.dump(index, file)
file.close()
total_writtings = len(list(glob.glob('anthony-correspondence-resources/*.json')))
done_counter = 0
index = {}
for file in glob.glob('anthony-correspondence-resources/*.json'):
done_counter+=1
print(done_counter, '/',total_writtings)
print(file)
file_id = file.split('/')[-1].replace('.json','')
data = json.load(open(file))
if 'embedding' in data:
pages = []
digital_id = None
for item in data['items']:
digital_id = item['options']['digital_id']
pages.append(str(item['id']))
print((digital_id,"_".join(pages)))
index[('correspondence',digital_id,"_".join(pages))] = data['embedding']
file = open('anthony-correspondence-resources.pickle', 'wb')
pickle.dump(index, file)
file.close()