-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_candidates.py
150 lines (117 loc) · 5.29 KB
/
get_candidates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import argparse
import json
import os
from tqdm import tqdm
import paths
import solrqueries as solrq
from tqdm import tqdm
import sparqlqueries as sq
from multiprocessing import Pool
import requests
import utils
batch_size=100
workers=8
def process_candidate_batch(batch):
folder = paths.CANDIDATES_FOLDER_PATH
with requests.Session() as session:
for candidate_ids in batch:
candidate_path=folder / f"{candidate_ids}.json"
if not candidate_path.exists():
candidate_summary = sq.summarize_qid(candidate_ids, session=session)
res_obj = candidate_summary
with open(candidate_path, 'w') as fp:
json.dump(res_obj, fp)
def main(args):
batch_size=args.batch_size
workers=args.workers
# Read the artpedia.json file
with open(args.file_path) as file:
data = json.load(file)
# Create an empty set to store the mentions
mentions_set = set()
depicted_nes_qids = set()
# Iterate over each element in the data
# for element in data.values():
# for matches_fiels in ['visual_el_matches', 'contextual_el_matches']:
# for match in element[matches_fiels]:
# for v in match:
# # Add the mention to the set
# mentions_set.add(v['text'])
# match_qid_url=v['qid']
# match_qid=match_qid_url.split('/')[-1]
# ground_truth_qids.add(match_qid)
with open(paths.ARTPEDIA2WIKI_DEPICTED_LABELS_PATH, 'r') as f:
entities_dict = json.load(f)
for qid, labels in entities_dict.items():
if utils.is_named_entiy(labels):
depicted_nes_qids.add(qid)
dict_candidates = {}
#save dictionary to json file
file_path = paths.DICT_CANDIDATES_PATH
if os.path.exists(file_path):
print("Loading previous candidates from file, no need to search for them again")
with open(file_path) as f:
dict_candidates = json.load(f)
for mention in tqdm(mentions_set, "Serching for candidates"):
if mention in dict_candidates:
continue
candidate_ids_relevance = set(solrq.solr_search(mention, rows=25))
candidate_ids_popularity = set(solrq.solr_search(mention, rows=25, by_popularity=True))
candidate_ids = list(candidate_ids_relevance.union(candidate_ids_popularity))
dict_candidates[mention] = candidate_ids
with open(file_path, 'w') as fp:
json.dump(dict_candidates, fp)
candidates_set = set()
for candidate_ids in dict_candidates.values():
candidates_set.update(candidate_ids)
candidates_set.update(depicted_nes_qids)
lengths=[]
for candidate_ids in dict_candidates.values():
lengths.append(len(candidate_ids))
#average number of candidates per mention
sum(lengths)/len(lengths)
only_one_candidate = 0
for candidate_ids in dict_candidates.values():
if len(candidate_ids) == 1:
only_one_candidate+=1
print(f"Only one candidate for {only_one_candidate} mentions")
folder = paths.CANDIDATES_FOLDER_PATH
# for candidate_ids in tqdm(candidates_set):
# #request to wikidata to get the information about the candidate https://www.wikidata.org/w/rest.php/wikibase/v0/entities/items/Q7617093
# candidate_path=folder / f"{candidate_ids}.json"
# if not candidate_path.exists():
# candidate_summary = sq.summarize_qid(candidate_ids)
# res_obj = candidate_summary
# with open(candidate_path, 'w') as fp:
# json.dump(res_obj, fp)
#process the candidates in batches
candidates_list=list(candidates_set)
batches=[candidates_list[i:i + batch_size] for i in range(0, len(candidates_list), batch_size)]
with Pool(workers) as p:
for _ in tqdm(p.imap_unordered(process_candidate_batch, batches), total=len(batches), desc="Processing candidates in batches"):
pass
# iterate trough the json files in /el_candidates/ and extract the image urls into a set, then save the set in a text file line by line
image_urls = set()
for file in tqdm(os.listdir(folder)):
path=folder / file
if path.is_file() and path.suffix == ".json":
with open(path) as f:
data = json.load(f)
images=data["images"]
for image in images:
try:
commons_url = image
if not commons_url.startswith("http"):#from a different domain
image_urls.add(commons_url)
except:
pass
with open(paths.IMAGES_TXT_PATH, 'w') as f:
for item in image_urls:
f.write("%s\n" % item)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Get the candidates from wikidata, and build a file with all the images urls')
parser.add_argument('--file_path', type=str, help='artpedia2wiki_matched.json file path', default=paths.ARTPEDIA2WIKI_MATCHED_PATH)
parser.add_argument('--batch_size', type=int, default=batch_size, help='Batch size for processing candidates')
parser.add_argument('--workers', type=int, default=workers, help='Number of worker processes to use')
args = parser.parse_args()
main(args)