-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_crosswalk_annotation.py
273 lines (199 loc) · 12.5 KB
/
preprocess_crosswalk_annotation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import os
import pandas
import util.agent as agent
import random
import sys
import util.smartparse as smartparse
import util.session_manager as session_manager
default_params=smartparse.obj();
default_params.openai_api_key='your_key'
params = smartparse.parse()
params = smartparse.merge(params, default_params)
params.argv=sys.argv
agent.set_openai_key(params.openai_api_key)
# Crosswalk deposit type annotations
# Use USGS deposit code to CMMI mapping provided by Graham
table_usgs_cmmi=pandas.read_csv('taxonomy/usgs_num_cmmi_crosswalk.csv',encoding='latin1')
table_usgs_cmmi={'Deposit type':table_usgs_cmmi['Deposit type'],'USGS_Model':table_usgs_cmmi['USGS_Model']}
table_usgs=pandas.read_csv('taxonomy/usgs_num.csv',encoding='latin1')
table_usgs={'model_name':table_usgs['model_name'],'usgs_num':table_usgs['usgs_num']}
table_usgs_cmmi=agent.markdown_table(table_usgs_cmmi)
table_examples={}
table_examples['28-APR-03, Massive sulfide, kuroko, 12-MAR-02, 28a']="'28a'"
table_examples['Carbonatite, 28-APR-03, 12-MAR-02']="'10'"
table_examples['Mississippi Valley, S.E. Missouri Pb-Zn, E12, 32a']="'32a'"
table_examples['Basaltic Cu (Cox and Singer, 1986; model 23).']="'23'"
table_examples['Volcanogenic Mn? (Cox and Singer, 1986; model 24c?).']="'24c'"
table_examples['Disseminated, gold-bearing sulfide mineralization in calcareous metasedimentary schist; simple Sb deposits; low sulfide, Au-quartz vein? (Cox and Singer, 1986; model 27d and 36a).']="['27d','36a']"
table_examples['Massive sulfide, Besshi (Japanese deposits), woodruff, 17-OCT-2003 07:58:41, 12-MAR-2002 00:00:00']="'24b'"
table_examples={'deposit_type':list(table_examples.keys()),'usgs_num':[table_examples[k] for k in table_examples]}
#Slice crosswalked labeled data for MRDS, MRDS-CA and other datasets
# MRDS
input_file='index/sites/mrds.csv'
fname_out='index/annotations/mrds.csv'
df_out=agent.run_df_agent("I have collected a data frame of mineral site records in {input_file}, where some sites have been annotated with a deposit_type field. Please follow this table to extract the USGS model number:\n {table_usgs}\nAs you notice, there are two ways to get the deposit number. One is inferring the deposit number from the deposit type. The other is just reading the model number off the string. Here are some example input and outputs:\n{table_examples}\nPlease first filter for records with valid deposit_type annotations, and then add a new column `usgs_num` for the extracted USGS model number. Return a data frame with path, name, longitude, latitude, deposit_type and usgs_num columns. Notice that when inferring deposit number from the deposit type name, the deposit names can also have commas in them, so try to process the whole string. Also the deposit names can have overlaps, for example 'Placer Au-PGE' deposits are not 'Placer Au', so you'll need to return the longer string match 'Placer Au-PGE' which is '39a' and not 'Placer Au' or '17a'. ".format(input_file=input_file,table_usgs=agent.markdown_table(table_usgs),table_examples=agent.markdown_table(table_examples)))
#validate
print(len([x for x in list(df_out['usgs_num']) if not x is None]))
df_all.to_csv(fname_out)
# MRDS-CA
# Custom string matchers
import pandas
import re
import json
data=pandas.read_csv('index.csv')
rules=[]
rules.append({'dataset':'sir20105090z','name':'Porphyry copper','cmmi':['Porphyry copper ± gold']})
rules.append({'dataset':'sir20105090z','name':'Porphyry molybdenum-copper','cmmi':['Porphyry copper-molybdenum']})
rules.append({'dataset':'sir20105090z','name':'Porphyry gold','cmmi':['Porphyry gold ± copper']})
rules.append({'dataset':'porcu','name':'17','cmmi':['Porphyry copper ± gold']})
rules.append({'dataset':'porcu','name':'21a','cmmi':['Porphyry copper-molybdenum']})
rules.append({'dataset':'porcu','name':'20c','cmmi':['Porphyry gold ± copper']})
rules.append({'dataset':'nicrpge','name':None,'cmmi':['Komatiite nickel-copper-PGE','U-M layered intrusion chromium','U-M layered intrusion nickel-copper-PGE','U-M layered intrusion PGE','U-M conduit nickel-copper-PGE','Arc U-M intrusion nickel-copper-PGE','Ophiolite chromium']})
rules.append({'dataset':'sedexmvt','name':'MVT','cmmi':['MVT zinc-lead']})
rules.append({'dataset':'carbonatite','name':'10','cmmi':['Carbonatite REE']})
rules.append({'dataset':'ree','name':'alk-ig','cmmi':['Peralkaline igneous HFSE-REE']})
rules.append({'dataset':'ree','name':'carb','cmmi':['Carbonatite REE']})
rules.append({'dataset':'ofr20151121','name':'carbonatite','cmmi':['Carbonatite REE']})
rules.append({'dataset':'ofr20151121','name':'igneous','cmmi':['Peralkaline igneous HFSE-REE']})
rules.append({'dataset':'ofr20151121','name':'Carbonatite','cmmi':['Carbonatite REE']})
#ardf
def get_dataset(fname):
return fname.split('/')[1]
ardf_mapping={
'17': 'Porphyry copper ± gold',
'21a': 'Porphyry copper-molybdenum',
'20c': 'Porphyry gold ± copper',
'14a': 'Skarn tungsten ± Mo',
'14b': 'Skarn tin ± copper ± Mo',
'18f': 'Skarn gold ± copper ± tungsten',
'6a': 'Komatiite nickel-copper-PGE',
'2a': 'U-M layered intrusion chromium',
'5a': 'U-M layered intrusion nickel-copper-PGE',
'1': 'U-M layered intrusion PGE',
'3': 'U-M intrusion nickel-copper-PGE', # Assuming this mapping takes precedence over the duplicate
'5b': 'U-M conduit nickel-copper-PGE',
'9': 'Arc U-M intrusion nickel-copper-PGE',
'7b': 'Anorthosite massif titanium',
'31b': 'Siliciclastic-mafic barite',
'31a': 'Siliciclastic-carbonate zinc-lead',
'32c': 'Kipushi-type sediment-hosted copper-zinc-lead',
'32a': 'MVT zinc-lead',
'32f': 'MVT barite',
'32d': 'MVT fluorspar',
'25ob': 'Lacustrine zeolite (± Li, B)',
'25lc': 'Lacustrine clay lithium',
'35bm': 'Lacustrine brine lithium',
'10': 'Carbonatite REE',
'11': 'Peralkaline igneous HFSE-REE'
}
def extract_model_numbers(text):
# Define the regex pattern to match the model numbers
# This pattern matches numbers followed by optional sequences of letters or punctuation
pattern = r'\bmodel(?:s)?\s*([\d\w\?\.\, ]+)\b'
# Find all matches for the pattern
matches = re.findall(pattern, text)
# If matches are found, process them to extract individual model numbers
if matches:
result = []
for match in matches:
# Split by commas, 'or', and remove any extraneous characters such as '?' or spaces
models = re.split(r'[,\sor]+', match.strip())
for model in models:
model = model.strip(' ?.,')
if model: # Make sure the model number is not empty
result.append(model)
return result if len(result) > 1 else result[0] if result else None
return None
ann=[]
for i in range(len(data)):
ds=get_dataset(data['path'][i])
deposit=data['deposit_type'][i]
if ds=='ardf':
if not isinstance(deposit,str):
continue
usgs_num=extract_model_numbers(deposit)
if usgs_num is None:
continue
if isinstance(usgs_num,str):
usgs_num=[usgs_num]
dep_types=[ardf_mapping[x] for x in usgs_num if x in ardf_mapping]
if len(dep_types)>0:
print(i,dep_types)
ann.append((i,dep_types))
else:
for r in rules:
if r['dataset']==ds and (r['name']==deposit or r['name'] is None):
ann.append((i,r['cmmi']))
data=pandas.read_csv('index.csv',low_memory=False)
deposit_type=list(data['deposit_type'])
for x in ann:
deposit_type[x[0]]=json.dumps(x[1])
data['deposit_type']=deposit_type
data.to_csv('index2.csv')
# Generate run/train/eval splits
# run: not redacted + redacted & has gt
# eval: redacted, has gt, lat%5>2.5
# eval2: not redacted, otherwise same as eval
# train: redacted, has gt, lat%5<2.5
# train2: not redacted, otherwise same as eval
#Used in tasks
# Select LLM: test on eval
# Finetune LLM: train on train+train2, test on eval, eval2
# Score aggregation: train on train+train2, test on eval, eval2
# Area aggregation: train on train+train2 with run-train-train2-eval-eval2 as support, test on eval, eval2
df_out_cat=agent.run_df_agent("Load dataframes from `index.csv` and all `collection/usmin*.csv` (* is a wild card), and return the concatenated dataframe.")
df_out_cat=agent.run_df_agent("Load dataframes from `collection_v1.csv`, `collection/mrds.csv`, and return the concatenated dataframe.")
index=pandas.read_csv('index.csv',low_memory=False)
cmmi=pandas.read_csv('../science/dataset/taxonomy/cmmi_options_full_gpt4_number.csv',encoding='latin')
cmmi=list(cmmi['Deposit type'])
def deposit_type_matcher(ann,cmmi):
if isinstance(ann,list):
if ann in cmmi:
for x in ann:
label=deposit_type_matcher(x,cmmi)
if not label is None:
return label
return None
else:
return None
elif isinstance(ann,str):
if ann in cmmi:
return ann
else:
return None
else:
return None
ann=[deposit_type_matcher(x,cmmi) for x in list(index['deposit_type'])]
ind_train=[i for i in range(len(ann)) if not ann[i] is None and index['latitude'][i]!=0 and index['latitude'][i]%5>2.5]
random.shuffle(ind_train)
ind_eval=[i for i in range(len(ann)) if not ann[i] is None and index['latitude'][i]!=0 and index['latitude'][i]%5<=2.5]
random.shuffle(ind_eval)
# Create redacted json records: remove deposit type information from data
# Used for split generation
root='dataset/mrdata_json'
root_out='dataset/mrdata_json_redacted'
df_out_redacted=agent.run_df_agent("I have provided a database of mineral deposit resources under directory {folder}. The database consists of folders of json files, where each json file is a database record as a nested dictionary-list hybrid. Please help me redact these records of mineral deposit type information, returning a record without them. Deposit type information is often found in fields `properties.deptype`, `properties.dep_type`, `properties.sub_type`, `properties.subtype`,`properties.mintype`, `deptype`, `dep_type`, `properties.deposit_model`,`min_type`, `mintype`, `depk10km`, `depk5km`, `dep10km`, `dep5km`, `properties.deposit_model.model_code`,`properties.dep10km`, `properties.dep5km`, `model_type`, `properties.model`, `properties.deptext`, `model`, `properties.model_code`, `properties.model_name`, `model_code`, `model_name`, `properties.model_type`, `deposits.dep_tp`, `properties.deposits.dep_tp`, `properties.type_detail`, `type_detail`, `properties.deptypea`, `properties.deptypeb`, `properties.deptypec`, `properties.comments`, `properties.prevtype`, `dep_subtype`, `properties.dep_model`,`properties.dpmd_nonm`. Because these json records may not have consistent fields, please check the record structure for each record. For nested fields, I've been writing them in the form of `parent.child.grandchild. ...`, etc. Some keys may point to list of dictionaries, so please go into each dictionary and try to find the keys in question. Please return a dataframe with 2 columns:\npath: path to the json file as a string, in the form of `mrdata_json/xxxx/yyyy.json`.\nredacted_json: redacted json record as a string, generated using json.dumps. ".format(folder=root))
path=[x.replace('mrdata_json','mrdata_json_redacted') for x in df_out_redacted['path']]
for i in range(len(df_out_redacted)):
if not os.path.exists(path[i]):
os.makedirs(os.path.dirname(path[i]), exist_ok=True)
json.dump(json.loads(df_out_redacted['redacted_json'][i]),open(path[i],'w'))
#Generate splits
fnames_train=[index['path'][i] for i in ind_train]
labels_train=[ann[i] for i in ind_train]
fnames_train2=[index['path'][i].replace('mrdata_json','mrdata_json_redacted') for i in ind_train]
fnames_eval=[index['path'][i] for i in ind_eval]
labels_eval=[ann[i] for i in ind_eval]
fnames_eval2=[index['path'][i].replace('mrdata_json','mrdata_json_redacted') for i in ind_eval]
fnames_run=fnames_eval2+fnames_train2+list(index['path'])
#Save splits
def save_split(fnames,labels=None,fname_out=''):
if labels is None:
return pandas.DataFrame({'path':fnames}).to_csv(fname_out)
else:
return pandas.DataFrame({'path':fnames,'labels':labels}).to_csv(fname_out)
save_split(fnames_run,fname_out='splits/run.csv')
save_split(fnames_train,labels_train,fname_out='splits/train.csv')
save_split(fnames_train2,labels_train,fname_out='splits/train2.csv')
save_split(fnames_eval,labels_eval,fname_out='splits/eval.csv')
save_split(fnames_eval2,labels_eval,fname_out='splits/eval2.csv')