-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathCS-ValidateQueries-MASTER.pl
331 lines (294 loc) · 13.5 KB
/
CS-ValidateQueries-MASTER.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
#!/usr/bin/perl
use warnings;
use strict;
binmode(STDOUT, ":utf8");
### DO NOT INCLUDE
use ColdStartLib;
### DO INCLUDE
#####################################################################################
# This program checks for a number of possible problems with a set of
# queries. It then converts queries containing multiple entry points,
# to multiple queries that can be distributed to CSSF teams.
#
# Author: James Mayfield
# Please send questions or comments to jamesmayfield "at" gmail "dot" com
#
# For usage, run with no arguments
#####################################################################################
my $version = "2017.2.0";
# Filehandles for program and error output
my $program_output = *STDOUT{IO};
my $error_output = *STDERR{IO};
### DO NOT INCLUDE
#####################################################################################
# Library inclusions
#####################################################################################
### DO INCLUDE
### DO INCLUDE Utils ColdStartLib.pm
### DO INCLUDE Patterns ColdStartLib.pm
### DO INCLUDE Logger ColdStartLib.pm
### DO INCLUDE Provenance ColdStartLib.pm
### DO INCLUDE Predicates ColdStartLib.pm
### DO INCLUDE Query ColdStartLib.pm
### DO INCLUDE QuerySet ColdStartLib.pm
### DO INCLUDE Switches ColdStartLib.pm
### DO NOT INCLUDE
# Hush up perl worrywart module
my $pattern = $main::comment_pattern;
### DO INCLUDE
my $logger = Logger->new();
my %type_repairs = (
none => "Do not repair mismatched slots",
'delete-slot' => "Delete second (and all subsequent) slots",
'delete-query' => "Delete the entire query",
);
my $type_repair_string = "{" . join("; ", map {"$_: $type_repairs{$_}"} sort keys %type_repairs) . "}";
my %subtype_repairs = (
none => "Do not repair mismatched slots",
repair => "Modify second slot to match first",
'delete-slot' => "Delete second (and all subsequent) slots",
'delete-query' => "Delete the entire query",
);
my $subtype_repair_string = "{" . join("; ", map {"$_: $subtype_repairs{$_}"} sort keys %subtype_repairs) . "}";
my %dup_repairs = (
none => "Do not check for duplicates",
ignore => "Check for duplicates but do not repair",
'delete' => "Delete the later (orthographically) query",
);
my $dups_repair_string = "{" . join("; ", map {"$_: $dup_repairs{$_}"} sort keys %dup_repairs) . "}";
my %subtype_map = (
'city' => {NORMAL => 'city', stateorprovince => 'stateorprovince', country => 'country'},
'cities' => {NORMAL => 'city', stateorprovince => 'statesorprovinces', country => 'countries'},
'stateorprovince' => {NORMAL => 'stateorprovince', city => 'city', country => 'country'},
'statesorprovinces' => {NORMAL => 'stateorprovince', city => 'cities', country => 'countries'},
'country' => {NORMAL => 'country', city => 'city', stateorprovince => 'stateorprovince'},
'countries' => {NORMAL => 'country', city => 'cities', stateorprovince => 'statesorprovinces'},
);
sub fix_queries {
my ($queries, $fix_types, $fix_subtypes) = @_;
my $new_queries = QuerySet->new($queries->{LOGGER});
query:
foreach my $query ($queries->get_all_queries()) {
my @repairs;
my $max_slot = $#{$query->{SLOTS}};
if ($fix_types ne 'none') {
my $domains = {$query->get('ENTTYPE') => 'true'};
num:
foreach my $num (0..$max_slot) {
my $predicate = $query->{PREDICATES}[$num];
foreach my $domain (keys %{$domains}) {
if ($predicate->{DOMAIN}{$domain}) {
$domains = $predicate->{RANGE};
next num;
}
}
# None of the domains matched, so handle the error
$queries->{LOGGER}->record_problem('MISMATCHED_HOP_TYPES',
$query->{QUERY_ID},
join(":", sort keys %{$domains}),
$query->get("SLOT$num"),
'NO_SOURCE');
# delete-query requested, so skip to the next query
next query if $fix_types eq 'delete-query';
# alternative is delete-slot. Update max slot that will be output
$max_slot = $num - 1;
last num;
}
}
if ($fix_subtypes ne 'none') {
for (my $num = 0; $num < $max_slot; $num++) {
my $slot0_subtype;
my $predicate0 = $query->{PREDICATES}[$num]{NAME};
if ($predicate0 =~ /^(.*?)_of_.*$/) {
$slot0_subtype = $subtype_map{$1}{NORMAL};
next unless defined $slot0_subtype;
}
else {
next;
}
my $slot1_subtype;
my $slot1_prefix;
my $slot1 = $query->get('SLOT' . ($num + 1));
my $predicate1 = $query->{PREDICATES}[$num + 1]{NAME};
if ($slot1 =~ /^(.*_of_)(.*)$/ || $slot1 =~ /(.*_in_)(.*)$/) {
$slot1_subtype = $subtype_map{$2}{NORMAL};
next unless defined $slot1_subtype;
$slot1_prefix = $1;
}
else {
next;
}
unless ($slot0_subtype eq $slot1_subtype) {
$queries->{LOGGER}->record_problem('MISMATCHED_HOP_SUBTYPES',
$query->{QUERY_ID},
$predicate0,
$predicate1,
'NO_SOURCE');
next query if $fix_subtypes eq 'delete-query';
if ($fix_subtypes eq 'delete-slot') {
$max_slot = $num;
last num;
}
my $new_slot1 = "$slot1_prefix$subtype_map{$slot1_subtype}{$slot0_subtype}";
push(@repairs, {KEY => 'SLOT' . ($num + 1), VALUE => $new_slot1});
}
}
}
my $new_query = $query->duplicate();
foreach my $repair (@repairs) {
$new_query->put($repair->{KEY}, $repair->{VALUE});
}
$new_query->truncate_slots($max_slot);
$new_queries->add($new_query);
}
$new_queries;
}
sub check_for_duplication {
my ($queries, $fix_duplicates) = @_;
my $new_queries = QuerySet->new($queries->{LOGGER});
my %exact_matches;
my %candidate_matches;
my %dedup;
query:
foreach my $query (sort {$a->{QUERY_ID} cmp $b->{QUERY_ID}} $queries->get_all_queries()) {
my $entrypoints = $query->get("ENTRYPOINTS");
my $query_id = $query->get("QUERY_ID");
foreach my $entrypoint (@{$entrypoints}) {
my $provenance = "$entrypoint->{DOCID}:$entrypoint->{START}:$entrypoint->{END}";
my $hashstring = "$provenance:" . join(":", @{$query->{SLOTS}});
if (defined $exact_matches{$hashstring}) {
unless ($dedup{$exact_matches{$hashstring}}{$query_id}++) {
$new_queries->add($query) if $fix_duplicates eq 'ignore';
$queries->{LOGGER}->record_problem("DUPLICATE_QUERY", $exact_matches{$hashstring}, $query_id, 'NO_SOURCE');
}
next query;
}
$exact_matches{$hashstring} = $query_id;
}
foreach my $entrypoint (@{$entrypoints}) {
my $candidate_hashstring = "$entrypoint->{NAME}:" . join(":", @{$query->{SLOTS}});
if (defined $candidate_matches{$candidate_hashstring} && $candidate_matches{$candidate_hashstring} ne $query_id) {
unless ($dedup{$candidate_matches{$candidate_hashstring}}{$query_id}++) {
$queries->{LOGGER}->record_problem("POSSIBLE_DUPLICATE_QUERY", $candidate_matches{$candidate_hashstring}, $query_id, $entrypoint->{NAME}, "NO_SOURCE");
# Don't delete these, because they might be legitimate
$new_queries->add($query);
next query;
}
}
$candidate_matches{$candidate_hashstring} = $query_id;
}
$new_queries->add($query);
}
$new_queries;
}
sub generate_expanded_queries {
my ($queries, $query_base, $index_file, $languages, $retain_sf) = @_;
$retain_sf = "false" unless defined $retain_sf;
my $new_queries = QuerySet->new($queries->{LOGGER});
foreach my $query ($queries->get_all_queries()) {
$query->expand($query_base, $new_queries);
}
foreach my $query ($new_queries->get_all_queries()) {
if($languages) {
my @query_languages = @{$query->{LANGUAGES}};
my %selected_languages = map {$_=>1} split(":", $languages);
my $skip = 1;
foreach my $query_language(@query_languages){
if(exists $selected_languages{$query_language}){
$skip = 0;
last;
}
}
next if $skip;
}
# Move on to the next query if an event related slot appears in its hop-0
if($retain_sf eq "true"){
my $slot0 = $query->get("SLOT0");
# capture identifier for event slots
my ($eal_identifier) = $slot0 =~ /^.*?:(.*?)\_/;
# capture the slot name for potential match with sentiment slots
my ($sen_identifier) = $slot0 =~ /^.*?:(.*?)$/;
next if (defined $eal_identifier && exists $PredicateSet::legal_event_types{$eal_identifier}) ||
exists $PredicateSet::legal_sentiment_slots{$sen_identifier};
}
print $index_file $query->get("FULL_QUERY_ID"), "\t", $query->get("ORIGINAL_QUERY_ID"), "\n" if defined $index_file;
}
$new_queries;
}
#####################################################################################
# Runtime switches and main program
#####################################################################################
# Handle run-time switches
my $switches = SwitchProcessor->new($0, "Validates query files, correcting various problems. Converts queries containing multiple entry points to multiple queries with single entry points.",
"");
$switches->addHelpSwitch("help", "Show help");
$switches->addHelpSwitch("h", undef);
$switches->addVarSwitch('error_file', "Specify a file to which error output should be redirected");
$switches->put('error_file', "STDERR");
$switches->addVarSwitch('query_base', "Base name for generated queries");
$switches->put('query_base', 'TAC2015CS');
$switches->addVarSwitch('index_file', "Filename into which to place mapping from output query name to original LDC query name");
$switches->addVarSwitch('types', "Repair queries with type mismatches (choices are $type_repair_string)");
$switches->put('types', 'none');
$switches->addVarSwitch('subtypes', "Repair queries with subtype mismatches (choices are $subtype_repair_string)");
$switches->put('subtypes', 'none');
$switches->addVarSwitch('languages', "Select the languages to be considered for output.");
$switches->put('languages', 'ENGLISH:CHINESE:SPANISH');
$switches->addConstantSwitch('expand', 'true', "Expand single queries with multiple entry points into multiple queries with single entry points");
$switches->addConstantSwitch('retain_sf', 'true', "Retain queries with hop-0 SF slots; remove hop-1 non-SF slots");
$switches->addVarSwitch('dups', "Check whether different queries with the same slots share one or more entry points (choices are $dups_repair_string)");
$switches->put('dups', 'none');
$switches->addImmediateSwitch('version', sub { print "$0 version $version\n"; exit 0; }, "Print version number and exit");
$switches->addParam("queryfile", "required", "File containing queries used to generate the file being validated. Only the original query file needs to be specified here");
$switches->addParam("outputfile", "required", "File into which to place combined output");
$switches->process(@ARGV);
my $queryfile = $switches->get("queryfile");
my $outputfile = $switches->get("outputfile");
my $query_base = $switches->get('query_base');
my $languages = $switches->get('languages');
my $fix_types = lc $switches->get('types');
$logger->NIST_die("Unknown -types argument: $fix_types") unless $type_repairs{$fix_types};
my $fix_subtypes = lc $switches->get('subtypes');
$logger->NIST_die("Unknown -subtypes argument: $fix_subtypes") unless $subtype_repairs{$fix_subtypes};
my $fix_dups = lc $switches->get('dups');
$logger->NIST_die("Unknown -dups argument: $fix_dups") unless $dup_repairs{$fix_dups};
# Allow redirection of stderr
my $error_filename = $switches->get("error_file");
$logger->set_error_output($error_filename);
$error_output = $logger->get_error_output();
my $outputfilename = $switches->get("outputfile");
$logger->NIST_die("File $outputfilename already exists") if -e $outputfilename;
open($program_output, ">:utf8", $outputfilename) or $logger->NIST_die("Could not open $outputfilename: $!");
my $index_filename = $switches->get('index_file');
my $index_file;
if (defined $index_filename) {
$logger->NIST_die("File $index_filename already exists") if -e $index_filename;
open($index_file, ">:utf8", $index_filename) or $logger->NIST_die("Could not create $index_filename: $!");
}
my $queries = QuerySet->new($logger, $queryfile);
$queries = &fix_queries($queries, $fix_types, $fix_subtypes)
if $fix_types ne 'none' || $fix_subtypes ne 'none';
$queries = &check_for_duplication($queries, $fix_dups) if $fix_dups ne 'none';
$queries = &generate_expanded_queries($queries, $query_base, $index_file, $languages, $switches->get('retain_sf')) if $switches->get('expand');
print $program_output $queries->tostring("", undef, ['SLOT', 'NODEID'], $languages, $switches->get('retain_sf'));
close $program_output;
close $index_file if defined $index_file;
# Problems were identified while the KB was loaded; now report them
my ($num_errors, $num_warnings) = $logger->report_all_problems();
if ($num_errors) {
$logger->NIST_die("$num_errors error" . ($num_errors == 1 ? '' : 's') . " encountered");
}
print $error_output ($num_warnings || 'No'), " warning", ($num_warnings == 1 ? '' : 's'), " encountered\n";
exit 0;
################################################################################
# Revision History
################################################################################
# 1.0 - Initial version
# 1.1 - Added code to build index from expanded query_id to original LDC query_id
# 1.2 - Refactored generate_expanded_queries to move expansion into ColdStartLib
# 2.0 - Verion upped to make the code work with new ColdStartLib
# 2.1 - NODEID is removed from the CS-ValidateQueries output to make the SF queries file look the same as 2015.
# 2.2 - Added support for printing queries with entrypoint from selected languages
# 2.3 - Fixing the queries.index file to print queries with entrypoint from selected languages only
# 2017.2.0 - Code state at the release of scores
1;