forked from ymaurer/cdx-summarize-warc-indexer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
solr_stream_queries.sh
executable file
·199 lines (172 loc) · 7.31 KB
/
solr_stream_queries.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/bin/bash
#
# Stream-oriented version of execute_solr_queries.sh
#
# Based on a query from a given file, a CSV file is generated with
# domain, year, hit count for domain & year and total bytes for domain & year
#
# This script expects a Solr collection reasonably compatible with
# https://github.com/ukwa/webarchive-discovery
# The field names are configurable, but the index must contain fields for
# domain, year and content-size in bytes.
#
# This script piggy backs on the queries used by the execute_solr_queries.sh,
# using only the 'query' part from the JSON structure.
#
# Solr streaming expressions
# https://solr.apache.org/guide/8_11/streaming-expressions.html
# scales indefinitely with corpus size and should be used for large collections,
# where "large" means "millions of unique domains" and "10+ shards" or something
# to that effect.
#
# Important: Solr streaming requires a Solr running in cloud mode.
#
# Created 2022 by Toke Eskildsen, [email protected]
# Updated 2023-02-08 OUTPUT_PREFIX
# Updated 2023-02-09 Check for output file existence
# Updated 2023-02-09 Keep result in case of errors
# License: CC0: https://creativecommons.org/share-your-work/public-domain/cc0/
#
###############################################################################
# CONFIG
###############################################################################
#
# The script can be used as-is by adjusting the relevant parameters below, e.g.
# PORT=8080 INDEX=netarchive ./solr_stream_queries.sh solrq_http.q solrq_https.q
# It can also be configured by copy-pasting relevant ": ${..." lines below to
# a file named "solr.conf" and adjusting to the local setup.
#
pushd ${BASH_SOURCE%/*} > /dev/null
if [[ -s "solr.conf" ]]; then
source "solr.conf" # Optional config
fi
: ${QFILES:="$@"}
: ${QFILES:=$(find . -maxdepth 1 -iname '*.q')}
# If defined, the filter will be merged with the query extracted from QFILE:
# myfilter AND (qfile_query)
# This can be used for bisection in case of timeouts. Consider filters such as
# hash:[* TO sha1:K}
# hash:[sha1:K TO *]
# for bisection. Note the curly bracket:
# https://solr.apache.org/guide/6_6/the-standard-query-parser.html#TheStandardQueryParser-RangeSearches
# The hash generates by webarchive-discovery is Base32 encoded with the characters
# ABCDEFGHIJKLMNOPQRSTUVWXYZ234567
# Note that 2-7 is lexically before A-Z.
: ${FILTER:=""}
: ${PROTOCOL:="http"}
: ${SERVER:="localhost"}
: ${PORT:="8983"}
: ${INDEX:="netarchivebuilder"}
# Solr fields
: ${DOMAIN:="domain"}
: ${YEAR:="crawl_year"}
: ${SIZE_BYTES:="content_length"}
: ${STREAM_URL="${PROTOCOL}://${SERVER}:${PORT}/solr/${INDEX}/stream"}
: ${JQEXP:=".\"result-set\"[][] | [.\"${DOMAIN}\", .\"${YEAR}\", .\"count(*)\", .\"sum(${SIZE_BYTES})\"] | @csv"}
: ${CURL="curl -s"}
: ${OUTPUT_PREFIX:=""} # Prepended to the output file names
: ${FORCE_OUTPUT:="false"} # If true, already existing output files will be replaced
popd > /dev/null
usage() {
echo "Usage: solr_stream_queries.sh [queryfile]"
echo " Without arguments it runs all .q files in the current directory"
echo " queryfile: one or more json formatted query files to run"
exit $1
}
check_parameters() {
if [[ "$QFILES" == "--help" ]]; then
usage
fi
if [[ -z "$QFILES" ]]; then
>&2 echo "Error: No query files given and none could be located"
echo "Try running the script from the script folder, where the default query files are located"
echo ""
usage 2
fi
}
################################################################################
# FUNCTIONS
################################################################################
# Generate stats for a given query
# Input: solr_query output_file
query_job() {
# Escape embedded quotes
local QUERY="$(sed 's/"/\\"/g' <<< "$1")"
# Merge with FILTER if relevant
if [[ ! -z "$FILTER" ]]; then
QUERY="${FILTER} AND (${QUERY})"
fi
local OUTPUT="$2"
local OUTPUT_INTERMEDIATE="${OUTPUT}.raw"
local STREAM="expr=\
rollup(\
search(${INDEX}, q=\"$QUERY\", \
sort=\"${DOMAIN} asc, ${YEAR} asc, ${SIZE_BYTES} asc\",\
fl=\"${DOMAIN}, ${YEAR}, ${SIZE_BYTES}\",\
qt=\"/export\"\
),\
over=\"${DOMAIN}, ${YEAR}\",\
count(*),\
sum(${SIZE_BYTES})\
)"
echo "$(date -Is) - Generating ${OUTPUT} (this will probably take a while)"
# The grep at the end is a hack to avoid a non-value last line in the CVS due to Solr stream terminator
${CURL} "$STREAM_URL" -d "$STREAM" > "$OUTPUT_INTERMEDIATE"
if [[ ! -s "$OUTPUT_INTERMEDIATE" ]]; then
>&2 echo "Error: No output generated from calling ${CURL} \"$STREAM_URL\" -d \"$STREAM\""
return
fi
if [[ "." != ".$(grep "java.net.SocketTimeoutException" "$OUTPUT_INTERMEDIATE")" ]]; then
>&2 echo "Error: 'java.net.SocketTimeoutException' found in ${OUTPUT_INTERMEDIATE}, signalling premature EOS. Increase the Solr Cloud timeout (good luck extending beyond 10 minutes with Solr 7) or bisect the query in some way"
return
fi
jq -r "${JQEXP}" < "$OUTPUT_INTERMEDIATE" | grep -v ,,, > "$OUTPUT"
if [[ ! -s "$OUTPUT" ]]; then
>&2 echo "Error: No output generated from transforming ${OUTPUT_INTERMEDIATE}"
fi
# TODO: Add proper check for fully successful transformation (not just partial) and if so, delete the intermediate output
}
# Extract query from a given file and produce stats from the query
# Input: query_file
file_job() {
local INPUT="$1"
# ./solrq_json.q -> q_json
local DESIGNATION="$(sed 's/.*solr\(q_[a-z0-9]\+\).q/\1/' <<< "$INPUT")"
local OUTPUT="${OUTPUT_PREFIX}${DESIGNATION}-result.csv"
if [[ "$DESIGNATION" == "$INPUT" ]]; then
>&2 echo "Note: The input file '$INPUT' did not conform to the expected pattern \".../solrq_DESIGNATION.q\". The output will be based on the full filename of the input file instead of just the DESIGNATION part"
local DESIGNATION="$(basename $INPUT)"
local OUTPUT="${OUTPUT_PREFIX}${DESIGNATION}-result.csv"
fi
if [[ ! -s "$INPUT" ]]; then
>&2 echo "Error: Query file '$INPUT' could not be read"
return
fi
if [[ -s "$OUTPUT" ]]; then
if [[ "true" == "$FORCE_OUTPUT" ]]; then
echo " - Replacing existing output as FORCE_OUTPUT==true: $OUTPUT"
else
echo " - Skipping generation of output as it is already existing: $OUTPUT"
return
fi
fi
# '+' signals a space and must be URL-encoded to %2B. TODO: Why does this not happen automatically with curl -d?
local QUERY="$(jq -r .query < "$INPUT" | sed 's/+/%2B/g')"
if [[ -z "$QUERY" ]]; then
>&2 echo "Error: Unable to extract query from file '$INPUT'"
return
fi
query_job "$QUERY" "$OUTPUT"
}
# Iterate all query_files and call file_job for each query_file
all_file_jobs() {
for QFILE in $QFILES; do
file_job "$QFILE"
done
}
###############################################################################
# CODE
###############################################################################
check_parameters "$@"
all_file_jobs
echo "$(date -Is) - All done for INDEX=$INDEX $(tr '\n' ' ' <<< "$QFILES")"