forked from paracrawl/cirrus-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
05.tokenise
executable file
·47 lines (36 loc) · 1.25 KB
/
05.tokenise
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/bin/bash
set -euo pipefail
ulimit -n 16384
SLANG="$1"
BATCH="$2"
TMPSFX=${JOB_ID:-$$}
if [[ $SLANG == ${TARGET_LANG%~*} ]]; then
INPUT="sentences"
else
INPUT="sentences_${TARGET_LANG%~*}"
fi
OUTPUT="tokenised_${TARGET_LANG%~*}"
tokenise() {
# Tokenizer gets really unhappy with all-dot lines
sed 's/^\.\.\.\.*$/.../' | $TOKENISER -a -q -l ${TARGET_LANG%~*}
}
export -f tokenise
echo "Processing (${SLANG}) ${BATCH}"
< ${BATCH}/${INPUT}.gz gzip -dc \
| b64filter cache bash -c tokenise \
| gzip -9c \
> ${TMPDIR}/${OUTPUT}.$TMPSFX.gz
echo "Checking output"
docs_st=$(gzip -cd ${BATCH}/${INPUT}.gz | wc -l)
docs_tk=$(gzip -cd ${TMPDIR}/${OUTPUT}.$TMPSFX.gz | wc -l)
echo "Expecting $docs_st documents, found $docs_tk"
test $docs_st -eq $docs_tk || exit 1
lines_st=$(gzip -cd ${BATCH}/${INPUT}.gz | base64 -d | wc -l)
lines_tk=$(gzip -cd ${TMPDIR}/${OUTPUT}.$TMPSFX.gz | base64 -d | wc -l)
echo "Expecting $lines_st lines, found $lines_tk"
test $lines_st -eq $lines_tk || exit 1
# Two-step move because the first one might fail and leave an
# incomplete file behind, which is tricky to detect.
mv ${TMPDIR}/${OUTPUT}.$TMPSFX.gz ${BATCH}/${OUTPUT}.$TMPSFX.gz
mv ${BATCH}/${OUTPUT}.$TMPSFX.gz ${BATCH}/${OUTPUT}.gz
echo "Moved result (${SLANG}) ${BATCH}/${OUTPUT}.gz"