forked from paracrawl/cirrus-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
07.fix
executable file
·67 lines (59 loc) · 1.99 KB
/
07.fix
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/bin/bash
set -euo pipefail
shopt -s extglob
collection=$1
lang=$2
target_lang_data=$3
batch=$4
shard=$(basename $(dirname $batch))
if [[ $lang == no ]]; then
bicleaner_lang=nb
else
bicleaner_lang=$lang
fi
TMPSFX=${JOB_ID:-$$}
FIXED=$batch/fixed.gz
HARDRULED=$batch/hardruled.gz
# Note: the concept of source/target language differs in this repo and in
# bifixer & bicleaner. That is why the next set of lines flips all the column
# orders.
remove_empty_lines() {
awk -F"\t" '$3 != "" && $4 != "" { print }'
}
for match in $batch/aligned-+([0-9]).gz; do
echo $match 1>&2
matched_batch=$(echo $match | sed 's/.*-\([0-9]*\)\.gz/\1/')
paste <(gzip -cd ${match} \
| awk -F '\t' '{ print 0.0 "\t" $1 "\t" $2}' `# bitextor's docjoin expects a score column, which it then ignores` \
| docjoin \
-r ${target_lang_data}/${shard}/${matched_batch}/url.gz \
-l $(dirname ${match})/url.gz) `# 1,2: target & source url`\
<(gzip -cd $match | cut -f4) `# 3: target sentence (e.g. en)` \
<(gzip -cd $match | cut -f3) `# 4: source sentence (e.g. mt)`\
<(gzip -cd $match | cut -f7) `# 5: target checksum (skipping col 5: bleualign score)`\
<(gzip -cd $match | cut -f6) `# 6: source checksum`
done \
| parallel -j $THREADS --pipe --line-buffer --halt 2 \
bifixer - - ${TARGET_LANG%~*} ${bicleaner_lang} --sdeferredcol 5 --tdeferredcol 6 $BIFIXER_PARAMS `# 7,8: bifixer hash & score`\
| ${SCRIPTS}/filter-unicode.py \
| remove_empty_lines \
| tee \
>(pigz -9c > $FIXED.$TMPSFX) \
| bicleaner-hardrules \
--score_only \
--tmp_dir $TMPDIR \
--processes $THREADS \
--source_lang ${TARGET_LANG%~*} \
--target_lang $bicleaner_lang \
--scol 3 \
--tcol 4 \
--metadata $BICLEANER_MODEL \
/dev/stdin /dev/stdout \
| pigz -9c \
>$HARDRULED.$TMPSFX
LC_HARDRULED=$(gzip -cd < $HARDRULED.$TMPSFX | wc -l)
LC_FIXED=$(gzip -cd < $FIXED.$TMPSFX | wc -l)
echo "hardrules.gz has $LC_HARDRULED lines, fixed.gz has $LC_FIXED lines."
test $LC_HARDRULED -eq $LC_FIXED
mv $FIXED.$TMPSFX $FIXED
mv $HARDRULED.$TMPSFX $HARDRULED