forked from paracrawl/cirrus-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
12.reduce-tmx
executable file
·37 lines (32 loc) · 1.31 KB
/
12.reduce-tmx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash
set -euo pipefail
TMPSFX=${JOB_ID:-$$}
lang=$1
tmx_output=$2
txt_output=$3
shift 3
filtered_input=$@
pigz -cd $filtered_input \
| PYTHONPATH=$PREFIX/src/bitextor python3 ${SCRIPTS}/bitextor-buildTMX.py \
--lang1 ${TARGET_LANG%~*} --lang2 ${lang} \
-c "url1,url2,seg1,seg2,checksum1,checksum2,bifixerhash,bifixerscore,bicleaner,collection,lengthratio,numTokensSL,numTokensTL" \
--no-delete-seg \
--dedup "bifixerhash" \
| tee >(pigz > $tmx_output.$$) \
| tmxt --codelist=${TARGET_LANG%~*},${lang} \
| pigz \
> $txt_output.$TMPSFX
echo "Moving $txt_output.$TMPSFX to $txt_output"
mv $txt_output.$TMPSFX $txt_output
# (Note: compressing tmx might not have finished yet but a simple rename should
# maintain the file handle. I hope.)
echo "Moving $tmx_output.$TMPSFX to $tmx_output"
mv $tmx_output.$TMPSFX $tmx_output
stats_file=$(dirname $tmx_output)/$(basename $tmx_output .tmx.gz).stats.tmx
echo "tmx" > $stats_file.$TMPSFX
echo "TXT File Size: $(du -h $txt_output | cut -f 1)" >> $stats_file.$TMPSFX
echo "TMX File Size: $(du -h $tmx_output | cut -f 1)" >> $stats_file.$TMPSFX
WC=$(gzip -cd $txt_output | cut -f 1 | wc -wl | tr -s ' ')
echo "Sentence Pairs: $(echo $WC | cut -d ' ' -f 1)" >> $stats_file.$TMPSFX
echo "English words: $(echo $WC | cut -d ' ' -f 2)" >> $stats_file.$TMPSFX
mv $stats_file.$TMPSFX $stats_file