forked from paracrawl/cirrus-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path11.reduce-filtered.sh
executable file
·39 lines (31 loc) · 1.04 KB
/
11.reduce-filtered.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/bin/bash
set -euo pipefail
. ./env/init.sh
. ./config.sh
. ./functions.sh
lang=$1
shift
collections=$@
collection_hash=$(printf "%s\n" $collections | sort | join_by -)
# Load the bicleaner model for this language as we need the $BICLEANER_THRESHOLD
bicleaner_model $lang
declare -a batch_lists
batch_count=0
for collection in $collections; do
batch_list=$(make_batch_list 11 $collection $lang)
batch_count=$(( $batch_count + $(cat $batch_list | wc -l) ))
batch_lists+=( $batch_list )
done
output_file="${DATA_CLEANING}/${TARGET_LANG}-${lang}/${TARGET_LANG%~*}-${lang%~*}.${collection_hash}.filtered${BICLEANER_THRESHOLD/./}.gz"
if [ ! -f $output_file ] || ! $RETRY; then
prompt "Scheduling 1-1 for combining $batch_count batches across ${#batch_lists[@]} collections\n"
if confirm; then
schedule \
-J reduce-filtered-${lang%~*} \
--time 36:00:00 \
--cpus-per-task 16 \
-e ${SLURM_LOGS}/11.reduce-filtered-%A.err \
-o ${SLURM_LOGS}/11.reduce-filtered-%A.out \
${SCRIPTS}/11.reduce-filtered ${output_file} ${batch_lists[@]}
fi
fi