From 48a83a2ca804da42b66d987d1baa02a712e46074 Mon Sep 17 00:00:00 2001 From: Ben Hearsum Date: Fri, 12 May 2023 16:50:32 -0400 Subject: [PATCH] pipeline script --- pipeline/clean/merge-corpus.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pipeline/clean/merge-corpus.sh b/pipeline/clean/merge-corpus.sh index a7514a3e2..236c1d2d4 100755 --- a/pipeline/clean/merge-corpus.sh +++ b/pipeline/clean/merge-corpus.sh @@ -22,8 +22,13 @@ tmp="${output_prefix}/merge" mkdir -p "${tmp}" echo "### Merging" -cat "${input_prefixes[@]/%/.${SRC}.${ARTIFACT_EXT}}" >"${tmp}/corpus.${SRC}.dup.${ARTIFACT_EXT}" -cat "${input_prefixes[@]/%/.${TRG}.${ARTIFACT_EXT}}" >"${tmp}/corpus.${TRG}.dup.${ARTIFACT_EXT}" +if [[ "${input_prefixes[0]}" == *.${ARTIFACT_EXT} ]]; then + cat "${input_prefixes[@]}" >"${tmp}/corpus.${SRC}.dup.${ARTIFACT_EXT}" + cat "${input_prefixes[@]}" >"${tmp}/corpus.${TRG}.dup.${ARTIFACT_EXT}" +else + cat "${input_prefixes[@]/%/.${SRC}.${ARTIFACT_EXT}}" >"${tmp}/corpus.${SRC}.dup.${ARTIFACT_EXT}" + cat "${input_prefixes[@]/%/.${TRG}.${ARTIFACT_EXT}}" >"${tmp}/corpus.${TRG}.dup.${ARTIFACT_EXT}" +fi echo "### Deduplication" paste <(${COMPRESSION_CMD} -dc "${tmp}/corpus.${SRC}.dup.${ARTIFACT_EXT}") <(${COMPRESSION_CMD} -dc "${tmp}/corpus.${TRG}.dup.${ARTIFACT_EXT}") |