From 582338f470422e52cad7253fa5377b1ccc776196 Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Sat, 11 Aug 2018 22:29:43 +0300 Subject: [PATCH 1/3] Suggestion: parallelize repeat masking I was trying to build `nt` database and discovered that it would take roughly 12 hours to mask all of it in one thread. This example modification uses GNU Parallel to speed up repeat masking in proportion to the number of CPUs. Perhaps this code will need to be changed before being merged (e. g. checking that GNU Parallel is available in PATH, of taking into account user setting of whether to use multiple threads), so this is only a draft and a suggestion. --- scripts/mask_low_complexity.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/scripts/mask_low_complexity.sh b/scripts/mask_low_complexity.sh index e35151c..af5de4c 100755 --- a/scripts/mask_low_complexity.sh +++ b/scripts/mask_low_complexity.sh @@ -23,17 +23,25 @@ if ! which $MASKER > /dev/null; then exit 1 fi +function mask_data_chunk () { + # Removes empty records and performs masking, all in pipes + awk -v RS=">" -v FS="\n" -v ORS="" ' { if ($2) print ">"$0 } ' |\ + $MASKER -in - -outfmt fasta |\ + sed -e '/^>/!s/[a-z]/x/g' +} +export -f mask_data_chunk + if [ -d $target ]; then for file in $(find $target '(' -name '*.fna' -o -name '*.faa' ')'); do if [ ! -e "$file.masked" ]; then - $MASKER -in $file -outfmt fasta | sed -e '/^>/!s/[a-z]/x/g' > "$file.tmp" + cat $file | parallel --keep-order --pipe --recstart '>' --blocksize 100M mask_data_chunk > "$file.tmp" mv "$file.tmp" $file touch "$file.masked" fi done elif [ -f $target ]; then if [ ! -e "$target.masked" ]; then - $MASKER -in $target -outfmt fasta | sed -e '/^>/!s/[a-z]/x/g' > "$target.tmp" + cat $target | parallel --keep-order --pipe --recstart '>' --blocksize 100M mask_data_chunk > "$target.tmp" mv "$target.tmp" $target touch "$target.masked" fi From c621b4c7e2c9ad8bc06a978318380762ea092a48 Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Sat, 11 Aug 2018 22:55:51 +0300 Subject: [PATCH 2/3] Do not keep order during parallelization Since keeping order of records (when comparing before/after low complexity masking) is not necessary, it is better to disable this option, since it degrades parallelism somewhat. --- scripts/mask_low_complexity.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/mask_low_complexity.sh b/scripts/mask_low_complexity.sh index af5de4c..611e182 100755 --- a/scripts/mask_low_complexity.sh +++ b/scripts/mask_low_complexity.sh @@ -34,14 +34,14 @@ export -f mask_data_chunk if [ -d $target ]; then for file in $(find $target '(' -name '*.fna' -o -name '*.faa' ')'); do if [ ! -e "$file.masked" ]; then - cat $file | parallel --keep-order --pipe --recstart '>' --blocksize 100M mask_data_chunk > "$file.tmp" + cat $file | parallel --pipe --recstart '>' --blocksize 100M mask_data_chunk > "$file.tmp" mv "$file.tmp" $file touch "$file.masked" fi done elif [ -f $target ]; then if [ ! -e "$target.masked" ]; then - cat $target | parallel --keep-order --pipe --recstart '>' --blocksize 100M mask_data_chunk > "$target.tmp" + cat $target | parallel --pipe --recstart '>' --blocksize 100M mask_data_chunk > "$target.tmp" mv "$target.tmp" $target touch "$target.masked" fi From 62f2eb28a8ad4d644a39ffed6f3f5697c4906152 Mon Sep 17 00:00:00 2001 From: Kirill Tsukanov Date: Sun, 12 Aug 2018 21:55:08 +0300 Subject: [PATCH 3/3] Fix error when passing $MASKER variable --- scripts/mask_low_complexity.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/mask_low_complexity.sh b/scripts/mask_low_complexity.sh index 611e182..b631cb3 100755 --- a/scripts/mask_low_complexity.sh +++ b/scripts/mask_low_complexity.sh @@ -25,6 +25,7 @@ fi function mask_data_chunk () { # Removes empty records and performs masking, all in pipes + MASKER=$1 awk -v RS=">" -v FS="\n" -v ORS="" ' { if ($2) print ">"$0 } ' |\ $MASKER -in - -outfmt fasta |\ sed -e '/^>/!s/[a-z]/x/g' @@ -34,14 +35,14 @@ export -f mask_data_chunk if [ -d $target ]; then for file in $(find $target '(' -name '*.fna' -o -name '*.faa' ')'); do if [ ! -e "$file.masked" ]; then - cat $file | parallel --pipe --recstart '>' --blocksize 100M mask_data_chunk > "$file.tmp" + cat $file | parallel --pipe --recstart '>' --blocksize 100M mask_data_chunk $MASKER > "$file.tmp" mv "$file.tmp" $file touch "$file.masked" fi done elif [ -f $target ]; then if [ ! -e "$target.masked" ]; then - cat $target | parallel --pipe --recstart '>' --blocksize 100M mask_data_chunk > "$target.tmp" + cat $target | parallel --pipe --recstart '>' --blocksize 100M mask_data_chunk $MASKER > "$target.tmp" mv "$target.tmp" $target touch "$target.masked" fi