diff --git a/.github/workflows/workflow_modules.yml b/.github/workflows/workflow_modules.yml index 74c9423d..3564777e 100644 --- a/.github/workflows/workflow_modules.yml +++ b/.github/workflows/workflow_modules.yml @@ -91,7 +91,9 @@ jobs: " --databases=/vol/scratch/databases/ " \ ./example_params/fullPipelineONT.yml "${WORK_DIR}" \ ${PROFILE} ${VERSION} || exit 1 - + - name: Cleanup previous checkm results for correct aggregation run + run: | + rm -rf output - name: Test Full Pipeline ONT and Illumina Run run: | VERSION=$(sort -r VERSIONS.txt | tail -n 1) @@ -120,6 +122,16 @@ jobs: runs-on: [ self-hosted, slurm] steps: - uses: actions/checkout@v2 + - name: Set Secrets + run: | + make set_secrets SECRET_NAME=S3_ACCESS SECRET_VALUE=${{ secrets.S3_ACCESS }} + make set_secrets SECRET_NAME=S3_SECRET SECRET_VALUE=${{ secrets.S3_VALUE }} + make set_secrets SECRET_NAME=S3_checkm_ACCESS SECRET_VALUE=${{ secrets.S3_ACCESS }} + make set_secrets SECRET_NAME=S3_checkm_SECRET SECRET_VALUE=${{ secrets.S3_VALUE }} + make set_secrets SECRET_NAME=S3_checkm2_ACCESS SECRET_VALUE=${{ secrets.S3_ACCESS }} + make set_secrets SECRET_NAME=S3_checkm2_SECRET SECRET_VALUE=${{ secrets.S3_VALUE }} + make set_secrets SECRET_NAME=S3_gtdb_ACCESS SECRET_VALUE=${{ secrets.S3_ACCESS }} + make set_secrets SECRET_NAME=S3_gtdb_SECRET SECRET_VALUE=${{ secrets.S3_VALUE }} - name: Test MAG Attributes run: | bash ./scripts/test_magAttributes.sh \ @@ -131,17 +143,7 @@ jobs: bash ./scripts/test_magAttributes.sh \ " --databases=/vol/scratch/databases/ " \ example_params/magAttributes_fraction/magAttributes.yml "${WORK_DIR}" ${PROFILE} || exit 1 - - name: Set Secrets - run: | - make set_secrets SECRET_NAME=S3_ACCESS SECRET_VALUE=${{ secrets.S3_ACCESS }} - make set_secrets SECRET_NAME=S3_SECRET SECRET_VALUE=${{ secrets.S3_VALUE }} - make set_secrets SECRET_NAME=S3_checkm_ACCESS SECRET_VALUE=${{ secrets.S3_ACCESS }} - make set_secrets SECRET_NAME=S3_checkm_SECRET SECRET_VALUE=${{ secrets.S3_VALUE }} - make set_secrets SECRET_NAME=S3_gtdb_ACCESS SECRET_VALUE=${{ secrets.S3_ACCESS }} - make set_secrets SECRET_NAME=S3_gtdb_SECRET SECRET_VALUE=${{ secrets.S3_VALUE }} - - - - name: Test Checkm runs against different database parameters + - name: Test Checkm1 runs against different database parameters run: | make runDatabaseTest MODULE_DB_TEST_EXTRACTED="/vol/spool/toolkit/checkm/" \ MODULE_DB_TEST_MD5SUM="0963b301dfe9345ea4be1246e32f6728" \ @@ -149,11 +151,26 @@ jobs: MODULE_DB_TEST_PATH="/vol/spool/toolkit/checkm_data_2015_01_16.tar.gz" MODULE_DB_TEST_S3PATH="s3://databases/checkm_data_2015_01_16.tar.gz" \ MODULE_DB_TEST_S3_DIRECTORY_PATH="s3://databases/checkm/*" \ MODULE_DB_TEST_S5CMD_COMMAND='" --retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080 "' \ - MODULE_DB_TEST_GENERATED_YML=/vol/spool/generated_yamls/checkm \ + MODULE_DB_TEST_CREDENTIALS=/vol/spool/credentials MODULE_DB_TEST_GENERATED_YML=${WORK_DIR}/generated_yamls/checkm \ MODULE_DB_TEST_YML=example_params/magAttributes.yml \ - MODULE_DB_TEST_YML_PATH="'.steps.magAttributes.checkm.database=env(database),del(.steps.magAttributes.gtdb,.steps.magAttributes.prokka)'" \ + MODULE_DB_TEST_YML_PATH="'.steps.magAttributes.checkm.database=env(database),del(.steps.magAttributes.gtdb,.steps.magAttributes.checkm2,.steps.magAttributes.prokka)'" \ MODULE_DB_TEST_YML_SCRIPT="./scripts/test_magAttributes.sh" MODULE_DB_TEST_GENERATED_YML_DIR="${WORK_DIR}/checkm_yaml_database_tests" \ MODULE_DB_TEST_REMOVE_DB="yes" + + - name: Test Checkm2 runs against different database parameters + run: | + make runDatabaseTest MODULE_DB_TEST_EXTRACTED="/vol/spool/toolkit/checkm2/" \ + MODULE_DB_TEST_MD5SUM="a634cb3d31a1f56f2912b74005f25f09" \ + MODULE_DB_TEST_HTTPS="https://openstack.cebitec.uni-bielefeld.de:8080/databases/checkm2_v2.tar.gz" \ + MODULE_DB_TEST_PATH="/vol/spool/toolkit/checkm2_v2.tar.gz" MODULE_DB_TEST_S3PATH="s3://databases/checkm2_v2.tar.gz" \ + MODULE_DB_TEST_S3_DIRECTORY_PATH="s3://databases/checkm2/*" \ + MODULE_DB_TEST_S5CMD_COMMAND='" --retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080 "' \ + MODULE_DB_TEST_CREDENTIALS=/vol/spool/credentials MODULE_DB_TEST_GENERATED_YML=${WORK_DIR}/generated_yamls/checkm2 \ + MODULE_DB_TEST_YML=example_params/magAttributes.yml \ + MODULE_DB_TEST_YML_PATH="'.steps.magAttributes.checkm2.database=env(database),del(.steps.magAttributes.gtdb,.steps.magAttributes.checkm,.steps.magAttributes.prokka)'" \ + MODULE_DB_TEST_YML_SCRIPT="./scripts/test_magAttributes.sh" MODULE_DB_TEST_GENERATED_YML_DIR="${WORK_DIR}/checkm2_yaml_database_tests" \ + MODULE_DB_TEST_REMOVE_DB="yes" + - name: Test GTDB runs against different database parameters run: | @@ -163,9 +180,10 @@ jobs: MODULE_DB_TEST_PATH="/vol/spool/toolkit/gtdbtk_r214_data.tar.gz" MODULE_DB_TEST_S3PATH="s3://databases/gtdbtk_r214_data.tar.gz" \ MODULE_DB_TEST_S3_DIRECTORY_PATH="s3://databases/gtdbtk_r214_data/*" \ MODULE_DB_TEST_S5CMD_COMMAND='" --retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080 "' \ - MODULE_DB_TEST_GENERATED_YML=/vol/spool/generated_yamls/gtdb \ + MODULE_DB_TEST_CREDENTIALS=/vol/spool/credentials \ + MODULE_DB_TEST_GENERATED_YML=${WORK_DIR}/generated_yamls/gtdb \ MODULE_DB_TEST_YML=example_params/magAttributes.yml \ - MODULE_DB_TEST_YML_PATH="'.steps.magAttributes.gtdb.database=env(database),del(.steps.magAttributes.checkm,.steps.magAttributes.prokka)'" \ + MODULE_DB_TEST_YML_PATH="'.steps.magAttributes.gtdb.database=env(database),del(.steps.magAttributes.checkm,.steps.magAttributes.checkm2,.steps.magAttributes.prokka)'" \ MODULE_DB_TEST_YML_SCRIPT="./scripts/test_magAttributes.sh" MODULE_DB_TEST_GENERATED_YML_DIR="${WORK_DIR}/gtdb_yaml_database_tests" \ MODULE_DB_TEST_REMOVE_DB="yes" @@ -296,7 +314,8 @@ jobs: MODULE_DB_TEST_PATH="/vol/spool/toolkit/card_20221209.tar.bz2" MODULE_DB_TEST_S3PATH="s3://databases/card_20221209.tar.bz2" \ MODULE_DB_TEST_S3_DIRECTORY_PATH="s3://databases/card_20221209/*" \ MODULE_DB_TEST_S5CMD_COMMAND='" --retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080 "' \ - MODULE_DB_TEST_GENERATED_YML=/vol/spool/generated_yamls/rgi \ + MODULE_DB_TEST_CREDENTIALS=/vol/spool/credentials \ + MODULE_DB_TEST_GENERATED_YML=${WORK_DIR}/generated_yamls/rgi \ MODULE_DB_TEST_YML=example_params/annotation.yml \ MODULE_DB_TEST_YML_PATH="'.steps.annotation.rgi.database=env(database),del(.steps.annotation.mmseqs2,.steps.annotation.keggFromBlast)'" \ MODULE_DB_TEST_YML_SCRIPT="./scripts/test_annotation.sh" MODULE_DB_TEST_GENERATED_YML_DIR="${WORK_DIR}/rgi_yaml_database_tests" \ @@ -311,7 +330,8 @@ jobs: MODULE_DB_TEST_PATH="/vol/spool/toolkit/kegg.tar.gz" MODULE_DB_TEST_S3PATH="s3://databases_internal/kegg-links-mirror-2021-01.tar.gz" \ MODULE_DB_TEST_S3_DIRECTORY_PATH="s3://databases_internal/kegg-links-mirror-2021-01/*" \ MODULE_DB_TEST_S5CMD_COMMAND='" --retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080 "' \ - MODULE_DB_TEST_GENERATED_YML=/vol/spool/generated_yamls/kegg \ + MODULE_DB_TEST_CREDENTIALS=/vol/spool/credentials \ + MODULE_DB_TEST_GENERATED_YML=${WORK_DIR}/generated_yamls/kegg \ MODULE_DB_TEST_YML=example_params/annotation.yml \ MODULE_DB_TEST_YML_PATH="'.steps.annotation.keggFromBlast.database=env(database),del(.steps.annotation.rgi)'" \ MODULE_DB_TEST_YML_SCRIPT="./scripts/test_annotation.sh" MODULE_DB_TEST_GENERATED_YML_DIR="${WORK_DIR}/kegg_yaml_database_tests" \ @@ -339,7 +359,8 @@ jobs: MODULE_DB_TEST_PATH="/vol/spool/toolkit/kegg-mirror-2021-01_mmseqs.tar.zst" MODULE_DB_TEST_S3PATH="s3://databases_internal/kegg-mirror-2021-01_mmseqs.tar.zst" \ MODULE_DB_TEST_S3_DIRECTORY_PATH="s3://databases_internal/kegg-mirror-2021-01_mmseqs/*" \ MODULE_DB_TEST_S5CMD_COMMAND='" --retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080 "' \ - MODULE_DB_TEST_GENERATED_YML=/vol/spool/generated_yamls/mmseqs2 \ + MODULE_DB_TEST_CREDENTIALS=/vol/spool/credentials \ + MODULE_DB_TEST_GENERATED_YML=${WORK_DIR}/generated_yamls/mmseqs2 \ MODULE_DB_TEST_YML=example_params/annotation.yml \ MODULE_DB_TEST_YML_PATH="'.steps.annotation.mmseqs2.kegg.database=env(database),del(.steps.annotation.rgi,.steps.annotation.keggFromBlast)'" \ MODULE_DB_TEST_YML_SCRIPT="./scripts/test_annotation.sh" \ @@ -387,7 +408,8 @@ jobs: MODULE_DB_TEST_S3PATH="s3://databases/plasmids_plsdb_20220929.tar.bz2" \ MODULE_DB_TEST_S3_DIRECTORY_PATH="s3://databases/plasmids_plsdb_20220929/*" \ MODULE_DB_TEST_S5CMD_COMMAND='" --retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080 "' \ - MODULE_DB_TEST_GENERATED_YML=/vol/spool/generated_yamls/plasmids \ + MODULE_DB_TEST_CREDENTIALS=/vol/spool/credentials \ + MODULE_DB_TEST_GENERATED_YML=${WORK_DIR}/generated_yamls/plasmids \ MODULE_DB_TEST_YML=example_params/plasmid.yml MODULE_DB_TEST_YML_PATH="'.steps.plasmid.PLSDB.database=env(database),del(.steps.plasmid.Platon,.steps.plasmid.ViralVerifyPlasmid,.steps.plasmid.MobTyper)'" \ MODULE_DB_TEST_YML_SCRIPT="./scripts/test_plasmids.sh" MODULE_DB_TEST_GENERATED_YML_DIR="${WORK_DIR}/plasmid_yaml_database_tests" \ MODULE_DB_TEST_REMOVE_DB="yes" @@ -399,7 +421,7 @@ jobs: MODULE_DB_TEST_PATH="/vol/spool/toolkit/pfam-A_35.0.hmm.gz" MODULE_DB_TEST_S3PATH="s3://databases/pfam-A_35.0.hmm.gz" \ MODULE_DB_TEST_S3_DIRECTORY_PATH="s3://databases/pfam-A_35.*.hmm" \ MODULE_DB_TEST_S5CMD_COMMAND='" --retry-count 30 --no-verify-ssl --no-sign-request --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080 "' \ - MODULE_DB_TEST_GENERATED_YML=/vol/spool/generated_yamls/plasmids MODULE_DB_TEST_YML=example_params/plasmid.yml \ + MODULE_DB_TEST_CREDENTIALS=/vol/spool/credentials MODULE_DB_TEST_GENERATED_YML=${WORK_DIR}/generated_yamls/plasmids MODULE_DB_TEST_YML=example_params/plasmid.yml \ MODULE_DB_TEST_YML_PATH="'.steps.plasmid.ViralVerifyPlasmid.database=env(database),del(.steps.plasmid.Platon,.steps.plasmid.PlasClass,.steps.plasmid.MobTyper,.steps.plasmid.PLSDB)'" \ MODULE_DB_TEST_YML_SCRIPT="./scripts/test_plasmids.sh" \ MODULE_DB_TEST_GENERATED_YML_DIR="${WORK_DIR}/plasmid_yaml_database_tests" \ @@ -413,7 +435,7 @@ jobs: MODULE_DB_TEST_PATH="/vol/spool/toolkit/platon_20220929.tar.gz" MODULE_DB_TEST_S3PATH="s3://databases/platon_20220929.tar.gz" \ MODULE_DB_TEST_S3_DIRECTORY_PATH="s3://databases/platon/*" \ MODULE_DB_TEST_S5CMD_COMMAND='" --retry-count 30 --no-verify-ssl --no-sign-request --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080 "' \ - MODULE_DB_TEST_GENERATED_YML=/vol/spool/generated_yamls/plasmids MODULE_DB_TEST_YML=example_params/plasmid.yml \ + MODULE_DB_TEST_CREDENTIALS=/vol/spool/credentials MODULE_DB_TEST_GENERATED_YML=${WORK_DIR}/generated_yamls/plasmids MODULE_DB_TEST_YML=example_params/plasmid.yml \ MODULE_DB_TEST_YML_PATH="'.steps.plasmid.Platon.database=env(database),del(.steps.plasmid.ViralVerifyPlasmid,.steps.plasmid.PlasClass,.steps.plasmid.MobTyper,.steps.plasmid.PLSDB)'" \ MODULE_DB_TEST_YML_SCRIPT="./scripts/test_plasmids.sh" MODULE_DB_TEST_GENERATED_YML_DIR="${WORK_DIR}/plasmid_yaml_database_tests" \ MODULE_DB_TEST_REMOVE_DB="yes" @@ -426,7 +448,7 @@ jobs: MODULE_DB_TEST_PATH="/vol/spool/toolkit/mob_20220929.gz" MODULE_DB_TEST_S3PATH="s3://databases/mob_20220929.gz" \ MODULE_DB_TEST_S3_DIRECTORY_PATH="s3://databases/mob_20220929/*" \ MODULE_DB_TEST_S5CMD_COMMAND='" --retry-count 30 --no-verify-ssl --no-sign-request --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080 "' \ - MODULE_DB_TEST_GENERATED_YML=/vol/spool/generated_yamls/plasmids MODULE_DB_TEST_YML=example_params/plasmid.yml \ + MODULE_DB_TEST_CREDENTIALS=/vol/spool/credentials MODULE_DB_TEST_GENERATED_YML=${WORK_DIR}/generated_yamls/plasmids MODULE_DB_TEST_YML=example_params/plasmid.yml \ MODULE_DB_TEST_YML_PATH="'.steps.plasmid.MobTyper.database=env(database),del(.steps.plasmid.ViralVerifyPlasmid,.steps.plasmid.PlasClass,.steps.plasmid.Platon,.steps.plasmid.PLSDB)'" \ MODULE_DB_TEST_YML_SCRIPT="./scripts/test_plasmids.sh" MODULE_DB_TEST_GENERATED_YML_DIR="${WORK_DIR}/plasmid_yaml_database_tests" \ MODULE_DB_TEST_REMOVE_DB="yes" diff --git a/bin/emgb.sh b/bin/emgb.sh index 685627de..f78806cc 100755 --- a/bin/emgb.sh +++ b/bin/emgb.sh @@ -79,7 +79,7 @@ function getContigs { function getBins { - checkm=$(find $OUTPUT_PATH/$RUN_ID/magAttributes/*/checkm/ -name "*_checkm_*.tsv" -exec readlink -f {} \; | sed 's/^/ -checkm-tsv /g') + checkm=$(find $OUTPUT_PATH/$RUN_ID/magAttributes/*/checkm*/ -name "*_checkm*_*.tsv" -exec readlink -f {} \; | sed 's/^/ -checkm-tsv /g') gtdbtk=$(find $OUTPUT_PATH/$RUN_ID/magAttributes/*/gtdb/ -name "*.summary.tsv" -exec readlink -f {} \; | sed 's/^/ -gtdbtk-tsvs /g') bins=$(find $BINS_DIR -name "*_bin.*.fa" -exec readlink -f {} \; | tail -n 1 | rev | cut -f 1 -d '/' | rev | cut -d '.' -f 1 | sed 's/^/ -bin-id-prefix /g') json=" -json-gz $(pwd)/${NAME}.bins.json.gz " diff --git a/default/fullPipeline_illumina_nanpore.yml b/default/fullPipeline_illumina_nanpore.yml index 241dc490..b2add82a 100644 --- a/default/fullPipeline_illumina_nanpore.yml +++ b/default/fullPipeline_illumina_nanpore.yml @@ -101,18 +101,12 @@ steps: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/gtdbtk_r214_data.tar.gz md5sum: 390e16b3f7b0c4463eb7a3b2149261d9 additionalParams: " --min_af 0.65 --scratch_dir . " - # checkm lineage_wf - checkm: + checkm2: database: download: - # Use Version number in databse names - source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/checkm_data_2015_01_16.tar.gz" - md5sum: 0963b301dfe9345ea4be1246e32f6728 - buffer: 200 - additionalParams: - tree: " --reduced_tree " - lineage_set: " " - qa: " " + source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/checkm2_v2.tar.gz" + md5sum: a634cb3d31a1f56f2912b74005f25f09 + additionalParams: " " fragmentRecruitment: mashScreen: genomes: test_data/fragmentRecruitment/mags.tsv diff --git a/default/fullPipeline_illumina_nanpore_without_aggregate.yml b/default/fullPipeline_illumina_nanpore_without_aggregate.yml index d2213731..caafd586 100644 --- a/default/fullPipeline_illumina_nanpore_without_aggregate.yml +++ b/default/fullPipeline_illumina_nanpore_without_aggregate.yml @@ -101,18 +101,12 @@ steps: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/gtdbtk_r214_data.tar.gz md5sum: 390e16b3f7b0c4463eb7a3b2149261d9 additionalParams: " --min_af 0.65 --scratch_dir . " - # checkm lineage_wf - checkm: + checkm2: database: download: - # Use Version number in databse names - source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/checkm_data_2015_01_16.tar.gz" - md5sum: 0963b301dfe9345ea4be1246e32f6728 - buffer: 200 - additionalParams: - tree: " --reduced_tree " - lineage_set: " " - qa: " " + source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/checkm2_v2.tar.gz" + md5sum: a634cb3d31a1f56f2912b74005f25f09 + additionalParams: " " fragmentRecruitment: mashScreen: genomes: test_data/fragmentRecruitment/mags.tsv diff --git a/docker/toolkit-checkm2/Dockerfile b/docker/toolkit-checkm2/Dockerfile new file mode 100644 index 00000000..76ace6ea --- /dev/null +++ b/docker/toolkit-checkm2/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:focal +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update \ + && apt install -y git python3-pip prodigal diamond-aligner wget \ + && pip3 install setuptools wheel requests packaging tqdm \ + && pip3 install protobuf==3.20.* scikit-learn==0.23.2 h5py==2.10.0 numpy==1.19.2 tensorflow==2.5.0 lightgbm==3.2.1 pandas==1.4.0 scipy==1.8.0 + +ENV CHECKM2_VERSION=1.0.2 + +RUN git clone -b $CHECKM2_VERSION --recursive https://github.com/chklovski/checkm2.git && cd checkm2 \ + && python3 setup.py install + +ENV PATH="$HOME/.local/bin:$PATH" diff --git a/docker/toolkit-checkm2/VERSION b/docker/toolkit-checkm2/VERSION new file mode 100644 index 00000000..446dec6f --- /dev/null +++ b/docker/toolkit-checkm2/VERSION @@ -0,0 +1 @@ +1.0.2-1 diff --git a/docs/modules/magAttributes.md b/docs/modules/magAttributes.md index 272929c2..9c4a1eec 100644 --- a/docs/modules/magAttributes.md +++ b/docs/modules/magAttributes.md @@ -49,7 +49,8 @@ nextflow secrets set S3_checkm_SECRET XXXXXXX All GTDB files include the GTDB specific columns in addition to a `SAMPLE` column (`SAMPLE_gtdbtk.bac120.summary.tsv`, `SAMPLE_gtdbtk.ar122.summary.tsv`). In addition, this module produces a file `SAMPLE_gtdbtk_CHUNK.tsv` that combines both files and adds a `BIN_ID` column that adheres to the magAttributes specification -### Checkm +### Checkm and Checkm2 -The Checkm output adheres to the magAttributes specification and adds a `BIN_ID` and `SAMPLE` column to the output file. +The Checkm and Checkm2 output adheres to the magAttributes specification and adds a `BIN_ID` and `SAMPLE` column to the output file. +If Checkm2 and Checkm are both specified in the config file then only the Checkm2 results are used for downstream pipeline steps. diff --git a/example_params/fullPipeline.yml b/example_params/fullPipeline.yml index 285b83eb..82ca090d 100644 --- a/example_params/fullPipeline.yml +++ b/example_params/fullPipeline.yml @@ -53,16 +53,12 @@ steps: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/gtdbtk_r214_data.tar.gz md5sum: 390e16b3f7b0c4463eb7a3b2149261d9 additionalParams: " --min_af 0.65 --scratch_dir . " - checkm: + checkm2: database: download: - source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/checkm_data_2015_01_16.tar.gz" - md5sum: 0963b301dfe9345ea4be1246e32f6728 - buffer: 200 - additionalParams: - tree: " --reduced_tree " - lineage_set: " " - qa: " " + source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/checkm2_v2.tar.gz" + md5sum: a634cb3d31a1f56f2912b74005f25f09 + additionalParams: " " fragmentRecruitment: mashScreen: genomes: test_data/fragmentRecruitment/mags.tsv diff --git a/example_params/fullPipelineIlluminaOrONT.yml b/example_params/fullPipelineIlluminaOrONT.yml index 99ec6618..5e44a4b8 100644 --- a/example_params/fullPipelineIlluminaOrONT.yml +++ b/example_params/fullPipelineIlluminaOrONT.yml @@ -92,16 +92,12 @@ steps: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/gtdbtk_r214_data.tar.gz md5sum: 390e16b3f7b0c4463eb7a3b2149261d9 additionalParams: " --min_af 0.65 --scratch_dir . " - checkm: + checkm2: database: download: - source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/checkm_data_2015_01_16.tar.gz" - md5sum: 0963b301dfe9345ea4be1246e32f6728 - buffer: 200 - additionalParams: - tree: " --reduced_tree " - lineage_set: " " - qa: " " + source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/checkm2_v2.tar.gz" + md5sum: a634cb3d31a1f56f2912b74005f25f09 + additionalParams: " " dereplication: bottomUpClustering: minimumCompleteness: 0 diff --git a/example_params/magAttributes.yml b/example_params/magAttributes.yml index 430c3dbf..9aca6b1b 100644 --- a/example_params/magAttributes.yml +++ b/example_params/magAttributes.yml @@ -17,6 +17,12 @@ steps: source: https://openstack.cebitec.uni-bielefeld.de:8080/databases/gtdbtk_r214_data.tar.gz md5sum: 390e16b3f7b0c4463eb7a3b2149261d9 additionalParams: " --min_af 0.65 --scratch_dir . " + checkm2: + database: + download: + source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/checkm2_v2.tar.gz" + md5sum: a634cb3d31a1f56f2912b74005f25f09 + additionalParams: " " checkm: database: download: diff --git a/main.nf b/main.nf index 923b1c85..43381ab3 100644 --- a/main.nf +++ b/main.nf @@ -328,6 +328,18 @@ workflow wAggregatePipeline { | map { sra, bins -> bins} \ | set { checkm } + // get Checkm2 results + Pattern checkm2Pattern = Pattern.compile('.*/magAttributes/' + params.modules.magAttributes.version.major + '..*/.*/.*_checkm2_.*.tsv$') + selectedSRAMagAttributes | filter({ sra, path -> checkm2Pattern.matcher(path.toString()).matches()}) \ + | splitCsv(header: true, sep: '\t') \ + | map { sra, bins -> bins} \ + | set { checkm2 } + + // We allow only to execute checkm or checkm2 but not both + checkm \ + | mix(checkm2) \ + | set {checkm} + // get gtdbtk summary files Pattern gtdbPattern = Pattern.compile('.*/magAttributes/' + params.modules.magAttributes.version.major + '..*/.*/.*_gtdbtk_combined.tsv$' ) selectedSRAMagAttributes | filter({ sra, path -> gtdbPattern.matcher(path.toString()).matches()}) \ diff --git a/modules/magAttributes/module.nf b/modules/magAttributes/module.nf index 616e532a..3ef880e1 100644 --- a/modules/magAttributes/module.nf +++ b/modules/magAttributes/module.nf @@ -46,7 +46,8 @@ process pCheckM { publishDir params.output, mode: "${params.publishDirMode}", saveAs: { filename -> getOutput("${sample}", params.runid, "checkm", filename) }, \ pattern: "{**.tsv}" - when params.steps.containsKey("magAttributes") && params.steps.magAttributes.containsKey("checkm") + when params.steps.containsKey("magAttributes") && params.steps.magAttributes.containsKey("checkm") \ + && !params.steps.magAttributes.containsKey("checkm2") containerOptions Utils.getDockerMount(params.steps?.magAttributes?.checkm?.database, params) @@ -74,6 +75,44 @@ process pCheckM { } +process pCheckM2 { + + container "${params.checkm2_image}" + + tag "Sample: $sample" + + secret { "${S3_checkm2_ACCESS}"!="" ? ["S3_checkm2_ACCESS", "S3_checkm2_SECRET"] : [] } + + publishDir params.output, mode: "${params.publishDirMode}", saveAs: { filename -> getOutput("${sample}", params.runid, "checkm2", filename) }, \ + pattern: "{**.tsv}" + + when params.steps.containsKey("magAttributes") && params.steps.magAttributes.containsKey("checkm2") + + containerOptions Utils.getDockerMount(params.steps?.magAttributes?.checkm2?.database, params) + + beforeScript "mkdir -p ${params.polished.databases}" + + label 'medium' + + input: + tuple val(sample), val(ending), path(bins) + + output: + tuple path("${sample}_checkm2_*.tsv", type: "file"), val("${sample}"), emit: checkm + tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \ + file(".command.out"), file(".command.err"), file(".command.log"), emit: logs + + shell: + output = getOutput("${sample}", params.runid, "checkm2", "") + S5CMD_PARAMS=params?.steps?.magAttributes?.checkm2?.database?.download?.s5cmd?.params ?: "" + DOWNLOAD_LINK=params?.steps?.magAttributes?.checkm2?.database?.download?.source ?: "" + MD5SUM=params.steps?.magAttributes?.checkm2?.database?.download?.md5sum ?: "" + EXTRACTED_DB=params.steps?.magAttributes?.checkm2?.database?.extractedDBPath ?: "" + S3_checkm2_ACCESS=params?.steps?.magAttributes?.checkm2?.database?.download?.s5cmd && S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_checkm2_ACCESS" : "" + S3_checkm2_SECRET=params?.steps?.magAttributes?.checkm2?.database?.download?.s5cmd && S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_checkm2_SECRET" : "" + template 'checkm2.sh' +} + process pGtdbtk { container "${params.gtdbtk_image}" @@ -249,6 +288,7 @@ workflow _wMagAttributes { main: GTDB_DEFAULT_BUFFER = 500 CHECKM_DEFAULT_BUFFER = 30 + CHECKM2_DEFAULT_BUFFER = 20000 BIN_FILES_INPUT_IDX = 1 DATASET_IDX = 0 @@ -261,18 +301,22 @@ workflow _wMagAttributes { // get file ending of bin files (.fa, .fasta, ...) and group by file ending and dataset bins | flatMap({n -> groupBins(n, params?.steps?.magAttributes?.checkm?.buffer ?: CHECKM_DEFAULT_BUFFER)}) \ | pCheckM | set {checkm} + bins | flatMap({n -> groupBins(n, params?.steps?.magAttributes?.checkm2?.buffer ?: CHECKM2_DEFAULT_BUFFER)}) \ + | pCheckM2 | set { checkm2 } bins | flatMap({n -> groupBins(n, params?.steps?.magAttributes?.gtdb?.buffer ?: GTDB_DEFAULT_BUFFER )}) \ | pGtdbtk | set {gtdb} - // Prepare checkm output file - checkm.checkm | groupTuple(by: DATASET_OUTPUT_IDX, remainder: true) | map { it -> it[BIN_FILES_OUTPUT_GROUP_IDX] } | flatten | map { bin -> file(bin) } \ + checkm2.checkm | mix(checkm.checkm) | set {checkmSelected} + + // Prepare checkm2 output file + checkmSelected | groupTuple(by: DATASET_OUTPUT_IDX, remainder: true) | map { it -> it[BIN_FILES_OUTPUT_GROUP_IDX] } | flatten | map { bin -> file(bin) } \ | collectFile(keepHeader: true, newLine: false ){ item -> [ "bin_attributes.tsv", item.text ] } \ | splitCsv(sep: '\t', header: true) \ - | set{ checkm_list } + | set{ checkmSelectedList } if(params.summary){ - // collect checkm files for checkm results across multiple datasets - checkm.checkm \ + // collect checkm files for checkm2 results across multiple datasets + checkmSelected \ | collectFile(newLine: false, keepHeader: true, storeDir: params.output + "/summary/"){ item -> [ "checkm.tsv", item[BIN_FILES_OUTPUT_IDX].text ] } @@ -290,6 +334,6 @@ workflow _wMagAttributes { pGtdbtk.out.logs | mix(pCheckM.out.logs) | pDumpLogs emit: - checkm = checkm_list + checkm = checkmSelectedList gtdb = gtdb.combined } diff --git a/nextflow.config b/nextflow.config index c85ffe5c..a49ab00f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -282,6 +282,7 @@ params { maxbin_image = "quay.io/biocontainers/maxbin2:2.2.7--he1b5a44_2" gtdbtk_image = "quay.io/metagenomics/toolkit-gtdbtk:2.1.1-2" checkm_image = "quay.io/metagenomics/toolkit-checkm:0.1.0" + checkm2_image = "quay.io/metagenomics/toolkit-checkm2:1.0.2-1" prokka_image = "quay.io/metagenomics/toolkit-prokka:1.14.6-1" fastp_image = "quay.io/biocontainers/fastp:0.23.2--h79da9fb_0" nanoplot_image = "quay.io/biocontainers/nanoplot:1.40.0--pyhdfd78af_0" diff --git a/templates/checkm2.sh b/templates/checkm2.sh new file mode 100644 index 00000000..689686a4 --- /dev/null +++ b/templates/checkm2.sh @@ -0,0 +1,53 @@ + +# Prepare checkm patch, output directory and output file name +mkdir out +FILE_ID=$(mktemp XXXXXXXX) +OUTPUT=!{sample}_checkm2_${FILE_ID}.tsv + +# Check developer documentation +if [ -z "!{EXTRACTED_DB}" ] +then + DATABASE=!{params.polished.databases}/checkm2 + LOCK_FILE=${DATABASE}/lock.txt + + if [ ! -z "!{S3_checkm2_ACCESS}" ] + then + export AWS_ACCESS_KEY_ID=!{S3_checkm2_ACCESS} + export AWS_SECRET_ACCESS_KEY=!{S3_checkm2_SECRET} + fi + + # Download checkm database if necessary + mkdir -p ${DATABASE} + flock ${LOCK_FILE} concurrentDownload.sh --output=${DATABASE} \ + --link=!{DOWNLOAD_LINK} \ + --httpsCommand="wget -O checkm2.tar.gz !{DOWNLOAD_LINK} && tar -xzvf checkm2.tar.gz && rm checkm2.tar.gz" \ + --s3FileCommand="s5cmd !{S5CMD_PARAMS} cp --concurrency !{task.cpus} !{DOWNLOAD_LINK} checkm2.tar.gz && tar -xzvf checkm2.tar.gz && rm checkm2.tar.gz" \ + --s3DirectoryCommand="s5cmd !{S5CMD_PARAMS} cp --concurrency !{task.cpus} !{DOWNLOAD_LINK} . " \ + --s5cmdAdditionalParams="!{S5CMD_PARAMS}" \ + --localCommand="tar -xzvf !{DOWNLOAD_LINK}" \ + --expectedMD5SUM=!{MD5SUM} + + export CHECKM2DB=$(find $DATABASE -name "*.dmnd") +else + export CHECKM2DB=$(find !{EXTRACTED_DB} -name "*.dmnd") +fi + +checkm2 predict !{params.steps.magAttributes.checkm2.additionalParams} --threads !{task.cpus} --input !{bins} -o out +mv out/quality_report.tsv ${OUTPUT} + +# Prepare output + +# Rename column names to upper letters +sed -i -e " 1,1 s/Name/BIN_ID/" \ + -e " 1,1 s/Completeness/COMPLETENESS/" \ + -e " 1,1 s/Contamination/CONTAMINATION/" ${OUTPUT} + +# Add missing file ending +sed -i " 2,$ s/\t/!{ending}\t/" ${OUTPUT} + +# Add sample id +sed -i -e "1s/^/SAMPLE\t/" -e "2,$ s/^/!{sample}\t/" ${OUTPUT} + +# Add Heterogeneity column to make it compatible with older checkm result files +sed -i -e " 1,1 s/$/\tHETEROGENEITY/" \ + -e " 2,$ s/$/\t0/g" ${OUTPUT}