From fdf4fc6c3de50c676a0a81d8bb400950dbd530a9 Mon Sep 17 00:00:00 2001 From: Peter Belmann Date: Tue, 10 Dec 2024 12:17:25 +0000 Subject: [PATCH] feat(export): initial emgb export functionality --- .github/workflows/workflow_modules.yml | 8 + example_params/export.yml | 47 ++++ example_params/fullPipeline.yml | 14 ++ main.nf | 309 ++++++++++++++++++++++--- modules/annotation/module.nf | 7 +- modules/export/emgb.nf | 165 +++++++++++++ modules/magAttributes/module.nf | 32 ++- nextflow.config | 11 + scripts/test_fullPipelineExport.sh | 15 ++ templates/emgbAnnotatedBins.sh | 13 ++ templates/emgbAnnotatedContigs.sh | 5 + templates/emgbAnnotatedGenes.sh | 91 ++++++++ 12 files changed, 669 insertions(+), 48 deletions(-) create mode 100644 example_params/export.yml create mode 100644 modules/export/emgb.nf create mode 100644 scripts/test_fullPipelineExport.sh create mode 100644 templates/emgbAnnotatedBins.sh create mode 100644 templates/emgbAnnotatedContigs.sh create mode 100644 templates/emgbAnnotatedGenes.sh diff --git a/.github/workflows/workflow_modules.yml b/.github/workflows/workflow_modules.yml index 755622d1..bd74363a 100644 --- a/.github/workflows/workflow_modules.yml +++ b/.github/workflows/workflow_modules.yml @@ -96,6 +96,14 @@ jobs: " --output=output --smetana_image=pbelmann/metabolomics:0.1.0 \ --steps.cooccurrence.beforeProcessScript=/vol/spool/dockerPull.sh " ./example_params/fullPipelineAggregate.yml \ "${WORK_DIR}" ${PROFILE} || exit 1 + + - name: Test export workflow based on previous test + run: | + bash ./scripts/test_fullPipelineExport.sh \ + " --output=output --input=output --smetana_image=pbelmann/metabolomics:0.1.0 \ + "" " ./example_params/export.yml \ + "${WORK_DIR}" ${PROFILE} || exit 1 + - name: Test if SRA S3 Input module works with ONT data run: | bash ./scripts/test_fullPipeline.sh " -c test_data/assets/aws.config " \ diff --git a/example_params/export.yml b/example_params/export.yml new file mode 100644 index 00000000..531dc2b6 --- /dev/null +++ b/example_params/export.yml @@ -0,0 +1,47 @@ +tempdir: "tmp" +summary: false +s3SignIn: false +input: "output" +output: "output" +logDir: log +runid: 1 +databases: "/mnt/databases" +logLevel: 1 +scratch: "/vol/scratch" +publishDirMode: "symlink" +steps: + export: + emgb: + titles: + database: + download: + source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/uniref90.titles.tsv.gz" + md5sum: aaf1dd9021243def8e6c4e438b4b3669 + kegg: + database: + download: + source: s3://databases_internal/annotatedgenes2json_db_kegg-mirror-2022-12.tar.zst + md5sum: 262dab8ca564fbc1f27500c22b5bc47b + s5cmd: + params: '--retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080' +resources: + highmemLarge: + cpus: 28 + memory: 230 + highmemMedium: + cpus: 14 + memory: 113 + large: + cpus: 28 + memory: 58 + medium: + cpus: 14 + memory: 29 + small: + cpus: 7 + memory: 14 + tiny: + cpus: 1 + memory: 1 + + diff --git a/example_params/fullPipeline.yml b/example_params/fullPipeline.yml index 9fab59af..04cb152d 100644 --- a/example_params/fullPipeline.yml +++ b/example_params/fullPipeline.yml @@ -246,6 +246,20 @@ steps: additionalParams: run: "" report: "" + export: + emgb: + titles: + database: + download: + source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/uniref90.titles.tsv.gz" + md5sum: aaf1dd9021243def8e6c4e438b4b3669 + kegg: + database: + download: + source: s3://databases_internal/annotatedgenes2json_db_kegg-mirror-2022-12.tar.zst + md5sum: 262dab8ca564fbc1f27500c22b5bc47b + s5cmd: + params: '--retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080' resources: highmemLarge: cpus: 28 diff --git a/main.nf b/main.nf index bf9724da..816a6ede 100644 --- a/main.nf +++ b/main.nf @@ -9,6 +9,7 @@ include { wShortReadAssemblyFile; wShortReadAssemblyList; wTestMemSelection } fr include { wOntAssemblyFile; wOntAssemblyList } from './modules/assembly/ontAssembler' include { wShortReadBinningList } from './modules/binning/shortReadBinning' include { wLongReadBinningList } from './modules/binning/ontBinning' +include { wEMGBList } from './modules/export/emgb' include { wMagAttributesFile; wMagAttributesList; wCMSeqWorkflowFile; } from './modules/magAttributes/module.nf' include { wDereplicateFile; wDereplicateList} from './modules/dereplication/bottomUpClustering/module' include { wAnalyseMetabolitesList; wAnalyseMetabolitesFile } from './modules/metabolomics/module' @@ -177,7 +178,7 @@ def getPath(f){ return params.input.startsWith("s3://")? "s3:/" + f: f } -workflow _wAggregateONT { +workflow _wFindSamplesONT { take: sraFiles main: @@ -195,7 +196,7 @@ workflow _wAggregateONT { | map{ sra,f -> [sra, getPath(f)] } | splitCsv(sep: '\t', header: true) \ | map { sample -> [sample[SAMPLE_NAME], sample[SAMPLE_STATS].median_qual]} |set { ontMedianQuality } - Pattern binsONTPattern = Pattern.compile('.*/binningONT/' + params.modules.binning.version.major + '..*/.*/.*_bin.*.fa$') + Pattern binsONTPattern = Pattern.compile('.*/binningONT/' + params.modules.binningONT.version.major + '..*/.*/.*_bin.*.fa$') sraFiles | filter({ sra, path -> binsONTPattern.matcher(path.toString()).matches()}) \ | map{ sra,f -> [SAMPLE:sra, PATH: getPath(f), BIN_ID:file(f).name] } \ | set{ ontBins } @@ -212,7 +213,26 @@ workflow _wAggregateONT { ontBinStats = ontBinStats } -workflow _wAggregateIllumina { +workflow _wGetAssemblyFiles { + take: + assemblyFiles + main: + // get Illumina assembly files + Pattern assemblyIlluminaPattern = Pattern.compile('.*/assembly/' + params.modules.assembly.version.major + '..*/.*/.*_contigs.fa.gz$') + assemblyFiles | filter({ sra, path -> assemblyIlluminaPattern.matcher(path.toString()).matches()}) \ + | map{ sra,f -> [sra, getPath(f)] } | set { illuminaAssembly } + + // get ONT assembly files + Pattern assemblyONTPattern = Pattern.compile('.*/assemblyONT/' + params.modules.assemblyONT.version.major + '..*/.*/.*_contigs.fa.gz$') + assemblyFiles | filter({ sra, path -> assemblyONTPattern.matcher(path.toString()).matches()}) \ + | map{ sra,f -> [sra, getPath(f)] } | set { ontAssembly } + emit: + illuminaAssembly + ontAssembly +} + + +workflow _wFindSamplesIllumina { take: binningFiles qcFiles @@ -288,17 +308,10 @@ workflow _wFragmentRecruitment { recruitedGenomesStats = recruitedGenomesStats } -/* -* This workflow entry point allows to aggregate information of different samples. -* It will perform analysis steps such as dereplication, read mapping and co-occurrence. -* The input files are automatically fetched as long as they adhere to the pipeline specification document (see documentation). -*/ -workflow wAggregatePipeline { - def input = params.input +workflow _wGetSamples() { + main: def runID = params.runid - - // Save config File - wSaveSettingsList(Channel.value("AGGREGATED")) + def input = params.input // List all available SRAIDs Channel.from(file(input).list()) | filter({ path -> !(path ==~ /.*summary$/) && !(path ==~ /null$/) }) \ @@ -307,32 +320,32 @@ workflow wAggregatePipeline { sraDatasets | map { sra -> [sra, input + "/" + sra + "/" + runID + "/" ]} \ | set {sraIDs} + emit: + sraIDs + sraDatasets +} - // List all files in sample directories - sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.qc])} | set { qcFiles } - sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.binning]) } | set { binningFiles } - - _wAggregateIllumina(binningFiles, qcFiles) - - sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.binningONT])} | set { binningONTFiles } - sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.qcONT])} | mix(binningONTFiles) | _wAggregateONT - - sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.fragmentRecruitment])} | _wFragmentRecruitment - - sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.magAttributes])} | set { selectedSRAMagAttributes} +workflow _wGetCheckm { + take: + selectedSRAMagAttributes + main: // get Checkm results Pattern checkmPattern = Pattern.compile('.*/magAttributes/' + params.modules.magAttributes.version.major + '..*/.*/.*_checkm_.*.tsv$') - selectedSRAMagAttributes | filter({ sra, path -> checkmPattern.matcher(path.toString()).matches()}) \ - | splitCsv(header: ["SAMPLE", "BIN_ID", "Marker lineage", "# genomes", "# markers", \ + selectedSRAMagAttributes | filter({ sra, path -> checkmPattern.matcher(path.toString()).matches()}) \ + | set { checkmFiles } + + checkmFiles | splitCsv(header: ["SAMPLE", "BIN_ID", "Marker lineage", "# genomes", "# markers", \ "# marker sets", "0", "1", "2", "3", "4", "5+", "COMPLETENESS", "CONTAMINATION", "HETEROGENEITY"], sep: '\t') \ | map { sra, bins -> bins} \ | set { checkm } // get Checkm2 results Pattern checkm2Pattern = Pattern.compile('.*/magAttributes/' + params.modules.magAttributes.version.major + '..*/.*/.*_checkm2_.*.tsv$') - selectedSRAMagAttributes | filter({ sra, path -> checkm2Pattern.matcher(path.toString()).matches()}) \ - | splitCsv(header: true, sep: '\t') \ + selectedSRAMagAttributes | filter({ sra, path -> checkm2Pattern.matcher(path.toString()).matches()}) \ + | set { checkm2Files } + + checkm2Files | splitCsv(header: true, sep: '\t') \ | map { sra, bins -> bins} \ | set { checkm2 } @@ -341,6 +354,52 @@ workflow wAggregatePipeline { | mix(checkm2) \ | set {checkm} + checkmFiles \ + | mix(checkm2Files) \ + | set {checkmFiles} + + emit: + checkm + checkmFiles +} + +/* +* This workflow entry point allows to aggregate information of different samples. +* It will perform analysis steps such as dereplication, read mapping and co-occurrence. +* The input files are automatically fetched as long as they adhere to the pipeline specification document (see documentation). +*/ +workflow wAggregatePipeline { + def input = params.input + def runID = params.runid + + // Save config File + wSaveSettingsList(Channel.value("AGGREGATED")) + + _wGetSamples() + _wGetSamples.out.sraDatasets | set { sraDatasets } + _wGetSamples.out.sraIDs | set { sraIDs } + + // List all files in sample directories + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.qc]) } | set { qcFiles } + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.binning]) } | set { binningFiles } + + _wFindSamplesIllumina(binningFiles, qcFiles) + + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.binningONT])} | set { binningONTFiles } + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.qcONT])} \ + | mix(binningONTFiles) | set { binningONT } + + _wFindSamplesONT(binningONT) + + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.fragmentRecruitment])} | _wFragmentRecruitment + + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.magAttributes])} | set { selectedSRAMagAttributes} + + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.annotation])} | set { selectedAnnotation} + + selectedSRAMagAttributes | _wGetCheckm + _wGetCheckm.out.checkm | set { checkm } + // get gtdbtk summary files Pattern gtdbPattern = Pattern.compile('.*/magAttributes/' + params.modules.magAttributes.version.major + '..*/.*/.*_gtdbtk_combined.tsv$' ) selectedSRAMagAttributes | filter({ sra, path -> gtdbPattern.matcher(path.toString()).matches()}) \ @@ -360,18 +419,161 @@ workflow wAggregatePipeline { recruitedGenomesStats = _wFragmentRecruitment.out.recruitedGenomesStats - mapJoin(_wAggregateIllumina.out.illuminaBinStats | mix(_wAggregateONT.out.ontBinStats) \ + mapJoin(_wFindSamplesIllumina.out.illuminaBinStats | mix(_wFindSamplesONT.out.ontBinStats) \ | mix(recruitedGenomesStats), checkm, "BIN_ID", "BIN_ID") \ | set {checkmBinStats} - mapJoin(checkmBinStats, _wAggregateIllumina.out.illuminaBins | mix(_wAggregateONT.out.ontBins) \ + mapJoin(checkmBinStats, _wFindSamplesIllumina.out.illuminaBins | mix(_wFindSamplesONT.out.ontBins) \ | mix(recruitedGenomes), "BIN_ID", "BIN_ID") \ | set {binsStatsComplete} - _wAggregateIllumina.out.illuminaSamples | mix(_wAggregateONT.out.ontSamples) | view { sra, path -> "Files detected of SRA ID $sra" } + _wFindSamplesIllumina.out.illuminaSamples | mix(_wFindSamplesONT.out.ontSamples) | view { sra, path -> "Files detected of SRA ID $sra" } + + _wAggregate(_wFindSamplesONT.out.ontSamples, _wFindSamplesONT.out.ontMedianQuality, _wFindSamplesIllumina.out.illuminaSamples, \ + _wFindSamplesIllumina.out.unpairedIlluminaSamples, binsStatsComplete, gtdb, models) +} + + +/* +* This workflow entry point allows to aggregate information of different samples. +* It will perform analysis steps such as dereplication, read mapping and co-occurrence. +* The input files are automatically fetched as long as they adhere to the pipeline specification document (see documentation). +*/ +workflow wExportPipeline { + def input = params.input + def runID = params.runid + + // List all available SRAIDs + Channel.from(file(input).list()) | filter({ path -> !(path ==~ /.*summary$/) && !(path ==~ /null$/) }) \ + | filter({ path -> !(path ==~ /.*AGGREGATED$/)}) \ + | set { sraDatasets } + + // Save config File + wSaveSettingsList(sraDatasets) - _wAggregate(_wAggregateONT.out.ontSamples, _wAggregateONT.out.ontMedianQuality, _wAggregateIllumina.out.illuminaSamples, \ - _wAggregateIllumina.out.unpairedIlluminaSamples, binsStatsComplete, gtdb, models) + sraDatasets | map { sra -> [sra, input + "/" + sra + "/" + runID + "/" ]} \ + | set {sraIDs} + + // List all files in sample directories + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.qc])} | set { qcFiles } + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.binning]) } \ + | set { binningFiles } + + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.magAttributes])} \ + | set { selectedSRAMagAttributes} + + // Checkm files + selectedSRAMagAttributes | _wGetCheckm + _wGetCheckm.out.checkmFiles | set { checkmFiles } + + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.assemblyONT]) } | set { assemblyONTFiles } + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.assembly]) } | set { assemblyIlluminaFiles } + sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.annotation])} | set { selectedAnnotation} + + assemblyONTFiles | mix(assemblyIlluminaFiles) | _wGetAssemblyFiles + _wGetAssemblyFiles.out.illuminaAssembly | mix(_wGetAssemblyFiles.out.ontAssembly) | set { assembly } + + // get Bins + Pattern binsIlluminaPattern = Pattern.compile('.*/binning/' + params.modules.binning.version.major + '..*/.*/.*_bin.*.fa$') + binningFiles | filter({ sra, path -> binsIlluminaPattern.matcher(path.toString()).matches()}) \ + | set{ illuminaBins } + + Pattern binsONTPattern = Pattern.compile('.*/binningONT/' + params.modules.binningONT.version.major + '..*/.*/.*_bin.*.fa$') + binningFiles | filter({ sra, path -> binsONTPattern.matcher(path.toString()).matches()}) \ + | mix(illuminaBins) | groupTuple(by: 0) | set{ binFiles } + + // get not Binned gff files + Pattern annotationNotBinnedGffPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_notBinned.gff.gz$' ) + selectedAnnotation | filter({ sra, path -> annotationNotBinnedGffPattern.matcher(path.toString()).matches()}) \ + | set { notBinnedGff } + + // get Bin gff files + Pattern annotationGffPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_bin..*.gff.gz$' ) + selectedAnnotation | filter({ sra, path -> annotationGffPattern.matcher(path.toString()).matches()}) \ + | mix(notBinnedGff) | map { sra, path -> [sra, file(path).baseName, path] } | set { gff } + + // get not binned ffn files + Pattern annotationNotBinnedFfnPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_notBinned.ffn.gz$' ) + selectedAnnotation | filter({ sra, path -> annotationNotBinnedFfnPattern.matcher(path.toString()).matches()}) \ + | set { notBinnedFfn } + + // get ffn files + Pattern annotationFfnPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_bin..*.ffn.gz$' ) + selectedAnnotation | filter({ sra, path -> annotationFfnPattern.matcher(path.toString()).matches()}) \ + | mix(notBinnedFfn) | map { sra, path -> [sra, file(path).baseName, path] } | set { ffn } + + // not Binned faa files + Pattern annotationNotBinnedPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_notBinned.faa.gz$' ) + selectedAnnotation | filter({ sra, path -> annotationNotBinnedPattern.matcher(path.toString()).matches()}) \ + | set { notBinnedFaa } + + // get Bin faa files + Pattern annotationFnaPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_bin..*.faa.gz$' ) + selectedAnnotation | filter({ sra, path -> annotationFnaPattern.matcher(path.toString()).matches()}) \ + | mix(notBinnedFaa) | map { sra, path -> [sra, file(path).baseName, path] } | set { faa } + + // get MMseqs unbinned taxonomy files + Pattern annotationMmseqsUnbinnedTaxonomyPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2_taxonomy/.*/.*_unbinned.ncbi_nr.taxonomy.tsv$' ) + selectedAnnotation | filter({ sra, path -> annotationMmseqsUnbinnedTaxonomyPattern.matcher(path.toString()).matches()}) \ + | map { sra, path -> ["ncbi_nr", sra, path] } | set { mmseqsUnbinnedTaxonomy } + + // get MMseqs binned taxonomy files output/test2/1/annotation/1.0.0/mmseqs2/ncbi_nr/ test2_unbinned.ncbi_nr.105001.112000.blast.tsv + Pattern annotationMmseqsBinnedTaxonomyPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2_taxonomy/.*/.*_binned.ncbi_nr.taxonomy.tsv$' ) + selectedAnnotation | filter({ sra, path -> annotationMmseqsBinnedTaxonomyPattern.matcher(path.toString()).matches()}) \ + | map { sra, path -> ["ncbi_nr", sra, path] } | mix(mmseqsUnbinnedTaxonomy) | set { mmseqsTaxonomy } + + // get MMseqs unbinned blast files + Pattern annotationMmseqsUnirefUnbinnedPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2/.*/.*_unbinned.uniref90.blast.tsv$' ) + selectedAnnotation | filter({ sra, path -> annotationMmseqsUnirefUnbinnedPattern.matcher(path.toString()).matches()}) \ + | map { sra, path -> ["uniref90", sra, "unbinned", 1, 1, 1, path] } | set { mmseqsUnirefUnbinnedBlast } + + // get MMseqs binned blast files output/test2/1/annotation/1.0.0/mmseqs2/ncbi_nr/ test2_unbinned.ncbi_nr.105001.112000.blast.tsv + Pattern annotationMmseqsUnirefBinnedPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2/.*/.*_binned.uniref90.blast.tsv$' ) + selectedAnnotation | filter({ sra, path -> annotationMmseqsUnirefBinnedPattern.matcher(path.toString()).matches()}) \ + | map { sra, path -> ["uniref90", sra, "binned", 1, 1, 1, path] } \ + | mix(mmseqsUnirefUnbinnedBlast) | set { mmseqsUnirefBlast } + + // get MMseqs unbinned kegg blast files + Pattern annotationMmseqsKeggUnbinnedPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2/.*/.*_unbinned.kegg.blast.tsv$' ) + selectedAnnotation | filter({ sra, path -> annotationMmseqsKeggUnbinnedPattern.matcher(path.toString()).matches()}) \ + | map { sra, path -> ["kegg", sra, "unbinned", 1, 1, 1, path] } \ + | set { mmseqsKeggUnbinnedBlast } + + // get MMseqs binned blast files output/test2/1/annotation/1.0.0/mmseqs2/ncbi_nr/ test2_unbinned.ncbi_nr.105001.112000.blast.tsv + Pattern annotationMmseqsKeggBinnedPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2/.*/.*_binned.kegg.blast.tsv$' ) + selectedAnnotation | filter({ sra, path -> annotationMmseqsKeggBinnedPattern.matcher(path.toString()).matches()}) \ + | map { sra, path -> ["kegg", sra, "binned", 1, 1, 1, path] } \ + | mix(mmseqsKeggUnbinnedBlast) \ + | set { mmseqsKeggBlast } + + // get gtdbtk summary files + Pattern gtdbSummaryPattern = Pattern.compile('.*/magAttributes/' + params.modules.magAttributes.version.major + '..*/.*/.*.summary.tsv$' ) + selectedSRAMagAttributes | filter({ sra, path -> gtdbSummaryPattern.matcher(path.toString()).matches()}) \ + | groupTuple(by: 0) | set { gtdbSummaryFiles } + + // get Mapping + Pattern mappingIlluminaPattern = Pattern.compile('.*/binning/' + params.modules.binning.version.major + '..*/.*/.*.bam$') + binningFiles | filter({ sra, path -> mappingIlluminaPattern.matcher(path.toString()).matches()}) \ + | set{ illuminaMapping } + + // get ONT mapping files + Pattern mappingONTPattern = Pattern.compile('.*/binningONT/' + params.modules.binningONT.version.major + '..*/.*/.*.bam$') + binningFiles | filter({ sra, path -> mappingONTPattern.matcher(path.toString()).matches()}) \ + | set{ ontMapping } + + illuminaMapping | mix(ontMapping) | set { mapping } + + _wExportEMGB(assembly, \ + mapping, \ + binFiles, \ + gtdbSummaryFiles, \ + checkmFiles, \ + gff, \ + ffn, \ + faa, \ + mmseqsTaxonomy, \ + mmseqsUnirefBlast | mix(mmseqsKeggBlast) \ + ) } @@ -448,7 +650,6 @@ workflow wSaveSettings { } - def flattenBins(binning){ def chunkList = []; def SAMPLE_IDX = 0; @@ -470,6 +671,7 @@ workflow _wProcessIllumina { wShortReadAssemblyList(qcReads, wShortReadQualityControlList.out.nonpareil, wShortReadQualityControlList.out.kmerFrequencies) wShortReadBinningList(wShortReadAssemblyList.out.contigs, qcReads) emit: + contigs = wShortReadAssemblyList.out.contigs notBinnedContigs = wShortReadBinningList.out.notBinnedContigs bins = wShortReadBinningList.out.bins binsStats = wShortReadBinningList.out.binsStats @@ -493,6 +695,7 @@ workflow _wProcessOnt { wLongReadBinningList(wOntAssemblyList.out.contigs, ontQCReads, wOntAssemblyList.out.graph, \ wOntAssemblyList.out.headerMapping, wOntAssemblyList.out.info, medianQuality) emit: + contigs = wOntAssemblyList.out.contigs notBinnedContigs = wLongReadBinningList.out.notBinnedContigs bins = wLongReadBinningList.out.bins binsStats = wLongReadBinningList.out.binsStats @@ -504,6 +707,22 @@ workflow _wProcessOnt { medianQuality = medianQuality } +workflow _wExportEMGB { + take: + contigs + mapping + bins + gtdbtk + checkm + gff + ffn + faa + mmseqsTaxonomy + mmseqsBlast + main: + wEMGBList(contigs, mapping, bins, gtdbtk, checkm, gff, ffn, faa, mmseqsTaxonomy, mmseqsBlast) +} + /* * * Main workflow entrypoint. Takes list of files containing reads as input and produces assembly, binning, dereplication and metabolomics @@ -536,6 +755,8 @@ workflow wFullPipeline { | map{ bin -> [bin.SAMPLE, bin.BIN_ID, bin.PATH]} \ | set { bins } + ont.bins | mix(illumina.bins) | set {binsFiles} + illumina.fastg | set { fastg } ont.gfa | set { gfa } @@ -581,6 +802,22 @@ workflow wFullPipeline { wAnalyseMetabolitesList(binsStats, mapJoin(wMagAttributesList.out.checkm, proteins, "BIN_ID", "BIN_ID")) - _wAggregate(ont.reads, ont.medianQuality, illumina.readsPair, illumina.readsSingle, binsStats, \ + wMagAttributesList.out.gtdbArcheaFiles \ + | mix(wMagAttributesList.out.gtdbBacteriaFiles) \ + | groupTuple(by: 0) | set { gtdbSummaryFiles } + + _wExportEMGB(illumina.contigs | mix(ont.contigs),\ + mapping, \ + binsFiles, \ + gtdbSummaryFiles, \ + wMagAttributesList.out.checkmFiles, \ + wAnnotateBinsList.out.gff, \ + wAnnotateBinsList.out.ffn, \ + wAnnotateBinsList.out.faa, \ + wAnnotateBinsList.out.mmseqs2_taxonomy, \ + wAnnotateBinsList.out.mmseqs2_blast \ + ) + + _wAggregate(ont.reads, ont.medianQuality, illumina.readsPair, illumina.readsSingle, binsStats, \ wMagAttributesList.out.gtdb, wAnalyseMetabolitesList.out.models) } diff --git a/modules/annotation/module.nf b/modules/annotation/module.nf index cfe20f82..9438d389 100644 --- a/modules/annotation/module.nf +++ b/modules/annotation/module.nf @@ -53,6 +53,7 @@ process pMMseqs2 { container "${params.mmseqs2_image}" + // Databases will be downloaded to a fixed place so that they can be used by future processes. // This fixed place has to be outside of the working-directory to be easy to find for every process. // Therefore this place has to be mounted to the docker container to be accessible during run time. @@ -610,6 +611,11 @@ workflow wAnnotateList { emit: keggAnnotation = _wAnnotation.out.keggAnnotation proteins = _wAnnotation.out.prokka_faa + ffn = _wAnnotation.out.prokka_ffn + gff = _wAnnotation.out.prokka_gff + faa = _wAnnotation.out.prokka_faa + mmseqs2_taxonomy = _wAnnotation.out.mmseqs2_taxonomy + mmseqs2_blast = _wAnnotation.out.mmseqs2_blast } /** @@ -702,7 +708,6 @@ workflow _wSplit { | pCount | combine(chunkSize) | flatMap { sample -> \ Utils.splitFilesIndex(Integer.parseInt(sample[COUNT_IDX]), sample[CHUNK_SIZE_IDX], [sample[SAMPLE_IDX], sample[FILE_2_IDX], sample[FILE_3_IDX]]) } \ | set { chunks } - emit: chunks } diff --git a/modules/export/emgb.nf b/modules/export/emgb.nf new file mode 100644 index 00000000..16dc8716 --- /dev/null +++ b/modules/export/emgb.nf @@ -0,0 +1,165 @@ +include { pDumpLogs } from '../utils/processes' + + + +def getOutput(SAMPLE, RUNID, TOOL, filename){ + return SAMPLE + '/' + RUNID + '/' + params.modules.export.name + '/' + + params.modules.export.version.major + "." + + params.modules.export.version.minor + "." + + params.modules.export.version.patch + + '/' + TOOL + '/' + filename +} + +process pEMGBAnnotatedContigs { + + container "${params.emgbAnnotatedContigs_image}" + + tag "Sample: $sample" + + publishDir params.output, mode: "${params.publishDirMode}", saveAs: { filename -> getOutput("${sample}", params.runid, "emgb", filename) } + + when params.steps.containsKey("export") && params.steps.export.containsKey("emgb") + + containerOptions " --entrypoint='' " + + label 'tiny' + + input: + tuple val(sample), path(contigs), path(mapping), path("bins/*") + + output: + + tuple val("${sample}"), path("${sample}.contigs.json.gz"), emit: json + tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \ + file(".command.out"), file(".command.err"), file(".command.log"), emit: logs + + shell: + template 'emgbAnnotatedContigs.sh' +} + + +process pEMGBAnnotatedBins { + + container "${params.emgbAnnotatedBins_image}" + + tag "Sample: $sample" + + publishDir params.output, mode: "${params.publishDirMode}", saveAs: { filename -> getOutput("${sample}", params.runid, "emgb", filename) } + + when params.steps.containsKey("export") && params.steps.export.containsKey("emgb") + + containerOptions " --entrypoint='' " + + label 'tiny' + + input: + tuple val(sample), path(checkm), path(gtdbtk), path("bins/*") + + output: + tuple val("${sample}"), path("${sample}.bins.json.gz"), emit: json + tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \ + file(".command.out"), file(".command.err"), file(".command.log"), emit: logs + + shell: + template 'emgbAnnotatedBins.sh' +} + + +process pEMGBAnnotatedGenes { + + container "${params.emgbAnnotatedGenes_image}" + + tag "Sample: $sample" + + publishDir params.output, mode: "${params.publishDirMode}", saveAs: { filename -> getOutput("${sample}", params.runid, "emgb", filename) } + + secret { "${S3_EMGB_TITLES_ACCESS}"!="" ? ["S3_EMGB_TITLES_ACCESS", "S3_EMGB_TITLES_SECRET"] : [] } + + secret { "${S3_EMGB_KEGG_ACCESS}"!="" ? ["S3_EMGB_KEGG_ACCESS", "S3_EMGB_KEGG_SECRET"] : [] } + + containerOptions Utils.getDockerMount(params.steps?.export?.emgb?.titles?.database, params) + Utils.getDockerMount(params.steps?.export?.emgb?.kegg?.database, params) + Utils.getDockerNetwork() + " --entrypoint='' " + + when params.steps.containsKey("export") && params.steps.export.containsKey("emgb") + + memory { Utils.getMemoryResources(params.resources.medium, "${sample}", task.attempt, params.resources) } + + cpus { Utils.getCPUsResources(params.resources.medium, "${sample}", task.attempt, params.resources) } + + beforeScript Utils.getCreateDatabaseDirCommand("${params.polished.databases}") + + input: + tuple val(sample), path("gff/*"), path("ffn/*"), path("faa/*"), path("taxonomy/*"), path("blastResult/*"), path("blastKeggResult/*"), path("bins/*") + + output: + tuple val("${sample}"), path("${sample}.genes.json.gz"), emit: json + tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \ + file(".command.out"), file(".command.err"), file(".command.log"), emit: logs + + shell: + TITLES_S5CMD_PARAMS=params?.steps?.export?.emgb?.titles?.database?.download?.s5cmd?.params ?: "" + TITLES_DOWNLOAD_LINK=params?.steps?.export?.emgb?.titles?.database?.download?.source ?: "" + TITLES_MD5SUM=params.steps?.export?.emgb?.titles?.database?.download?.md5sum ?: "" + TITLES_EXTRACTED_DB=params.steps?.export?.emgb?.titles?.database?.extractedDBPath ?: "" + KEGG_S5CMD_PARAMS=params?.steps?.export?.emgb?.kegg?.database?.download?.s5cmd?.params ?: "" + KEGG_DOWNLOAD_LINK=params?.steps?.export?.emgb?.kegg?.database?.download?.source ?: "" + KEGG_MD5SUM=params.steps?.export?.emgb?.kegg?.database?.download?.md5sum ?: "" + KEGG_EXTRACTED_DB=params.steps?.export?.emgb?.kegg?.database?.extractedDBPath ?: "" + + TITLES_S3_EMGB_ACCESS=params?.steps?.export?.emgb?.titles?.database?.download?.s5cmd && TITLES_S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_EMGB_TITLES_ACCESS" : "" + TITLES_S3_EMGB_SECRET=params?.steps?.export?.emgb?.titles?.database?.download?.s5cmd && TITLES_S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_EMGB_TITLES_SECRET" : "" + + KEGG_S3_EMGB_ACCESS=params?.steps?.export?.emgb?.kegg?.database?.download?.s5cmd && KEGG_S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_EMGB_KEGG_ACCESS" : "" + KEGG_S3_EMGB_SECRET=params?.steps?.export?.emgb?.kegg?.database?.download?.s5cmd && KEGG_S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_EMGB_KEGG_SECRET" : "" + template 'emgbAnnotatedGenes.sh' +} + + + +// CONTIGS: [test1, /vol/spool/metagenomics-tk/work_wFullPipeline/cd/7c943c0808e6d36c72a64834e7a88e/test1_contigs.fa.gz] +// [test2, /vol/spool/metagenomics-tk/work_wFullPipeline/00/71fd0e590f4f03fda4cb87903a3b38/test2_contigs.fa.gz] +// mapping: [test2, /vol/spool/metagenomics-tk/work_wFullPipeline/7c/43a3513d49143a76a29c4404417997/test2.bam] +// bins: [test1, [/vol/spool/metagenomics-tk/work_wFullPipeline/95/085ffbe17a6f2ebfb60709d3f33cf3/test1_bin.1.fa, /vol/spool/metagenomics-tk/work_wFullPipeline/95/085ffbe17a6f2ebfb60709d3f33cf3/test1_bin.2.fa]] +// gtdbtk: [test2, [/vol/spool/metagenomics-tk/work_wFullPipeline/d2/5b5937c7a950d87cd796e2bf80cfcd/chunk_00_test2_gtdbtk.bac120.summary.tsv]] +// checkm: [test2, /vol/spool/metagenomics-tk/work_wFullPipeline/ff/1292ed3399aada6fa1dffc18cfbc12/test2_checkm2_EfY2cPIh.tsv] +// gff,ffn,faa: [test2, test2_bin.2.fa, /vol/spool/metagenomics-tk/work_wFullPipeline/78/1224783c7ae2d9e0a8d4264893e47a/test2_bin.2.gff.gz] +// mmseqsTaxonomy: [ncbi_nr, test1, /vol/spool/metagenomics-tk/work_wFullPipeline/93/56bf5c34d7524930d73a6ef45ae23d/test1_binned.ncbi_nr.taxonomy.tsv] +//mmseqsBlast: +// [bacmet20_predicted, test2, binned, 1, 2922, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/3c/e5221be7fd305dc056356201de7d61/test2_binned.1.2922.bacmet20_predicted.blast.tsv] +// [uniref90, test1, binned, 1, 2923, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/a0/8e7a2a7b928731dbcfd62cbf546d4f/test1_binned.1.2923.uniref90.blast.tsv] +// [bacmet20_predicted, test1, binned, 1, 2923, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/2f/dffc66ee85b2a1baf296323fcc894c/test1_binned.1.2923.bacmet20_predicted.blast.tsv] +// [kegg, test2, binned, 1, 2922, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/5a/74adca85c1b4005aa490440f8b05ba/test2_binned.1.2922.kegg.blast.tsv] +// [uniref90, test2, binned, 1, 2922, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/57/5f4e7f5ae75937613d7c813be6c8fd/test2_binned.1.2922.uniref90.blast.tsv] +// [kegg, test1, binned, 1, 2923, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/6b/41a5ce820f9fb5e897b6a9109e5678/test1_binned.1.2923.kegg.blast.tsv] +workflow wEMGBList { + take: + contigs + mapping + bins + gtdbtk + checkm + gff + ffn + faa + mmseqsTaxonomy + mmseqsBlast + main: + SAMPLE_IDX = 0 + contigs | combine(mapping, by: SAMPLE_IDX) | combine(bins, by: SAMPLE_IDX) | pEMGBAnnotatedContigs + checkm | combine(gtdbtk, by: SAMPLE_IDX) | combine(bins, by: SAMPLE_IDX) | pEMGBAnnotatedBins + + mmseqsBlast | filter { db, sample, type, start, end, chunkNumber, blastResult -> db == "uniref90" } \ + | map { db, sample, type, start, end, chunkNumber, blastResult -> [sample, blastResult] } \ + | groupTuple(by: SAMPLE_IDX) | set { selectedDBBlastResults } + + mmseqsBlast | filter { db, sample, type, start, end, chunkNumber, blastResult -> db == "kegg" } \ + | map { db, sample, type, start, end, chunkNumber, blastResult -> [sample, blastResult] } \ + | groupTuple(by: SAMPLE_IDX) | set { selectedKeggBlastResults } + + gff | map { sample, bin, gff -> [sample, gff] } | groupTuple(by: SAMPLE_IDX) \ + | combine(ffn | map { sample, bin, ffn -> [sample, ffn] } | groupTuple(by: SAMPLE_IDX), by: SAMPLE_IDX) \ + | combine(faa | map { sample, bin, faa -> [sample, faa] } | groupTuple(by: SAMPLE_IDX), by: SAMPLE_IDX) \ + | combine(mmseqsTaxonomy | map { db, sample, blastResult -> [sample, blastResult] } | groupTuple(by: SAMPLE_IDX), by: SAMPLE_IDX) \ + | combine(selectedDBBlastResults, by: SAMPLE_IDX) \ + | combine(selectedKeggBlastResults, by: SAMPLE_IDX) \ + | combine(bins, by: SAMPLE_IDX) | pEMGBAnnotatedGenes +} diff --git a/modules/magAttributes/module.nf b/modules/magAttributes/module.nf index 12c1a2e3..7eba09d3 100644 --- a/modules/magAttributes/module.nf +++ b/modules/magAttributes/module.nf @@ -59,7 +59,7 @@ process pCheckM { tuple val(sample), val(ending), path(bins), val(chunkId) output: - tuple path("${sample}_checkm_*.tsv", type: "file"), val("${sample}"), emit: checkm + tuple val("${sample}"), path("${sample}_checkm_*.tsv", type: "file"), emit: checkm tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \ file(".command.out"), file(".command.err"), file(".command.log"), emit: logs @@ -98,7 +98,7 @@ process pCheckM2 { tuple val(sample), val(ending), path(bins), val(chunkId) output: - tuple path("${sample}_checkm2_*.tsv", type: "file"), val("${sample}"), emit: checkm + tuple val("${sample}"), path("${sample}_checkm2_*.tsv", type: "file"), emit: checkm tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \ file(".command.out"), file(".command.err"), file(".command.log"), emit: logs @@ -136,12 +136,12 @@ process pGtdbtk { tuple val(sample), val(ending), path(bins), val(chunkId) output: - tuple path("chunk_*_${sample}_gtdbtk.bac120.summary.tsv"), val("${sample}"), optional: true, emit: bacteria - tuple path("chunk_*_${sample}_gtdbtk.ar122.summary.tsv"), val("${sample}"), optional: true, emit: archea - tuple path("chunk_*_${sample}_gtdbtk_unclassified.tsv"), val("${sample}"), optional: true, emit: unclassified - tuple path("*.tree"), val("${sample}"), optional: true, emit: tree - tuple path("chunk_*_${sample}_gtdbtk_combined.tsv"), val("${sample}"), optional: true, emit: combined - tuple path("chunk_*_${sample}_missing_bins.tsv"), val("${sample}"), optional: true, emit: missing + tuple val("${sample}"), path("chunk_*_${sample}_gtdbtk.bac120.summary.tsv"), optional: true, emit: bacteria + tuple val("${sample}"), path("chunk_*_${sample}_gtdbtk.ar122.summary.tsv"), optional: true, emit: archea + tuple val("${sample}"), path("chunk_*_${sample}_gtdbtk_unclassified.tsv"), optional: true, emit: unclassified + tuple val("${sample}"), path("*.tree"), val("${sample}"), optional: true, emit: tree + tuple val("${sample}"), path("chunk_*_${sample}_gtdbtk_combined.tsv"), optional: true, emit: combined + tuple val("${sample}"), path("chunk_*_${sample}_missing_bins.tsv"), optional: true, emit: missing tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \ file(".command.out"), file(".command.err"), file(".command.log"), emit: logs @@ -259,6 +259,9 @@ workflow wMagAttributesList { checkm = _wMagAttributes.out.checkm gtdb = _wMagAttributes.out.gtdb gtdbMissing = _wMagAttributes.out.gtdbMissing + gtdbArcheaFiles = _wMagAttributes.out.gtdbArcheaFiles + gtdbBacteriaFiles = _wMagAttributes.out.gtdbBacteriaFiles + checkmFiles = _wMagAttributes.out.checkmFiles } @@ -311,19 +314,23 @@ workflow _wMagAttributes { // Prepare checkm output file checkmSelected | splitCsv(sep: '\t', header: true) \ - | map { checkmDict, sample -> checkmDict } \ + | map { sample, checkmDict -> checkmDict } \ | set { checkmList } // Prepare gtdb output file gtdb.combined | splitCsv(sep: '\t', header: true) \ - | map { gtdb, sample -> gtdb } \ + | map { sample, gtdb -> gtdb } \ | set { gtdbCombinedList } // Prepare missing gtdb output file gtdb.missing | splitCsv(sep: '\t', header: true) \ - | map { gtdb, sample -> gtdb } \ + | map { sample, gtdb -> gtdb } \ | set { gtdbMissingList } + gtdb.bacteria | set { gtdbBacteriaFiles } + + gtdb.archea | set { gtdbArcheaFiles } + if(params.summary){ // collect checkm files for checkm2 results across multiple datasets checkmSelected \ @@ -347,4 +354,7 @@ workflow _wMagAttributes { checkm = checkmList gtdb = gtdbCombinedList gtdbMissing = gtdbMissingList + gtdbArcheaFiles = gtdbArcheaFiles + gtdbBacteriaFiles = gtdbBacteriaFiles + checkmFiles = checkmSelected } diff --git a/nextflow.config b/nextflow.config index 4895c10a..cca591db 100644 --- a/nextflow.config +++ b/nextflow.config @@ -292,6 +292,9 @@ params { skipVersionCheck = false + emgbAnnotatedGenes_image = "quay.io/emgb/annotatedgenes2json:2.5.2z" + emgbAnnotatedContigs_image = "quay.io/emgb/annotatedcontigs2json:2.2.4z" + emgbAnnotatedBins_image = "quay.io/emgb/annotatedbins2json:2.4.0z" pysradb_image = "quay.io/biocontainers/pysradb:1.4.1--pyhdfd78af_0" minimap2_image= "quay.io/biocontainers/minimap2:2.24--h7132678_1" metaflye_image = "quay.io/biocontainers/flye:2.9--py36h7281c5b_1" @@ -498,6 +501,14 @@ params { patch = 0 } } + export { + name = "export" + version { + major = 0 + minor = 1 + patch = 0 + } + } readMapping { name = "readMapping" version { diff --git a/scripts/test_fullPipelineExport.sh b/scripts/test_fullPipelineExport.sh new file mode 100644 index 00000000..b1c59cc6 --- /dev/null +++ b/scripts/test_fullPipelineExport.sh @@ -0,0 +1,15 @@ +set -e + +ENTRY="wExportPipeline" +OPTIONS=$1 +YAML="${2:-example_params/export.yml}" +WORK="${3:-work}_${ENTRY}" +PROFILE="${4:-standard}" +LOG_DIR="${WORK}/logs" +make run_small_full_test WORK_DIR=${WORK} \ + PARAMS_FILE=$YAML \ + PROFILE=" $PROFILE " \ + LOG_DIR=${LOG_DIR} \ + OPTIONS=" $OPTIONS " \ + ENTRY="${ENTRY}" +make check LOG_DIR=${LOG_DIR} diff --git a/templates/emgbAnnotatedBins.sh b/templates/emgbAnnotatedBins.sh new file mode 100644 index 00000000..de37d3be --- /dev/null +++ b/templates/emgbAnnotatedBins.sh @@ -0,0 +1,13 @@ + +BINS_DIR=bins + +binPrefix=$(find $BINS_DIR -name "*_bin.*.fa" -exec readlink -f {} \; \ + | tail -n 1 \ + | rev | cut -f 1 -d '/' \ + | rev | cut -d '.' -f 1 \ + | sed 's/$/./g') + +annotatedbins2json -checkm-tsv !{checkm} \ + -gtdbtk-tsvs !{gtdbtk} \ + -json-gz !{sample}.bins.json.gz \ + -bin-id-prefix ${binPrefix} diff --git a/templates/emgbAnnotatedContigs.sh b/templates/emgbAnnotatedContigs.sh new file mode 100644 index 00000000..6f688fd5 --- /dev/null +++ b/templates/emgbAnnotatedContigs.sh @@ -0,0 +1,5 @@ +annotatedcontigs2json -bins-dir bins \ + -sample-bam-files !{mapping} \ + -fasta !{contigs} \ + -json-gz !{sample}.contigs.json.gz \ + -sample-names !{sample} diff --git a/templates/emgbAnnotatedGenes.sh b/templates/emgbAnnotatedGenes.sh new file mode 100644 index 00000000..d0b5c550 --- /dev/null +++ b/templates/emgbAnnotatedGenes.sh @@ -0,0 +1,91 @@ +# Check developer documentation +# Download titles +if [ -z "!{TITLES_EXTRACTED_DB}" ] +then + TITLES_DATABASE=!{params.databases}/emgb_titles + TITLES_LOCK_FILE=${TITLES_DATABASE}/lock.txt + + if [ ! -z "!{TITLES_S3_EMGB_ACCESS}" ] + then + export AWS_ACCESS_KEY_ID=!{TITLES_S3_EMGB_ACCESS} + export AWS_SECRET_ACCESS_KEY=!{TITLES_S3_EMGB_SECRET} + fi + + echo "${TITLES_DATABASE}" + # Download checkm database if necessary + mkdir -p ${TITLES_DATABASE} + flock ${TITLES_LOCK_FILE} concurrentDownload.sh --output=${TITLES_DATABASE} \ + --link="!{TITLES_DOWNLOAD_LINK}" \ + --httpsCommand="wgetStatic --no-check-certificate -qO- !{TITLES_DOWNLOAD_LINK} | gunzip > titles.tsv" \ + --s3FileCommand="s5cmd !{TITLES_S5CMD_PARAMS} cat --concurrency !{task.cpus} !{TITLES_DOWNLOAD_LINK} | gunzip > titles.tsv " \ + --s3DirectoryCommand="s5cmd !{TITLES_S5CMD_PARAMS} cp --concurrency !{task.cpus} !{TITLES_DOWNLOAD_LINK} . && mv * titles.tsv " \ + --s5cmdAdditionalParams="!{TITLES_S5CMD_PARAMS}" \ + --localCommand="gunzip -c !{TITLES_DOWNLOAD_LINK} > ./titles.tsv" \ + --expectedMD5SUM=!{TITLES_MD5SUM} + TITLES_FILE="${TITLES_DATABASE}/out/titles.tsv" +else + TITLES_FILE="!{TITLES_EXTRACTED_DB}" +fi + + + +# Check developer documentation +# Download titles +if [ -z "!{KEGG_EXTRACTED_DB}" ] +then + KEGG_DATABASE=!{params.databases}/emgb_kegg + KEGG_LOCK_FILE=${KEGG_DATABASE}/lock.txt + + if [ ! -z "!{KEGG_S3_EMGB_ACCESS}" ] + then + export AWS_ACCESS_KEY_ID=!{KEGG_S3_EMGB_ACCESS} + export AWS_SECRET_ACCESS_KEY=!{KEGG_S3_EMGB_SECRET} + fi + + # Download checkm database if necessary + mkdir -p ${KEGG_DATABASE} + flock ${KEGG_LOCK_FILE} concurrentDownload.sh --output=${KEGG_DATABASE} \ + --link=!{KEGG_DOWNLOAD_LINK} \ + --httpsCommand="wgetStatic --no-check-certificate -qO- !{KEGG_DOWNLOAD_LINK} | zstd -T!{task.cpus} -d -c | tar --strip-components=1 -xv " \ + --s3FileCommand="s5cmd !{KEGG_S5CMD_PARAMS} cat --concurrency !{task.cpus} !{KEGG_DOWNLOAD_LINK} | zstd -T!{task.cpus} -d -c | tar --strip-components=1 -xv " \ + --s3DirectoryCommand="s5cmd !{KEGG_S5CMD_PARAMS} cp --concurrency !{task.cpus} !{KEGG_DOWNLOAD_LINK} . " \ + --s5cmdAdditionalParams="!{KEGG_S5CMD_PARAMS}" \ + --localCommand="zstd -T!{task.cpus} -c -d !{KEGG_DOWNLOAD_LINK} | tar --strip-components=1 -xv " \ + --expectedMD5SUM=!{KEGG_MD5SUM} + KEGG_DB=$(readlink -f ${KEGG_DATABASE}/out/) +else + KEGG_DB="!{KEGG_EXTRACTED_DB}" +fi + +BINS_DIR=bins + +nr=$(find blastResult/ -name "*.tsv" -exec readlink -f {} \; | sed 's/^/ --blast-tab /g') + +tax=$(find taxonomy/ -name "*.tsv" -exec readlink -f {} \; | sed 's/^/ -mmseqs-lineage /g') + + +ffn=$(find ffn/ -name "*.gz" -exec readlink -f {} \; | sed 's/^/ -ffn /g') + +faa=$(find faa/ -name "*.gz" -exec readlink -f {} \; | sed 's/^/ -faa /g') + +gff=$(find gff/ -name "*.gz" -exec readlink -f {} \; | sed 's/^/ -gff /g') + +kegg=$(find blastKeggResult/ -name "*.tsv" -exec readlink -f {} \; | sed 's/^/ -kegg-blast-tab /g') + +titles=" -title-tsv $TITLES_FILE " + +name=" -dataset-name !{sample} " + +db=" -db ${KEGG_DB} " + +annotatedgenes2json -bins-dir bins \ + ${nr} \ + ${tax} \ + ${ffn} \ + ${faa} \ + ${gff} \ + ${titles} \ + ${name} \ + ${db} \ + ${kegg} \ + -json-gz !{sample}.genes.json.gz