From fdf4fc6c3de50c676a0a81d8bb400950dbd530a9 Mon Sep 17 00:00:00 2001
From: Peter Belmann <pbelmann@cebitec.uni-bielefeld.de>
Date: Tue, 10 Dec 2024 12:17:25 +0000
Subject: [PATCH] feat(export): initial emgb export functionality

---
 .github/workflows/workflow_modules.yml |   8 +
 example_params/export.yml              |  47 ++++
 example_params/fullPipeline.yml        |  14 ++
 main.nf                                | 309 ++++++++++++++++++++++---
 modules/annotation/module.nf           |   7 +-
 modules/export/emgb.nf                 | 165 +++++++++++++
 modules/magAttributes/module.nf        |  32 ++-
 nextflow.config                        |  11 +
 scripts/test_fullPipelineExport.sh     |  15 ++
 templates/emgbAnnotatedBins.sh         |  13 ++
 templates/emgbAnnotatedContigs.sh      |   5 +
 templates/emgbAnnotatedGenes.sh        |  91 ++++++++
 12 files changed, 669 insertions(+), 48 deletions(-)
 create mode 100644 example_params/export.yml
 create mode 100644 modules/export/emgb.nf
 create mode 100644 scripts/test_fullPipelineExport.sh
 create mode 100644 templates/emgbAnnotatedBins.sh
 create mode 100644 templates/emgbAnnotatedContigs.sh
 create mode 100644 templates/emgbAnnotatedGenes.sh

diff --git a/.github/workflows/workflow_modules.yml b/.github/workflows/workflow_modules.yml
index 755622d1..bd74363a 100644
--- a/.github/workflows/workflow_modules.yml
+++ b/.github/workflows/workflow_modules.yml
@@ -96,6 +96,14 @@ jobs:
           " --output=output   --smetana_image=pbelmann/metabolomics:0.1.0 \
            --steps.cooccurrence.beforeProcessScript=/vol/spool/dockerPull.sh " ./example_params/fullPipelineAggregate.yml \
           "${WORK_DIR}" ${PROFILE} || exit 1
+
+      - name: Test export workflow based on previous test
+        run: |
+          bash ./scripts/test_fullPipelineExport.sh \
+          " --output=output --input=output --smetana_image=pbelmann/metabolomics:0.1.0 \
+          "" " ./example_params/export.yml \
+          "${WORK_DIR}" ${PROFILE} || exit 1
+
       - name: Test if SRA S3 Input module works with ONT data
         run: |
           bash ./scripts/test_fullPipeline.sh   " -c  test_data/assets/aws.config     "  \
diff --git a/example_params/export.yml b/example_params/export.yml
new file mode 100644
index 00000000..531dc2b6
--- /dev/null
+++ b/example_params/export.yml
@@ -0,0 +1,47 @@
+tempdir: "tmp"
+summary: false
+s3SignIn: false
+input: "output"
+output: "output"
+logDir: log
+runid: 1
+databases: "/mnt/databases"
+logLevel: 1
+scratch: "/vol/scratch"
+publishDirMode: "symlink"
+steps:
+  export:
+    emgb:
+      titles:
+        database:
+          download:
+            source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/uniref90.titles.tsv.gz"
+            md5sum: aaf1dd9021243def8e6c4e438b4b3669
+      kegg:
+        database:
+          download:
+            source: s3://databases_internal/annotatedgenes2json_db_kegg-mirror-2022-12.tar.zst
+            md5sum: 262dab8ca564fbc1f27500c22b5bc47b
+            s5cmd:
+              params: '--retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080'
+resources:
+  highmemLarge:
+    cpus: 28
+    memory: 230
+  highmemMedium:
+    cpus: 14
+    memory: 113
+  large:
+    cpus: 28
+    memory: 58
+  medium:
+    cpus: 14
+    memory: 29
+  small:
+    cpus: 7
+    memory: 14
+  tiny:
+    cpus: 1
+    memory: 1
+
+
diff --git a/example_params/fullPipeline.yml b/example_params/fullPipeline.yml
index 9fab59af..04cb152d 100644
--- a/example_params/fullPipeline.yml
+++ b/example_params/fullPipeline.yml
@@ -246,6 +246,20 @@ steps:
        additionalParams:
          run: ""
          report: ""
+  export:
+    emgb:
+      titles:
+        database:
+          download:
+            source: "https://openstack.cebitec.uni-bielefeld.de:8080/databases/uniref90.titles.tsv.gz"
+            md5sum: aaf1dd9021243def8e6c4e438b4b3669
+      kegg:
+        database:
+          download:
+            source: s3://databases_internal/annotatedgenes2json_db_kegg-mirror-2022-12.tar.zst
+            md5sum: 262dab8ca564fbc1f27500c22b5bc47b
+            s5cmd:
+              params: '--retry-count 30 --no-verify-ssl --endpoint-url https://openstack.cebitec.uni-bielefeld.de:8080'
 resources:
   highmemLarge:
     cpus: 28
diff --git a/main.nf b/main.nf
index bf9724da..816a6ede 100644
--- a/main.nf
+++ b/main.nf
@@ -9,6 +9,7 @@ include { wShortReadAssemblyFile; wShortReadAssemblyList; wTestMemSelection } fr
 include { wOntAssemblyFile; wOntAssemblyList } from './modules/assembly/ontAssembler'
 include { wShortReadBinningList } from './modules/binning/shortReadBinning'
 include { wLongReadBinningList } from './modules/binning/ontBinning'
+include { wEMGBList } from './modules/export/emgb'
 include { wMagAttributesFile; wMagAttributesList; wCMSeqWorkflowFile; } from './modules/magAttributes/module.nf'
 include { wDereplicateFile; wDereplicateList} from './modules/dereplication/bottomUpClustering/module'
 include { wAnalyseMetabolitesList; wAnalyseMetabolitesFile } from './modules/metabolomics/module'
@@ -177,7 +178,7 @@ def getPath(f){
   return params.input.startsWith("s3://")? "s3:/" + f: f
 }
 
-workflow _wAggregateONT { 
+workflow _wFindSamplesONT { 
    take:
      sraFiles
    main:
@@ -195,7 +196,7 @@ workflow _wAggregateONT {
      | map{ sra,f -> [sra, getPath(f)] } | splitCsv(sep: '\t', header: true) \
      | map { sample -> [sample[SAMPLE_NAME], sample[SAMPLE_STATS].median_qual]} |set { ontMedianQuality }
 
-    Pattern binsONTPattern = Pattern.compile('.*/binningONT/' + params.modules.binning.version.major + '..*/.*/.*_bin.*.fa$')
+    Pattern binsONTPattern = Pattern.compile('.*/binningONT/' + params.modules.binningONT.version.major + '..*/.*/.*_bin.*.fa$')
     sraFiles | filter({ sra, path -> binsONTPattern.matcher(path.toString()).matches()}) \
      | map{ sra,f -> [SAMPLE:sra, PATH: getPath(f), BIN_ID:file(f).name] } \
      | set{ ontBins }
@@ -212,7 +213,26 @@ workflow _wAggregateONT {
      ontBinStats = ontBinStats
 }
 
-workflow _wAggregateIllumina { 
+workflow _wGetAssemblyFiles {
+  take:
+    assemblyFiles
+  main:
+    // get Illumina assembly files
+    Pattern assemblyIlluminaPattern = Pattern.compile('.*/assembly/' + params.modules.assembly.version.major + '..*/.*/.*_contigs.fa.gz$')
+    assemblyFiles | filter({ sra, path -> assemblyIlluminaPattern.matcher(path.toString()).matches()}) \
+     | map{ sra,f -> [sra, getPath(f)] } | set { illuminaAssembly }
+
+    // get ONT assembly files
+    Pattern assemblyONTPattern = Pattern.compile('.*/assemblyONT/' + params.modules.assemblyONT.version.major + '..*/.*/.*_contigs.fa.gz$')
+    assemblyFiles | filter({ sra, path -> assemblyONTPattern.matcher(path.toString()).matches()}) \
+      | map{ sra,f -> [sra, getPath(f)] } | set { ontAssembly }
+  emit:
+    illuminaAssembly    
+    ontAssembly
+}
+
+
+workflow _wFindSamplesIllumina { 
     take:
       binningFiles
       qcFiles
@@ -288,17 +308,10 @@ workflow _wFragmentRecruitment {
      recruitedGenomesStats = recruitedGenomesStats
 }
 
-/*
-* This workflow entry point allows to aggregate information of different samples.
-* It will perform analysis steps such as dereplication, read mapping and co-occurrence.
-* The input files are automatically fetched as long as they adhere to the pipeline specification document (see documentation).
-*/
-workflow wAggregatePipeline {
-    def input = params.input
+workflow _wGetSamples() {
+  main:
     def runID = params.runid
-
-    // Save config File
-    wSaveSettingsList(Channel.value("AGGREGATED"))
+    def input = params.input
 
     // List all available SRAIDs
     Channel.from(file(input).list()) | filter({ path -> !(path ==~ /.*summary$/) && !(path ==~ /null$/) }) \
@@ -307,32 +320,32 @@ workflow wAggregatePipeline {
 
     sraDatasets | map { sra ->  [sra, input + "/" + sra + "/" + runID + "/" ]} \
      | set {sraIDs}
+  emit:
+    sraIDs
+    sraDatasets
+}
 
-    // List all files in sample directories
-    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.qc])} | set { qcFiles }
-    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.binning]) } | set { binningFiles } 
-
-    _wAggregateIllumina(binningFiles, qcFiles)
-
-    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.binningONT])} | set { binningONTFiles }
-    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.qcONT])} | mix(binningONTFiles) | _wAggregateONT
-
-    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.fragmentRecruitment])} |  _wFragmentRecruitment
-
-    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.magAttributes])} | set { selectedSRAMagAttributes}
 
+workflow _wGetCheckm {
+  take:
+    selectedSRAMagAttributes
+  main:
     // get Checkm results
     Pattern checkmPattern = Pattern.compile('.*/magAttributes/' + params.modules.magAttributes.version.major + '..*/.*/.*_checkm_.*.tsv$')
-     selectedSRAMagAttributes | filter({ sra, path -> checkmPattern.matcher(path.toString()).matches()}) \
-     | splitCsv(header: ["SAMPLE", "BIN_ID", "Marker lineage", "# genomes", "# markers", \
+    selectedSRAMagAttributes | filter({ sra, path -> checkmPattern.matcher(path.toString()).matches()}) \
+     | set { checkmFiles }
+
+    checkmFiles | splitCsv(header: ["SAMPLE", "BIN_ID", "Marker lineage", "# genomes", "# markers", \
           "# marker sets", "0", "1", "2", "3", "4", "5+", "COMPLETENESS", "CONTAMINATION", "HETEROGENEITY"], sep: '\t') \
      | map { sra, bins -> bins} \
      | set { checkm }
 
     // get Checkm2 results
     Pattern checkm2Pattern = Pattern.compile('.*/magAttributes/' + params.modules.magAttributes.version.major + '..*/.*/.*_checkm2_.*.tsv$')
-     selectedSRAMagAttributes | filter({ sra, path -> checkm2Pattern.matcher(path.toString()).matches()}) \
-     | splitCsv(header: true, sep: '\t') \
+    selectedSRAMagAttributes | filter({ sra, path -> checkm2Pattern.matcher(path.toString()).matches()}) \
+     | set { checkm2Files }
+
+    checkm2Files | splitCsv(header: true, sep: '\t') \
      | map { sra, bins -> bins} \
      | set { checkm2 }
 
@@ -341,6 +354,52 @@ workflow wAggregatePipeline {
        | mix(checkm2) \
        | set {checkm}
 
+    checkmFiles \
+       | mix(checkm2Files) \
+       | set {checkmFiles}
+
+  emit:
+    checkm
+    checkmFiles
+}
+
+/*
+* This workflow entry point allows to aggregate information of different samples.
+* It will perform analysis steps such as dereplication, read mapping and co-occurrence.
+* The input files are automatically fetched as long as they adhere to the pipeline specification document (see documentation).
+*/
+workflow wAggregatePipeline {
+    def input = params.input
+    def runID = params.runid
+
+    // Save config File
+    wSaveSettingsList(Channel.value("AGGREGATED"))
+
+    _wGetSamples()
+    _wGetSamples.out.sraDatasets | set { sraDatasets }
+    _wGetSamples.out.sraIDs | set { sraIDs }
+
+    // List all files in sample directories
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.qc]) } | set { qcFiles }
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.binning]) } | set { binningFiles } 
+
+    _wFindSamplesIllumina(binningFiles, qcFiles)
+
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.binningONT])} | set { binningONTFiles }
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.qcONT])} \
+	| mix(binningONTFiles) | set { binningONT }
+
+    _wFindSamplesONT(binningONT)
+
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.fragmentRecruitment])} |  _wFragmentRecruitment
+
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.magAttributes])} | set { selectedSRAMagAttributes}
+
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.annotation])} | set { selectedAnnotation}
+
+    selectedSRAMagAttributes | _wGetCheckm 
+    _wGetCheckm.out.checkm | set { checkm }
+
     // get gtdbtk summary files
     Pattern gtdbPattern = Pattern.compile('.*/magAttributes/' + params.modules.magAttributes.version.major + '..*/.*/.*_gtdbtk_combined.tsv$' )
     selectedSRAMagAttributes | filter({ sra, path -> gtdbPattern.matcher(path.toString()).matches()}) \
@@ -360,18 +419,161 @@ workflow wAggregatePipeline {
 
     recruitedGenomesStats =  _wFragmentRecruitment.out.recruitedGenomesStats
 
-    mapJoin(_wAggregateIllumina.out.illuminaBinStats | mix(_wAggregateONT.out.ontBinStats) \
+    mapJoin(_wFindSamplesIllumina.out.illuminaBinStats | mix(_wFindSamplesONT.out.ontBinStats) \
         | mix(recruitedGenomesStats), checkm, "BIN_ID", "BIN_ID") \
         | set {checkmBinStats}
 
-    mapJoin(checkmBinStats, _wAggregateIllumina.out.illuminaBins | mix(_wAggregateONT.out.ontBins) \
+    mapJoin(checkmBinStats, _wFindSamplesIllumina.out.illuminaBins | mix(_wFindSamplesONT.out.ontBins) \
         | mix(recruitedGenomes), "BIN_ID", "BIN_ID") \
         | set {binsStatsComplete}
 
-    _wAggregateIllumina.out.illuminaSamples | mix(_wAggregateONT.out.ontSamples) | view { sra, path -> "Files detected of SRA ID $sra" }
+    _wFindSamplesIllumina.out.illuminaSamples | mix(_wFindSamplesONT.out.ontSamples) | view { sra, path -> "Files detected of SRA ID $sra" }
+
+    _wAggregate(_wFindSamplesONT.out.ontSamples, _wFindSamplesONT.out.ontMedianQuality, _wFindSamplesIllumina.out.illuminaSamples, \
+        _wFindSamplesIllumina.out.unpairedIlluminaSamples, binsStatsComplete, gtdb, models)
+}
+
+
+/*
+* This workflow entry point allows to aggregate information of different samples.
+* It will perform analysis steps such as dereplication, read mapping and co-occurrence.
+* The input files are automatically fetched as long as they adhere to the pipeline specification document (see documentation).
+*/
+workflow wExportPipeline {
+    def input = params.input
+    def runID = params.runid
+
+    // List all available SRAIDs
+    Channel.from(file(input).list()) | filter({ path -> !(path ==~ /.*summary$/) && !(path ==~ /null$/) }) \
+     | filter({ path -> !(path ==~ /.*AGGREGATED$/)}) \
+     | set { sraDatasets }
+
+    // Save config File
+    wSaveSettingsList(sraDatasets)
 
-    _wAggregate(_wAggregateONT.out.ontSamples, _wAggregateONT.out.ontMedianQuality, _wAggregateIllumina.out.illuminaSamples, \
-        _wAggregateIllumina.out.unpairedIlluminaSamples, binsStatsComplete, gtdb, models)
+    sraDatasets | map { sra ->  [sra, input + "/" + sra + "/" + runID + "/" ]} \
+     | set {sraIDs}
+
+    // List all files in sample directories
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.qc])} | set { qcFiles }
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.binning]) } \
+	| set { binningFiles } 
+
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.magAttributes])} \
+	| set { selectedSRAMagAttributes}
+
+    // Checkm files
+    selectedSRAMagAttributes | _wGetCheckm 
+    _wGetCheckm.out.checkmFiles | set { checkmFiles }
+
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.assemblyONT]) } | set { assemblyONTFiles } 
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.assembly]) } | set { assemblyIlluminaFiles }
+    sraIDs | flatMap { sraID, path -> collectModuleFiles(path, sraID, [params.modules.annotation])} | set { selectedAnnotation}
+
+    assemblyONTFiles | mix(assemblyIlluminaFiles) | _wGetAssemblyFiles 
+    _wGetAssemblyFiles.out.illuminaAssembly | mix(_wGetAssemblyFiles.out.ontAssembly) | set { assembly } 
+
+    // get Bins
+    Pattern binsIlluminaPattern = Pattern.compile('.*/binning/' + params.modules.binning.version.major + '..*/.*/.*_bin.*.fa$')
+    binningFiles | filter({ sra, path -> binsIlluminaPattern.matcher(path.toString()).matches()}) \
+     | set{ illuminaBins }
+
+    Pattern binsONTPattern = Pattern.compile('.*/binningONT/' + params.modules.binningONT.version.major + '..*/.*/.*_bin.*.fa$')
+    binningFiles | filter({ sra, path -> binsONTPattern.matcher(path.toString()).matches()}) \
+     | mix(illuminaBins) | groupTuple(by: 0) | set{ binFiles }
+
+    // get not Binned gff files
+    Pattern annotationNotBinnedGffPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_notBinned.gff.gz$' )
+    selectedAnnotation | filter({ sra, path -> annotationNotBinnedGffPattern.matcher(path.toString()).matches()}) \
+     | set { notBinnedGff }
+
+    // get Bin gff files
+    Pattern annotationGffPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_bin..*.gff.gz$' )
+    selectedAnnotation | filter({ sra, path -> annotationGffPattern.matcher(path.toString()).matches()}) \
+     | mix(notBinnedGff) | map { sra, path -> [sra, file(path).baseName, path] }  | set { gff }
+
+    // get not binned ffn files
+    Pattern annotationNotBinnedFfnPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_notBinned.ffn.gz$' )
+    selectedAnnotation | filter({ sra, path -> annotationNotBinnedFfnPattern.matcher(path.toString()).matches()}) \
+     | set { notBinnedFfn }
+
+    // get ffn files
+    Pattern annotationFfnPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_bin..*.ffn.gz$' )
+    selectedAnnotation | filter({ sra, path -> annotationFfnPattern.matcher(path.toString()).matches()}) \
+     | mix(notBinnedFfn) | map { sra, path -> [sra, file(path).baseName, path] } | set { ffn }
+
+    // not Binned faa files
+    Pattern annotationNotBinnedPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_notBinned.faa.gz$' )
+    selectedAnnotation | filter({ sra, path -> annotationNotBinnedPattern.matcher(path.toString()).matches()}) \
+     | set { notBinnedFaa }
+
+    // get Bin faa files
+    Pattern annotationFnaPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/prokka/.*_bin..*.faa.gz$' )
+    selectedAnnotation | filter({ sra, path -> annotationFnaPattern.matcher(path.toString()).matches()}) \
+     | mix(notBinnedFaa) | map { sra, path -> [sra, file(path).baseName, path] } | set { faa }
+
+    // get MMseqs unbinned taxonomy files
+    Pattern annotationMmseqsUnbinnedTaxonomyPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2_taxonomy/.*/.*_unbinned.ncbi_nr.taxonomy.tsv$' )
+    selectedAnnotation | filter({ sra, path -> annotationMmseqsUnbinnedTaxonomyPattern.matcher(path.toString()).matches()}) \
+     | map { sra, path -> ["ncbi_nr", sra, path]  } | set { mmseqsUnbinnedTaxonomy }
+
+    // get MMseqs binned taxonomy files   output/test2/1/annotation/1.0.0/mmseqs2/ncbi_nr/   test2_unbinned.ncbi_nr.105001.112000.blast.tsv
+    Pattern annotationMmseqsBinnedTaxonomyPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2_taxonomy/.*/.*_binned.ncbi_nr.taxonomy.tsv$' )
+    selectedAnnotation | filter({ sra, path -> annotationMmseqsBinnedTaxonomyPattern.matcher(path.toString()).matches()}) \
+     | map { sra, path -> ["ncbi_nr", sra, path] } | mix(mmseqsUnbinnedTaxonomy) | set { mmseqsTaxonomy }
+
+    // get MMseqs unbinned blast files
+    Pattern annotationMmseqsUnirefUnbinnedPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2/.*/.*_unbinned.uniref90.blast.tsv$' )
+    selectedAnnotation | filter({ sra, path -> annotationMmseqsUnirefUnbinnedPattern.matcher(path.toString()).matches()}) \
+     | map { sra, path -> ["uniref90", sra, "unbinned", 1, 1, 1, path] } | set { mmseqsUnirefUnbinnedBlast }
+
+    // get MMseqs binned blast files   output/test2/1/annotation/1.0.0/mmseqs2/ncbi_nr/   test2_unbinned.ncbi_nr.105001.112000.blast.tsv
+    Pattern annotationMmseqsUnirefBinnedPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2/.*/.*_binned.uniref90.blast.tsv$' )
+    selectedAnnotation | filter({ sra, path -> annotationMmseqsUnirefBinnedPattern.matcher(path.toString()).matches()}) \
+     | map { sra, path -> ["uniref90", sra, "binned", 1, 1, 1, path] } \
+     | mix(mmseqsUnirefUnbinnedBlast) | set { mmseqsUnirefBlast }
+
+    // get MMseqs unbinned kegg blast files
+    Pattern annotationMmseqsKeggUnbinnedPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2/.*/.*_unbinned.kegg.blast.tsv$' )
+    selectedAnnotation | filter({ sra, path -> annotationMmseqsKeggUnbinnedPattern.matcher(path.toString()).matches()}) \
+     | map { sra, path -> ["kegg", sra, "unbinned", 1, 1, 1, path] } \
+     | set { mmseqsKeggUnbinnedBlast }
+
+    // get MMseqs binned blast files   output/test2/1/annotation/1.0.0/mmseqs2/ncbi_nr/   test2_unbinned.ncbi_nr.105001.112000.blast.tsv
+    Pattern annotationMmseqsKeggBinnedPattern = Pattern.compile('.*/annotation/' + params.modules.annotation.version.major + '..*/mmseqs2/.*/.*_binned.kegg.blast.tsv$' )
+    selectedAnnotation | filter({ sra, path -> annotationMmseqsKeggBinnedPattern.matcher(path.toString()).matches()}) \
+     | map { sra, path -> ["kegg", sra, "binned", 1, 1, 1, path] } \
+     | mix(mmseqsKeggUnbinnedBlast) \
+     | set { mmseqsKeggBlast }
+
+    // get gtdbtk summary files
+    Pattern gtdbSummaryPattern = Pattern.compile('.*/magAttributes/' + params.modules.magAttributes.version.major + '..*/.*/.*.summary.tsv$' )
+    selectedSRAMagAttributes | filter({ sra, path -> gtdbSummaryPattern.matcher(path.toString()).matches()}) \
+     | groupTuple(by: 0) | set { gtdbSummaryFiles }
+
+    // get Mapping
+    Pattern mappingIlluminaPattern = Pattern.compile('.*/binning/' + params.modules.binning.version.major + '..*/.*/.*.bam$')
+    binningFiles | filter({ sra, path -> mappingIlluminaPattern.matcher(path.toString()).matches()}) \
+     | set{ illuminaMapping }
+
+    // get ONT mapping files
+    Pattern mappingONTPattern = Pattern.compile('.*/binningONT/' + params.modules.binningONT.version.major + '..*/.*/.*.bam$')
+    binningFiles | filter({ sra, path -> mappingONTPattern.matcher(path.toString()).matches()}) \
+     | set{ ontMapping }
+
+    illuminaMapping | mix(ontMapping) | set { mapping } 
+
+    _wExportEMGB(assembly, \
+	mapping, \
+        binFiles, \
+        gtdbSummaryFiles, \
+        checkmFiles, \
+        gff, \
+        ffn, \
+        faa, \
+        mmseqsTaxonomy, \
+        mmseqsUnirefBlast | mix(mmseqsKeggBlast) \
+    )
 
 }
 
@@ -448,7 +650,6 @@ workflow wSaveSettings {
 }
 
 
-
 def flattenBins(binning){
   def chunkList = [];
   def SAMPLE_IDX = 0;
@@ -470,6 +671,7 @@ workflow _wProcessIllumina {
       wShortReadAssemblyList(qcReads, wShortReadQualityControlList.out.nonpareil, wShortReadQualityControlList.out.kmerFrequencies)
       wShortReadBinningList(wShortReadAssemblyList.out.contigs, qcReads)
     emit:
+      contigs = wShortReadAssemblyList.out.contigs 
       notBinnedContigs = wShortReadBinningList.out.notBinnedContigs 
       bins = wShortReadBinningList.out.bins 
       binsStats = wShortReadBinningList.out.binsStats
@@ -493,6 +695,7 @@ workflow _wProcessOnt {
       wLongReadBinningList(wOntAssemblyList.out.contigs, ontQCReads, wOntAssemblyList.out.graph, \
 	 wOntAssemblyList.out.headerMapping, wOntAssemblyList.out.info, medianQuality)
     emit:
+      contigs = wOntAssemblyList.out.contigs
       notBinnedContigs = wLongReadBinningList.out.notBinnedContigs 
       bins = wLongReadBinningList.out.bins 
       binsStats = wLongReadBinningList.out.binsStats
@@ -504,6 +707,22 @@ workflow _wProcessOnt {
       medianQuality = medianQuality
 }
 
+workflow _wExportEMGB {
+    take:
+      contigs
+      mapping
+      bins
+      gtdbtk
+      checkm
+      gff
+      ffn
+      faa
+      mmseqsTaxonomy
+      mmseqsBlast
+    main:
+    wEMGBList(contigs, mapping, bins, gtdbtk, checkm, gff, ffn, faa, mmseqsTaxonomy, mmseqsBlast)
+}
+
 /*
 * 
 * Main workflow entrypoint. Takes list of files containing reads as input and produces assembly, binning, dereplication and metabolomics 
@@ -536,6 +755,8 @@ workflow wFullPipeline {
 	| map{ bin -> [bin.SAMPLE, bin.BIN_ID, bin.PATH]} \
         | set { bins }
 
+    ont.bins | mix(illumina.bins) | set {binsFiles}
+
     illumina.fastg | set { fastg } 
 
     ont.gfa | set { gfa } 
@@ -581,6 +802,22 @@ workflow wFullPipeline {
 
     wAnalyseMetabolitesList(binsStats, mapJoin(wMagAttributesList.out.checkm, proteins, "BIN_ID", "BIN_ID"))
 
-    _wAggregate(ont.reads, ont.medianQuality, illumina.readsPair, illumina.readsSingle, binsStats, \
+    wMagAttributesList.out.gtdbArcheaFiles \
+	| mix(wMagAttributesList.out.gtdbBacteriaFiles) \
+	| groupTuple(by: 0) | set { gtdbSummaryFiles }
+
+   _wExportEMGB(illumina.contigs | mix(ont.contigs),\
+	mapping, \
+        binsFiles, \
+        gtdbSummaryFiles, \
+        wMagAttributesList.out.checkmFiles, \
+        wAnnotateBinsList.out.gff, \
+        wAnnotateBinsList.out.ffn, \
+        wAnnotateBinsList.out.faa, \
+        wAnnotateBinsList.out.mmseqs2_taxonomy, \
+        wAnnotateBinsList.out.mmseqs2_blast \
+   )
+
+   _wAggregate(ont.reads, ont.medianQuality, illumina.readsPair, illumina.readsSingle, binsStats, \
 	wMagAttributesList.out.gtdb,  wAnalyseMetabolitesList.out.models)
 }
diff --git a/modules/annotation/module.nf b/modules/annotation/module.nf
index cfe20f82..9438d389 100644
--- a/modules/annotation/module.nf
+++ b/modules/annotation/module.nf
@@ -53,6 +53,7 @@ process pMMseqs2 {
    
       container "${params.mmseqs2_image}"
 
+     
       // Databases will be downloaded to a fixed place so that they can be used by future processes.
       // This fixed place has to be outside of the working-directory to be easy to find for every process.
       // Therefore this place has to be mounted to the docker container to be accessible during run time.
@@ -610,6 +611,11 @@ workflow wAnnotateList {
     emit:
       keggAnnotation = _wAnnotation.out.keggAnnotation
       proteins = _wAnnotation.out.prokka_faa
+      ffn = _wAnnotation.out.prokka_ffn
+      gff = _wAnnotation.out.prokka_gff
+      faa = _wAnnotation.out.prokka_faa
+      mmseqs2_taxonomy = _wAnnotation.out.mmseqs2_taxonomy
+      mmseqs2_blast = _wAnnotation.out.mmseqs2_blast
 }
 
 /**
@@ -702,7 +708,6 @@ workflow _wSplit {
 	| pCount | combine(chunkSize) | flatMap { sample -> \
    	Utils.splitFilesIndex(Integer.parseInt(sample[COUNT_IDX]), sample[CHUNK_SIZE_IDX], [sample[SAMPLE_IDX], sample[FILE_2_IDX], sample[FILE_3_IDX]]) } \
 	| set { chunks }   	
-
   emit:
     chunks
 }
diff --git a/modules/export/emgb.nf b/modules/export/emgb.nf
new file mode 100644
index 00000000..16dc8716
--- /dev/null
+++ b/modules/export/emgb.nf
@@ -0,0 +1,165 @@
+include { pDumpLogs } from '../utils/processes'
+
+
+
+def getOutput(SAMPLE, RUNID, TOOL, filename){
+    return SAMPLE + '/' + RUNID + '/' + params.modules.export.name + '/' + 
+           params.modules.export.version.major + "." +
+           params.modules.export.version.minor + "." +
+           params.modules.export.version.patch + 
+           '/' + TOOL + '/' + filename
+}
+
+process pEMGBAnnotatedContigs {
+
+    container "${params.emgbAnnotatedContigs_image}"
+
+    tag "Sample: $sample"
+
+    publishDir params.output, mode: "${params.publishDirMode}", saveAs: { filename -> getOutput("${sample}", params.runid, "emgb", filename) }
+
+    when params.steps.containsKey("export") && params.steps.export.containsKey("emgb")
+
+    containerOptions  " --entrypoint='' "
+
+    label 'tiny'
+
+    input:
+    tuple val(sample), path(contigs), path(mapping), path("bins/*")
+
+    output:
+
+    tuple val("${sample}"), path("${sample}.contigs.json.gz"), emit: json
+    tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \
+	file(".command.out"), file(".command.err"), file(".command.log"), emit: logs
+
+    shell:
+    template 'emgbAnnotatedContigs.sh'
+}
+
+
+process pEMGBAnnotatedBins {
+
+    container "${params.emgbAnnotatedBins_image}"
+
+    tag "Sample: $sample"
+
+    publishDir params.output, mode: "${params.publishDirMode}", saveAs: { filename -> getOutput("${sample}", params.runid, "emgb", filename) }
+
+    when params.steps.containsKey("export") && params.steps.export.containsKey("emgb")
+
+    containerOptions  " --entrypoint='' "
+
+    label 'tiny'
+
+    input:
+    tuple val(sample), path(checkm), path(gtdbtk), path("bins/*")
+
+    output:
+    tuple val("${sample}"), path("${sample}.bins.json.gz"), emit: json
+    tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \
+	file(".command.out"), file(".command.err"), file(".command.log"), emit: logs
+
+    shell:
+    template 'emgbAnnotatedBins.sh'
+}
+
+
+process pEMGBAnnotatedGenes {
+
+    container "${params.emgbAnnotatedGenes_image}"
+
+    tag "Sample: $sample"
+
+    publishDir params.output, mode: "${params.publishDirMode}", saveAs: { filename -> getOutput("${sample}", params.runid, "emgb", filename) }
+
+    secret { "${S3_EMGB_TITLES_ACCESS}"!="" ? ["S3_EMGB_TITLES_ACCESS", "S3_EMGB_TITLES_SECRET"] : [] } 
+
+    secret { "${S3_EMGB_KEGG_ACCESS}"!="" ? ["S3_EMGB_KEGG_ACCESS", "S3_EMGB_KEGG_SECRET"] : [] } 
+
+    containerOptions Utils.getDockerMount(params.steps?.export?.emgb?.titles?.database, params) + Utils.getDockerMount(params.steps?.export?.emgb?.kegg?.database, params) + Utils.getDockerNetwork() + " --entrypoint='' "
+
+    when params.steps.containsKey("export") && params.steps.export.containsKey("emgb")
+
+    memory { Utils.getMemoryResources(params.resources.medium, "${sample}", task.attempt, params.resources) }
+
+    cpus { Utils.getCPUsResources(params.resources.medium, "${sample}", task.attempt, params.resources) }
+
+    beforeScript Utils.getCreateDatabaseDirCommand("${params.polished.databases}")
+
+    input:
+    tuple val(sample), path("gff/*"), path("ffn/*"), path("faa/*"), path("taxonomy/*"), path("blastResult/*"), path("blastKeggResult/*"), path("bins/*")
+
+    output:
+    tuple val("${sample}"), path("${sample}.genes.json.gz"), emit: json
+    tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \
+	file(".command.out"), file(".command.err"), file(".command.log"), emit: logs
+
+    shell:
+    TITLES_S5CMD_PARAMS=params?.steps?.export?.emgb?.titles?.database?.download?.s5cmd?.params ?: "" 
+    TITLES_DOWNLOAD_LINK=params?.steps?.export?.emgb?.titles?.database?.download?.source ?: ""
+    TITLES_MD5SUM=params.steps?.export?.emgb?.titles?.database?.download?.md5sum ?: ""
+    TITLES_EXTRACTED_DB=params.steps?.export?.emgb?.titles?.database?.extractedDBPath ?: ""
+    KEGG_S5CMD_PARAMS=params?.steps?.export?.emgb?.kegg?.database?.download?.s5cmd?.params ?: "" 
+    KEGG_DOWNLOAD_LINK=params?.steps?.export?.emgb?.kegg?.database?.download?.source ?: ""
+    KEGG_MD5SUM=params.steps?.export?.emgb?.kegg?.database?.download?.md5sum ?: ""
+    KEGG_EXTRACTED_DB=params.steps?.export?.emgb?.kegg?.database?.extractedDBPath ?: ""
+ 
+    TITLES_S3_EMGB_ACCESS=params?.steps?.export?.emgb?.titles?.database?.download?.s5cmd && TITLES_S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_EMGB_TITLES_ACCESS" : ""
+    TITLES_S3_EMGB_SECRET=params?.steps?.export?.emgb?.titles?.database?.download?.s5cmd && TITLES_S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_EMGB_TITLES_SECRET" : ""
+
+    KEGG_S3_EMGB_ACCESS=params?.steps?.export?.emgb?.kegg?.database?.download?.s5cmd && KEGG_S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_EMGB_KEGG_ACCESS" : ""
+    KEGG_S3_EMGB_SECRET=params?.steps?.export?.emgb?.kegg?.database?.download?.s5cmd && KEGG_S5CMD_PARAMS.indexOf("--no-sign-request") == -1 ? "\$S3_EMGB_KEGG_SECRET" : ""
+    template 'emgbAnnotatedGenes.sh'
+}
+
+
+
+// CONTIGS: [test1, /vol/spool/metagenomics-tk/work_wFullPipeline/cd/7c943c0808e6d36c72a64834e7a88e/test1_contigs.fa.gz]
+// [test2, /vol/spool/metagenomics-tk/work_wFullPipeline/00/71fd0e590f4f03fda4cb87903a3b38/test2_contigs.fa.gz]
+// mapping: [test2, /vol/spool/metagenomics-tk/work_wFullPipeline/7c/43a3513d49143a76a29c4404417997/test2.bam]
+// bins: [test1, [/vol/spool/metagenomics-tk/work_wFullPipeline/95/085ffbe17a6f2ebfb60709d3f33cf3/test1_bin.1.fa, /vol/spool/metagenomics-tk/work_wFullPipeline/95/085ffbe17a6f2ebfb60709d3f33cf3/test1_bin.2.fa]]
+// gtdbtk: [test2, [/vol/spool/metagenomics-tk/work_wFullPipeline/d2/5b5937c7a950d87cd796e2bf80cfcd/chunk_00_test2_gtdbtk.bac120.summary.tsv]]
+// checkm: [test2, /vol/spool/metagenomics-tk/work_wFullPipeline/ff/1292ed3399aada6fa1dffc18cfbc12/test2_checkm2_EfY2cPIh.tsv]
+// gff,ffn,faa: [test2, test2_bin.2.fa, /vol/spool/metagenomics-tk/work_wFullPipeline/78/1224783c7ae2d9e0a8d4264893e47a/test2_bin.2.gff.gz]
+// mmseqsTaxonomy: [ncbi_nr, test1, /vol/spool/metagenomics-tk/work_wFullPipeline/93/56bf5c34d7524930d73a6ef45ae23d/test1_binned.ncbi_nr.taxonomy.tsv]
+//mmseqsBlast:
+// [bacmet20_predicted, test2, binned, 1, 2922, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/3c/e5221be7fd305dc056356201de7d61/test2_binned.1.2922.bacmet20_predicted.blast.tsv]
+// [uniref90, test1, binned, 1, 2923, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/a0/8e7a2a7b928731dbcfd62cbf546d4f/test1_binned.1.2923.uniref90.blast.tsv]
+// [bacmet20_predicted, test1, binned, 1, 2923, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/2f/dffc66ee85b2a1baf296323fcc894c/test1_binned.1.2923.bacmet20_predicted.blast.tsv]
+// [kegg, test2, binned, 1, 2922, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/5a/74adca85c1b4005aa490440f8b05ba/test2_binned.1.2922.kegg.blast.tsv]
+// [uniref90, test2, binned, 1, 2922, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/57/5f4e7f5ae75937613d7c813be6c8fd/test2_binned.1.2922.uniref90.blast.tsv]
+// [kegg, test1, binned, 1, 2923, 1, /vol/spool/metagenomics-tk/work_wFullPipeline/6b/41a5ce820f9fb5e897b6a9109e5678/test1_binned.1.2923.kegg.blast.tsv]
+workflow wEMGBList {
+  take:
+    contigs
+    mapping
+    bins
+    gtdbtk
+    checkm
+    gff
+    ffn
+    faa
+    mmseqsTaxonomy
+    mmseqsBlast
+  main:
+    SAMPLE_IDX = 0
+    contigs |  combine(mapping, by: SAMPLE_IDX) | combine(bins, by: SAMPLE_IDX) | pEMGBAnnotatedContigs
+    checkm | combine(gtdbtk, by: SAMPLE_IDX) | combine(bins, by: SAMPLE_IDX) | pEMGBAnnotatedBins
+
+    mmseqsBlast | filter { db, sample, type, start, end, chunkNumber, blastResult -> db == "uniref90" } \
+	| map { db, sample, type, start, end, chunkNumber, blastResult -> [sample, blastResult] } \
+	| groupTuple(by: SAMPLE_IDX) | set { selectedDBBlastResults }
+
+    mmseqsBlast | filter { db, sample, type, start, end, chunkNumber, blastResult -> db == "kegg" } \
+	| map { db, sample, type, start, end, chunkNumber, blastResult -> [sample, blastResult] } \
+	| groupTuple(by: SAMPLE_IDX) | set { selectedKeggBlastResults }
+
+    gff | map { sample, bin, gff -> [sample, gff] } | groupTuple(by: SAMPLE_IDX) \
+	| combine(ffn | map { sample, bin, ffn -> [sample, ffn] } | groupTuple(by: SAMPLE_IDX), by: SAMPLE_IDX) \
+	| combine(faa | map { sample, bin, faa -> [sample, faa] } | groupTuple(by: SAMPLE_IDX), by: SAMPLE_IDX) \
+	| combine(mmseqsTaxonomy | map { db, sample, blastResult -> [sample, blastResult] } | groupTuple(by: SAMPLE_IDX), by: SAMPLE_IDX) \
+	| combine(selectedDBBlastResults, by: SAMPLE_IDX) \
+	| combine(selectedKeggBlastResults, by: SAMPLE_IDX) \
+	| combine(bins, by: SAMPLE_IDX) | pEMGBAnnotatedGenes
+}
diff --git a/modules/magAttributes/module.nf b/modules/magAttributes/module.nf
index 12c1a2e3..7eba09d3 100644
--- a/modules/magAttributes/module.nf
+++ b/modules/magAttributes/module.nf
@@ -59,7 +59,7 @@ process pCheckM {
     tuple val(sample), val(ending), path(bins), val(chunkId)
 
     output:
-    tuple path("${sample}_checkm_*.tsv", type: "file"), val("${sample}"), emit: checkm
+    tuple val("${sample}"), path("${sample}_checkm_*.tsv", type: "file"), emit: checkm
     tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \
 	file(".command.out"), file(".command.err"), file(".command.log"), emit: logs
 
@@ -98,7 +98,7 @@ process pCheckM2 {
     tuple val(sample), val(ending), path(bins), val(chunkId)
 
     output:
-    tuple path("${sample}_checkm2_*.tsv", type: "file"), val("${sample}"), emit: checkm
+    tuple val("${sample}"), path("${sample}_checkm2_*.tsv", type: "file"),  emit: checkm
     tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \
 	file(".command.out"), file(".command.err"), file(".command.log"), emit: logs
 
@@ -136,12 +136,12 @@ process pGtdbtk {
     tuple val(sample), val(ending), path(bins), val(chunkId)
 
     output:
-    tuple path("chunk_*_${sample}_gtdbtk.bac120.summary.tsv"), val("${sample}"), optional: true, emit: bacteria
-    tuple path("chunk_*_${sample}_gtdbtk.ar122.summary.tsv"), val("${sample}"), optional: true, emit: archea
-    tuple path("chunk_*_${sample}_gtdbtk_unclassified.tsv"), val("${sample}"), optional: true, emit: unclassified
-    tuple path("*.tree"), val("${sample}"), optional: true, emit: tree
-    tuple path("chunk_*_${sample}_gtdbtk_combined.tsv"), val("${sample}"), optional: true, emit: combined
-    tuple path("chunk_*_${sample}_missing_bins.tsv"), val("${sample}"), optional: true, emit: missing
+    tuple val("${sample}"), path("chunk_*_${sample}_gtdbtk.bac120.summary.tsv"), optional: true, emit: bacteria
+    tuple val("${sample}"), path("chunk_*_${sample}_gtdbtk.ar122.summary.tsv"), optional: true, emit: archea
+    tuple val("${sample}"), path("chunk_*_${sample}_gtdbtk_unclassified.tsv"), optional: true, emit: unclassified
+    tuple val("${sample}"), path("*.tree"), val("${sample}"), optional: true, emit: tree
+    tuple val("${sample}"), path("chunk_*_${sample}_gtdbtk_combined.tsv"), optional: true, emit: combined
+    tuple val("${sample}"), path("chunk_*_${sample}_missing_bins.tsv"), optional: true, emit: missing
     tuple env(FILE_ID), val("${output}"), val(params.LOG_LEVELS.INFO), file(".command.sh"), \
 	file(".command.out"), file(".command.err"), file(".command.log"), emit: logs
 
@@ -259,6 +259,9 @@ workflow wMagAttributesList {
      checkm = _wMagAttributes.out.checkm
      gtdb = _wMagAttributes.out.gtdb
      gtdbMissing = _wMagAttributes.out.gtdbMissing
+     gtdbArcheaFiles = _wMagAttributes.out.gtdbArcheaFiles
+     gtdbBacteriaFiles = _wMagAttributes.out.gtdbBacteriaFiles
+     checkmFiles = _wMagAttributes.out.checkmFiles
 }
 
 
@@ -311,19 +314,23 @@ workflow _wMagAttributes {
 
      // Prepare checkm output file
      checkmSelected | splitCsv(sep: '\t', header: true) \
-	| map { checkmDict, sample -> checkmDict } \
+	| map { sample, checkmDict -> checkmDict } \
 	| set { checkmList }
 
      // Prepare gtdb output file
      gtdb.combined | splitCsv(sep: '\t', header: true) \
-	| map { gtdb, sample -> gtdb } \
+	| map { sample, gtdb -> gtdb } \
 	| set { gtdbCombinedList }
 
      // Prepare missing gtdb output file
      gtdb.missing | splitCsv(sep: '\t', header: true) \
-	| map { gtdb, sample -> gtdb } \
+	| map { sample, gtdb -> gtdb } \
 	| set { gtdbMissingList }
 
+     gtdb.bacteria | set { gtdbBacteriaFiles }
+
+     gtdb.archea | set { gtdbArcheaFiles }
+
      if(params.summary){
        // collect checkm files for checkm2 results across multiple datasets
        checkmSelected \
@@ -347,4 +354,7 @@ workflow _wMagAttributes {
      checkm = checkmList
      gtdb = gtdbCombinedList
      gtdbMissing = gtdbMissingList
+     gtdbArcheaFiles = gtdbArcheaFiles
+     gtdbBacteriaFiles = gtdbBacteriaFiles
+     checkmFiles = checkmSelected
 }
diff --git a/nextflow.config b/nextflow.config
index 4895c10a..cca591db 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -292,6 +292,9 @@ params {
 
    skipVersionCheck = false
 
+   emgbAnnotatedGenes_image = "quay.io/emgb/annotatedgenes2json:2.5.2z"
+   emgbAnnotatedContigs_image = "quay.io/emgb/annotatedcontigs2json:2.2.4z"
+   emgbAnnotatedBins_image = "quay.io/emgb/annotatedbins2json:2.4.0z"
    pysradb_image = "quay.io/biocontainers/pysradb:1.4.1--pyhdfd78af_0"
    minimap2_image= "quay.io/biocontainers/minimap2:2.24--h7132678_1"
    metaflye_image = "quay.io/biocontainers/flye:2.9--py36h7281c5b_1"
@@ -498,6 +501,14 @@ params {
                 patch = 0
          }
       }
+      export {
+         name = "export"
+         version {
+                major = 0
+                minor = 1
+                patch = 0
+         }
+      }
       readMapping {
          name = "readMapping"
          version {
diff --git a/scripts/test_fullPipelineExport.sh b/scripts/test_fullPipelineExport.sh
new file mode 100644
index 00000000..b1c59cc6
--- /dev/null
+++ b/scripts/test_fullPipelineExport.sh
@@ -0,0 +1,15 @@
+set -e
+
+ENTRY="wExportPipeline"
+OPTIONS=$1
+YAML="${2:-example_params/export.yml}"
+WORK="${3:-work}_${ENTRY}"
+PROFILE="${4:-standard}"
+LOG_DIR="${WORK}/logs"
+make run_small_full_test WORK_DIR=${WORK} \
+	  PARAMS_FILE=$YAML \
+	  PROFILE=" $PROFILE " \
+	  LOG_DIR=${LOG_DIR} \
+	  OPTIONS=" $OPTIONS " \
+	  ENTRY="${ENTRY}"
+make check LOG_DIR=${LOG_DIR}
diff --git a/templates/emgbAnnotatedBins.sh b/templates/emgbAnnotatedBins.sh
new file mode 100644
index 00000000..de37d3be
--- /dev/null
+++ b/templates/emgbAnnotatedBins.sh
@@ -0,0 +1,13 @@
+
+BINS_DIR=bins
+
+binPrefix=$(find $BINS_DIR -name "*_bin.*.fa" -exec readlink -f {} \; \
+				            | tail -n 1 \
+				            | rev | cut -f 1 -d '/' \
+			                    | rev | cut -d '.' -f 1 \
+					    | sed 's/$/./g')
+
+annotatedbins2json -checkm-tsv  !{checkm} \
+	-gtdbtk-tsvs !{gtdbtk} \
+	-json-gz !{sample}.bins.json.gz \
+        -bin-id-prefix ${binPrefix}
diff --git a/templates/emgbAnnotatedContigs.sh b/templates/emgbAnnotatedContigs.sh
new file mode 100644
index 00000000..6f688fd5
--- /dev/null
+++ b/templates/emgbAnnotatedContigs.sh
@@ -0,0 +1,5 @@
+annotatedcontigs2json -bins-dir bins \
+	-sample-bam-files  !{mapping} \
+	-fasta !{contigs} \
+	-json-gz !{sample}.contigs.json.gz \
+	-sample-names !{sample}
diff --git a/templates/emgbAnnotatedGenes.sh b/templates/emgbAnnotatedGenes.sh
new file mode 100644
index 00000000..d0b5c550
--- /dev/null
+++ b/templates/emgbAnnotatedGenes.sh
@@ -0,0 +1,91 @@
+# Check developer documentation
+# Download titles
+if [ -z "!{TITLES_EXTRACTED_DB}" ]
+then
+  TITLES_DATABASE=!{params.databases}/emgb_titles
+  TITLES_LOCK_FILE=${TITLES_DATABASE}/lock.txt
+
+  if [ ! -z "!{TITLES_S3_EMGB_ACCESS}" ]
+  then
+    export AWS_ACCESS_KEY_ID=!{TITLES_S3_EMGB_ACCESS}
+    export AWS_SECRET_ACCESS_KEY=!{TITLES_S3_EMGB_SECRET}
+  fi
+
+  echo "${TITLES_DATABASE}"
+  # Download checkm database if necessary
+  mkdir -p ${TITLES_DATABASE}
+  flock ${TITLES_LOCK_FILE} concurrentDownload.sh --output=${TITLES_DATABASE} \
+    --link="!{TITLES_DOWNLOAD_LINK}" \
+    --httpsCommand="wgetStatic --no-check-certificate -qO- !{TITLES_DOWNLOAD_LINK} | gunzip > titles.tsv" \
+    --s3FileCommand="s5cmd !{TITLES_S5CMD_PARAMS} cat --concurrency !{task.cpus} !{TITLES_DOWNLOAD_LINK} | gunzip > titles.tsv " \
+    --s3DirectoryCommand="s5cmd !{TITLES_S5CMD_PARAMS} cp --concurrency !{task.cpus} !{TITLES_DOWNLOAD_LINK} . && mv * titles.tsv " \
+    --s5cmdAdditionalParams="!{TITLES_S5CMD_PARAMS}" \
+    --localCommand="gunzip -c !{TITLES_DOWNLOAD_LINK} > ./titles.tsv" \
+    --expectedMD5SUM=!{TITLES_MD5SUM}
+  TITLES_FILE="${TITLES_DATABASE}/out/titles.tsv"
+else
+  TITLES_FILE="!{TITLES_EXTRACTED_DB}"
+fi
+
+
+
+# Check developer documentation
+# Download titles
+if [ -z "!{KEGG_EXTRACTED_DB}" ]
+then
+  KEGG_DATABASE=!{params.databases}/emgb_kegg
+  KEGG_LOCK_FILE=${KEGG_DATABASE}/lock.txt
+
+  if [ ! -z "!{KEGG_S3_EMGB_ACCESS}" ]
+  then
+    export AWS_ACCESS_KEY_ID=!{KEGG_S3_EMGB_ACCESS}
+    export AWS_SECRET_ACCESS_KEY=!{KEGG_S3_EMGB_SECRET}
+  fi
+
+  # Download checkm database if necessary
+  mkdir -p ${KEGG_DATABASE}
+  flock ${KEGG_LOCK_FILE} concurrentDownload.sh --output=${KEGG_DATABASE} \
+    --link=!{KEGG_DOWNLOAD_LINK} \
+    --httpsCommand="wgetStatic --no-check-certificate -qO- !{KEGG_DOWNLOAD_LINK} | zstd -T!{task.cpus} -d -c | tar --strip-components=1 -xv " \
+    --s3FileCommand="s5cmd !{KEGG_S5CMD_PARAMS} cat --concurrency !{task.cpus} !{KEGG_DOWNLOAD_LINK} | zstd -T!{task.cpus} -d -c | tar --strip-components=1 -xv " \
+    --s3DirectoryCommand="s5cmd !{KEGG_S5CMD_PARAMS} cp --concurrency !{task.cpus} !{KEGG_DOWNLOAD_LINK} . " \
+    --s5cmdAdditionalParams="!{KEGG_S5CMD_PARAMS}" \
+    --localCommand="zstd -T!{task.cpus} -c -d !{KEGG_DOWNLOAD_LINK} | tar --strip-components=1 -xv " \
+    --expectedMD5SUM=!{KEGG_MD5SUM}
+  KEGG_DB=$(readlink -f ${KEGG_DATABASE}/out/)
+else
+  KEGG_DB="!{KEGG_EXTRACTED_DB}"
+fi
+
+BINS_DIR=bins
+
+nr=$(find  blastResult/ -name "*.tsv" -exec readlink -f {} \;  | sed 's/^/ --blast-tab /g')
+
+tax=$(find taxonomy/ -name "*.tsv" -exec readlink -f {} \; | sed 's/^/ -mmseqs-lineage /g')
+
+
+ffn=$(find ffn/ -name "*.gz" -exec readlink -f {} \; | sed 's/^/ -ffn /g')
+
+faa=$(find faa/ -name "*.gz" -exec readlink -f {} \; | sed 's/^/ -faa /g')
+
+gff=$(find gff/ -name "*.gz" -exec readlink -f {} \; | sed 's/^/ -gff /g')
+
+kegg=$(find blastKeggResult/ -name "*.tsv" -exec readlink -f {} \; | sed 's/^/ -kegg-blast-tab /g')
+
+titles=" -title-tsv $TITLES_FILE "
+
+name=" -dataset-name !{sample} "
+
+db=" -db ${KEGG_DB} "
+
+annotatedgenes2json -bins-dir bins \
+			${nr} \
+			${tax} \
+			${ffn} \
+			${faa} \
+			${gff} \
+			${titles} \
+			${name} \
+			${db} \
+			${kegg} \
+			-json-gz !{sample}.genes.json.gz