nf-core · christopher-mohr · Aug 25, 2022 · Aug 25, 2022 · Aug 25, 2022 · Aug 25, 2022
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -30,6 +30,11 @@
                 "pattern": "^(I|II|H-2)$",
                 "errorMessage": "The MHC class must be provided. Valid values: "
             },
+            "rnaseq": {
+                "type": "string",
+                "pattern": "^\\S+\\.(tsv)$",
+                "errorMessage": "RNAseq analysis results must have one of the following extensions:  ''.tsv''"
+            },
             "filename": {
                 "type": "string",
                 "pattern": "^\\S+\\.(vcf|tsv|fasta|fa|txt)$",

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -229,10 +229,17 @@ def check_samplesheet(file_in, file_out):
     GBM_1,gbm_1_alleles.txt,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
     GBM_2,gbm_2_alleles.txt,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta
 
+    or with optional column(s)
+
+    sample,alleles,mhc_class,expression,filename
+    GBM_1,gbm_1_alleles.txt,I,expression_values_gbm1.tsv,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
+    GBM_2,gbm_2_alleles.txt,I,expression_values_gbm2.tsv,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta
+
 
     where the FileName column contains EITHER a vcf/tsv file with genomic variants, a tsv file (peptides), or a fasta file (proteins)
     and the Alleles column contains EITHER a string of alleles separated by semicolon or the path to a text file
-    containing one allele per line (no header)
+    containing one allele per line (no header). The optional expression column contains a tsv file with expression values (on transcript or
+    gene level) as generated by the rnaseq pipeline.
 
     Further examples:
     - Class2 allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.DRB1_01_01.txt
@@ -246,12 +253,16 @@ def check_samplesheet(file_in, file_out):
     with open(file_in, "r") as fin:
 
         ## Check header
-        COL_NUM = 4
+        MIN_COL = 4 # expression optional
         HEADER = ["sample", "alleles", "mhc_class", "filename"]
         header = [x.strip('"') for x in fin.readline().strip().split(",")]
+        expression_available = "expression" in header
         valid_classes = "I,II,H-2"
         valid_class1_loci = ['A*','B*','C*','E*','G*']
         valid_class2_loci = ['DR','DP','DQ']
+
+        if expression_available:
+            HEADER.insert(3, "expression")
         if header[: len(HEADER)] != HEADER:
             print("ERROR: Please check samplesheet header -> {} != {}".format("\t".join(header), "\t".join(HEADER)))
             sys.exit(1)
@@ -267,15 +278,20 @@ def check_samplesheet(file_in, file_out):
                     line,
                 )
             num_cols = len([x for x in lspl if x])
-            if num_cols != COL_NUM:
+            if num_cols < MIN_COL:
                 print_error(
-                    "Invalid number of populated columns (valid = {})!".format(COL_NUM),
+                    "Invalid number of populated columns (valid >= {})!".format(MIN_COL),
                     "Line",
                     line,
                 )
 
             ## Check sample name entries
-            sample, alleles, mhcclass, filename = lspl[: len(HEADER)]
+            if expression_available:
+                sample, alleles, mhcclass, expression, filename = lspl[: len(HEADER)]
+                sample_info = [sample, alleles, mhcclass, expression, filename]
+            else:
+                sample, alleles, mhcclass, filename = lspl[: len(HEADER)]
+                sample_info = [sample, alleles, mhcclass, "", filename]
 
             ## Check given file types
             if not filename.lower().endswith((".vcf", ".vcf.gz", ".tsv", ".GSvar", ".fasta", ".txt")):
@@ -289,7 +305,6 @@ def check_samplesheet(file_in, file_out):
             if  not os.path.isfile(alleles) and mhcclass == 'I' and any(substring in alleles for substring in valid_class2_loci) or mhcclass == 'II' and any(substring in alleles for substring in valid_class1_loci):
                 print_error("Samplesheet contains invalid mhc class and allele combination!", "Line", line)
 
-            sample_info = [sample, alleles, mhcclass, filename]
             ## Create sample mapping dictionary
             if sample not in sample_run_dict:
                 sample_run_dict[sample] = [sample_info]
@@ -304,7 +319,7 @@ def check_samplesheet(file_in, file_out):
         out_dir = os.path.dirname(file_out)
         make_dir(out_dir)
         with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "alleles","mhc_class","filename"]) + "\n")
+            fout.write(",".join(["sample", "alleles","mhc_class", "expression", "filename"]) + "\n")
 
             for sample in sorted(sample_run_dict.keys()):
                 for val in sample_run_dict[sample]:

diff --git a/bin/epaa.py b/bin/epaa.py
@@ -419,6 +419,9 @@ def read_protein_quant(filename):
                         intensities[p.split('|')[1]] = valuedict
     return intensities
 
+# parse rnaseq analysis results
+# data frame: gene/transcript -> count/TPM
+#def read_diff_expression_values(filename):
 
 # parse different expression analysis results (DESeq2), link log2fold changes to transcripts/genes
 def read_diff_expression_values(filename):
@@ -486,12 +489,6 @@ def create_mutationsyntax_genome_column_value(pep):
     return ','.join(set([y.cdsMutationSyntax for y in syntaxes]))
 
 
-def create_variationfilelinenumber_column_value(pep):
-    v = [x.vars.values() for x in pep.get_all_transcripts()]
-    vf = list(itertools.chain.from_iterable(v))
-    return ','.join([str(int(y.id.replace('line', ''))+1) for y in vf])
-
-
 def create_gene_column_value(pep):
     transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())]
     variants = []
@@ -877,7 +874,7 @@ def make_predictions_from_variants(variants_all, methods, tool_thresholds, use_a
         df['length'] = df['sequence'].map(len)
         df['chr'] = df['sequence'].map(create_variant_chr_column_value)
         df['pos'] = df['sequence'].map(create_variant_pos_column_value)
-        df['gene'] = df['sequence'].map(create_gene_column_value)
+        df['gene_id'] = df['sequence'].map(create_gene_column_value)
         df['transcripts'] = df['sequence'].map(create_transcript_column_value)
         df['proteins'] = df['sequence'].map(create_protein_column_value)
         df['variant type'] = df['sequence'].map(
@@ -1049,8 +1046,8 @@ def __main__():
                         help="List of gene IDs for ID mapping.", required=False)
     parser.add_argument('-pq', "--protein_quantification",
                         help="File with protein quantification values")
-    parser.add_argument('-ge', "--gene_expression",
-                        help="File with expression analysis results")
+    parser.add_argument('-ge', "--expression",
+                        help="File with rnaseq analysis results", required=False)
     parser.add_argument('-de', "--diff_gene_expression",
                         help="File with differential expression analysis results (DESeq2)")
     parser.add_argument('-li', "--ligandomics_id",
@@ -1154,7 +1151,6 @@ def __main__():
     try:
         complete_df = pd.concat(pred_dataframes, sort=True)
         # replace method names with method names with version
-        # complete_df.replace({'method': methods}, inplace=True)
         complete_df['method'] = complete_df['method'].apply(
             lambda x: x.lower() + '-' + methods[x.lower()])
         predictions_available = True
@@ -1163,16 +1159,25 @@ def __main__():
         predictions_available = False
         logger.error("No predictions available.")
 
+    complete_df.replace("gene", "gene_id")
+
+    # get gene names from Ensembl and add them to the data frame
+    # we want to add gene names to our data frame in order to make the mapping easier
+    # we will use this when the next epytope release is ready where we already implemented the functionality
+    #mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list())
+    #mapping_gene_names_ids.columns = ["gene_name", "gene_id"]
+    #complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left")
+
     # include wild type sequences to dataframe if specified
     if args.wild_type:
         wt_sequences = generate_wt_seqs(all_peptides_filtered)
         complete_df['wt sequence'] = complete_df.apply(
             lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
         columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos',
-                         'gene', 'transcripts', 'proteins', 'variant type', 'method']
+                         'gene_id', 'transcripts', 'proteins', 'variant type', 'method']
     # Change the order (the index) of the columns
     else:
-        columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene',
+        columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene_id',
                          'transcripts', 'proteins', 'variant type', 'method']
     for c in complete_df.columns:
         if c not in columns_tiles:
@@ -1206,24 +1211,26 @@ def __main__():
         for k in first_entry.keys():
             complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(
                 lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1)
-    # parse (differential) expression analysis results, annotate features (genes/transcripts)
-    if args.gene_expression is not None:
-        fold_changes = read_diff_expression_values(args.gene_expression)
-        gene_id_lengths = {}
-        col_name = 'RNA expression (RPKM)'
-
-        with open(args.gene_reference, 'r') as gene_list:
-            for l in gene_list:
-                ids = l.split('\t')
-                gene_id_in_df = complete_df.iloc[1]['gene']
-                if 'ENSG' in gene_id_in_df:
-                    gene_id_lengths[ids[0]] = float(ids[2].strip())
-                else:
-                    gene_id_lengths[ids[1]] = float(ids[2].strip())
-        deseq = False
-        # add column to result dataframe
-        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(
-            row, fold_changes, deseq, gene_id_lengths), axis=1)
+    # parse expression (nf-core/rnaseq) analysis results, annotate features (genes/transcripts)
+    if args.expression is not None:
+        rnaseq_results = pd.read_csv(args.expression, sep='\t', header=0)
+
+        measure = "count" if "count" in args.expression else "TPM"
+        transcript_features = "tx" in rnaseq_results.columns
+        #merge_on = "gene_name"
+        merge_on = "gene_id"
+
+        # we expect columns: tx gene_id samples
+        if transcript_features:
+            rnaseq_results.columns = ["{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}") for c in rnaseq_results.columns]
+            merge_on = "tx"
+        # we expect columns: gene_id gene_name samples
+        else:
+            rnaseq_results.columns = ["{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}") for c in rnaseq_results.columns]
+
+        # add sample-specific expression values to data frame
+        complete_df = complete_df.merge(rnaseq_results,on=merge_on,how="left")
+
     if args.diff_gene_expression is not None:
         gene_id_lengths = {}
         fold_changes = read_diff_expression_values(args.diff_gene_expression)

diff --git a/modules/local/epytope_peptide_prediction.nf b/modules/local/epytope_peptide_prediction.nf
@@ -9,6 +9,7 @@ process EPYTOPE_PEPTIDE_PREDICTION {
     input:
     tuple val(meta), path(splitted), path(software_versions)
     val netmhc_paths
+    path(expression)
 
     output:
     tuple val(meta), path("*.json"), emit: json
@@ -41,6 +42,10 @@ process EPYTOPE_PEPTIDE_PREDICTION {
         argument = "--use_affinity_thresholds " + argument
     }
 
+    if (expression) {
+        argument = "--expression ${expression} " + argument
+    }
+
     def netmhc_paths_string = netmhc_paths.join(",")
     def tools_split = params.tools.split(',')
     def class1_tools = tools_split.findAll { ! it.matches('.*(?i)(class-2|ii).*') }

diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf
@@ -30,12 +30,14 @@ def get_samplesheet_paths(LinkedHashMap row) {
     meta.alleles        = allele_string
     meta.mhcclass       = row.mhc_class
     meta.inputtype      = type
+    expression = row.expression ? file(row.expression, checkIfExists: true) : []
 
     def array = []
     if (!file(row.filename).exists()) {
         exit 1, "ERROR: Please check input samplesheet -> file does not exist!\n${row.Filename}"
-    } else {
-        array = [ meta, file(row.filename) ]
+    }
+    else {
+        array = [meta, expression, file(row.filename)]
     }
     return array
 }

diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf
@@ -112,7 +112,7 @@ workflow EPITOPEPREDICTION {
 
     INPUT_CHECK.out.reads
                 .branch {
-                    meta_data, input_file ->
+                    meta_data, expression, input_file ->
                         variant_compressed : meta_data.inputtype == 'variant_compressed'
                             return [ meta_data, input_file ]
                         variant_uncompressed :  meta_data.inputtype == 'variant'
@@ -124,6 +124,9 @@ workflow EPITOPEPREDICTION {
                     }
                 .set { ch_samples_from_sheet }
 
+    ch_expression = INPUT_CHECK.out.reads
+                        .map { meta_data, expression, input_file -> expression }
+
     // gunzip variant files
     GUNZIP_VCF (
         ch_samples_from_sheet.variant_compressed
@@ -342,7 +345,8 @@ workflow EPITOPEPREDICTION {
             .splitted
             .combine( ch_prediction_tool_versions )
             .transpose(),
-            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
+            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
+            ch_expression
     )
 
     // Run epitope prediction for peptides
@@ -352,7 +356,8 @@ workflow EPITOPEPREDICTION {
             .splitted
             .combine( ch_prediction_tool_versions )
             .transpose(),
-            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
+            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
+            ch_expression
     )
 
     // Run epitope prediction for variants
@@ -363,7 +368,8 @@ workflow EPITOPEPREDICTION {
             .mix( ch_split_variants.splitted )
             .combine( ch_prediction_tool_versions )
             .transpose(),
-            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
+            EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
+            ch_expression
     )
 
     // collect prediction script versions