Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rnaseq input #174

Open
wants to merge 8 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@
"pattern": "^(I|II|H-2)$",
"errorMessage": "The MHC class must be provided. Valid values: "
},
"rnaseq": {
"type": "string",
"pattern": "^\\S+\\.(tsv)$",
"errorMessage": "RNAseq analysis results must have one of the following extensions: ''.tsv''"
},
"filename": {
"type": "string",
"pattern": "^\\S+\\.(vcf|tsv|fasta|fa|txt)$",
Expand Down
29 changes: 22 additions & 7 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,10 +229,17 @@ def check_samplesheet(file_in, file_out):
GBM_1,gbm_1_alleles.txt,I,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
GBM_2,gbm_2_alleles.txt,I,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta

or with optional column(s)

sample,alleles,mhc_class,expression,filename
GBM_1,gbm_1_alleles.txt,I,expression_values_gbm1.tsv,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
GBM_2,gbm_2_alleles.txt,I,expression_values_gbm2.tsv,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta


where the FileName column contains EITHER a vcf/tsv file with genomic variants, a tsv file (peptides), or a fasta file (proteins)
and the Alleles column contains EITHER a string of alleles separated by semicolon or the path to a text file
containing one allele per line (no header)
containing one allele per line (no header). The optional expression column contains a tsv file with expression values (on transcript or
gene level) as generated by the rnaseq pipeline.

Further examples:
- Class2 allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.DRB1_01_01.txt
Expand All @@ -246,12 +253,16 @@ def check_samplesheet(file_in, file_out):
with open(file_in, "r") as fin:

## Check header
COL_NUM = 4
MIN_COL = 4 # expression optional
HEADER = ["sample", "alleles", "mhc_class", "filename"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
expression_available = "expression" in header
valid_classes = "I,II,H-2"
valid_class1_loci = ['A*','B*','C*','E*','G*']
valid_class2_loci = ['DR','DP','DQ']

if expression_available:
HEADER.insert(3, "expression")
if header[: len(HEADER)] != HEADER:
print("ERROR: Please check samplesheet header -> {} != {}".format("\t".join(header), "\t".join(HEADER)))
sys.exit(1)
Expand All @@ -267,15 +278,20 @@ def check_samplesheet(file_in, file_out):
line,
)
num_cols = len([x for x in lspl if x])
if num_cols != COL_NUM:
if num_cols < MIN_COL:
print_error(
"Invalid number of populated columns (valid = {})!".format(COL_NUM),
"Invalid number of populated columns (valid >= {})!".format(MIN_COL),
"Line",
line,
)

## Check sample name entries
sample, alleles, mhcclass, filename = lspl[: len(HEADER)]
if expression_available:
sample, alleles, mhcclass, expression, filename = lspl[: len(HEADER)]
sample_info = [sample, alleles, mhcclass, expression, filename]
else:
sample, alleles, mhcclass, filename = lspl[: len(HEADER)]
sample_info = [sample, alleles, mhcclass, "", filename]
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved

## Check given file types
if not filename.lower().endswith((".vcf", ".vcf.gz", ".tsv", ".GSvar", ".fasta", ".txt")):
Expand All @@ -289,7 +305,6 @@ def check_samplesheet(file_in, file_out):
if not os.path.isfile(alleles) and mhcclass == 'I' and any(substring in alleles for substring in valid_class2_loci) or mhcclass == 'II' and any(substring in alleles for substring in valid_class1_loci):
print_error("Samplesheet contains invalid mhc class and allele combination!", "Line", line)

sample_info = [sample, alleles, mhcclass, filename]
## Create sample mapping dictionary
if sample not in sample_run_dict:
sample_run_dict[sample] = [sample_info]
Expand All @@ -304,7 +319,7 @@ def check_samplesheet(file_in, file_out):
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(["sample", "alleles","mhc_class","filename"]) + "\n")
fout.write(",".join(["sample", "alleles","mhc_class", "expression", "filename"]) + "\n")

for sample in sorted(sample_run_dict.keys()):
for val in sample_run_dict[sample]:
Expand Down
67 changes: 37 additions & 30 deletions bin/epaa.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,9 @@ def read_protein_quant(filename):
intensities[p.split('|')[1]] = valuedict
return intensities

# parse rnaseq analysis results
# data frame: gene/transcript -> count/TPM
#def read_diff_expression_values(filename):

# parse different expression analysis results (DESeq2), link log2fold changes to transcripts/genes
def read_diff_expression_values(filename):
Expand Down Expand Up @@ -486,12 +489,6 @@ def create_mutationsyntax_genome_column_value(pep):
return ','.join(set([y.cdsMutationSyntax for y in syntaxes]))


def create_variationfilelinenumber_column_value(pep):
v = [x.vars.values() for x in pep.get_all_transcripts()]
vf = list(itertools.chain.from_iterable(v))
return ','.join([str(int(y.id.replace('line', ''))+1) for y in vf])


def create_gene_column_value(pep):
transcript_ids = [x.transcript_id for x in set(pep.get_all_transcripts())]
variants = []
Expand Down Expand Up @@ -877,7 +874,7 @@ def make_predictions_from_variants(variants_all, methods, tool_thresholds, use_a
df['length'] = df['sequence'].map(len)
df['chr'] = df['sequence'].map(create_variant_chr_column_value)
df['pos'] = df['sequence'].map(create_variant_pos_column_value)
df['gene'] = df['sequence'].map(create_gene_column_value)
df['gene_id'] = df['sequence'].map(create_gene_column_value)
df['transcripts'] = df['sequence'].map(create_transcript_column_value)
df['proteins'] = df['sequence'].map(create_protein_column_value)
df['variant type'] = df['sequence'].map(
Expand Down Expand Up @@ -1049,8 +1046,8 @@ def __main__():
help="List of gene IDs for ID mapping.", required=False)
parser.add_argument('-pq', "--protein_quantification",
help="File with protein quantification values")
parser.add_argument('-ge', "--gene_expression",
help="File with expression analysis results")
parser.add_argument('-ge', "--expression",
help="File with rnaseq analysis results", required=False)
parser.add_argument('-de', "--diff_gene_expression",
help="File with differential expression analysis results (DESeq2)")
parser.add_argument('-li', "--ligandomics_id",
Expand Down Expand Up @@ -1154,7 +1151,6 @@ def __main__():
try:
complete_df = pd.concat(pred_dataframes, sort=True)
# replace method names with method names with version
# complete_df.replace({'method': methods}, inplace=True)
complete_df['method'] = complete_df['method'].apply(
lambda x: x.lower() + '-' + methods[x.lower()])
predictions_available = True
Expand All @@ -1163,16 +1159,25 @@ def __main__():
predictions_available = False
logger.error("No predictions available.")

complete_df.replace("gene", "gene_id")

# get gene names from Ensembl and add them to the data frame
# we want to add gene names to our data frame in order to make the mapping easier
# we will use this when the next epytope release is ready where we already implemented the functionality
#mapping_gene_names_ids = ma.get_gene_name_from_id(complete_df['gene_id'].unique.to_list())
#mapping_gene_names_ids.columns = ["gene_name", "gene_id"]
#complete_df = complete_df.merge(mapping_gene_names_ids,on='gene_id',how="left")

# include wild type sequences to dataframe if specified
if args.wild_type:
wt_sequences = generate_wt_seqs(all_peptides_filtered)
complete_df['wt sequence'] = complete_df.apply(
lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos',
'gene', 'transcripts', 'proteins', 'variant type', 'method']
'gene_id', 'transcripts', 'proteins', 'variant type', 'method']
# Change the order (the index) of the columns
else:
columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene',
columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene_id',
'transcripts', 'proteins', 'variant type', 'method']
for c in complete_df.columns:
if c not in columns_tiles:
Expand Down Expand Up @@ -1206,24 +1211,26 @@ def __main__():
for k in first_entry.keys():
complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(
lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1)
# parse (differential) expression analysis results, annotate features (genes/transcripts)
if args.gene_expression is not None:
fold_changes = read_diff_expression_values(args.gene_expression)
gene_id_lengths = {}
col_name = 'RNA expression (RPKM)'

with open(args.gene_reference, 'r') as gene_list:
for l in gene_list:
ids = l.split('\t')
gene_id_in_df = complete_df.iloc[1]['gene']
if 'ENSG' in gene_id_in_df:
gene_id_lengths[ids[0]] = float(ids[2].strip())
else:
gene_id_lengths[ids[1]] = float(ids[2].strip())
deseq = False
# add column to result dataframe
complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(
row, fold_changes, deseq, gene_id_lengths), axis=1)
# parse expression (nf-core/rnaseq) analysis results, annotate features (genes/transcripts)
if args.expression is not None:
rnaseq_results = pd.read_csv(args.expression, sep='\t', header=0)

measure = "count" if "count" in args.expression else "TPM"
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved
transcript_features = "tx" in rnaseq_results.columns
#merge_on = "gene_name"
merge_on = "gene_id"

# we expect columns: tx gene_id samples
if transcript_features:
rnaseq_results.columns = ["{}{}".format(c, "" if c in ["tx", "gene_id"] else f"_{'transcript'}_{measure}") for c in rnaseq_results.columns]
merge_on = "tx"
# we expect columns: gene_id gene_name samples
else:
rnaseq_results.columns = ["{}{}".format(c, "" if c in ["gene_name", "gene_id"] else f"_{'gene'}_{measure}") for c in rnaseq_results.columns]

# add sample-specific expression values to data frame
complete_df = complete_df.merge(rnaseq_results,on=merge_on,how="left")

if args.diff_gene_expression is not None:
gene_id_lengths = {}
fold_changes = read_diff_expression_values(args.diff_gene_expression)
Expand Down
5 changes: 5 additions & 0 deletions modules/local/epytope_peptide_prediction.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ process EPYTOPE_PEPTIDE_PREDICTION {
input:
tuple val(meta), path(splitted), path(software_versions)
val netmhc_paths
path(expression)
jonasscheid marked this conversation as resolved.
Show resolved Hide resolved

output:
tuple val(meta), path("*.json"), emit: json
Expand Down Expand Up @@ -41,6 +42,10 @@ process EPYTOPE_PEPTIDE_PREDICTION {
argument = "--use_affinity_thresholds " + argument
}

if (expression) {
argument = "--expression ${expression} " + argument
}

def netmhc_paths_string = netmhc_paths.join(",")
def tools_split = params.tools.split(',')
def class1_tools = tools_split.findAll { ! it.matches('.*(?i)(class-2|ii).*') }
Expand Down
6 changes: 4 additions & 2 deletions subworkflows/local/input_check.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,14 @@ def get_samplesheet_paths(LinkedHashMap row) {
meta.alleles = allele_string
meta.mhcclass = row.mhc_class
meta.inputtype = type
expression = row.expression ? file(row.expression, checkIfExists: true) : []

def array = []
if (!file(row.filename).exists()) {
exit 1, "ERROR: Please check input samplesheet -> file does not exist!\n${row.Filename}"
} else {
array = [ meta, file(row.filename) ]
}
else {
array = [meta, expression, file(row.filename)]
}
return array
}
Expand Down
14 changes: 10 additions & 4 deletions workflows/epitopeprediction.nf
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ workflow EPITOPEPREDICTION {

INPUT_CHECK.out.reads
.branch {
meta_data, input_file ->
meta_data, expression, input_file ->
variant_compressed : meta_data.inputtype == 'variant_compressed'
return [ meta_data, input_file ]
variant_uncompressed : meta_data.inputtype == 'variant'
Expand All @@ -124,6 +124,9 @@ workflow EPITOPEPREDICTION {
}
.set { ch_samples_from_sheet }

ch_expression = INPUT_CHECK.out.reads
.map { meta_data, expression, input_file -> expression }

// gunzip variant files
GUNZIP_VCF (
ch_samples_from_sheet.variant_compressed
Expand Down Expand Up @@ -342,7 +345,8 @@ workflow EPITOPEPREDICTION {
.splitted
.combine( ch_prediction_tool_versions )
.transpose(),
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
ch_expression
)

// Run epitope prediction for peptides
Expand All @@ -352,7 +356,8 @@ workflow EPITOPEPREDICTION {
.splitted
.combine( ch_prediction_tool_versions )
.transpose(),
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
ch_expression
)

// Run epitope prediction for variants
Expand All @@ -363,7 +368,8 @@ workflow EPITOPEPREDICTION {
.mix( ch_split_variants.splitted )
.combine( ch_prediction_tool_versions )
.transpose(),
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([])
EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]),
ch_expression
)

// collect prediction script versions
Expand Down