Skip to content

Commit

Permalink
Merge pull request #33 from RIVM-bioinformatics/add_salm_col
Browse files Browse the repository at this point in the history
Add context to salmonella serotypes
  • Loading branch information
boasvdp authored Feb 28, 2024
2 parents 3a7cca8 + 2c255da commit ac04fde
Show file tree
Hide file tree
Showing 8 changed files with 183 additions and 4 deletions.
100 changes: 100 additions & 0 deletions bin/add_context_seqsero.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/env python3

import logging
from pathlib import Path

import pandas as pd


def df_to_dict(df, column_name):
"""
Convert a dataframe to a dictionary
Parameters
----------
df : pd.DataFrame
Input dataframe
column_name : str
Column name to use as key
Returns
-------
dict
Dictionary with column values as keys and the rest of the row as values
"""
df = df[df["Column"] == column_name]
dict_ = df.set_index("Value").to_dict(orient="index")
return dict_


def add_context(df_context, value, col_name):
"""
Add context to a value
Parameters
----------
df_context : pd.DataFrame
Dataframe with context for e.g. specific serotypes
value : str
Value to check, e.g. serotype name
col_name : str
Column name to check, e.g. "Predicted serotype"
Returns
-------
str
Context for the value
"""
logging.info(f"Checking context for {col_name}={value}")
context = None
dict_context = df_to_dict(df_context, col_name)
if value in dict_context:
logging.info(f"Found context for {col_name}={value}")
context_partial = dict_context[value]["Context"]
context = f"{col_name}={value}: {context_partial}"
return context


def main(args):
logging.info(f"Reading {args.input} and {args.context}")
df = pd.read_csv(args.input, sep="\t")
df_context = pd.read_csv(args.context, sep="\t")
notes = []

logging.info(f"Check if this is a single sample report")
if df.shape[0] > 1:
raise ValueError("This script only works for single sample reports")

# Add context to O antigen
O_gene = df["O antigen prediction"].values[0]
notes.append(add_context(df_context, O_gene, "O antigen prediction"))

# Add context to serotype
serotype = df["Predicted serotype"].values[0]
notes.append(add_context(df_context, serotype, "Predicted serotype"))

# Combine all notes
note_str = "|".join([note for note in notes if note is not None])
df["RIVM-specific notes"] = note_str

# Write to output
logging.info(f"Writing to {args.output}")
df.to_csv(args.output, sep="\t", index=False)


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser("Add context to SeqSero report")

parser.add_argument("-i", "--input", required=True, type=Path)
parser.add_argument("-o", "--output", required=True, type=Path)
parser.add_argument("-c", "--context", required=True, type=Path)
parser.add_argument("--verbose", action="store_true")

args = parser.parse_args()

if args.verbose:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

main(args)
4 changes: 3 additions & 1 deletion bin/download_dbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,9 @@ def get_downloads_juno_typing(
):
if self.update_dbs:
if str(self.db_dir) == "/mnt/db/juno/typing_db":
raise ValueError(f"The databases on /mnt/db/juno/typing_db cannot be updated using the --update option. If you require an update, please contact the IDS-bioinformatics team.")
raise ValueError(
f"The databases on /mnt/db/juno/typing_db cannot be updated using the --update option. If you require an update, please contact the IDS-bioinformatics team."
)
else:
try:
rm_dir = subprocess.run(
Expand Down
28 changes: 27 additions & 1 deletion bin/rules/serotype.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

def choose_serotyper(wildcards):
if SAMPLES[wildcards.sample]["genus"] == "salmonella":
return [OUT + "/serotype/{sample}/SeqSero_result.tsv"]
return [OUT + "/serotype/{sample}/SeqSero_result_with_context.tsv"]
elif (
SAMPLES[wildcards.sample]["genus"] == "escherichia"
or SAMPLES[wildcards.sample]["genus"] == "shigella"
Expand Down Expand Up @@ -75,6 +75,32 @@ rule salmonella_serotyper:
"""


rule add_context_salmonella_serotyper:
input:
seqsero=OUT + "/serotype/{sample}/SeqSero_result.tsv",
output:
seqsero=OUT + "/serotype/{sample}/SeqSero_result_with_context.tsv",
message:
"Adding context to salmonella serotype report for {wildcards.sample}"
log:
OUT + "/log/add_context_salmonella_serotyper/{sample}.log",
params:
seqsero_context=config["seqsero_context"],
threads: config["threads"]["other"]
resources:
mem_gb=config["mem_gb"]["other"],
conda:
"../../envs/python.yaml"
shell:
"""
python bin/add_context_seqsero.py \
--input {input.seqsero} \
--output {output.seqsero} \
--context {params.seqsero_context} \
--verbose 2>&1>{log}
"""


# -----------------------------------------------------------------------------#
### E. coli serotyper ###

Expand Down
2 changes: 1 addition & 1 deletion bin/rules/serotype_multireports.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ rule serotype_multireports:
sample_subfolder="{params.output_dir}/${{subfolder}}"
result_sample=$(find "${{sample_subfolder}}" \
-type f \
-name "SeqSero_result.tsv" \
-name "SeqSero_result_with_context.tsv" \
-o -name "result_serotype.csv" \
-o -name "command.txt" \
-o -name "shigatyper.csv" \
Expand Down
2 changes: 1 addition & 1 deletion bin/serotyper_multireport.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def __classify_serotyper_result_files(self):
"neisseriatyper": [],
}
for file_ in self.serotyper_result_files:
if file_.endswith("SeqSero_result.tsv"):
if file_.endswith("SeqSero_result_with_context.tsv"):
input_files["seqsero2"].append(file_)
elif file_.endswith("result_serotype.csv"):
input_files["serotypefinder"].append(file_)
Expand Down
4 changes: 4 additions & 0 deletions envs/python.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
channels:
- conda-forge
dependencies:
- pandas=2.1.*
38 changes: 38 additions & 0 deletions files/SeqSero2_context.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
Column Value Context Source
Predicted serotype Virginia Phenotypic confirmation of O6 antigen required
Predicted serotype Muenchen Phenotypic confirmation of O6 antigen required
Predicted serotype Yovokome Phenotypic confirmation of O6 antigen required
Predicted serotype Manhattan Phenotypic confirmation of O6 antigen required
Predicted serotype Bardo Phenotypic confirmation of O6 antigen required
Predicted serotype Newport Phenotypic confirmation of O6 antigen required
Predicted serotype Ferruch Phenotypic confirmation of O6 antigen required
Predicted serotype Kottbus Phenotypic confirmation of O6 antigen required
Predicted serotype Bargny Phenotypic confirmation of O6 antigen required
Predicted serotype Takoradi Phenotypic confirmation of O6 antigen required
Predicted serotype Haardt Phenotypic confirmation of O6 antigen required
Predicted serotype Blockley Phenotypic confirmation of O6 antigen required
Predicted serotype Pakistan Phenotypic confirmation of O6 antigen required
Predicted serotype Litchfield Phenotypic confirmation of O6 antigen required
Predicted serotype Hindmarsh Phenotypic confirmation of O6 antigen required
Predicted serotype Bovismorbificans Phenotypic confirmation of O6 antigen required
Predicted serotype Brikama Phenotypic confirmation of O6 antigen required
Predicted serotype Goldcoast Phenotypic confirmation of O6 antigen required
Predicted serotype Albany Phenotypic confirmation of O6 antigen required
Predicted serotype Duesseldorf Phenotypic confirmation of O6 antigen required
Predicted serotype Paris Phenotypic confirmation of O6 antigen required
Predicted serotype Mapo Phenotypic confirmation of O6 antigen required
Predicted serotype Istanbul Phenotypic confirmation of O6 antigen required
Predicted serotype Hadar Phenotypic confirmation of O6 antigen required
Predicted serotype Chomedey Phenotypic confirmation of O6 antigen required
Predicted serotype Glostrup Phenotypic confirmation of O6 antigen required
Predicted serotype 4,[5],12:i:- Confirm with PCR according to SOP IDS_BAC_M321 Validation report
Predicted serotype Miami Confirm with growth on Simmons citrate (should be +) Validation report
Predicted serotype Sendai Confirm with growth on Simmons citrate (should be -) Validation report
Predicted serotype Typhimurium Confirm with PCR according to SOP IDS_BAC_M321 Validation report
Predicted serotype Senftenberg Phenotypic confirmation required https://doi.org/10.1128%2FAEM.02265-19
Predicted serotype Dessau Phenotypic confirmation required https://doi.org/10.1128%2FAEM.02265-19
Predicted serotype Indiana Phenotypic confirmation required https://doi.org/10.1128%2FAEM.02265-19
Predicted serotype 4,12:z:1,7 Phenotypic confirmation required https://doi.org/10.1128%2FAEM.02265-19
O antigen prediction 6,14 Confirm with O24/O25 antisera Validation report
O antigen prediction 3,10 Phenotypic confirmation required
O antigen prediction 1,3,19 Phenotypic confirmation required
9 changes: 9 additions & 0 deletions juno_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,13 @@ def _add_args_to_parser(self) -> None:
action="store_true",
help="Force database update even if they are present.",
)
self.add_argument(
"--seqsero_context",
type=Path,
metavar="FILE",
default="files/SeqSero2_context.tsv",
help="SeqSero context file which lists additional confirmation steps for O-antigen genes.",
)

def _parse_args(self) -> argparse.Namespace:
# Remove this if containers can be used with juno-typing
Expand All @@ -140,6 +147,7 @@ def _parse_args(self) -> argparse.Namespace:
args.bordetella_vaccine_antigen_scheme_name
)
self.update_dbs: bool = args.update
self.seqsero_context: Path = args.seqsero_context
return args

def setup(self) -> None:
Expand Down Expand Up @@ -177,6 +185,7 @@ def setup(self) -> None:
"bordetella.fa",
)
),
"seqsero_context": str(self.seqsero_context),
}

with open(
Expand Down

0 comments on commit ac04fde

Please sign in to comment.