Merge pull request #33 from RIVM-bioinformatics/add_salm_col

Add context to salmonella serotypes
RIVM-bioinformatics · Feb 28, 2024 · ac04fde · ac04fde
2 parents 3a7cca8 + 2c255da
commit ac04fde
Show file tree

Hide file tree

Showing 8 changed files with 183 additions and 4 deletions.
diff --git a/bin/add_context_seqsero.py b/bin/add_context_seqsero.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+import logging
+from pathlib import Path
+
+import pandas as pd
+
+
+def df_to_dict(df, column_name):
+    """
+    Convert a dataframe to a dictionary
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input dataframe
+    column_name : str
+        Column name to use as key
+
+    Returns
+    -------
+    dict
+        Dictionary with column values as keys and the rest of the row as values
+    """
+    df = df[df["Column"] == column_name]
+    dict_ = df.set_index("Value").to_dict(orient="index")
+    return dict_
+
+
+def add_context(df_context, value, col_name):
+    """
+    Add context to a value
+
+    Parameters
+    ----------
+    df_context : pd.DataFrame
+        Dataframe with context for e.g. specific serotypes
+    value : str
+        Value to check, e.g. serotype name
+    col_name : str
+        Column name to check, e.g. "Predicted serotype"
+
+    Returns
+    -------
+    str
+        Context for the value
+    """
+    logging.info(f"Checking context for {col_name}={value}")
+    context = None
+    dict_context = df_to_dict(df_context, col_name)
+    if value in dict_context:
+        logging.info(f"Found context for {col_name}={value}")
+        context_partial = dict_context[value]["Context"]
+        context = f"{col_name}={value}: {context_partial}"
+    return context
+
+
+def main(args):
+    logging.info(f"Reading {args.input} and {args.context}")
+    df = pd.read_csv(args.input, sep="\t")
+    df_context = pd.read_csv(args.context, sep="\t")
+    notes = []
+
+    logging.info(f"Check if this is a single sample report")
+    if df.shape[0] > 1:
+        raise ValueError("This script only works for single sample reports")
+
+    # Add context to O antigen
+    O_gene = df["O antigen prediction"].values[0]
+    notes.append(add_context(df_context, O_gene, "O antigen prediction"))
+
+    # Add context to serotype
+    serotype = df["Predicted serotype"].values[0]
+    notes.append(add_context(df_context, serotype, "Predicted serotype"))
+
+    # Combine all notes
+    note_str = "|".join([note for note in notes if note is not None])
+    df["RIVM-specific notes"] = note_str
+
+    # Write to output
+    logging.info(f"Writing to {args.output}")
+    df.to_csv(args.output, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser("Add context to SeqSero report")
+
+    parser.add_argument("-i", "--input", required=True, type=Path)
+    parser.add_argument("-o", "--output", required=True, type=Path)
+    parser.add_argument("-c", "--context", required=True, type=Path)
+    parser.add_argument("--verbose", action="store_true")
+
+    args = parser.parse_args()
+
+    if args.verbose:
+        logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
+
+    main(args)
diff --git a/bin/download_dbs.py b/bin/download_dbs.py
@@ -225,7 +225,9 @@ def get_downloads_juno_typing(
     ):
         if self.update_dbs:
             if str(self.db_dir) == "/mnt/db/juno/typing_db":
-                raise ValueError(f"The databases on /mnt/db/juno/typing_db cannot be updated using the --update option. If you require an update, please contact the IDS-bioinformatics team.")
+                raise ValueError(
+                    f"The databases on /mnt/db/juno/typing_db cannot be updated using the --update option. If you require an update, please contact the IDS-bioinformatics team."
+                )
             else:
                 try:
                     rm_dir = subprocess.run(

diff --git a/bin/rules/serotype.smk b/bin/rules/serotype.smk
@@ -5,7 +5,7 @@
 
 def choose_serotyper(wildcards):
     if SAMPLES[wildcards.sample]["genus"] == "salmonella":
-        return [OUT + "/serotype/{sample}/SeqSero_result.tsv"]
+        return [OUT + "/serotype/{sample}/SeqSero_result_with_context.tsv"]
     elif (
         SAMPLES[wildcards.sample]["genus"] == "escherichia"
         or SAMPLES[wildcards.sample]["genus"] == "shigella"
@@ -75,6 +75,32 @@ rule salmonella_serotyper:
         """
 
 
+rule add_context_salmonella_serotyper:
+    input:
+        seqsero=OUT + "/serotype/{sample}/SeqSero_result.tsv",
+    output:
+        seqsero=OUT + "/serotype/{sample}/SeqSero_result_with_context.tsv",
+    message:
+        "Adding context to salmonella serotype report for {wildcards.sample}"
+    log:
+        OUT + "/log/add_context_salmonella_serotyper/{sample}.log",
+    params:
+        seqsero_context=config["seqsero_context"],
+    threads: config["threads"]["other"]
+    resources:
+        mem_gb=config["mem_gb"]["other"],
+    conda:
+        "../../envs/python.yaml"
+    shell:
+        """
+        python bin/add_context_seqsero.py \
+            --input {input.seqsero} \
+            --output {output.seqsero} \
+            --context {params.seqsero_context} \
+            --verbose 2>&1>{log}
+        """
+
+
 # -----------------------------------------------------------------------------#
 ### E. coli serotyper ###
 

diff --git a/bin/rules/serotype_multireports.smk b/bin/rules/serotype_multireports.smk
@@ -23,7 +23,7 @@ rule serotype_multireports:
             sample_subfolder="{params.output_dir}/${{subfolder}}"
             result_sample=$(find "${{sample_subfolder}}" \
                     -type f \
-                    -name "SeqSero_result.tsv" \
+                    -name "SeqSero_result_with_context.tsv" \
                     -o -name "result_serotype.csv" \
                     -o -name "command.txt" \
                     -o -name "shigatyper.csv" \

diff --git a/bin/serotyper_multireport.py b/bin/serotyper_multireport.py
@@ -205,7 +205,7 @@ def __classify_serotyper_result_files(self):
             "neisseriatyper": [],
         }
         for file_ in self.serotyper_result_files:
-            if file_.endswith("SeqSero_result.tsv"):
+            if file_.endswith("SeqSero_result_with_context.tsv"):
                 input_files["seqsero2"].append(file_)
             elif file_.endswith("result_serotype.csv"):
                 input_files["serotypefinder"].append(file_)

diff --git a/envs/python.yaml b/envs/python.yaml
@@ -0,0 +1,4 @@
+channels:
+- conda-forge
+dependencies:
+- pandas=2.1.*
diff --git a/files/SeqSero2_context.tsv b/files/SeqSero2_context.tsv
@@ -0,0 +1,38 @@
+Column	Value	Context	Source
+Predicted serotype	Virginia	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Muenchen	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Yovokome	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Manhattan	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Bardo	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Newport	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Ferruch	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Kottbus	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Bargny	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Takoradi	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Haardt	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Blockley	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Pakistan	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Litchfield	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Hindmarsh	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Bovismorbificans	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Brikama	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Goldcoast	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Albany	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Duesseldorf	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Paris	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Mapo	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Istanbul	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Hadar	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Chomedey	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	Glostrup	Phenotypic confirmation of O6 antigen required	
+Predicted serotype	4,[5],12:i:-	Confirm with PCR according to SOP IDS_BAC_M321	Validation report
+Predicted serotype	Miami	Confirm with growth on Simmons citrate (should be +)	Validation report
+Predicted serotype	Sendai	Confirm with growth on Simmons citrate (should be -)	Validation report
+Predicted serotype	Typhimurium	Confirm with PCR according to SOP IDS_BAC_M321	Validation report
+Predicted serotype	Senftenberg	Phenotypic confirmation required	https://doi.org/10.1128%2FAEM.02265-19
+Predicted serotype	Dessau	Phenotypic confirmation required	https://doi.org/10.1128%2FAEM.02265-19
+Predicted serotype	Indiana	Phenotypic confirmation required	https://doi.org/10.1128%2FAEM.02265-19
+Predicted serotype	4,12:z:1,7	Phenotypic confirmation required	https://doi.org/10.1128%2FAEM.02265-19
+O antigen prediction	6,14	Confirm with O24/O25 antisera	Validation report
+O antigen prediction	3,10	Phenotypic confirmation required	
+O antigen prediction	1,3,19	Phenotypic confirmation required	
diff --git a/juno_typing.py b/juno_typing.py
@@ -119,6 +119,13 @@ def _add_args_to_parser(self) -> None:
             action="store_true",
             help="Force database update even if they are present.",
         )
+        self.add_argument(
+            "--seqsero_context",
+            type=Path,
+            metavar="FILE",
+            default="files/SeqSero2_context.tsv",
+            help="SeqSero context file which lists additional confirmation steps for O-antigen genes.",
+        )
 
     def _parse_args(self) -> argparse.Namespace:
         # Remove this if containers can be used with juno-typing
@@ -140,6 +147,7 @@ def _parse_args(self) -> argparse.Namespace:
             args.bordetella_vaccine_antigen_scheme_name
         )
         self.update_dbs: bool = args.update
+        self.seqsero_context: Path = args.seqsero_context
         return args
 
     def setup(self) -> None:
@@ -177,6 +185,7 @@ def setup(self) -> None:
                     "bordetella.fa",
                 )
             ),
+            "seqsero_context": str(self.seqsero_context),
         }
 
         with open(