Merge pull request #13 from RIVM-bioinformatics/fix_localrule

Fix localrule
RIVM-bioinformatics · Jun 3, 2024 · def3c64 · def3c64
2 parents 79b041e + 7872ba3
commit def3c64
Show file tree

Hide file tree

Showing 6 changed files with 18 additions and 20 deletions.
diff --git a/Snakefile b/Snakefile
@@ -31,7 +31,7 @@ localrules:
     copy_ref_gff,
     aggregate_species,
     no_typing,
-    cauris_extract_amr_mutations,
+    cauris_extract_aa_mutations,
     combine_auriclas,
 
 

diff --git a/tests/test_amr_mutation_parsing.py b/tests/test_amr_mutation_parsing.py
@@ -160,14 +160,14 @@ def test_combine_exact_matches_and_possible_cnvs(self):
 
 class TestAaComparison(unittest.TestCase):
     df_resistance_genes_correct = pd.read_csv(
-        "tests/test_files/df_resistance_genes_correct.tsv", sep="\t"
+        "tests/test_files/df_resistance_genes_correct.tsv", sep="\t", dtype={"AF": str}
     )
 
     def test_read_input_file(self):
         df_mutations_test_read_input_correct = pd.read_csv(
             "tests/test_files/df_mutations_test_read_input_correct.tsv",
             sep="\t",
-            dtype={"AF": float},
+            dtype={"AF": str},
         )
         df_mutations_test_read_input = read_input_file(
             Path("tests/test_files/df_mutations_test_read_input.tsv")
@@ -180,12 +180,6 @@ def test_read_input_file(self):
         df_mutations_test_read_input_correct[
             "type"
         ] = df_mutations_test_read_input_correct["type"].fillna("NA")
-        df_mutations_test_read_input.to_csv(
-            "tests/test_files/inspect1.tsv", sep="\t", index=False
-        )
-        df_mutations_test_read_input_correct.to_csv(
-            "tests/test_files/inspect2.tsv", sep="\t", index=False
-        )
         self.assertTrue(
             df_mutations_test_read_input.equals(df_mutations_test_read_input_correct)
         )
@@ -201,13 +195,11 @@ def test_create_locus_tag_gene_dict(self):
         )
 
     def test_filter_for_resistance_genes(self):
-        # df_resistance_genes_correct = pd.read_csv(
-        #     "tests/test_files/df_resistance_genes_correct.tsv", sep="\t"
-        # )
         df_mutations_parsed = read_input_file(
             Path("tests/test_files/df_mutations_test_read_input.tsv")
         )
         df_resistance_genes_correct_copy = self.df_resistance_genes_correct.copy()
+
         df_resistance_genes = filter_for_resistance_genes(
             df_mutations=df_mutations_parsed,
             dict_locus_tag_gene={"b0001": "gene A"},
@@ -225,12 +217,15 @@ def test_merge_resistance_genes_with_ref(self):
             resistance_variants_csv=df_aa_resistance_variants,
         )
         df_resistance_with_impact_correct = pd.read_csv(
-            "tests/test_files/df_resistance_with_impact_correct.tsv", sep="\t"
+            "tests/test_files/df_resistance_with_impact_correct.tsv",
+            sep="\t",
+            dtype={"AF": str},
         )
         self.assertEqual(df_resistance_with_impact.shape[0], 2)
         self.assertEqual(df_resistance_with_impact.shape[1], 14)
         df_resistance_with_impact.reset_index(drop=True, inplace=True)
         df_resistance_with_impact_correct.reset_index(drop=True, inplace=True)
+
         self.assertTrue(
             df_resistance_with_impact.equals(df_resistance_with_impact_correct)
         )

diff --git a/tests/test_files/df_resistance_genes_correct.tsv b/tests/test_files/df_resistance_genes_correct.tsv
@@ -1,3 +1,3 @@
 CHROM	POS	TYPE	REF	ALT	DP	AF	type	locus_tag	mutation_name	ref_aa	alt_aa	genetic_element
-NC_000913.3	100	SNP	A	T	100	1.0	missense	b0001	10E>10K	10E	10K	gene A
-NC_000913.3	200	SNP	A	T	100	1.0	synonymous	b0001	20S	20S		gene A
+NC_000913.3	100	SNP	A	T	100	1	missense	b0001	10E>10K	10E	10K	gene A
+NC_000913.3	200	SNP	A	T	100	1	synonymous	b0001	20S	20S		gene A
diff --git a/tests/test_files/df_resistance_with_impact_correct.tsv b/tests/test_files/df_resistance_with_impact_correct.tsv
@@ -1,3 +1,3 @@
 CHROM	POS	TYPE	REF	ALT	DP	AF	type	locus_tag	mutation_name	ref_aa	alt_aa	genetic_element	impact
-NC_000913.3	100	SNP	A	T	100	1.0	missense	b0001	10E>10K	10E	10K	gene A	resistance
-NC_000913.3	200	SNP	A	T	100	1.0	synonymous	b0001	20S	20S		gene A	
+NC_000913.3	100	SNP	A	T	100	1	missense	b0001	10E>10K	10E	10K	gene A	resistance
+NC_000913.3	200	SNP	A	T	100	1	synonymous	b0001	20S	20S		gene A	
diff --git a/workflow/rules/cauris_typing.smk b/workflow/rules/cauris_typing.smk
@@ -72,10 +72,10 @@ rule cauris_extract_aa_mutations:
     message:
         "Extract AMR mutations for {wildcards.sample}"
     log:
-        OUT + "/log/cauris_extract_amr_mutations/{sample}.log",
+        OUT + "/log/cauris_compare_aa_mutations/{sample}.log",
     shell:
         """
-python workflow/scripts/extract_amr_mutations.py \
+python workflow/scripts/compare_aa_mutations.py \
     --input {input.tsv} \
     --output {output.tsv} \
     --full-output {output.full} \

diff --git a/workflow/scripts/compare_aa_mutations.py b/workflow/scripts/compare_aa_mutations.py
@@ -48,8 +48,11 @@ def read_input_file(input_file: Path) -> pd.DataFrame:
     # Read lines into pandas dataframe
     df_input = pd.DataFrame([line.split("\t") for line in lines[1:]])
     df_input.columns = lines[0].rstrip("\n").split("\t")
+    # if AF contains a string like 0.5,0.5 convert to two rows for this record with AF 0.5
+    # df_input = df_input.assign(AF=df_input["AF"].str.split(",")).explode("AF")
     # Set dtypes
-    df_input = df_input.astype({"POS": int, "DP": int, "AF": float})
+    # df_input = df_input.astype({"POS": int, "DP": int, "AF": float})
+    df_input = df_input.astype({"POS": int, "DP": int, "AF": str})
     df_input[["type", "locus_tag", "mutation_name"]] = df_input["BCSQ"].str.split(
         "|", expand=True
     )[[0, 1, 5]]