oicr-gsi · slazicoicr · Nov 13, 2024 · Nov 12, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+### Changed
+* `batches` field changed from list to a string with a separator.
+WDL and Shesmu enforce the same types within a `dict`, so the metadata JSON is now all string.
 
 ## [0.1.0] - 2024-11-08
 ### Added

diff --git a/crosscheck_fingerprint_caller/main.py b/crosscheck_fingerprint_caller/main.py
@@ -38,8 +38,8 @@ def main(args=None):
     parser.add_argument(
         "-s",
         "--seperator",
-        default=",",
-        help="The seperator to use for turning lists into strings (default `,`)",
+        default=";",
+        help="The seperator for splitting batch string into individual batches (default `;`)",
     )
 
     if args is None:
@@ -60,7 +60,7 @@ def main(args=None):
         cols_match = [x + "_match" for x in cols]
         cols_match.append("LOD_SCORE")
         match = mark_match(df, ambg)
-        btch_ovlp = batch_overlap(df)
+        btch_ovlp = batch_overlap(df, args.seperator)
         generate_detailed_calls(
             df[cols + cols_match],
             match,
@@ -266,21 +266,24 @@ def mark_match(df: DataFrame, ambg: pandas.Series) -> pandas.Series:
     return keep
 
 
-def batch_overlap(df: DataFrame) -> pandas.Series:
+def batch_overlap(df: DataFrame, separator: str) -> pandas.Series:
     """
     The batches that the query and match library share.
 
     If they overlap, then the swap could be internal.
 
     Args:
         df: The DataFrame must contain the `batches` and `batches_match` columns.
+        separator: The string to use to separate batch string into individual batches
 
     Returns: A Series of sets of shared batches. Empty list means no overlap.
 
     """
 
     def intrs(x):
-        return set(x["batches"]).intersection(x["batches_match"])
+        return set(x["batches"].split(separator)).intersection(
+            x["batches_match"].split(separator)
+        )
 
     return df.apply(intrs, axis=1)
 

diff --git a/doc/metadata_example.json b/doc/metadata_example.json
@@ -5,28 +5,22 @@
     "library_design": "WG",
     "lims_id": "6479_1_LDI98381",
     "merge_key": "REVOLVE_0001_01_LB02-01_230406_M00753_0560_000000000-DKWTV_1_CATACCAC-CGGTTGTT",
-    "batches": [
-      "2023-04-05_u144_s30_k154-140205"
-    ]
+    "batches": "2023-04-05_u144_s30_k154-140205"
   },
   {
     "donor": "REVOLVE_0001",
     "library_name": "REVOLVE_0001_16_LB01-01",
     "library_design": "WG",
     "lims_id": "6479_1_LDI98382",
     "merge_key": "REVOLVE_0001_16_LB01-01_230406_M00753_0560_000000000-DKWTV_1_GATGTGTG-CATGGCTA",
-    "batches": [
-      "2023-04-05_u144_s30_k154-140205"
-    ]
+    "batches": "2023-04-05_u144_s30_k154-140205;2023-04-11_u157_s30_k154-139702"
   },
   {
     "donor": "REVOLVE_0002",
     "library_name": "REVOLVE_0002_13_LB02-01",
     "library_design": "WG",
     "lims_id": "6486_1_LDI98487",
     "merge_key": "REVOLVE_0002_13_LB02-01_230412_M06816_0442_000000000-DKVYK_1_TGGTAGCT-CAACACCT",
-    "batches": [
-      "2023-04-11_u157_s30_k154-139702"
-    ]
+    "batches": "2023-04-11_u157_s30_k154-139702"
   }
 ]
diff --git a/test/files/load_REVWGTS.29181.crosscheck_metrics.csv b/test/files/load_REVWGTS.29181.crosscheck_metrics.csv