Skip to content

Commit

Permalink
Merge pull request #3 from oicr-gsi/batch_str
Browse files Browse the repository at this point in the history
`batches` field changed from list to a string with a separator.
  • Loading branch information
slazicoicr authored Nov 13, 2024
2 parents 1483fb9 + 62198db commit ecfa0a3
Show file tree
Hide file tree
Showing 9 changed files with 2,222 additions and 2,303 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Changed
* `batches` field changed from list to a string with a separator.
WDL and Shesmu enforce the same types within a `dict`, so the metadata JSON is now all string.

## [0.1.0] - 2024-11-08
### Added
Expand Down
13 changes: 8 additions & 5 deletions crosscheck_fingerprint_caller/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def main(args=None):
parser.add_argument(
"-s",
"--seperator",
default=",",
help="The seperator to use for turning lists into strings (default `,`)",
default=";",
help="The seperator for splitting batch string into individual batches (default `;`)",
)

if args is None:
Expand All @@ -60,7 +60,7 @@ def main(args=None):
cols_match = [x + "_match" for x in cols]
cols_match.append("LOD_SCORE")
match = mark_match(df, ambg)
btch_ovlp = batch_overlap(df)
btch_ovlp = batch_overlap(df, args.seperator)
generate_detailed_calls(
df[cols + cols_match],
match,
Expand Down Expand Up @@ -266,21 +266,24 @@ def mark_match(df: DataFrame, ambg: pandas.Series) -> pandas.Series:
return keep


def batch_overlap(df: DataFrame) -> pandas.Series:
def batch_overlap(df: DataFrame, separator: str) -> pandas.Series:
"""
The batches that the query and match library share.
If they overlap, then the swap could be internal.
Args:
df: The DataFrame must contain the `batches` and `batches_match` columns.
separator: The string to use to separate batch string into individual batches
Returns: A Series of sets of shared batches. Empty list means no overlap.
"""

def intrs(x):
return set(x["batches"]).intersection(x["batches_match"])
return set(x["batches"].split(separator)).intersection(
x["batches_match"].split(separator)
)

return df.apply(intrs, axis=1)

Expand Down
12 changes: 3 additions & 9 deletions doc/metadata_example.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,22 @@
"library_design": "WG",
"lims_id": "6479_1_LDI98381",
"merge_key": "REVOLVE_0001_01_LB02-01_230406_M00753_0560_000000000-DKWTV_1_CATACCAC-CGGTTGTT",
"batches": [
"2023-04-05_u144_s30_k154-140205"
]
"batches": "2023-04-05_u144_s30_k154-140205"
},
{
"donor": "REVOLVE_0001",
"library_name": "REVOLVE_0001_16_LB01-01",
"library_design": "WG",
"lims_id": "6479_1_LDI98382",
"merge_key": "REVOLVE_0001_16_LB01-01_230406_M00753_0560_000000000-DKWTV_1_GATGTGTG-CATGGCTA",
"batches": [
"2023-04-05_u144_s30_k154-140205"
]
"batches": "2023-04-05_u144_s30_k154-140205;2023-04-11_u157_s30_k154-139702"
},
{
"donor": "REVOLVE_0002",
"library_name": "REVOLVE_0002_13_LB02-01",
"library_design": "WG",
"lims_id": "6486_1_LDI98487",
"merge_key": "REVOLVE_0002_13_LB02-01_230412_M06816_0442_000000000-DKVYK_1_TGGTAGCT-CAACACCT",
"batches": [
"2023-04-11_u157_s30_k154-139702"
]
"batches": "2023-04-11_u157_s30_k154-139702"
}
]
2,592 changes: 1,296 additions & 1,296 deletions test/files/load_REVWGTS.29181.crosscheck_metrics.csv

Large diffs are not rendered by default.

Loading

0 comments on commit ecfa0a3

Please sign in to comment.