Skip to content

Commit

Permalink
Do sample comparison in memory due to limits (#477)
Browse files Browse the repository at this point in the history
* Do sample comparison in memory due to limits

* Create patch release

* Add comment

* Lint
  • Loading branch information
thomasyu888 authored Aug 12, 2022
1 parent 1a99552 commit 2473a6a
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 9 deletions.
2 changes: 1 addition & 1 deletion genie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@

# create version in __init__.py
# https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
__version__ = "14.1.1"
__version__ = "14.1.2"

__all__ = ["__version__"]
27 changes: 19 additions & 8 deletions genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,17 +302,29 @@ def update_clinical(self, row):
x[i] = x[i].strip(" ")
return x

def uploadMissingData(self, df, col, dbSynId, stagingSynId, retractionSynId=None):
"""Uploads missing clinical samples / patients"""
samples = "','".join(df[col])
def uploadMissingData(
self, df: pd.DataFrame, col: str, dbSynId: str, stagingSynId: str
):
"""Uploads missing clinical samples / patients
Args:
df (pd.DataFrame): dataframe with clinical data
col (str): column in dataframe. Usually SAMPLE_ID or PATIENT_ID.
dbSynId (str): Synapse table Synapse id
stagingSynId (str): Center Synapse staging Id
"""
path = os.path.join(
process_functions.SCRIPT_DIR, f"{self._fileType}_missing_{col}.csv"
)
missing = self.syn.tableQuery(
f"select {col} from {dbSynId} where "
f"CENTER='{self.center}' and {col} not in ('{samples}')"
# PLFM-7428 - there are limits on a "not in" function on Synapse tables
center_samples = self.syn.tableQuery(
f"select {col} from {dbSynId} where " f"CENTER='{self.center}'"
)
missing.asDataFrame().to_csv(path, index=False)
center_samples_df = center_samples.asDataFrame()
# Get all the samples that are in the database but missing from
# the input file
missing_df = center_samples_df[col][~center_samples_df[col].isin(df[col])]
missing_df.to_csv(path, index=False)
self.syn.store(synapseclient.File(path, parent=stagingSynId))
os.remove(path)

Expand Down Expand Up @@ -449,7 +461,6 @@ def process_steps(
sampleClinical["ONCOTREE_CODE"].isin(oncotree_mapping["ONCOTREE_CODE"])
]
self.uploadMissingData(sampleClinical, "SAMPLE_ID", sample_synid, parentId)
# ,retractedSampleSynId)
process_functions.updateData(
self.syn,
sample_synid,
Expand Down

0 comments on commit 2473a6a

Please sign in to comment.