Skip to content

Commit

Permalink
Rc 13.2.0 (#462)
Browse files Browse the repository at this point in the history
* Revise dashboards (#433)

* Don't cache local install (#431)

* Don't put cache dir

* Remove -e

* Update codeowners to use genie reviewers

* Remove missing vital status and assay info sections

* hotfix dockerfile

* pip3 install no longer works in dockerfile for locally built packages

* case-insensitive comparison

* test case-insensitive comparison

* black on genie_registry/assay.py only

* black on genie/process_functions.py only

* rename new test function

* replace underscores with hyphens

* add test for underscore versus hypen

* black on genie_registry/assay.py

* Modify tsa1, tsa2, ref maf error message (#438)

* Modify tsa1, tsa2, ref maf error message

* Fix tests

* Add sample class filter (#441)

* Add sample class filter

* lint

* Lint

* only filter for public release

* lint

* Make sure processing pipeline doesn't fail for older releases that don't have SAMPLE_CLASS

* Fix Docker build (#445)

* Change docker tag and add depedency

* comment in sdist

* Update pandas version (#446)

* Use iloc

* Use pd.concat

* Use pd.concat

* Use pd.concat

* Use pd.concat instead of append

* Update genie/database_to_staging.py

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* use pd.concat

* Use pd.concat

* lint

* exclude tests

* Use pd.concat

* Use mask to replace values

* Lint

* append

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* Add code of conduct (#448)

* year or int of death is not applicable for living patients (#450)

* year or int of death is not applicable for living patients

* update tests for dead variable

Co-authored-by: Thomas Yu <[email protected]>

* support scheduled job secrets (#453)

* Add documentation

* Uncommit

* Add in clinicalreported (#459)

* [ETL-156, ETL-157, ETL-158] Create configuration dictionary for GENIE code (#455)

* Add genie_config parameter

* Remove unused variables

* Remove unused parameters

* Remove parameters

* Edit doc

* Comment

* Add genie config to file format classes

* Comment

* Fix some tests

* Fix test

* Fix tests

* Add test configuration

* Fix tests

* Lint

* Update bin/input_to_database.py

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* [ETL-159] Use GENIE config dict for processing (#460)

* Remove dataBaseSynIdmapping

* Remove the need for database mapping df

* Remove need for database mapping df

* Remove use of database mapping df

* Fill in docs

* Added quick value error check

* Update hack text

* Fix tests and lint

* use genie config and utilize existing function

* Remove unused function

* Remove todos

* Fix

* [ETL-156] Use genie config in validation cli (#461)

* Update validation cli code to use genie config

* Fix tests

* Add in genie config

* Update genie/validate.py

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* Fix test

Co-authored-by: Haley Hunter-Zinck <[email protected]>

Co-authored-by: Haley Hunter-Zinck <[email protected]>

* Bump version

Co-authored-by: Haley Hunter-Zinck <[email protected]>
Co-authored-by: Haley Hunter-Zinck <[email protected]>
  • Loading branch information
3 people authored Apr 4, 2022
1 parent 87c3697 commit 9f61bb0
Show file tree
Hide file tree
Showing 22 changed files with 545 additions and 520 deletions.
140 changes: 75 additions & 65 deletions bin/input_to_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,20 @@
import argparse
from datetime import date
import logging
import os

from genie import input_to_database, write_invalid_reasons, process_functions, config
from genie import (
config,
input_to_database,
process_functions,
write_invalid_reasons,
)

logger = logging.getLogger(__name__)

# TODO: Remove oncotree_link
# TODO: Remove gneie_annotation_pkg

def main(
process,
project_id,
process: str,
project_id: str,
center=None,
pemfile=None,
delete_old=False,
Expand All @@ -26,69 +29,76 @@ def main(
debug=False,
format_registry=None,
):
"""Invoke the GENIE ETL pipeline from data input files to synapse tables
Args:
process (str): main or mutation processing
project_id (str): Synapse project id that houses GENIE project
center (str, optional): GENIE center. Defaults to None.
pemfile (str, optional): Path to private key. Defaults to None.
delete_old (bool, optional): True to delete all old input/processed files.
Defaults to False.
only_validate (bool, optional): True if only validate files. Defaults to False.
oncotree_link (str, optional): Link to oncotree version. Defaults to None.
genie_annotation_pkg (str, optional): vcf/maf conversion tools.
Defaults to None.
create_new_maf_database (bool, optional): To create new maf table.
Defaults to False.
debug (bool, optional): Debug mode. Defaults to False.
format_registry (str, optional): File format registry python package.
Defaults to None.
Raises:
ValueError: If invalid center name is specified
Exception: If processing is already happening.
"""

syn = process_functions.synLogin(pemfile, debug=debug)

# Get the Synapse Project where data is stored
# Should have annotations to find the table lookup
project = syn.get(project_id)
database_to_synid_mapping_synid = project.annotations.get("dbMapping", "")

databaseToSynIdMapping = syn.tableQuery(
"SELECT * FROM {}".format(database_to_synid_mapping_synid[0])
)
databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()

center_mapping_id = process_functions.getDatabaseSynId(
syn, "centerMapping", databaseToSynIdMappingDf=databaseToSynIdMappingDf
)

center_mapping = syn.tableQuery("SELECT * FROM %s" % center_mapping_id)
center_mapping_df = center_mapping.asDataFrame()
# Get project GENIE configurations
genie_config = process_functions.get_genie_config(syn=syn, project_id=project_id)

# Filter for specific center
if center is not None:
assert (
center in center_mapping_df.center.tolist()
), "Must specify one of these centers: {}".format(
", ".join(center_mapping_df.center)
)
if center not in genie_config["center_config"].keys():
raise ValueError(
"Must specify one of these centers: {}".format(
", ".join(genie_config["center_config"].keys())
)
)
centers = [center]
else:
# exclude_sites = ['JHU', 'DFCI', 'GRCC', 'VICC', 'NKI', 'MSK',
# 'UHN', 'MDA', 'WAKE', 'YALE', 'UCSF', 'CRUK',
# 'CHOP', 'VHIO', 'SCI', 'PHS', 'COLU', 'UCHI']
center_mapping_df = center_mapping_df[~center_mapping_df["inputSynId"].isnull()]
# release is a bool column
center_mapping_df = center_mapping_df[center_mapping_df["release"]]
# center_mapping_df = center_mapping_df[
# ~center_mapping_df['center'].isin(exclude_sites)
# ]
centers = center_mapping_df.center
# TODO: add in logic to exclude sites from processing
centers = list(genie_config["center_config"].keys())

# HACK: Modify oncotree link config
if oncotree_link is None:
onco_link = databaseToSynIdMappingDf["Id"][
databaseToSynIdMappingDf["Database"] == "oncotreeLink"
].values[0]
onco_link_ent = syn.get(onco_link)
onco_link_ent = syn.get(genie_config["oncotreeLink"])
oncotree_link = onco_link_ent.externalURL
genie_config["oncotreeLink"] = oncotree_link
# Check if you can connect to oncotree link,
# if not then don't run validation / processing
process_functions.checkUrl(oncotree_link)
process_functions.checkUrl(genie_config["oncotreeLink"])

# HACK: Add genie annotation package to config
if process == "mutation" and genie_annotation_pkg is None:
raise ValueError("Must define genie annotation pkg if mutation processing")
genie_config["genie_annotation_pkg"] = genie_annotation_pkg

center_mapping_ent = syn.get(center_mapping_id)
# HACK: This is essential, because Synapse has concurrency update issues
center_mapping_ent = syn.get(genie_config["centerMapping"])
if center_mapping_ent.get("isProcessing", ["True"])[0] == "True":
raise Exception(
"Processing/validation is currently happening. "
"Please change/add the 'isProcessing' annotation on {} "
"to False to enable processing".format(center_mapping_id)
"Processing/validation is currently happening. Please change/add the "
f"'isProcessing' annotation on {genie_config['centerMapping']} "
"to False to enable processing"
)
else:
center_mapping_ent.isProcessing = "True"
center_mapping_ent = syn.store(center_mapping_ent)
# remove this query timeout and see what happens
# syn.table_query_timeout = 50000

# Create new maf database, should only happen once if its specified
# HACK: Create new maf database, should only happen once if its specified
# Will modify genie configuration
if create_new_maf_database:
today = date.today()
table_name = f"Narrow MAF Database - {today}"
Expand All @@ -98,37 +108,37 @@ def main(
syn, "vcf2maf", table_name, project_id, "syn7208886"
)
syn.setPermissions(new_tables["newdb_ent"].id, 3326313, [])
databaseToSynIdMappingDf = new_tables["newdb_mappingdf"]
genie_config["vcf2maf"] = new_tables["newdb_ent"].id

# Get file format classes
format_registry = config.collect_format_types(args.format_registry_packages)

# Start GENIE processing
for process_center in centers:
input_to_database.center_input_to_database(
syn,
project_id,
process_center,
process,
only_validate,
databaseToSynIdMappingDf,
center_mapping_df,
syn=syn,
project_id=project_id,
center=process_center,
process=process,
only_validate=only_validate,
delete_old=delete_old,
oncotree_link=oncotree_link,
format_registry=format_registry,
genie_annotation_pkg=genie_annotation_pkg,
genie_config=genie_config,
)

# To ensure that this is the new entity
center_mapping_ent = syn.get(center_mapping_id)
# HACK: To ensure that this is the new entity
center_mapping_ent = syn.get(genie_config["centerMapping"])
center_mapping_ent.isProcessing = "False"
center_mapping_ent = syn.store(center_mapping_ent)

error_tracker_synid = process_functions.getDatabaseSynId(
syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf
)
error_tracker_synid = genie_config["errorTracker"]
# Only write out invalid reasons if the center
# isnt specified and if only validate
if center is None and only_validate:
logger.info("WRITING INVALID REASONS TO CENTER STAGING DIRS")
write_invalid_reasons.write(syn, center_mapping_df, error_tracker_synid)
write_invalid_reasons.write(
syn, genie_config["centerMapping"], error_tracker_synid
)
logger.info("INPUT TO DATABASE COMPLETE")


Expand Down
2 changes: 1 addition & 1 deletion genie/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "13.1.1"
__version__ = "13.2.0"
2 changes: 1 addition & 1 deletion genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -1652,7 +1652,7 @@ def stagingToCbio(
bedDf = process_functions.get_syntabledf(
syn,
"SELECT Chromosome,Start_Position,End_Position,Hugo_Symbol,ID,"
"SEQ_ASSAY_ID,Feature_Type,includeInPanel FROM"
"SEQ_ASSAY_ID,Feature_Type,includeInPanel,clinicalReported FROM"
f" {bedSynId} where CENTER in ('{center_query_str}')",
)

Expand Down
3 changes: 2 additions & 1 deletion genie/example_filetype_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ class FileTypeFormat(object):

_validation_kwargs = []

def __init__(self, syn, center, poolSize=1):
def __init__(self, syn, center, genie_config=None):
self.syn = syn
self.center = center
self.genie_config = genie_config

# self.pool = multiprocessing.Pool(poolSize)

Expand Down
Loading

0 comments on commit 9f61bb0

Please sign in to comment.