From 83c4ea59068c5da4195f54162d80741cb42f6a57 Mon Sep 17 00:00:00 2001 From: jcha Date: Mon, 29 Jun 2020 14:47:32 -0700 Subject: [PATCH 1/3] Consolidated sif parser and command line parser Added seg_input_type as an argument --- Cluster/Cluster.py | 65 ++++++++++++++++------------------------------ PhylogicNDT.py | 5 ++++ 2 files changed, 28 insertions(+), 42 deletions(-) diff --git a/Cluster/Cluster.py b/Cluster/Cluster.py index 8c4b5a9..4bb17eb 100644 --- a/Cluster/Cluster.py +++ b/Cluster/Cluster.py @@ -33,48 +33,29 @@ def run_tool(args): # Load sample data if args.sif: # if sif file is specified - sif_file = open(args.sif) - - for file_idx, file_line in enumerate(sif_file): - - ##for now, assume input file order is of the type sample_id\tmaf_fn\tseg_fn\tpurity\ttimepoint - if not file_idx: - continue - if file_line.strip('\n').strip == "": - continue # empty rows - smpl_spec = file_line.strip('\n').split('\t') - sample_id = smpl_spec[0] - maf_fn = smpl_spec[1] - seg_fn = smpl_spec[2] - purity = float(smpl_spec[3]) - timepoint = float(smpl_spec[4]) - print(timepoint) - patient_data.addSample(maf_fn, sample_id, timepoint_value=timepoint, grid_size=args.grid_size, - _additional_muts=None, seg_file=seg_fn, - purity=purity, input_type=args.maf_input_type) - - if len(patient_data.sample_list) == args.n_samples: # use only first N samples - break - - else: # if sample names/files are specified directly on cmdline - - # sort order on timepoint or order of entry on cmdline if not present - print(args.sample_data) - for idx, sample_entry in enumerate(args.sample_data): - ##for now, assume input order is of the type sample_id\tmaf_fn\tseg_fn\tpurity\ttimepoint - smpl_spec = sample_entry.strip('\n').split(':') - sample_id = smpl_spec[0] - maf_fn = smpl_spec[1] - seg_fn = smpl_spec[2] - purity = float(smpl_spec[3]) - timepoint = float(smpl_spec[4]) - - patient_data.addSample(maf_fn, sample_id, timepoint_value=timepoint, grid_size=args.grid_size, - _additional_muts=None, - seg_file=seg_fn, - purity=purity) - if len(patient_data.sample_list) == args.n_samples: # use only first N samples - break + with open(args.sif) as sif_file: + header = sif_file.readline().strip('\n').split('\t') + sample_data = sif_file.read().replace('\t', ':').split('\n') + else: + sample_data = args.sample_data + header = ['sample_id', 'maf_fn', 'seg_fn', 'purity', 'timepoint'] + print(sample_data) + for sample_entry in sample_data: + ##for now, assume input order is of the type sample_id\tmaf_fn\tseg_fn\tpurity\ttimepoint + if not sample_entry.strip(): + continue + smpl_spec = dict(zip(header, sample_entry.split(':'))) + sample_id = smpl_spec['sample_id'] + maf_fn = smpl_spec['maf_fn'] + seg_fn = smpl_spec['seg_fn'] + purity = float(smpl_spec['purity']) if smpl_spec['purity'] else 1. + timepoint = float(smpl_spec['timepoint']) if smpl_spec['timepoint'] else 0. + print(timepoint) + patient_data.addSample(maf_fn, sample_id, timepoint_value=timepoint, grid_size=args.grid_size, + _additional_muts=None, seg_file=seg_fn, purity=purity, input_type=args.maf_input_type, + seg_input_type=args.seg_input_type) + if len(patient_data.sample_list) == args.n_samples: # use only first N samples + break patient_data.get_arm_level_cn_events() patient_data.preprocess_samples() diff --git a/PhylogicNDT.py b/PhylogicNDT.py index 7f162ae..5f22219 100644 --- a/PhylogicNDT.py +++ b/PhylogicNDT.py @@ -237,6 +237,11 @@ def build_parser(): dest='maf_input_type', default='auto') + clustering.add_argument('--seg_input_type', '-st', + action='store', + dest='seg_input_type', + default='auto') + # BuildTree Tool buildtree = subparsers.add_parser("BuildTree", help="BuildTree module for constructing of phylogenetic trees.", From d358e3fa44e11a6461e20be35f6d19cc30abea77 Mon Sep 17 00:00:00 2001 From: jcha Date: Wed, 2 Sep 2020 13:41:10 -0700 Subject: [PATCH 2/3] Added support in GrowthKinetics for WBC records between samples Also added ordering based on time point --- GrowthKinetics/GrowthKinetics.py | 15 +++++++++++- GrowthKinetics/GrowthKineticsEngine.py | 32 +++++++++++++++----------- PhylogicNDT.py | 8 +++---- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/GrowthKinetics/GrowthKinetics.py b/GrowthKinetics/GrowthKinetics.py index a09b202..fefe548 100644 --- a/GrowthKinetics/GrowthKinetics.py +++ b/GrowthKinetics/GrowthKinetics.py @@ -1,5 +1,6 @@ import logging from collections import defaultdict +import numpy as np def run_tool(args): @@ -9,8 +10,20 @@ def run_tool(args): patient_data = Patient.Patient(indiv_name=args.indiv_id) mcmc_trace_cell_abundance, num_itertaions = load_mcmc_trace_abundances(args.abundance_mcmc_trace) + with open(args.sif, 'r') as f: + header = f.readline().strip('\n').split('\t') + sample_time_points = {} + for line in f: + fields = dict(zip(header, line.strip('\n').split('\t'))) + try: + sample_time_points[fields['sample_id']] = float(fields['timepoint']) + except ValueError: + sample_time_points = None + break + wbc_time_points = np.array(args.time) if args.time is not None else None gk_engine = GrowthKineticsEngine(patient_data, args.wbc) - gk_engine.estimate_growth_rate(mcmc_trace_cell_abundance, n_iter=min(num_itertaions, args.n_iter)) + gk_engine.estimate_growth_rate(mcmc_trace_cell_abundance, wbc_time_points=wbc_time_points, + sample_time_points=sample_time_points, n_iter=min(num_itertaions, args.n_iter)) # Output and visualization import output.PhylogicOutput diff --git a/GrowthKinetics/GrowthKineticsEngine.py b/GrowthKinetics/GrowthKineticsEngine.py index d75cbcd..86a2dec 100644 --- a/GrowthKinetics/GrowthKineticsEngine.py +++ b/GrowthKinetics/GrowthKineticsEngine.py @@ -2,13 +2,14 @@ import bisect from collections import defaultdict from scipy.stats import linregress +from scipy.interpolate import interp1d class GrowthKineticsEngine: def __init__(self, patient, wbc): self._patient = patient - self._wbc = wbc + self._wbc = np.array(wbc) self._growth_rates = defaultdict(list) @property @@ -19,25 +20,30 @@ def growth_rates(self): def wbc(self): return self._wbc - def estimate_growth_rate(self, mcmc_trace_cell_abundance, times=None, n_iter=100, conv=1e-40): + def estimate_growth_rate(self, mcmc_trace_cell_abundance, sample_time_points, wbc_time_points=None, n_iter=100, + conv=1e-40): ''' - + ''' # Number of samples for this Patient - sample_list = list(mcmc_trace_cell_abundance.keys()) - time_points = len(sample_list) + sample_list = sorted(sample_time_points, key=sample_time_points.__getitem__) + sample_time_points_arr = np.array([sample_time_points[s] for s in sample_list]) n_clusters = len(list(mcmc_trace_cell_abundance[sample_list[0]])) - #cluster_rates = defaultdict(list) + # cluster_rates = defaultdict(list) # If times of samples are not provided, treat as an integers - if not times: - times = np.array(range(time_points)) + 1 + if wbc_time_points is None: + assert len(self.wbc) == len(sample_list), 'WBCs not corresponding to samples need time points' + wbc_time_points = sample_time_points_arr + cond = (wbc_time_points >= min(sample_time_points_arr)) & (wbc_time_points <= max(sample_time_points_arr)) + inner_wbc = self.wbc[cond] + inner_wbc_time_points = wbc_time_points[cond] for n in range(n_iter): - adj_wbc = self._wbc * (1 + np.array([(np.random.random() - 0.5) / 100. for x in range(len(self._wbc))])) + adj_wbc = inner_wbc * (1 + (np.random.random(len(inner_wbc)) - .5) / 100.) for cluster_id in range(1, n_clusters + 1): - cluster_abundances = [] - for sample_name, sample_abundances in mcmc_trace_cell_abundance.items(): - cluster_abundances.append(sample_abundances[cluster_id][n] + conv) - cluster_slope = linregress(times, cluster_abundances * adj_wbc).slope + cluster_abundances = [mcmc_trace_cell_abundance[sample_id][cluster_id][n] + conv for sample_id in + sample_list] + inner_cluster_abundances = interp1d(sample_time_points_arr, cluster_abundances)(inner_wbc_time_points) + cluster_slope = linregress(inner_wbc_time_points, inner_cluster_abundances * adj_wbc).slope self._growth_rates[cluster_id].append(cluster_slope) def line_fit(self, x, c_idx, fb_x_vals, len_pre_tp, adj_dens): diff --git a/PhylogicNDT.py b/PhylogicNDT.py index 5f22219..0c58051 100644 --- a/PhylogicNDT.py +++ b/PhylogicNDT.py @@ -301,12 +301,12 @@ def build_parser(): # GrowthKinetics Tool growthkinetics = subparsers.add_parser("GrowthKinetics", - help="Sample growth rates and fitness across ensemble of trees", parents=[base_parser]) + help="Sample growth rates and fitness across ensemble of trees", parents=[base_parser]) growthkinetics.add_argument('--abundance_mcmc_trace', '-ab', type=str, action='store', dest='abundance_mcmc_trace', - help='tsv file generated by CellPopulation') + help='tsv file generated by CellPopulation') growthkinetics.add_argument('--n_iter', '-ni', type=int, action='store', @@ -314,14 +314,14 @@ def build_parser(): default=250, help='number iterations') growthkinetics.add_argument('--wbc', '-w', - type=int, + type=float, nargs='+', default=[], action='store', dest='wbc', help='wbc') growthkinetics.add_argument('--time', '-t', - type=int, + type=float, nargs='+', default=[], action='store', From eef8212c4462b46b0e9436d6ab2db9dd5296bfb1 Mon Sep 17 00:00:00 2001 From: jcha Date: Wed, 2 Sep 2020 13:56:47 -0700 Subject: [PATCH 3/3] Added more helpful error message --- GrowthKinetics/GrowthKinetics.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/GrowthKinetics/GrowthKinetics.py b/GrowthKinetics/GrowthKinetics.py index fefe548..f646546 100644 --- a/GrowthKinetics/GrowthKinetics.py +++ b/GrowthKinetics/GrowthKinetics.py @@ -18,8 +18,7 @@ def run_tool(args): try: sample_time_points[fields['sample_id']] = float(fields['timepoint']) except ValueError: - sample_time_points = None - break + raise ValueError('Sample time points in sif file are required') wbc_time_points = np.array(args.time) if args.time is not None else None gk_engine = GrowthKineticsEngine(patient_data, args.wbc) gk_engine.estimate_growth_rate(mcmc_trace_cell_abundance, wbc_time_points=wbc_time_points,