Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support in GrowthKinetics for WBC records between samples #48

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 23 additions & 42 deletions Cluster/Cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,48 +33,29 @@ def run_tool(args):
# Load sample data

if args.sif: # if sif file is specified
sif_file = open(args.sif)

for file_idx, file_line in enumerate(sif_file):

##for now, assume input file order is of the type sample_id\tmaf_fn\tseg_fn\tpurity\ttimepoint
if not file_idx:
continue
if file_line.strip('\n').strip == "":
continue # empty rows
smpl_spec = file_line.strip('\n').split('\t')
sample_id = smpl_spec[0]
maf_fn = smpl_spec[1]
seg_fn = smpl_spec[2]
purity = float(smpl_spec[3])
timepoint = float(smpl_spec[4])
print(timepoint)
patient_data.addSample(maf_fn, sample_id, timepoint_value=timepoint, grid_size=args.grid_size,
_additional_muts=None, seg_file=seg_fn,
purity=purity, input_type=args.maf_input_type)

if len(patient_data.sample_list) == args.n_samples: # use only first N samples
break

else: # if sample names/files are specified directly on cmdline

# sort order on timepoint or order of entry on cmdline if not present
print(args.sample_data)
for idx, sample_entry in enumerate(args.sample_data):
##for now, assume input order is of the type sample_id\tmaf_fn\tseg_fn\tpurity\ttimepoint
smpl_spec = sample_entry.strip('\n').split(':')
sample_id = smpl_spec[0]
maf_fn = smpl_spec[1]
seg_fn = smpl_spec[2]
purity = float(smpl_spec[3])
timepoint = float(smpl_spec[4])

patient_data.addSample(maf_fn, sample_id, timepoint_value=timepoint, grid_size=args.grid_size,
_additional_muts=None,
seg_file=seg_fn,
purity=purity)
if len(patient_data.sample_list) == args.n_samples: # use only first N samples
break
with open(args.sif) as sif_file:
header = sif_file.readline().strip('\n').split('\t')
sample_data = sif_file.read().replace('\t', ':').split('\n')
else:
sample_data = args.sample_data
header = ['sample_id', 'maf_fn', 'seg_fn', 'purity', 'timepoint']
print(sample_data)
for sample_entry in sample_data:
##for now, assume input order is of the type sample_id\tmaf_fn\tseg_fn\tpurity\ttimepoint
if not sample_entry.strip():
continue
smpl_spec = dict(zip(header, sample_entry.split(':')))
sample_id = smpl_spec['sample_id']
maf_fn = smpl_spec['maf_fn']
seg_fn = smpl_spec['seg_fn']
purity = float(smpl_spec['purity']) if smpl_spec['purity'] else 1.
timepoint = float(smpl_spec['timepoint']) if smpl_spec['timepoint'] else 0.
print(timepoint)
patient_data.addSample(maf_fn, sample_id, timepoint_value=timepoint, grid_size=args.grid_size,
_additional_muts=None, seg_file=seg_fn, purity=purity, input_type=args.maf_input_type,
seg_input_type=args.seg_input_type)
if len(patient_data.sample_list) == args.n_samples: # use only first N samples
break

patient_data.get_arm_level_cn_events()
patient_data.preprocess_samples()
Expand Down
14 changes: 13 additions & 1 deletion GrowthKinetics/GrowthKinetics.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from collections import defaultdict
import numpy as np


def run_tool(args):
Expand All @@ -9,8 +10,19 @@ def run_tool(args):

patient_data = Patient.Patient(indiv_name=args.indiv_id)
mcmc_trace_cell_abundance, num_itertaions = load_mcmc_trace_abundances(args.abundance_mcmc_trace)
with open(args.sif, 'r') as f:
header = f.readline().strip('\n').split('\t')
sample_time_points = {}
for line in f:
fields = dict(zip(header, line.strip('\n').split('\t')))
try:
sample_time_points[fields['sample_id']] = float(fields['timepoint'])
except ValueError:
raise ValueError('Sample time points in sif file are required')
wbc_time_points = np.array(args.time) if args.time is not None else None
gk_engine = GrowthKineticsEngine(patient_data, args.wbc)
gk_engine.estimate_growth_rate(mcmc_trace_cell_abundance, n_iter=min(num_itertaions, args.n_iter))
gk_engine.estimate_growth_rate(mcmc_trace_cell_abundance, wbc_time_points=wbc_time_points,
sample_time_points=sample_time_points, n_iter=min(num_itertaions, args.n_iter))

# Output and visualization
import output.PhylogicOutput
Expand Down
32 changes: 19 additions & 13 deletions GrowthKinetics/GrowthKineticsEngine.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
import bisect
from collections import defaultdict
from scipy.stats import linregress
from scipy.interpolate import interp1d


class GrowthKineticsEngine:

def __init__(self, patient, wbc):
self._patient = patient
self._wbc = wbc
self._wbc = np.array(wbc)
self._growth_rates = defaultdict(list)

@property
Expand All @@ -19,25 +20,30 @@ def growth_rates(self):
def wbc(self):
return self._wbc

def estimate_growth_rate(self, mcmc_trace_cell_abundance, times=None, n_iter=100, conv=1e-40):
def estimate_growth_rate(self, mcmc_trace_cell_abundance, sample_time_points, wbc_time_points=None, n_iter=100,
conv=1e-40):
'''

'''
# Number of samples for this Patient
sample_list = list(mcmc_trace_cell_abundance.keys())
time_points = len(sample_list)
sample_list = sorted(sample_time_points, key=sample_time_points.__getitem__)
sample_time_points_arr = np.array([sample_time_points[s] for s in sample_list])
n_clusters = len(list(mcmc_trace_cell_abundance[sample_list[0]]))
#cluster_rates = defaultdict(list)
# cluster_rates = defaultdict(list)
# If times of samples are not provided, treat as an integers
if not times:
times = np.array(range(time_points)) + 1
if wbc_time_points is None:
assert len(self.wbc) == len(sample_list), 'WBCs not corresponding to samples need time points'
wbc_time_points = sample_time_points_arr
cond = (wbc_time_points >= min(sample_time_points_arr)) & (wbc_time_points <= max(sample_time_points_arr))
inner_wbc = self.wbc[cond]
inner_wbc_time_points = wbc_time_points[cond]
for n in range(n_iter):
adj_wbc = self._wbc * (1 + np.array([(np.random.random() - 0.5) / 100. for x in range(len(self._wbc))]))
adj_wbc = inner_wbc * (1 + (np.random.random(len(inner_wbc)) - .5) / 100.)
for cluster_id in range(1, n_clusters + 1):
cluster_abundances = []
for sample_name, sample_abundances in mcmc_trace_cell_abundance.items():
cluster_abundances.append(sample_abundances[cluster_id][n] + conv)
cluster_slope = linregress(times, cluster_abundances * adj_wbc).slope
cluster_abundances = [mcmc_trace_cell_abundance[sample_id][cluster_id][n] + conv for sample_id in
sample_list]
inner_cluster_abundances = interp1d(sample_time_points_arr, cluster_abundances)(inner_wbc_time_points)
cluster_slope = linregress(inner_wbc_time_points, inner_cluster_abundances * adj_wbc).slope
self._growth_rates[cluster_id].append(cluster_slope)

def line_fit(self, x, c_idx, fb_x_vals, len_pre_tp, adj_dens):
Expand Down
13 changes: 9 additions & 4 deletions PhylogicNDT.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,11 @@ def build_parser():
dest='maf_input_type',
default='auto')

clustering.add_argument('--seg_input_type', '-st',
action='store',
dest='seg_input_type',
default='auto')

# BuildTree Tool

buildtree = subparsers.add_parser("BuildTree", help="BuildTree module for constructing of phylogenetic trees.",
Expand Down Expand Up @@ -296,27 +301,27 @@ def build_parser():
# GrowthKinetics Tool

growthkinetics = subparsers.add_parser("GrowthKinetics",
help="Sample growth rates and fitness across ensemble of trees", parents=[base_parser])
help="Sample growth rates and fitness across ensemble of trees", parents=[base_parser])
growthkinetics.add_argument('--abundance_mcmc_trace', '-ab',
type=str,
action='store',
dest='abundance_mcmc_trace',
help='tsv file generated by CellPopulation')
help='tsv file generated by CellPopulation')
growthkinetics.add_argument('--n_iter', '-ni',
type=int,
action='store',
dest='n_iter',
default=250,
help='number iterations')
growthkinetics.add_argument('--wbc', '-w',
type=int,
type=float,
nargs='+',
default=[],
action='store',
dest='wbc',
help='wbc')
growthkinetics.add_argument('--time', '-t',
type=int,
type=float,
nargs='+',
default=[],
action='store',
Expand Down