From 83c4ea59068c5da4195f54162d80741cb42f6a57 Mon Sep 17 00:00:00 2001
From: jcha <jcha@broadinstitute.org>
Date: Mon, 29 Jun 2020 14:47:32 -0700
Subject: [PATCH 1/3] Consolidated sif parser and command line parser Added
 seg_input_type as an argument

---
 Cluster/Cluster.py | 65 ++++++++++++++++------------------------------
 PhylogicNDT.py     |  5 ++++
 2 files changed, 28 insertions(+), 42 deletions(-)

diff --git a/Cluster/Cluster.py b/Cluster/Cluster.py
index 8c4b5a9..4bb17eb 100644
--- a/Cluster/Cluster.py
+++ b/Cluster/Cluster.py
@@ -33,48 +33,29 @@ def run_tool(args):
     # Load sample data
 
     if args.sif:  # if sif file is specified
-        sif_file = open(args.sif)
-
-        for file_idx, file_line in enumerate(sif_file):
-
-            ##for now, assume input file order is of the type sample_id\tmaf_fn\tseg_fn\tpurity\ttimepoint
-            if not file_idx:
-                continue
-            if file_line.strip('\n').strip == "":
-                continue  # empty rows
-            smpl_spec = file_line.strip('\n').split('\t')
-            sample_id = smpl_spec[0]
-            maf_fn = smpl_spec[1]
-            seg_fn = smpl_spec[2]
-            purity = float(smpl_spec[3])
-            timepoint = float(smpl_spec[4])
-            print(timepoint)
-            patient_data.addSample(maf_fn, sample_id, timepoint_value=timepoint, grid_size=args.grid_size,
-                                   _additional_muts=None, seg_file=seg_fn,
-                                   purity=purity, input_type=args.maf_input_type)
-
-            if len(patient_data.sample_list) == args.n_samples:  # use only first N samples
-                break
-
-    else:  # if sample names/files are specified directly on cmdline
-
-        # sort order on timepoint or order of entry on cmdline if not present
-        print(args.sample_data)
-        for idx, sample_entry in enumerate(args.sample_data):
-            ##for now, assume input order is of the type sample_id\tmaf_fn\tseg_fn\tpurity\ttimepoint
-            smpl_spec = sample_entry.strip('\n').split(':')
-            sample_id = smpl_spec[0]
-            maf_fn = smpl_spec[1]
-            seg_fn = smpl_spec[2]
-            purity = float(smpl_spec[3])
-            timepoint = float(smpl_spec[4])
-
-            patient_data.addSample(maf_fn, sample_id, timepoint_value=timepoint, grid_size=args.grid_size,
-                                   _additional_muts=None,
-                                   seg_file=seg_fn,
-                                   purity=purity)
-            if len(patient_data.sample_list) == args.n_samples:  # use only first N samples
-                break
+        with open(args.sif) as sif_file:
+            header = sif_file.readline().strip('\n').split('\t')
+            sample_data = sif_file.read().replace('\t', ':').split('\n')
+    else:
+        sample_data = args.sample_data
+        header = ['sample_id', 'maf_fn', 'seg_fn', 'purity', 'timepoint']
+    print(sample_data)
+    for sample_entry in sample_data:
+        ##for now, assume input order is of the type sample_id\tmaf_fn\tseg_fn\tpurity\ttimepoint
+        if not sample_entry.strip():
+            continue
+        smpl_spec = dict(zip(header, sample_entry.split(':')))
+        sample_id = smpl_spec['sample_id']
+        maf_fn = smpl_spec['maf_fn']
+        seg_fn = smpl_spec['seg_fn']
+        purity = float(smpl_spec['purity']) if smpl_spec['purity'] else 1.
+        timepoint = float(smpl_spec['timepoint']) if smpl_spec['timepoint'] else 0.
+        print(timepoint)
+        patient_data.addSample(maf_fn, sample_id, timepoint_value=timepoint, grid_size=args.grid_size,
+                               _additional_muts=None, seg_file=seg_fn, purity=purity, input_type=args.maf_input_type,
+                               seg_input_type=args.seg_input_type)
+        if len(patient_data.sample_list) == args.n_samples:  # use only first N samples
+            break
 
     patient_data.get_arm_level_cn_events()
     patient_data.preprocess_samples()
diff --git a/PhylogicNDT.py b/PhylogicNDT.py
index 7f162ae..5f22219 100644
--- a/PhylogicNDT.py
+++ b/PhylogicNDT.py
@@ -237,6 +237,11 @@ def build_parser():
                             dest='maf_input_type',
                             default='auto')
 
+    clustering.add_argument('--seg_input_type', '-st',
+                            action='store',
+                            dest='seg_input_type',
+                            default='auto')
+
     # BuildTree  Tool
 
     buildtree = subparsers.add_parser("BuildTree", help="BuildTree module for constructing of phylogenetic trees.",

From d358e3fa44e11a6461e20be35f6d19cc30abea77 Mon Sep 17 00:00:00 2001
From: jcha <jcha@broadinstitute.org>
Date: Wed, 2 Sep 2020 13:41:10 -0700
Subject: [PATCH 2/3] Added support in GrowthKinetics for WBC records between
 samples Also added ordering based on time point

---
 GrowthKinetics/GrowthKinetics.py       | 15 +++++++++++-
 GrowthKinetics/GrowthKineticsEngine.py | 32 +++++++++++++++-----------
 PhylogicNDT.py                         |  8 +++----
 3 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/GrowthKinetics/GrowthKinetics.py b/GrowthKinetics/GrowthKinetics.py
index a09b202..fefe548 100644
--- a/GrowthKinetics/GrowthKinetics.py
+++ b/GrowthKinetics/GrowthKinetics.py
@@ -1,5 +1,6 @@
 import logging
 from collections import defaultdict
+import numpy as np
 
 
 def run_tool(args):
@@ -9,8 +10,20 @@ def run_tool(args):
 
     patient_data = Patient.Patient(indiv_name=args.indiv_id)
     mcmc_trace_cell_abundance, num_itertaions = load_mcmc_trace_abundances(args.abundance_mcmc_trace)
+    with open(args.sif, 'r') as f:
+        header = f.readline().strip('\n').split('\t')
+        sample_time_points = {}
+        for line in f:
+            fields = dict(zip(header, line.strip('\n').split('\t')))
+            try:
+                sample_time_points[fields['sample_id']] = float(fields['timepoint'])
+            except ValueError:
+                sample_time_points = None
+                break
+    wbc_time_points = np.array(args.time) if args.time is not None else None
     gk_engine = GrowthKineticsEngine(patient_data, args.wbc)
-    gk_engine.estimate_growth_rate(mcmc_trace_cell_abundance, n_iter=min(num_itertaions, args.n_iter))
+    gk_engine.estimate_growth_rate(mcmc_trace_cell_abundance, wbc_time_points=wbc_time_points,
+                                   sample_time_points=sample_time_points, n_iter=min(num_itertaions, args.n_iter))
 
     # Output and visualization
     import output.PhylogicOutput
diff --git a/GrowthKinetics/GrowthKineticsEngine.py b/GrowthKinetics/GrowthKineticsEngine.py
index d75cbcd..86a2dec 100644
--- a/GrowthKinetics/GrowthKineticsEngine.py
+++ b/GrowthKinetics/GrowthKineticsEngine.py
@@ -2,13 +2,14 @@
 import bisect
 from collections import defaultdict
 from scipy.stats import linregress
+from scipy.interpolate import interp1d
 
 
 class GrowthKineticsEngine:
 
     def __init__(self, patient, wbc):
         self._patient = patient
-        self._wbc = wbc
+        self._wbc = np.array(wbc)
         self._growth_rates = defaultdict(list)
 
     @property
@@ -19,25 +20,30 @@ def growth_rates(self):
     def wbc(self):
         return self._wbc
 
-    def estimate_growth_rate(self, mcmc_trace_cell_abundance, times=None, n_iter=100, conv=1e-40):
+    def estimate_growth_rate(self, mcmc_trace_cell_abundance, sample_time_points, wbc_time_points=None, n_iter=100,
+                             conv=1e-40):
         '''
-        
+
         '''
         # Number of samples for this Patient
-        sample_list = list(mcmc_trace_cell_abundance.keys())
-        time_points = len(sample_list)
+        sample_list = sorted(sample_time_points, key=sample_time_points.__getitem__)
+        sample_time_points_arr = np.array([sample_time_points[s] for s in sample_list])
         n_clusters = len(list(mcmc_trace_cell_abundance[sample_list[0]]))
-        #cluster_rates = defaultdict(list)
+        # cluster_rates = defaultdict(list)
         # If times of samples are not provided, treat as an integers
-        if not times:
-            times = np.array(range(time_points)) + 1
+        if wbc_time_points is None:
+            assert len(self.wbc) == len(sample_list), 'WBCs not corresponding to samples need time points'
+            wbc_time_points = sample_time_points_arr
+        cond = (wbc_time_points >= min(sample_time_points_arr)) & (wbc_time_points <= max(sample_time_points_arr))
+        inner_wbc = self.wbc[cond]
+        inner_wbc_time_points = wbc_time_points[cond]
         for n in range(n_iter):
-            adj_wbc = self._wbc * (1 + np.array([(np.random.random() - 0.5) / 100. for x in range(len(self._wbc))]))
+            adj_wbc = inner_wbc * (1 + (np.random.random(len(inner_wbc)) - .5) / 100.)
             for cluster_id in range(1, n_clusters + 1):
-                cluster_abundances = []
-                for sample_name, sample_abundances in mcmc_trace_cell_abundance.items():
-                    cluster_abundances.append(sample_abundances[cluster_id][n] + conv)
-                cluster_slope = linregress(times, cluster_abundances * adj_wbc).slope
+                cluster_abundances = [mcmc_trace_cell_abundance[sample_id][cluster_id][n] + conv for sample_id in
+                                      sample_list]
+                inner_cluster_abundances = interp1d(sample_time_points_arr, cluster_abundances)(inner_wbc_time_points)
+                cluster_slope = linregress(inner_wbc_time_points, inner_cluster_abundances * adj_wbc).slope
                 self._growth_rates[cluster_id].append(cluster_slope)
 
     def line_fit(self, x, c_idx, fb_x_vals, len_pre_tp, adj_dens):
diff --git a/PhylogicNDT.py b/PhylogicNDT.py
index 5f22219..0c58051 100644
--- a/PhylogicNDT.py
+++ b/PhylogicNDT.py
@@ -301,12 +301,12 @@ def build_parser():
     # GrowthKinetics  Tool
 
     growthkinetics = subparsers.add_parser("GrowthKinetics",
-                                           help="Sample growth rates and fitness across ensemble of trees", parents=[base_parser])    
+                                           help="Sample growth rates and fitness across ensemble of trees", parents=[base_parser])
     growthkinetics.add_argument('--abundance_mcmc_trace', '-ab',
                                 type=str,
                                 action='store',
                                 dest='abundance_mcmc_trace',
-                                help='tsv file generated by CellPopulation')    
+                                help='tsv file generated by CellPopulation')
     growthkinetics.add_argument('--n_iter', '-ni',
                                 type=int,
                                 action='store',
@@ -314,14 +314,14 @@ def build_parser():
                                 default=250,
                                 help='number iterations')
     growthkinetics.add_argument('--wbc', '-w',
-                                type=int,
+                                type=float,
                                 nargs='+',
                                 default=[],
                                 action='store',
                                 dest='wbc',
                                 help='wbc')
     growthkinetics.add_argument('--time', '-t',
-                                type=int,
+                                type=float,
                                 nargs='+',
                                 default=[],
                                 action='store',

From eef8212c4462b46b0e9436d6ab2db9dd5296bfb1 Mon Sep 17 00:00:00 2001
From: jcha <jcha@broadinstitute.org>
Date: Wed, 2 Sep 2020 13:56:47 -0700
Subject: [PATCH 3/3] Added more helpful error message

---
 GrowthKinetics/GrowthKinetics.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/GrowthKinetics/GrowthKinetics.py b/GrowthKinetics/GrowthKinetics.py
index fefe548..f646546 100644
--- a/GrowthKinetics/GrowthKinetics.py
+++ b/GrowthKinetics/GrowthKinetics.py
@@ -18,8 +18,7 @@ def run_tool(args):
             try:
                 sample_time_points[fields['sample_id']] = float(fields['timepoint'])
             except ValueError:
-                sample_time_points = None
-                break
+                raise ValueError('Sample time points in sif file are required')
     wbc_time_points = np.array(args.time) if args.time is not None else None
     gk_engine = GrowthKineticsEngine(patient_data, args.wbc)
     gk_engine.estimate_growth_rate(mcmc_trace_cell_abundance, wbc_time_points=wbc_time_points,