broadinstitute · gwaybio · Mar 25, 2021 · Mar 25, 2021 · Mar 25, 2021 · May 12, 2021
diff --git a/0.preprocess-sites/1.process-spots.py b/0.preprocess-sites/1.process-spots.py
@@ -72,7 +72,8 @@
 control_barcodes = config["experiment"]["control_barcode_ids"]
 
 id_cols = config["options"]["core"]["cell_id_cols"]
-spot_parent_cols = config["options"]["core"]["cell_match_cols"]["spots"]
+parent_cols = config["options"]["core"]["cell_match_cols"]
+spot_parent_cols = parent_cols["spots"]
 ignore_files = config["options"]["core"]["ignore_files"]
 cell_filter = config["options"]["core"]["cell_quality"]["cell_filter"]
 quality_func = config["options"]["core"]["cell_quality"]["categorize_cell_quality"]
@@ -92,6 +93,7 @@
 foci_cols = spot_config["foci_cols"]
 force = spot_config["force_overwrite"]
 perform = spot_config["perform"]
+exact_match_reads_col = spot_config["exact_match_reads_col"]
 
 # check if this step should be performed
 if not perform:
@@ -195,6 +197,44 @@
     num_unassigned_spots = null_spot_df.shape[0]
     num_assigned_spots = cell_spot_df.shape[0]
 
+    # Table 1 - Number of cells with exact match reads
+    # Note, this includes cells with more than 1 perfect barcode
+    # First, select only spots with perfect match to barcode library
+    perfect_match_barcodes_df = complete_foci_df.loc[
+        (complete_foci_df.loc[:, spot_score_cols] == 1).squeeze(), :
+    ]
+
+    # Count perfect spots outside of cells
+    perfect_match_no_cell_df = perfect_match_barcodes_df.loc[
+        (perfect_match_barcodes_df.loc[:, parent_cols["spots"]] == 0).squeeze(), :
+    ]
+    perfect_match_no_cell_df = pd.Series(
+        [0, perfect_match_no_cell_df.shape[0]],
+        index=[exact_match_reads_col, "cell_count"],
+    )
+
+    # Next, drop spots outside of cells
+    perfect_match_barcodes_df = perfect_match_barcodes_df.loc[
+        (perfect_match_barcodes_df.loc[:, parent_cols["spots"]] != 0).squeeze(), :
+    ]
+
+    # Compile Table 1
+    perfect_match_barcodes_df = (
+        perfect_match_barcodes_df.groupby(parent_cols["spots"])[id_cols[0]]
+        .count()
+        .value_counts()
+        .reset_index()
+        .rename(
+            {"index": exact_match_reads_col, id_cols[0]: "cell_count"},
+            axis="columns",
+        )
+    ).append(perfect_match_no_cell_df, ignore_index=True)
+
+    # Output table
+    out_file = pathlib.Path(output_dir, "exact_match_barcode_reads_per_cell_counts.tsv")
+    if check_if_write(out_file, force):
+        perfect_match_barcodes_df.to_csv(out_file, sep="\t", index=False)
+
     # Figure 1 - histogram of barcode counts per cell
     fig_file = pathlib.Path(output_dir, "num_spots_per_cell_histogram.png")
     if check_if_write(fig_file, force):
@@ -242,7 +282,7 @@
     )
     num_unique_genes = len(crispr_barcode_gene_df.loc[:, gene_cols].squeeze().unique())
 
-    # Table 1 - Full cell and CRISPR guide quality with scores
+    # Table 2 - Full cell and CRISPR guide quality with scores
     out_file = pathlib.Path(
         output_dir, "cell_id_barcode_alignment_scores_by_guide.tsv.gz"
     )
@@ -251,7 +291,7 @@
             out_file, sep="\t", index=False, compression="gzip"
         )
 
-    # Table 2 - Cell Category Summary
+    # Table 3 - Cell Category Summary
     cell_quality_summary_df = cell_quality.summarize_cell_quality_counts(
         quality_df=crispr_barcode_gene_df, parent_cols=spot_parent_cols
     ).assign(
@@ -266,7 +306,7 @@
     if check_if_write(out_file, force):
         cell_quality_summary_df.to_csv(out_file, sep="\t", index=False)
 
-    # Table 3 - Counting gene and guide by cell category
+    # Table 4 - Counting gene and guide by cell category
     gene_category_count_df = cell_quality.summarize_perturbation_quality_counts(
         quality_df=crispr_barcode_gene_df,
         parent_cols=spot_parent_cols,

diff --git a/0.preprocess-sites/2.process-cells.py b/0.preprocess-sites/2.process-cells.py
@@ -189,7 +189,10 @@
     # Add the cell quality metadata to the df
     metadata_df = (
         metadata_df.merge(
-            cell_category_df, left_on=quality_idx, right_index=True, how="left",
+            cell_category_df,
+            left_on=quality_idx,
+            right_index=True,
+            how="left",
         )
         .sort_values(by=cell_sort_col)
         .drop_duplicates(subset=[cell_sort_col, quality_idx])
@@ -204,7 +207,12 @@
     cell_count_df = (
         pd.DataFrame(metadata_df.loc[:, quality_col].value_counts())
         .rename(columns={quality_col: "cell_count"})
-        .assign(site=site, plate=plate, well=well, site_location=site_location,)
+        .assign(
+            site=site,
+            plate=plate,
+            well=well,
+            site_location=site_location,
+        )
     )
 
     output_folder = pathlib.Path(output_paintdir, site)

diff --git a/0.preprocess-sites/3.visualize-cell-summary.py b/0.preprocess-sites/3.visualize-cell-summary.py
@@ -183,7 +183,9 @@
     + gg.ylab("Cell Count")
     + gg.ggtitle(f"{all_cells} Total Cells")
     + gg.scale_fill_manual(
-        name="Cell Quality", labels=cell_category_order, values=cell_category_colors,
+        name="Cell Quality",
+        labels=cell_category_order,
+        values=cell_category_colors,
     )
 )
 
@@ -206,7 +208,9 @@
     + gg.ylab("Cell Count")
     + gg.facet_wrap("~Cell_Quality")
     + gg.scale_fill_manual(
-        name="Cell Quality", labels=cell_category_order, values=cell_category_colors,
+        name="Cell Quality",
+        labels=cell_category_order,
+        values=cell_category_colors,
     )
     + gg.theme(strip_background=gg.element_rect(colour="black", fill="#fdfff4"))
 )

diff --git a/scripts/cell_quality_utils.py b/scripts/cell_quality_utils.py
@@ -53,6 +53,8 @@ def __init__(
             self.categorize = simple_categorize
         elif self.method == "simple_plus":
             self.categorize = simple_plus_categorize
+        elif self.method == "feldman":
+            self.categorize = feldman_categorize
 
         category_dict = self.define_cell_quality()
         self.category_df = (
@@ -65,12 +67,14 @@ def __init__(
     def define_cell_quality(self):
         return get_cell_quality_dict(method=self.method)
 
-    def assign_cell_quality(self, count_df, parent_cols, score_col):
+    def assign_cell_quality(self, count_df, parent_cols, score_col, barcode_col):
 
         quality_estimate_df = (
             pd.DataFrame(
                 count_df.groupby(parent_cols).apply(
-                    lambda x: self.categorize(x, score_col=score_col)
+                    lambda x: self.categorize(
+                        x, score_col=score_col, barcode_col=barcode_col
+                    )
                 ),
                 columns=[self.category_col_index],
             )
@@ -133,12 +137,20 @@ def get_cell_quality_dict(method):
             4: "Imperfect-Low",
             5: "Bad",
         },
+        "feldman": {
+            1: "Keep",
+            2: "Toss_No_Perfect",
+            3: "Toss_Multiple_Perfect",
+            4: "Toss_Minority_Perfect",
+        },
     }
 
     return cell_quality_dict[method]
 
 
-def simple_categorize(parent_cell, score_col, avg_col="mean", count_col="count"):
+def simple_categorize(
+    parent_cell, score_col, barcode_col=None, avg_col="mean", count_col="count"
+):
 
     score_col_avg = f"{score_col}_{avg_col}"
     count_col_avg = f"{score_col}_{count_col}"
@@ -148,8 +160,8 @@ def simple_categorize(parent_cell, score_col, avg_col="mean", count_col="count")
     )
 
     num_barcodes = parent_cell.shape[0]
-    max_score = max(parent_cell.Barcode_MatchedTo_Score_mean)
-    max_count = max(parent_cell.Barcode_MatchedTo_Score_count)
+    max_score = max(parent_cell.loc[:, score_col_avg])
+    max_count = max(parent_cell.loc[:, count_col_avg])
 
     if num_barcodes == 1:
         if max_score == 1:
@@ -177,7 +189,9 @@ def simple_categorize(parent_cell, score_col, avg_col="mean", count_col="count")
     return score
 
 
-def simple_plus_categorize(parent_cell, score_col, avg_col="mean", count_col="count"):
+def simple_plus_categorize(
+    parent_cell, score_col, barcode_col=None, avg_col="mean", count_col="count"
+):
 
     score_col_avg = f"{score_col}_{avg_col}"
     count_col_avg = f"{score_col}_{count_col}"
@@ -187,8 +201,8 @@ def simple_plus_categorize(parent_cell, score_col, avg_col="mean", count_col="co
     )
 
     num_barcodes = parent_cell.shape[0]
-    max_score = max(parent_cell.Barcode_MatchedTo_Score_mean)
-    max_count = max(parent_cell.Barcode_MatchedTo_Score_count)
+    max_score = max(parent_cell.loc[:, score_col_avg])
+    max_count = max(parent_cell.loc[:, count_col_avg])
 
     if num_barcodes == 1:
         if max_score == 1:
@@ -217,3 +231,37 @@ def simple_plus_categorize(parent_cell, score_col, avg_col="mean", count_col="co
                 else:
                     score = 5
     return score
+
+
+def feldman_categorize(
+    parent_cell,
+    score_col,
+    barcode_col="Barcode_MatchedTo_Barcode",
+    avg_col=None,
+    count_col=None,
+):
+    """
+    Note that the Feldman categorize function works on non-averaged scores
+    """
+    num_barcodes = parent_cell.shape[0]
+    max_score = max(parent_cell.loc[:, score_col])
+
+    if max_score < 1:
+        score = 2
+    else:
+        barcode_count_with_max_score = parent_cell.index[
+            parent_cell[score_col] == max_score
+        ].values
+
+        perfect_matches = parent_cell.loc[barcode_count_with_max_score, :]
+
+        if len(perfect_matches.loc[:, barcode_col].unique()) != 1:
+            score = 3
+        else:
+            top_barcode_ratio = len(barcode_count_with_max_score) / num_barcodes
+            if top_barcode_ratio > 0.5:
+                score = 1
+            else:
+                score = 4
+
+    return score