Merge pull request #55 from brianhie/sketching

Add sketch-based acceleration to Scanorama API
brianhie · Nov 18, 2019 · 125e6b4 · 125e6b4
2 parents 794d08d + e63fe9b
commit 125e6b4
Show file tree

Hide file tree

Showing 6 changed files with 182 additions and 85 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,9 @@
 
 Scanorama enables batch-correction and integration of heterogeneous scRNA-seq data sets, which is described in the paper ["Efficient integration of heterogeneous single-cell transcriptomes using Scanorama"](https://www.nature.com/articles/s41587-019-0113-3) by Brian Hie, Bryan Bryson, and Bonnie Berger. This repository contains the Scanorama source code as well as scripts necessary for reproducing the results in the paper.
 
-**Scanorama is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods, including those for imputation and highly-variable gene filtering. The results from Scanorama integration and batch correction can then be used as input to other tools for scRNA-seq clustering, visualization, and analysis.**
+Scanorama is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods, including those for imputation and highly-variable gene filtering. The results from Scanorama integration and batch correction can then be used as input to other tools for scRNA-seq clustering, visualization, and analysis.
+
+Scanorama integration can also be greatly accelerated using tools for data sketching, as described in the paper ["Geometric sketching compactly summarizes the single-cell transcriptomic landscape", Cell Systems (2019)](https://www.cell.com/cell-systems/fulltext/S2405-4712\(19\)30152-8) and implemented [here](https://github.com/brianhie/geosketch).
 
 ## API example usage
 
@@ -88,7 +90,7 @@ cd scanorama/
 python setup.py install --user
 ```
 
-If you are running inside an anaconda environment, first install annoy by doing: 
+If you are running inside an anaconda environment, first install annoy by doing:
 ```
 conda install -c conda-forge python-annoy
 ```
@@ -166,11 +168,9 @@ For those interested in the algorithm implementation, `scanorama/scanorama.py` i
 
 - Make sure the input matrices are cells-by-genes, not the transpose.
 
-- For the example scripts, be sure to run `bin/process.py` first, although this is not necessary if you are using Scanorama through the API.
-
-- For large data set integration under memory constraints (e.g., if you run into a `MemoryError`), try lowering the `batch_size` parameter. And stay tuned for more improvements!
+- For large data set integration under memory constraints (e.g., if you run into a `MemoryError`), try lowering the `batch_size` parameter to improve memory usage and try sketch-based acceleration using the `sketch` parameter to `integrate()` to improve both memory usage and runtime.
 
-- Scanorama versions 0.2 through 0.6.1 had default parameters that resulted in non-optimal batch correction results (integration was unaffected). Upgrade to the latest version for a fix to this issue.
+- For the example scripts, be sure to run `bin/process.py` first, although this is not necessary if you are using Scanorama through the API.
 
 ## Questions
 

diff --git a/bin/mouse_brain.py b/bin/mouse_brain.py
@@ -28,25 +28,23 @@
 
 if __name__ == '__main__':
     process(data_names, min_trans=100)
-    
+
     datasets, genes_list, n_cells = load_names(data_names)
 
     datasets, genes = merge_datasets(datasets, genes_list, ds_names=data_names)
 
     datasets_dimred, genes = process_data(datasets, genes, verbose=True)
-    
+
     t0 = time()
     datasets_dimred = assemble(
         datasets_dimred, batch_size=BATCH_SIZE,
-        geosketch=True, geosketch_max=6900
     )
     print('Integrated panoramas in {:.3f}s'.format(time() - t0))
-    
+
     t0 = time()
     datasets_dimred, datasets, genes = correct(
         datasets, genes_list, ds_names=data_names,
         return_dimred=True, batch_size=BATCH_SIZE,
-        geosketch=True, geosketch_max=6900
     )
     print('Integrated and batch corrected panoramas in {:.3f}s'
           .format(time() - t0))
@@ -59,12 +57,12 @@
         names.append(data_names[i])
         curr_label += 1
         labels = np.array(labels, dtype=int)
-    
+
     mouse_brain_genes = [
         'Gja1', 'Flt1', 'Gabra6', 'Syt1', 'Gabrb2', 'Gabra1',
         'Meg3', 'Mbp', 'Rgs5',
     ]
-    
+
     # Downsample for visualization purposes
     datasets_dimred = []
     for i in range(len(data_names)):
@@ -83,7 +81,7 @@
                           image_suffix='.png')
     np.savetxt('data/{}_embedding.txt'.format(NAMESPACE),
                embedding, delimiter='\t')
-    
+
     cell_labels = (
         open('data/cell_labels/mouse_brain_cluster.txt')
         .read().rstrip().split()

diff --git a/bin/mouse_brain_sketched.py b/bin/mouse_brain_sketched.py
@@ -0,0 +1,87 @@
+import numpy as np
+from scanorama import *
+from scipy.sparse import vstack
+from sklearn.preprocessing import normalize, LabelEncoder
+import sys
+from time import time
+
+from benchmark import write_table
+from process import load_names, process
+
+np.random.seed(0)
+
+NAMESPACE = 'mouse_brain_sketched'
+BATCH_SIZE = 10000
+
+data_names = [
+    'data/mouse_brain/nuclei',
+    'data/mouse_brain/dropviz/Cerebellum_ALT',
+    'data/mouse_brain/dropviz/Cortex_noRep5_FRONTALonly',
+    'data/mouse_brain/dropviz/Cortex_noRep5_POSTERIORonly',
+    'data/mouse_brain/dropviz/EntoPeduncular',
+    'data/mouse_brain/dropviz/GlobusPallidus',
+    'data/mouse_brain/dropviz/Hippocampus',
+    'data/mouse_brain/dropviz/Striatum',
+    'data/mouse_brain/dropviz/SubstantiaNigra',
+    'data/mouse_brain/dropviz/Thalamus',
+]
+
+if __name__ == '__main__':
+    process(data_names, min_trans=100)
+
+    datasets, genes_list, n_cells = load_names(data_names)
+
+    datasets, genes = merge_datasets(datasets, genes_list, ds_names=data_names)
+
+    datasets_dimred, genes = process_data(datasets, genes, verbose=True)
+
+    t0 = time()
+    datasets_dimred, genes = integrate(
+        datasets, genes_list, ds_names=data_names,
+        sketch=True, sketch_method='geosketch', sketch_max=2000,
+    )
+    print('Sketched and integrated panoramas in {:.3f}s'
+          .format(time() - t0))
+
+    labels = []
+    names = []
+    curr_label = 0
+    for i, a in enumerate(datasets_dimred):
+        labels += list(np.zeros(a.shape[0]) + curr_label)
+        names.append(data_names[i])
+        curr_label += 1
+        labels = np.array(labels, dtype=int)
+
+    mouse_brain_genes = [
+        'Gja1', 'Flt1', 'Gabra6', 'Syt1', 'Gabrb2', 'Gabra1',
+        'Meg3', 'Mbp', 'Rgs5',
+    ]
+
+    # Downsample for visualization purposes
+    datasets_dimred = []
+    for i in range(len(data_names)):
+        ds = datasets_dimred[i]
+        rand_idx = np.random.choice(ds.shape[0], size=int(ds.shape[0]/10),
+                                    replace=False)
+        datasets_dimred[i] = ds[rand_idx, :]
+        datasets[i] = datasets[i][rand_idx, :]
+        data.close()
+
+    embedding = visualize(datasets_dimred,
+                          labels, NAMESPACE + '_ds', names,
+                          gene_names=mouse_brain_genes, genes=genes,
+                          gene_expr=vstack(datasets),
+                          multicore_tsne=True,
+                          image_suffix='.png')
+
+    cell_labels = (
+        open('data/cell_labels/mouse_brain_cluster.txt')
+        .read().rstrip().split()
+    )
+    le = LabelEncoder().fit(cell_labels)
+    cell_labels = le.transform(cell_labels)
+    cell_types = le.classes_
+
+    visualize(None,
+              cell_labels, NAMESPACE + '_type', cell_types,
+              embedding=embedding,  image_suffix='.png')
diff --git a/scanorama/__init__.py b/scanorama/__init__.py
@@ -1 +1,3 @@
 from .scanorama import *
+
+__version__ = 1.5