From 9a35e086a5e1ae56731a522bfe13204b535d541d Mon Sep 17 00:00:00 2001 From: Greg Way Date: Tue, 23 Oct 2018 11:22:02 -0400 Subject: [PATCH] Compression Feature Comparison Data Generation (#89) * add tybalt to conda env * git lfs all gz * add compression script * add all compressed data * rename nf1 feature comparison script * its 2018 now --- .gitattributes | 4 +- environment.yml | 2 + .../compressed_feature_comparison.sh | 0 feature_comparison/compression.py | 102 ++++++++++++++++++ .../data/dae_pancanatlas_z100.tsv.gz | 3 + .../data/ica_pancanatlas_z100.tsv.gz | 3 + .../data/nmf_pancanatlas_z100.tsv.gz | 3 + .../data/pca_pancanatlas_z100.tsv.gz | 3 + .../data/vae_pancanatlas_z100.tsv.gz | 3 + 9 files changed, 121 insertions(+), 2 deletions(-) rename nf1_feature_comparison.sh => feature_comparison/compressed_feature_comparison.sh (100%) create mode 100644 feature_comparison/compression.py create mode 100644 feature_comparison/data/dae_pancanatlas_z100.tsv.gz create mode 100644 feature_comparison/data/ica_pancanatlas_z100.tsv.gz create mode 100644 feature_comparison/data/nmf_pancanatlas_z100.tsv.gz create mode 100644 feature_comparison/data/pca_pancanatlas_z100.tsv.gz create mode 100644 feature_comparison/data/vae_pancanatlas_z100.tsv.gz diff --git a/.gitattributes b/.gitattributes index 6983435..3020705 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,2 @@ -data/*.gz filter=lfs diff=lfs merge=lfs -text -scripts/snaptron/*.gz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text + diff --git a/environment.yml b/environment.yml index 2dc8936..564297d 100644 --- a/environment.yml +++ b/environment.yml @@ -27,3 +27,5 @@ dependencies: - conda-forge::r-cowplot=0.9.3 - conda-forge::r-hmisc=4.0_3 - bioconda::biopython=1.70 +- pip: + - git+https://github.com/greenelab/tybalt@1b4ad8f8032c66b60c2397307f8d396099946416 diff --git a/nf1_feature_comparison.sh b/feature_comparison/compressed_feature_comparison.sh similarity index 100% rename from nf1_feature_comparison.sh rename to feature_comparison/compressed_feature_comparison.sh diff --git a/feature_comparison/compression.py b/feature_comparison/compression.py new file mode 100644 index 0000000..e83e4ed --- /dev/null +++ b/feature_comparison/compression.py @@ -0,0 +1,102 @@ +""" +Gregory Way 2018 +PanCancer Classifier +scripts/compression.py + +Perform a series of compression algorithms on the PanCanAtlas RNAseq data. The +compressed features (z) are input as features for gene alteration predictions. + +Usage: +Run in command line + + python scripts/compression.py + +Output: +Compressed gene expression features in the `feature_comparison/data` folder +""" + +import os +import numpy as np +import pandas as pd +from statsmodels.robust.scale import mad + +from tybalt.data_models import DataModel + +np.random.seed(123) + +# Load constants +num_genes_kept = 8000 +num_components = 100 + +vae_epochs = 100 +vae_batch_size = 150 +vae_lr = 0.001 + +dae_epochs = 100 +dae_batch_size = 50 +dae_lr = 0.0005 +dae_noise = 0 +dae_sparsity = 0 + +algorithms = ['pca', 'ica', 'nmf', 'dae', 'vae'] + +# Load and Process Data +expr_file = os.path.join('..', 'data', 'pancan_rnaseq_freeze.tsv.gz') +rnaseq_df = pd.read_table(expr_file, index_col=0) + +# Subset x matrix to MAD genes +med_dev = pd.DataFrame(mad(rnaseq_df), index=rnaseq_df.columns) +mad_genes = ( + med_dev.sort_values(by=0, ascending=False) + .iloc[0:num_genes_kept] + .index + .tolist() +) + +rnaseq_df = rnaseq_df.loc[:, mad_genes] + +# Initialize DataModel class with PanCanAtlas RNAseq +dm = DataModel(df=rnaseq_df) + +# Transform the input matrix into a range between zero and one +dm.transform(how='zeroone') + +# Fit models +dm.pca(n_components=num_components) +dm.ica(n_components=num_components) +dm.nmf(n_components=num_components) + +dm.nn(n_components=num_components, + model='tybalt', + loss='binary_crossentropy', + epochs=int(vae_epochs), + batch_size=int(vae_batch_size), + learning_rate=float(vae_lr), + separate_loss=False, + verbose=False) + +dm.nn(n_components=num_components, + model='adage', + loss='binary_crossentropy', + epochs=int(dae_epochs), + batch_size=int(dae_batch_size), + learning_rate=float(dae_lr), + noise=float(dae_noise), + sparsity=float(dae_sparsity), + verbose=False) + +# Output compressed features to files +pca_file = os.path.join('data', 'pca_pancanatlas_z100.tsv.gz') +dm.pca_df.to_csv(pca_file, sep='\t', compression='gzip') + +ica_file = os.path.join('data', 'ica_pancanatlas_z100.tsv.gz') +dm.ica_df.to_csv(ica_file, sep='\t', compression='gzip') + +nmf_file = os.path.join('data', 'nmf_pancanatlas_z100.tsv.gz') +dm.nmf_df.to_csv(nmf_file, sep='\t', compression='gzip') + +dae_file = os.path.join('data', 'dae_pancanatlas_z100.tsv.gz') +dm.adage_df.to_csv(dae_file, sep='\t', compression='gzip') + +vae_file = os.path.join('data', 'vae_pancanatlas_z100.tsv.gz') +dm.tybalt_df.to_csv(vae_file, sep='\t', compression='gzip') diff --git a/feature_comparison/data/dae_pancanatlas_z100.tsv.gz b/feature_comparison/data/dae_pancanatlas_z100.tsv.gz new file mode 100644 index 0000000..9cc71cb --- /dev/null +++ b/feature_comparison/data/dae_pancanatlas_z100.tsv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36887beeb247c114ceb4e691a819388ccf5449388759d07ca8c07df194ac7ddd +size 2104754 diff --git a/feature_comparison/data/ica_pancanatlas_z100.tsv.gz b/feature_comparison/data/ica_pancanatlas_z100.tsv.gz new file mode 100644 index 0000000..247cc55 --- /dev/null +++ b/feature_comparison/data/ica_pancanatlas_z100.tsv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb02ab85282696d6a3610d17e540027b9a2516f59a9afeb013e1460b88312f02 +size 8482594 diff --git a/feature_comparison/data/nmf_pancanatlas_z100.tsv.gz b/feature_comparison/data/nmf_pancanatlas_z100.tsv.gz new file mode 100644 index 0000000..6bfc772 --- /dev/null +++ b/feature_comparison/data/nmf_pancanatlas_z100.tsv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:610e823221a7f642ca657cb46fa9a48ca99aa22cb715fb45e2f2879dec81f52e +size 4469365 diff --git a/feature_comparison/data/pca_pancanatlas_z100.tsv.gz b/feature_comparison/data/pca_pancanatlas_z100.tsv.gz new file mode 100644 index 0000000..346de26 --- /dev/null +++ b/feature_comparison/data/pca_pancanatlas_z100.tsv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b8ec52a1a49623222b1ef2dbbb13f9674486d9121fefdb4f3d284a1365f16d2 +size 8359088 diff --git a/feature_comparison/data/vae_pancanatlas_z100.tsv.gz b/feature_comparison/data/vae_pancanatlas_z100.tsv.gz new file mode 100644 index 0000000..1f4b3d9 --- /dev/null +++ b/feature_comparison/data/vae_pancanatlas_z100.tsv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e06693168c999dc055c0e0167fa1fbd09ffbf9611e95f1016c67b2e2c734d9f1 +size 2966577