From 0b38faeb19833acf31c3eb8451c2484723f86cb1 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Fri, 29 May 2020 20:36:05 -0400 Subject: [PATCH 01/10] Add GCT file, rerun notebook --- .gitattributes | 1 + ..._04_01_a549_48hr_batch1_consensus_modz.gct | 3 + consensus/build-consensus-signatures.ipynb | 70 +++++++++++++++++++ .../nbconverted/build-consensus-signatures.py | 25 +++++++ 4 files changed, 99 insertions(+) create mode 100644 consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct diff --git a/.gitattributes b/.gitattributes index c9a6185..4f609e1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,2 @@ *.gz filter=lfs diff=lfs merge=lfs -text +*.gct filter=lfs diff=lfs merge=lfs -text diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct new file mode 100644 index 0000000..574a972 --- /dev/null +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:248315f0da9fc74f28dfaaa17f073d324d8198d638f4c195cbff3d7549e77093 +size 297655274 diff --git a/consensus/build-consensus-signatures.ipynb b/consensus/build-consensus-signatures.ipynb index 7baa3bb..079a4e4 100644 --- a/consensus/build-consensus-signatures.ipynb +++ b/consensus/build-consensus-signatures.ipynb @@ -552,6 +552,76 @@ " consensus_file, sep=\",\", compression=\"gzip\", float_format=\"%5g\", index=False\n", " )" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save whole plate MODZ consensus signature as GCT\n", + "\n", + "Whole-plate-normalized + MODZ aggregated consensus profiles will be made available on clue.io/morphology as a GCT file." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Now Writing: Consensus Operation: modz; Norm Strategy: whole_plate\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct\n", + "(10752, 1788)\n" + ] + }, + { + "data": { + "application/javascript": [ + "\n", + " setTimeout(function() {\n", + " var nbb_cell_id = 10;\n", + " var nbb_unformatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", + " var nbb_formatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", + " var nbb_cells = Jupyter.notebook.get_cells();\n", + " for (var i = 0; i < nbb_cells.length; ++i) {\n", + " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", + " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", + " nbb_cells[i].set_text(nbb_formatted_code);\n", + " }\n", + " break;\n", + " }\n", + " }\n", + " }, 500);\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pycytominer.write_gct\n", + "\n", + "operation = \"modz\"\n", + "norm_strat = \"whole_plate\"\n", + "file_suffix = \".gct\"\n", + "consensus_file = f\"{batch}_consensus_{operation}{file_suffix}\"\n", + "consensus_file = pathlib.Path(batch, consensus_file)\n", + "\n", + "consensus_df = all_consensus_dfs[norm_strat][operation]\n", + "\n", + "print(\n", + " f\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_file}\"\n", + ")\n", + "print(consensus_df.shape)\n", + "\n", + "pycytominer.write_gct(consensus_df, consensus_file)" + ] } ], "metadata": { diff --git a/consensus/scripts/nbconverted/build-consensus-signatures.py b/consensus/scripts/nbconverted/build-consensus-signatures.py index 1ea23ea..4a323f5 100644 --- a/consensus/scripts/nbconverted/build-consensus-signatures.py +++ b/consensus/scripts/nbconverted/build-consensus-signatures.py @@ -207,3 +207,28 @@ def consensus_apply(df, operation, cp_features, replicate_cols): consensus_file, sep=",", compression="gzip", float_format="%5g", index=False ) + +# ## Save whole plate MODZ consensus signature as GCT +# +# Whole-plate-normalized + MODZ aggregated consensus profiles will be made available on clue.io/morphology as a GCT file. + +# In[10]: + + +import pycytominer.write_gct + +operation = "modz" +norm_strat = "whole_plate" +file_suffix = ".gct" +consensus_file = f"{batch}_consensus_{operation}{file_suffix}" +consensus_file = pathlib.Path(batch, consensus_file) + +consensus_df = all_consensus_dfs[norm_strat][operation] + +print( + f"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_file}" +) +print(consensus_df.shape) + +pycytominer.write_gct(consensus_df, consensus_file) + From 943bd53fc475730671e7d03022c5bd5b21759d23 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 30 May 2020 00:07:25 -0400 Subject: [PATCH 02/10] Handle gzip diffs --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 4f609e1..945b849 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,2 @@ -*.gz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=gzip merge=lfs -text *.gct filter=lfs diff=lfs merge=lfs -text From 598e30ca824740fbd6ee42a0f649f21bf51d72e6 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 30 May 2020 13:58:21 -0400 Subject: [PATCH 03/10] Update .gitattributes Revert git diff because we no longer need that --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 945b849..4f609e1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,2 @@ -*.gz filter=lfs diff=gzip merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text *.gct filter=lfs diff=lfs merge=lfs -text From 0ed2f59671a4d2ba0c879a32bcf94c8595931637 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Tue, 9 Mar 2021 14:55:50 -0500 Subject: [PATCH 04/10] Update README and include pip --- README.md | 10 +++++++--- consensus/README.md | 12 ++++++++++++ environment.yml | 1 + 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 892ac58..010fa1e 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ The repository stores data and data processing scripts for **a subset** of the [Broad Drug Repurposing Hub](https://clue.io/repurposing#home) collection of compounds. -In this project, the [Connectivity Map](https://clue.io/team) team perturbed A549 cells with ~1,500 compounds across 6 doses in 5 technical replicates. +In this project, the [Connectivity Map](https://clue.io/team) team perturbed A549 cells with 1,571 compounds across 6 doses in 5 technical replicates. We refer to this dataset as `LINCS Pilot 1`. -For a specific list of compounds tested, see [`metadata`](https://github.com/broadinstitute/lincs-cell-painting/tree/master/metadata). -Information about the compounds can be interactively explored in the [CLUE Repurposing app](https://clue.io/repurposing-app). +For a specific list of compounds tested, see [`metadata`](https://github.com/broadinstitute/lincs-cell-painting/tree/master/metadata). +Information about the compounds can be interactively explored in the [CLUE Repurposing app](https://clue.io/repurposing-app). The [Morphology Connectivity Hub](https://clue.io/morphology) is the primary source of this dataset. ## Image-Based Profiling @@ -23,6 +23,10 @@ For more details about image-based profiling in general, please refer to [Caiced We use [conda](https://docs.conda.io/en/latest/) to manage the computational environment. +To install conda see [instructions](https://docs.conda.io/en/latest/miniconda.html). + +We recommend installing conda by downloading and executing the `.sh` file and accepting defaults. + After installing conda, execute the following to install and navigate to the environment: ```bash diff --git a/consensus/README.md b/consensus/README.md index a3e628f..1db32c8 100644 --- a/consensus/README.md +++ b/consensus/README.md @@ -39,3 +39,15 @@ We then recode the dose points into ascending numerical levels and add a new met Note we generated per-well DMSO consensus signatures and per compound-dose pair consensus signatures for compounds. The per-well DMSO profiles can help to assess plate-associated batch effects. + +## Reproduce Pipeline + +The pipeline can be reproduced by executing the following: + +```bash +# Make sure conda environment is activated +conda activate lincs + +# Reproduce thepipeline for producing bulk signatures +python scripts/build-consensus-signatures.py +``` diff --git a/environment.yml b/environment.yml index c093792..92695c1 100644 --- a/environment.yml +++ b/environment.yml @@ -2,6 +2,7 @@ name: lincs channels: - conda-forge dependencies: +- pip=21.0.1 - conda-forge::pandas=1.0.1 - conda-forge::tabulate=0.8.7 - conda-forge::jupyter=1.0.0 From 766160339a6aba4b8ee84a4a610a75d5b39c8a64 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Tue, 9 Mar 2021 15:15:16 -0500 Subject: [PATCH 05/10] typo --- consensus/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/consensus/README.md b/consensus/README.md index 1db32c8..112f95a 100644 --- a/consensus/README.md +++ b/consensus/README.md @@ -49,5 +49,5 @@ The pipeline can be reproduced by executing the following: conda activate lincs # Reproduce thepipeline for producing bulk signatures -python scripts/build-consensus-signatures.py +ipython scripts/nbconverted/build-consensus-signatures.py ``` From 8769c38b163c47506cd84d801db50c04aa8a27df Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Tue, 9 Mar 2021 22:15:21 -0500 Subject: [PATCH 06/10] docs --- consensus/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/consensus/README.md b/consensus/README.md index 112f95a..24ea7d4 100644 --- a/consensus/README.md +++ b/consensus/README.md @@ -51,3 +51,9 @@ conda activate lincs # Reproduce thepipeline for producing bulk signatures ipython scripts/nbconverted/build-consensus-signatures.py ``` + +`scripts/nbconverted/*.py` were created from the Jupyter notebooks in this folder, like this: + +```sh +jupyter nbconvert --to=script --FilesWriter.build_directory=scripts/nbconverted *.ipynb +``` \ No newline at end of file From 2409017b82c88b0c41ca4a8f2b1dd0d42fa5fa3f Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Tue, 9 Mar 2021 22:16:55 -0500 Subject: [PATCH 07/10] add more columns, include feature selection --- consensus/build-consensus-signatures.ipynb | 176 +++++++++++++----- .../nbconverted/build-consensus-signatures.py | 103 ++++++++-- 2 files changed, 219 insertions(+), 60 deletions(-) diff --git a/consensus/build-consensus-signatures.ipynb b/consensus/build-consensus-signatures.ipynb index 079a4e4..444ceef 100644 --- a/consensus/build-consensus-signatures.ipynb +++ b/consensus/build-consensus-signatures.ipynb @@ -9,14 +9,18 @@ "Here, we generate consensus signatures for the LINCS Drug Repurposing Hub Cell Painting subset.\n", "See the project [README.md](README.md) for more details.\n", "\n", - "This notebook generates four files; one per plate normalization and consensus normalization strategy.\n", - "\n", - "| Plate Normalization | Consensus Normalization | Consensus Suffix |\n", - "| :------------------: | :------------------------: | -----------------: |\n", - "| DMSO | Median | `_consensus_median_dmso.csv.gz` |\n", - "| DMSO | MODZ | `_consensus_modz_dmso.csv.gz` |\n", - "| Whole Plate | Median | `_consensus_median.csv.gz` |\n", - "| Whole Plate | MODZ | `_consensus_modz.csv.gz` |" + "This notebook generates eight files; one per plate normalization and consensus normalization strategy, with and without feature selection.\n", + "\n", + "|Feature selection | Plate Normalization | Consensus Normalization | Consensus Suffix |\n", + "|:---------------- | :------------------: | :------------------------: | -----------------: |\n", + "| No | DMSO | Median | `_consensus_median_dmso.csv.gz` |\n", + "| No | DMSO | MODZ | `_consensus_modz_dmso.csv.gz` |\n", + "| No | Whole Plate | Median | `_consensus_median.csv.gz` |\n", + "| No | Whole Plate | MODZ | `_consensus_modz.csv.gz` |\n", + "| Yes | DMSO | Median | `_consensus_median_feature_select_dmso.csv.gz` |\n", + "| Yes | DMSO | MODZ | `_consensus_modz_feature_select_dmso.csv.gz` |\n", + "| Yes | Whole Plate | Median | `_consensus_median_feature_select.csv.gz` |\n", + "| Yes | Whole Plate | MODZ | `_consensus_modz_feature_select.csv.gz` |" ] }, { @@ -67,8 +71,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 2;\n", - " var nbb_unformatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", - " var nbb_formatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", + " var nbb_unformatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\nfrom pycytominer.feature_select import feature_select\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", + " var nbb_formatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\nfrom pycytominer.feature_select import feature_select\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -97,6 +101,7 @@ "\n", "from pycytominer.aggregate import aggregate\n", "from pycytominer.consensus import modz_base\n", + "from pycytominer.feature_select import feature_select\n", "\n", "from pycytominer.cyto_utils import infer_cp_features" ] @@ -355,9 +360,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create Consensus Profiles\n", + "## Create Consensus Profiles, with and without feature selection\n", "\n", - "We generate two different consensus profiles for each of the normalization strategies. This generates four different files." + "We generate two different consensus profiles for each of the normalization strategies, with and without feature selection. This generates eight different files." ] }, { @@ -371,8 +376,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 7;\n", - " var nbb_unformatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n]\";\n", - " var nbb_formatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n]\";\n", + " var nbb_unformatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n \\\"Metadata_moa\\\",\\n \\\"Metadata_target\\\",\\n]\";\n", + " var nbb_formatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n \\\"Metadata_moa\\\",\\n \\\"Metadata_target\\\",\\n]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -401,6 +406,8 @@ " \"Metadata_pert_well\",\n", " \"Metadata_mmoles_per_liter\",\n", " \"Metadata_dose_recode\",\n", + " \"Metadata_moa\",\n", + " \"Metadata_target\",\n", "]" ] }, @@ -414,13 +421,21 @@ "output_type": "stream", "text": [ "Now calculating median consensus for whole_plate normalization\n", - "There are 10752 median consensus profiles for whole_plate normalization\n", + "There are 8340 median consensus profiles for whole_plate normalization\n", + "Now feature selecting on median consensus for whole_plate normalization\n", + "There are 510 features in median consensus profiles for whole_plate normalization\n", "Now calculating modz consensus for whole_plate normalization\n", - "There are 10752 modz consensus profiles for whole_plate normalization\n", + "There are 8340 modz consensus profiles for whole_plate normalization\n", + "Now feature selecting on modz consensus for whole_plate normalization\n", + "There are 447 features in modz consensus profiles for whole_plate normalization\n", "Now calculating median consensus for dmso normalization\n", - "There are 10752 median consensus profiles for dmso normalization\n", + "There are 8340 median consensus profiles for dmso normalization\n", + "Now feature selecting on median consensus for dmso normalization\n", + "There are 592 features in median consensus profiles for dmso normalization\n", "Now calculating modz consensus for dmso normalization\n", - "There are 10752 modz consensus profiles for dmso normalization\n" + "There are 8340 modz consensus profiles for dmso normalization\n", + "Now feature selecting on modz consensus for dmso normalization\n", + "There are 527 features in modz consensus profiles for dmso normalization\n" ] }, { @@ -429,8 +444,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 8;\n", - " var nbb_unformatted_code = \"all_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", - " var nbb_formatted_code = \"all_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", + " var nbb_unformatted_code = \"# feature selection operations\\nfeature_select_ops = [\\n \\\"drop_na_columns\\\",\\n \\\"variance_threshold\\\",\\n \\\"correlation_threshold\\\",\\n \\\"blacklist\\\",\\n]\\n\\nall_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = {}\\n\\n consensus_profiles[operation][\\\"no_feat_select\\\"] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n # feature selection\\n print(\\n f\\\"Now feature selecting on {operation} consensus for {norm_strat} normalization\\\"\\n )\\n\\n consensus_profiles[operation][\\\"feat_select\\\"] = feature_select(\\n profiles=consensus_profiles[operation][\\\"no_feat_select\\\"],\\n features=\\\"infer\\\",\\n operation=feature_select_ops,\\n )\\n\\n # How many features in feature selected profile?\\n print(\\n f\\\"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", + " var nbb_formatted_code = \"# feature selection operations\\nfeature_select_ops = [\\n \\\"drop_na_columns\\\",\\n \\\"variance_threshold\\\",\\n \\\"correlation_threshold\\\",\\n \\\"blacklist\\\",\\n]\\n\\nall_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = {}\\n\\n consensus_profiles[operation][\\\"no_feat_select\\\"] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n # feature selection\\n print(\\n f\\\"Now feature selecting on {operation} consensus for {norm_strat} normalization\\\"\\n )\\n\\n consensus_profiles[operation][\\\"feat_select\\\"] = feature_select(\\n profiles=consensus_profiles[operation][\\\"no_feat_select\\\"],\\n features=\\\"infer\\\",\\n operation=feature_select_ops,\\n )\\n\\n # How many features in feature selected profile?\\n print(\\n f\\\"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -452,6 +467,14 @@ } ], "source": [ + "# feature selection operations\n", + "feature_select_ops = [\n", + " \"drop_na_columns\",\n", + " \"variance_threshold\",\n", + " \"correlation_threshold\",\n", + " \"blacklist\",\n", + "]\n", + "\n", "all_consensus_dfs = {}\n", "for norm_strat in file_bases:\n", " all_profiles_df = all_profiles_dfs[norm_strat]\n", @@ -461,7 +484,9 @@ " for operation in operations:\n", " print(f\"Now calculating {operation} consensus for {norm_strat} normalization\")\n", "\n", - " consensus_profiles[operation] = consensus_apply(\n", + " consensus_profiles[operation] = {}\n", + "\n", + " consensus_profiles[operation][\"no_feat_select\"] = consensus_apply(\n", " all_profiles_df,\n", " operation=operation,\n", " cp_features=cp_norm_features,\n", @@ -470,7 +495,23 @@ "\n", " # How many DMSO profiles per well?\n", " print(\n", - " f\"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization\"\n", + " f\"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization\"\n", + " )\n", + "\n", + " # feature selection\n", + " print(\n", + " f\"Now feature selecting on {operation} consensus for {norm_strat} normalization\"\n", + " )\n", + "\n", + " consensus_profiles[operation][\"feat_select\"] = feature_select(\n", + " profiles=consensus_profiles[operation][\"no_feat_select\"],\n", + " features=\"infer\",\n", + " operation=feature_select_ops,\n", + " )\n", + "\n", + " # How many features in feature selected profile?\n", + " print(\n", + " f\"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization\"\n", " )\n", "\n", " all_consensus_dfs[norm_strat] = consensus_profiles" @@ -480,7 +521,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Merge and Output Consensus Signatures" + "## Merge and Output Consensus Signatures, with and without feature selection" ] }, { @@ -492,18 +533,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "Now Writing: Consensus Operation: median; Norm Strategy: whole_plate\n", + "Now Writing: Feature selection: No; Consensus Operation: median; Norm Strategy: whole_plate\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz\n", - "(10752, 1788)\n", - "Now Writing: Consensus Operation: modz; Norm Strategy: whole_plate\n", + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: median; Norm Strategy: whole_plate\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select.csv.gz\n", + "(8340, 510)\n", + "Now Writing: Feature selection: No; Consensus Operation: modz; Norm Strategy: whole_plate\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz\n", - "(10752, 1788)\n", - "Now Writing: Consensus Operation: median; Norm Strategy: dmso\n", + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: modz; Norm Strategy: whole_plate\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz\n", + "(8340, 447)\n", + "Now Writing: Feature selection: No; Consensus Operation: median; Norm Strategy: dmso\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz\n", - "(10752, 1788)\n", - "Now Writing: Consensus Operation: modz; Norm Strategy: dmso\n", + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: median; Norm Strategy: dmso\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select_dmso.csv.gz\n", + "(8340, 592)\n", + "Now Writing: Feature selection: No; Consensus Operation: modz; Norm Strategy: dmso\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz\n", - "(10752, 1788)\n" + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: modz; Norm Strategy: dmso\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz\n", + "(8340, 527)\n" ] }, { @@ -512,8 +565,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 9;\n", - " var nbb_unformatted_code = \"for norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation]\\n\\n print(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file, sep=\\\",\\\", compression=\\\"gzip\\\", float_format=\\\"%5g\\\", index=False\\n )\";\n", - " var nbb_formatted_code = \"for norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation]\\n\\n print(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file, sep=\\\",\\\", compression=\\\"gzip\\\", float_format=\\\"%5g\\\", index=False\\n )\";\n", + " var nbb_unformatted_code = \"float_format = \\\"%5g\\\"\\ncompression = \\\"gzip\\\"\\n\\nfor norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n\\n # No feature selection\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\n print(\\n f\\\"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\\n\\n # With feature selection\\n consensus_feat_df = all_consensus_dfs[norm_strat][operation][\\\"feat_select\\\"]\\n\\n consensus_feat_file = (\\n f\\\"{batch}_consensus_{operation}_feature_select{file_suffix}\\\"\\n )\\n consensus_feat_file = pathlib.Path(batch, consensus_feat_file)\\n\\n print(\\n f\\\"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_feat_file}\\\"\\n )\\n print(consensus_feat_df.shape)\\n\\n consensus_feat_df.to_csv(\\n consensus_feat_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\";\n", + " var nbb_formatted_code = \"float_format = \\\"%5g\\\"\\ncompression = \\\"gzip\\\"\\n\\nfor norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n\\n # No feature selection\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\n print(\\n f\\\"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\\n\\n # With feature selection\\n consensus_feat_df = all_consensus_dfs[norm_strat][operation][\\\"feat_select\\\"]\\n\\n consensus_feat_file = (\\n f\\\"{batch}_consensus_{operation}_feature_select{file_suffix}\\\"\\n )\\n consensus_feat_file = pathlib.Path(batch, consensus_feat_file)\\n\\n print(\\n f\\\"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_feat_file}\\\"\\n )\\n print(consensus_feat_df.shape)\\n\\n consensus_feat_df.to_csv(\\n consensus_feat_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -535,21 +588,51 @@ } ], "source": [ + "float_format = \"%5g\"\n", + "compression = \"gzip\"\n", + "\n", "for norm_strat in file_bases:\n", " file_suffix = file_bases[norm_strat][\"output_file_suffix\"]\n", " for operation in operations:\n", + "\n", + " # No feature selection\n", " consensus_file = f\"{batch}_consensus_{operation}{file_suffix}\"\n", " consensus_file = pathlib.Path(batch, consensus_file)\n", "\n", - " consensus_df = all_consensus_dfs[norm_strat][operation]\n", + " consensus_df = all_consensus_dfs[norm_strat][operation][\"no_feat_select\"]\n", "\n", " print(\n", - " f\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_file}\"\n", + " f\"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_file}\"\n", " )\n", " print(consensus_df.shape)\n", "\n", " consensus_df.to_csv(\n", - " consensus_file, sep=\",\", compression=\"gzip\", float_format=\"%5g\", index=False\n", + " consensus_file,\n", + " sep=\",\",\n", + " compression=compression,\n", + " float_format=float_format,\n", + " index=False,\n", + " )\n", + "\n", + " # With feature selection\n", + " consensus_feat_df = all_consensus_dfs[norm_strat][operation][\"feat_select\"]\n", + "\n", + " consensus_feat_file = (\n", + " f\"{batch}_consensus_{operation}_feature_select{file_suffix}\"\n", + " )\n", + " consensus_feat_file = pathlib.Path(batch, consensus_feat_file)\n", + "\n", + " print(\n", + " f\"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_feat_file}\"\n", + " )\n", + " print(consensus_feat_df.shape)\n", + "\n", + " consensus_feat_df.to_csv(\n", + " consensus_feat_file,\n", + " sep=\",\",\n", + " compression=compression,\n", + " float_format=float_format,\n", + " index=False,\n", " )" ] }, @@ -564,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -573,7 +656,7 @@ "text": [ "Now Writing: Consensus Operation: modz; Norm Strategy: whole_plate\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct\n", - "(10752, 1788)\n" + "(8340, 1790)\n" ] }, { @@ -581,9 +664,9 @@ "application/javascript": [ "\n", " setTimeout(function() {\n", - " var nbb_cell_id = 10;\n", - " var nbb_unformatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", - " var nbb_formatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", + " var nbb_cell_id = 11;\n", + " var nbb_unformatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", + " var nbb_formatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -613,7 +696,7 @@ "consensus_file = f\"{batch}_consensus_{operation}{file_suffix}\"\n", "consensus_file = pathlib.Path(batch, consensus_file)\n", "\n", - "consensus_df = all_consensus_dfs[norm_strat][operation]\n", + "consensus_df = all_consensus_dfs[norm_strat][operation][\"no_feat_select\"]\n", "\n", "print(\n", " f\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_file}\"\n", @@ -622,6 +705,13 @@ "\n", "pycytominer.write_gct(consensus_df, consensus_file)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -640,7 +730,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.10" } }, "nbformat": 4, diff --git a/consensus/scripts/nbconverted/build-consensus-signatures.py b/consensus/scripts/nbconverted/build-consensus-signatures.py index 4a323f5..7e80785 100644 --- a/consensus/scripts/nbconverted/build-consensus-signatures.py +++ b/consensus/scripts/nbconverted/build-consensus-signatures.py @@ -6,14 +6,18 @@ # Here, we generate consensus signatures for the LINCS Drug Repurposing Hub Cell Painting subset. # See the project [README.md](README.md) for more details. # -# This notebook generates four files; one per plate normalization and consensus normalization strategy. +# This notebook generates eight files; one per plate normalization and consensus normalization strategy, with and without feature selection. # -# | Plate Normalization | Consensus Normalization | Consensus Suffix | -# | :------------------: | :------------------------: | -----------------: | -# | DMSO | Median | `_consensus_median_dmso.csv.gz` | -# | DMSO | MODZ | `_consensus_modz_dmso.csv.gz` | -# | Whole Plate | Median | `_consensus_median.csv.gz` | -# | Whole Plate | MODZ | `_consensus_modz.csv.gz` | +# |Feature selection | Plate Normalization | Consensus Normalization | Consensus Suffix | +# |:---------------- | :------------------: | :------------------------: | -----------------: | +# | No | DMSO | Median | `_consensus_median_dmso.csv.gz` | +# | No | DMSO | MODZ | `_consensus_modz_dmso.csv.gz` | +# | No | Whole Plate | Median | `_consensus_median.csv.gz` | +# | No | Whole Plate | MODZ | `_consensus_modz.csv.gz` | +# | Yes | DMSO | Median | `_consensus_median_feature_select_dmso.csv.gz` | +# | Yes | DMSO | MODZ | `_consensus_modz_feature_select_dmso.csv.gz` | +# | Yes | Whole Plate | Median | `_consensus_median_feature_select.csv.gz` | +# | Yes | Whole Plate | MODZ | `_consensus_modz_feature_select.csv.gz` | # In[1]: @@ -31,6 +35,7 @@ from pycytominer.aggregate import aggregate from pycytominer.consensus import modz_base +from pycytominer.feature_select import feature_select from pycytominer.cyto_utils import infer_cp_features @@ -141,9 +146,9 @@ def consensus_apply(df, operation, cp_features, replicate_cols): del all_profiles_df -# ## Create Consensus Profiles +# ## Create Consensus Profiles, with and without feature selection # -# We generate two different consensus profiles for each of the normalization strategies. This generates four different files. +# We generate two different consensus profiles for each of the normalization strategies, with and without feature selection. This generates eight different files. # In[7]: @@ -155,12 +160,22 @@ def consensus_apply(df, operation, cp_features, replicate_cols): "Metadata_pert_well", "Metadata_mmoles_per_liter", "Metadata_dose_recode", + "Metadata_moa", + "Metadata_target", ] # In[8]: +# feature selection operations +feature_select_ops = [ + "drop_na_columns", + "variance_threshold", + "correlation_threshold", + "blacklist", +] + all_consensus_dfs = {} for norm_strat in file_bases: all_profiles_df = all_profiles_dfs[norm_strat] @@ -170,7 +185,9 @@ def consensus_apply(df, operation, cp_features, replicate_cols): for operation in operations: print(f"Now calculating {operation} consensus for {norm_strat} normalization") - consensus_profiles[operation] = consensus_apply( + consensus_profiles[operation] = {} + + consensus_profiles[operation]["no_feat_select"] = consensus_apply( all_profiles_df, operation=operation, cp_features=cp_norm_features, @@ -179,32 +196,78 @@ def consensus_apply(df, operation, cp_features, replicate_cols): # How many DMSO profiles per well? print( - f"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization" + f"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization" + ) + + # feature selection + print( + f"Now feature selecting on {operation} consensus for {norm_strat} normalization" + ) + + consensus_profiles[operation]["feat_select"] = feature_select( + profiles=consensus_profiles[operation]["no_feat_select"], + features="infer", + operation=feature_select_ops, + ) + + # How many features in feature selected profile? + print( + f"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization" ) all_consensus_dfs[norm_strat] = consensus_profiles -# ## Merge and Output Consensus Signatures +# ## Merge and Output Consensus Signatures, with and without feature selection # In[9]: +float_format = "%5g" +compression = "gzip" + for norm_strat in file_bases: file_suffix = file_bases[norm_strat]["output_file_suffix"] for operation in operations: + + # No feature selection consensus_file = f"{batch}_consensus_{operation}{file_suffix}" consensus_file = pathlib.Path(batch, consensus_file) - consensus_df = all_consensus_dfs[norm_strat][operation] + consensus_df = all_consensus_dfs[norm_strat][operation]["no_feat_select"] print( - f"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_file}" + f"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_file}" ) print(consensus_df.shape) consensus_df.to_csv( - consensus_file, sep=",", compression="gzip", float_format="%5g", index=False + consensus_file, + sep=",", + compression=compression, + float_format=float_format, + index=False, + ) + + # With feature selection + consensus_feat_df = all_consensus_dfs[norm_strat][operation]["feat_select"] + + consensus_feat_file = ( + f"{batch}_consensus_{operation}_feature_select{file_suffix}" + ) + consensus_feat_file = pathlib.Path(batch, consensus_feat_file) + + print( + f"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_feat_file}" + ) + print(consensus_feat_df.shape) + + consensus_feat_df.to_csv( + consensus_feat_file, + sep=",", + compression=compression, + float_format=float_format, + index=False, ) @@ -212,7 +275,7 @@ def consensus_apply(df, operation, cp_features, replicate_cols): # # Whole-plate-normalized + MODZ aggregated consensus profiles will be made available on clue.io/morphology as a GCT file. -# In[10]: +# In[11]: import pycytominer.write_gct @@ -223,7 +286,7 @@ def consensus_apply(df, operation, cp_features, replicate_cols): consensus_file = f"{batch}_consensus_{operation}{file_suffix}" consensus_file = pathlib.Path(batch, consensus_file) -consensus_df = all_consensus_dfs[norm_strat][operation] +consensus_df = all_consensus_dfs[norm_strat][operation]["no_feat_select"] print( f"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_file}" @@ -232,3 +295,9 @@ def consensus_apply(df, operation, cp_features, replicate_cols): pycytominer.write_gct(consensus_df, consensus_file) + +# In[ ]: + + + + From 907d031dc5200992b8d8e25835b0d3710612c72b Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Tue, 9 Mar 2021 22:17:26 -0500 Subject: [PATCH 08/10] Add data files --- .../2016_04_01_a549_48hr_batch1_consensus_median.csv.gz | 4 ++-- .../2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz | 4 ++-- ...01_a549_48hr_batch1_consensus_median_feature_select.csv.gz | 3 +++ ...49_48hr_batch1_consensus_median_feature_select_dmso.csv.gz | 3 +++ .../2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz | 4 ++-- .../2016_04_01_a549_48hr_batch1_consensus_modz.gct | 4 ++-- .../2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz | 4 ++-- ...4_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz | 3 +++ ...a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz | 3 +++ 9 files changed, 22 insertions(+), 10 deletions(-) create mode 100644 consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select.csv.gz create mode 100644 consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select_dmso.csv.gz create mode 100644 consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz create mode 100644 consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz index 7e80bff..54baa70 100644 --- a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63c388269c53da12860f1c471420c825d04f5182b7b683b0c84616198fe0a7a7 -size 58815567 +oid sha256:d7c6934e25e22b4ac1e4f8cf9eb345c467f076152c172d3837e3e10a23e85094 +size 45776166 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz index 13f276f..72254cc 100644 --- a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:daf5a07496c24462623954c20af72e131e6743bef703a4560e73f6e363231582 -size 58706522 +oid sha256:6319a32be97eefd5e5f77fead0acd6cf3077899d315e3252653f5ad0d6d3a5ad +size 45678293 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select.csv.gz new file mode 100644 index 0000000..ab37e6c --- /dev/null +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51b16ab5b32fa0ef5f532b3b97b8925f4d4239c02d69f0b9dcc15f5066b3cf97 +size 13844674 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select_dmso.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select_dmso.csv.gz new file mode 100644 index 0000000..4517dc1 --- /dev/null +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select_dmso.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab294b4a080e8753a19a240ee551cf64c7490d43c78d1e8aa8263b71062ec8f +size 15985121 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz index 56c3197..4174c48 100644 --- a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b65f4825f9eefac30be2ccdf88c8c3dcbd3c4128144190267f300d9aa77906d -size 69381403 +oid sha256:31639af2fb4e5d0ff5e4b72a973c3271f03a88ff39e6c9129f8ded72a264bf7c +size 54008400 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct index 574a972..9b6c322 100644 --- a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:248315f0da9fc74f28dfaaa17f073d324d8198d638f4c195cbff3d7549e77093 -size 297655274 +oid sha256:eb8012af5eae5a52db4f80232134c945b4575caae8dd53278fe45b2396617f22 +size 230789982 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz index d65b8f5..d021ee0 100644 --- a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de0c7fbb70c282208e3817393234e6a9007dd8d020ed711503b02b76bb7c275c -size 69290644 +oid sha256:d76344c98f38a8f8808e0c20e22f0f8836f99f70104310c1e89030d3135aefce +size 53942607 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz new file mode 100644 index 0000000..2fe66a7 --- /dev/null +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10d2e1208464dab2a226dc7eaf9dd79ae12cc21ec0d2cc80b2ec6df43224af41 +size 14189099 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz new file mode 100644 index 0000000..707687b --- /dev/null +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b02716d692851c742b7a353cb0e36efcea40b756321da8ef7ac336f802aac4 +size 16677030 From 4a0de4f71dd41e90db553df32d7c014b8c4b2046 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 20 Mar 2021 07:12:06 -0400 Subject: [PATCH 09/10] Fix typos, drop GCT --- ..._04_01_a549_48hr_batch1_consensus_modz.gct | 3 - consensus/README.md | 2 +- consensus/build-consensus-signatures.ipynb | 78 ------------------- .../nbconverted/build-consensus-signatures.py | 32 -------- 4 files changed, 1 insertion(+), 114 deletions(-) delete mode 100644 consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct deleted file mode 100644 index 9b6c322..0000000 --- a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb8012af5eae5a52db4f80232134c945b4575caae8dd53278fe45b2396617f22 -size 230789982 diff --git a/consensus/README.md b/consensus/README.md index 24ea7d4..b816d5c 100644 --- a/consensus/README.md +++ b/consensus/README.md @@ -48,7 +48,7 @@ The pipeline can be reproduced by executing the following: # Make sure conda environment is activated conda activate lincs -# Reproduce thepipeline for producing bulk signatures +# Reproduce the pipeline for producing bulk signatures ipython scripts/nbconverted/build-consensus-signatures.py ``` diff --git a/consensus/build-consensus-signatures.ipynb b/consensus/build-consensus-signatures.ipynb index 444ceef..c327344 100644 --- a/consensus/build-consensus-signatures.ipynb +++ b/consensus/build-consensus-signatures.ipynb @@ -102,7 +102,6 @@ "from pycytominer.aggregate import aggregate\n", "from pycytominer.consensus import modz_base\n", "from pycytominer.feature_select import feature_select\n", - "\n", "from pycytominer.cyto_utils import infer_cp_features" ] }, @@ -635,83 +634,6 @@ " index=False,\n", " )" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Save whole plate MODZ consensus signature as GCT\n", - "\n", - "Whole-plate-normalized + MODZ aggregated consensus profiles will be made available on clue.io/morphology as a GCT file." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Now Writing: Consensus Operation: modz; Norm Strategy: whole_plate\n", - "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct\n", - "(8340, 1790)\n" - ] - }, - { - "data": { - "application/javascript": [ - "\n", - " setTimeout(function() {\n", - " var nbb_cell_id = 11;\n", - " var nbb_unformatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", - " var nbb_formatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", - " var nbb_cells = Jupyter.notebook.get_cells();\n", - " for (var i = 0; i < nbb_cells.length; ++i) {\n", - " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", - " if (nbb_cells[i].get_text() == nbb_unformatted_code) {\n", - " nbb_cells[i].set_text(nbb_formatted_code);\n", - " }\n", - " break;\n", - " }\n", - " }\n", - " }, 500);\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import pycytominer.write_gct\n", - "\n", - "operation = \"modz\"\n", - "norm_strat = \"whole_plate\"\n", - "file_suffix = \".gct\"\n", - "consensus_file = f\"{batch}_consensus_{operation}{file_suffix}\"\n", - "consensus_file = pathlib.Path(batch, consensus_file)\n", - "\n", - "consensus_df = all_consensus_dfs[norm_strat][operation][\"no_feat_select\"]\n", - "\n", - "print(\n", - " f\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_file}\"\n", - ")\n", - "print(consensus_df.shape)\n", - "\n", - "pycytominer.write_gct(consensus_df, consensus_file)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/consensus/scripts/nbconverted/build-consensus-signatures.py b/consensus/scripts/nbconverted/build-consensus-signatures.py index 7e80785..ad5dd55 100644 --- a/consensus/scripts/nbconverted/build-consensus-signatures.py +++ b/consensus/scripts/nbconverted/build-consensus-signatures.py @@ -36,7 +36,6 @@ from pycytominer.aggregate import aggregate from pycytominer.consensus import modz_base from pycytominer.feature_select import feature_select - from pycytominer.cyto_utils import infer_cp_features @@ -270,34 +269,3 @@ def consensus_apply(df, operation, cp_features, replicate_cols): index=False, ) - -# ## Save whole plate MODZ consensus signature as GCT -# -# Whole-plate-normalized + MODZ aggregated consensus profiles will be made available on clue.io/morphology as a GCT file. - -# In[11]: - - -import pycytominer.write_gct - -operation = "modz" -norm_strat = "whole_plate" -file_suffix = ".gct" -consensus_file = f"{batch}_consensus_{operation}{file_suffix}" -consensus_file = pathlib.Path(batch, consensus_file) - -consensus_df = all_consensus_dfs[norm_strat][operation]["no_feat_select"] - -print( - f"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_file}" -) -print(consensus_df.shape) - -pycytominer.write_gct(consensus_df, consensus_file) - - -# In[ ]: - - - - From 7836c4f2a25faa8a9c55fbc43ae558dfb6b760dc Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 21 Mar 2021 10:46:03 +0000 Subject: [PATCH 10/10] Update .gitattributes Co-authored-by: Greg Way --- .gitattributes | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 4f609e1..c9a6185 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1 @@ *.gz filter=lfs diff=lfs merge=lfs -text -*.gct filter=lfs diff=lfs merge=lfs -text