diff --git a/README.md b/README.md index f784a33..6977408 100644 --- a/README.md +++ b/README.md @@ -3,17 +3,18 @@ The Library of Integrated Network-Based Cellular Signatures (LINCS) Project aims to create publicly available resources to characterize how cells respond to perturbation. This repository stores Cell Painting readouts and associated data-processing pipelines for the LINCS Cell Painting dataset. +In this project, the [Connectivity Map](https://clue.io/team) team perturbed A549 cells with 1,571 compounds across 6 doses in 5 technical replicates. The data represent **a subset** of the [Broad Drug Repurposing Hub](https://clue.io/repurposing#home) collection of compounds. -In this project, the [Connectivity Map](https://clue.io/team) team perturbed A549 cells with ~1,500 compounds across 6 doses in 5 technical replicates. We refer to this dataset as `LINCS Pilot 1`. We also include data for the second batch of LINCS Cell Painting data, which we refer to as `LKCP`. For a specific list of compounds tested, see [`metadata`](https://github.com/broadinstitute/lincs-cell-painting/tree/master/metadata). You can interactively explore information about the compounds in the [CLUE Repurposing app](https://clue.io/repurposing-app). + The [Morphology Connectivity Hub](https://clue.io/morphology) is the primary source of this dataset. -## Image-Based profiling +## Image-based profiling We apply a unified, image-based profiling pipeline to all 136 384-well plates from `LINCS Pilot 1`, and all 135 384-well plates from `LKCP`. We use [pycytominer](https://github.com/cytomining/pycytominer) as the primary tool for image-based profiling. @@ -27,6 +28,10 @@ For more details about image-based profiling in general, please refer to [Caiced We use [conda](https://docs.conda.io/en/latest/) to manage the computational environment. +To install conda see [instructions](https://docs.conda.io/en/latest/miniconda.html). + +We recommend installing conda by downloading and executing the `.sh` file and accepting defaults. + After installing conda, execute the following to install and navigate to the environment: ```bash diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz index 7e80bff..54baa70 100644 --- a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63c388269c53da12860f1c471420c825d04f5182b7b683b0c84616198fe0a7a7 -size 58815567 +oid sha256:d7c6934e25e22b4ac1e4f8cf9eb345c467f076152c172d3837e3e10a23e85094 +size 45776166 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz index 13f276f..72254cc 100644 --- a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:daf5a07496c24462623954c20af72e131e6743bef703a4560e73f6e363231582 -size 58706522 +oid sha256:6319a32be97eefd5e5f77fead0acd6cf3077899d315e3252653f5ad0d6d3a5ad +size 45678293 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select.csv.gz new file mode 100644 index 0000000..ab37e6c --- /dev/null +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51b16ab5b32fa0ef5f532b3b97b8925f4d4239c02d69f0b9dcc15f5066b3cf97 +size 13844674 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select_dmso.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select_dmso.csv.gz new file mode 100644 index 0000000..4517dc1 --- /dev/null +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select_dmso.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab294b4a080e8753a19a240ee551cf64c7490d43c78d1e8aa8263b71062ec8f +size 15985121 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz index 56c3197..4174c48 100644 --- a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b65f4825f9eefac30be2ccdf88c8c3dcbd3c4128144190267f300d9aa77906d -size 69381403 +oid sha256:31639af2fb4e5d0ff5e4b72a973c3271f03a88ff39e6c9129f8ded72a264bf7c +size 54008400 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz index d65b8f5..d021ee0 100644 --- a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de0c7fbb70c282208e3817393234e6a9007dd8d020ed711503b02b76bb7c275c -size 69290644 +oid sha256:d76344c98f38a8f8808e0c20e22f0f8836f99f70104310c1e89030d3135aefce +size 53942607 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz new file mode 100644 index 0000000..2fe66a7 --- /dev/null +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10d2e1208464dab2a226dc7eaf9dd79ae12cc21ec0d2cc80b2ec6df43224af41 +size 14189099 diff --git a/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz new file mode 100644 index 0000000..707687b --- /dev/null +++ b/consensus/2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b02716d692851c742b7a353cb0e36efcea40b756321da8ef7ac336f802aac4 +size 16677030 diff --git a/consensus/README.md b/consensus/README.md index a3e628f..b816d5c 100644 --- a/consensus/README.md +++ b/consensus/README.md @@ -39,3 +39,21 @@ We then recode the dose points into ascending numerical levels and add a new met Note we generated per-well DMSO consensus signatures and per compound-dose pair consensus signatures for compounds. The per-well DMSO profiles can help to assess plate-associated batch effects. + +## Reproduce Pipeline + +The pipeline can be reproduced by executing the following: + +```bash +# Make sure conda environment is activated +conda activate lincs + +# Reproduce the pipeline for producing bulk signatures +ipython scripts/nbconverted/build-consensus-signatures.py +``` + +`scripts/nbconverted/*.py` were created from the Jupyter notebooks in this folder, like this: + +```sh +jupyter nbconvert --to=script --FilesWriter.build_directory=scripts/nbconverted *.ipynb +``` \ No newline at end of file diff --git a/consensus/build-consensus-signatures.ipynb b/consensus/build-consensus-signatures.ipynb index 7baa3bb..c327344 100644 --- a/consensus/build-consensus-signatures.ipynb +++ b/consensus/build-consensus-signatures.ipynb @@ -9,14 +9,18 @@ "Here, we generate consensus signatures for the LINCS Drug Repurposing Hub Cell Painting subset.\n", "See the project [README.md](README.md) for more details.\n", "\n", - "This notebook generates four files; one per plate normalization and consensus normalization strategy.\n", - "\n", - "| Plate Normalization | Consensus Normalization | Consensus Suffix |\n", - "| :------------------: | :------------------------: | -----------------: |\n", - "| DMSO | Median | `_consensus_median_dmso.csv.gz` |\n", - "| DMSO | MODZ | `_consensus_modz_dmso.csv.gz` |\n", - "| Whole Plate | Median | `_consensus_median.csv.gz` |\n", - "| Whole Plate | MODZ | `_consensus_modz.csv.gz` |" + "This notebook generates eight files; one per plate normalization and consensus normalization strategy, with and without feature selection.\n", + "\n", + "|Feature selection | Plate Normalization | Consensus Normalization | Consensus Suffix |\n", + "|:---------------- | :------------------: | :------------------------: | -----------------: |\n", + "| No | DMSO | Median | `_consensus_median_dmso.csv.gz` |\n", + "| No | DMSO | MODZ | `_consensus_modz_dmso.csv.gz` |\n", + "| No | Whole Plate | Median | `_consensus_median.csv.gz` |\n", + "| No | Whole Plate | MODZ | `_consensus_modz.csv.gz` |\n", + "| Yes | DMSO | Median | `_consensus_median_feature_select_dmso.csv.gz` |\n", + "| Yes | DMSO | MODZ | `_consensus_modz_feature_select_dmso.csv.gz` |\n", + "| Yes | Whole Plate | Median | `_consensus_median_feature_select.csv.gz` |\n", + "| Yes | Whole Plate | MODZ | `_consensus_modz_feature_select.csv.gz` |" ] }, { @@ -67,8 +71,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 2;\n", - " var nbb_unformatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", - " var nbb_formatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", + " var nbb_unformatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\nfrom pycytominer.feature_select import feature_select\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", + " var nbb_formatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\nfrom pycytominer.feature_select import feature_select\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -97,7 +101,7 @@ "\n", "from pycytominer.aggregate import aggregate\n", "from pycytominer.consensus import modz_base\n", - "\n", + "from pycytominer.feature_select import feature_select\n", "from pycytominer.cyto_utils import infer_cp_features" ] }, @@ -355,9 +359,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create Consensus Profiles\n", + "## Create Consensus Profiles, with and without feature selection\n", "\n", - "We generate two different consensus profiles for each of the normalization strategies. This generates four different files." + "We generate two different consensus profiles for each of the normalization strategies, with and without feature selection. This generates eight different files." ] }, { @@ -371,8 +375,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 7;\n", - " var nbb_unformatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n]\";\n", - " var nbb_formatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n]\";\n", + " var nbb_unformatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n \\\"Metadata_moa\\\",\\n \\\"Metadata_target\\\",\\n]\";\n", + " var nbb_formatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n \\\"Metadata_moa\\\",\\n \\\"Metadata_target\\\",\\n]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -401,6 +405,8 @@ " \"Metadata_pert_well\",\n", " \"Metadata_mmoles_per_liter\",\n", " \"Metadata_dose_recode\",\n", + " \"Metadata_moa\",\n", + " \"Metadata_target\",\n", "]" ] }, @@ -414,13 +420,21 @@ "output_type": "stream", "text": [ "Now calculating median consensus for whole_plate normalization\n", - "There are 10752 median consensus profiles for whole_plate normalization\n", + "There are 8340 median consensus profiles for whole_plate normalization\n", + "Now feature selecting on median consensus for whole_plate normalization\n", + "There are 510 features in median consensus profiles for whole_plate normalization\n", "Now calculating modz consensus for whole_plate normalization\n", - "There are 10752 modz consensus profiles for whole_plate normalization\n", + "There are 8340 modz consensus profiles for whole_plate normalization\n", + "Now feature selecting on modz consensus for whole_plate normalization\n", + "There are 447 features in modz consensus profiles for whole_plate normalization\n", "Now calculating median consensus for dmso normalization\n", - "There are 10752 median consensus profiles for dmso normalization\n", + "There are 8340 median consensus profiles for dmso normalization\n", + "Now feature selecting on median consensus for dmso normalization\n", + "There are 592 features in median consensus profiles for dmso normalization\n", "Now calculating modz consensus for dmso normalization\n", - "There are 10752 modz consensus profiles for dmso normalization\n" + "There are 8340 modz consensus profiles for dmso normalization\n", + "Now feature selecting on modz consensus for dmso normalization\n", + "There are 527 features in modz consensus profiles for dmso normalization\n" ] }, { @@ -429,8 +443,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 8;\n", - " var nbb_unformatted_code = \"all_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", - " var nbb_formatted_code = \"all_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", + " var nbb_unformatted_code = \"# feature selection operations\\nfeature_select_ops = [\\n \\\"drop_na_columns\\\",\\n \\\"variance_threshold\\\",\\n \\\"correlation_threshold\\\",\\n \\\"blacklist\\\",\\n]\\n\\nall_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = {}\\n\\n consensus_profiles[operation][\\\"no_feat_select\\\"] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n # feature selection\\n print(\\n f\\\"Now feature selecting on {operation} consensus for {norm_strat} normalization\\\"\\n )\\n\\n consensus_profiles[operation][\\\"feat_select\\\"] = feature_select(\\n profiles=consensus_profiles[operation][\\\"no_feat_select\\\"],\\n features=\\\"infer\\\",\\n operation=feature_select_ops,\\n )\\n\\n # How many features in feature selected profile?\\n print(\\n f\\\"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", + " var nbb_formatted_code = \"# feature selection operations\\nfeature_select_ops = [\\n \\\"drop_na_columns\\\",\\n \\\"variance_threshold\\\",\\n \\\"correlation_threshold\\\",\\n \\\"blacklist\\\",\\n]\\n\\nall_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = {}\\n\\n consensus_profiles[operation][\\\"no_feat_select\\\"] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n # feature selection\\n print(\\n f\\\"Now feature selecting on {operation} consensus for {norm_strat} normalization\\\"\\n )\\n\\n consensus_profiles[operation][\\\"feat_select\\\"] = feature_select(\\n profiles=consensus_profiles[operation][\\\"no_feat_select\\\"],\\n features=\\\"infer\\\",\\n operation=feature_select_ops,\\n )\\n\\n # How many features in feature selected profile?\\n print(\\n f\\\"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -452,6 +466,14 @@ } ], "source": [ + "# feature selection operations\n", + "feature_select_ops = [\n", + " \"drop_na_columns\",\n", + " \"variance_threshold\",\n", + " \"correlation_threshold\",\n", + " \"blacklist\",\n", + "]\n", + "\n", "all_consensus_dfs = {}\n", "for norm_strat in file_bases:\n", " all_profiles_df = all_profiles_dfs[norm_strat]\n", @@ -461,7 +483,9 @@ " for operation in operations:\n", " print(f\"Now calculating {operation} consensus for {norm_strat} normalization\")\n", "\n", - " consensus_profiles[operation] = consensus_apply(\n", + " consensus_profiles[operation] = {}\n", + "\n", + " consensus_profiles[operation][\"no_feat_select\"] = consensus_apply(\n", " all_profiles_df,\n", " operation=operation,\n", " cp_features=cp_norm_features,\n", @@ -470,7 +494,23 @@ "\n", " # How many DMSO profiles per well?\n", " print(\n", - " f\"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization\"\n", + " f\"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization\"\n", + " )\n", + "\n", + " # feature selection\n", + " print(\n", + " f\"Now feature selecting on {operation} consensus for {norm_strat} normalization\"\n", + " )\n", + "\n", + " consensus_profiles[operation][\"feat_select\"] = feature_select(\n", + " profiles=consensus_profiles[operation][\"no_feat_select\"],\n", + " features=\"infer\",\n", + " operation=feature_select_ops,\n", + " )\n", + "\n", + " # How many features in feature selected profile?\n", + " print(\n", + " f\"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization\"\n", " )\n", "\n", " all_consensus_dfs[norm_strat] = consensus_profiles" @@ -480,7 +520,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Merge and Output Consensus Signatures" + "## Merge and Output Consensus Signatures, with and without feature selection" ] }, { @@ -492,18 +532,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "Now Writing: Consensus Operation: median; Norm Strategy: whole_plate\n", + "Now Writing: Feature selection: No; Consensus Operation: median; Norm Strategy: whole_plate\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz\n", - "(10752, 1788)\n", - "Now Writing: Consensus Operation: modz; Norm Strategy: whole_plate\n", + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: median; Norm Strategy: whole_plate\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select.csv.gz\n", + "(8340, 510)\n", + "Now Writing: Feature selection: No; Consensus Operation: modz; Norm Strategy: whole_plate\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz\n", - "(10752, 1788)\n", - "Now Writing: Consensus Operation: median; Norm Strategy: dmso\n", + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: modz; Norm Strategy: whole_plate\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz\n", + "(8340, 447)\n", + "Now Writing: Feature selection: No; Consensus Operation: median; Norm Strategy: dmso\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz\n", - "(10752, 1788)\n", - "Now Writing: Consensus Operation: modz; Norm Strategy: dmso\n", + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: median; Norm Strategy: dmso\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select_dmso.csv.gz\n", + "(8340, 592)\n", + "Now Writing: Feature selection: No; Consensus Operation: modz; Norm Strategy: dmso\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz\n", - "(10752, 1788)\n" + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: modz; Norm Strategy: dmso\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz\n", + "(8340, 527)\n" ] }, { @@ -512,8 +564,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 9;\n", - " var nbb_unformatted_code = \"for norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation]\\n\\n print(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file, sep=\\\",\\\", compression=\\\"gzip\\\", float_format=\\\"%5g\\\", index=False\\n )\";\n", - " var nbb_formatted_code = \"for norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation]\\n\\n print(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file, sep=\\\",\\\", compression=\\\"gzip\\\", float_format=\\\"%5g\\\", index=False\\n )\";\n", + " var nbb_unformatted_code = \"float_format = \\\"%5g\\\"\\ncompression = \\\"gzip\\\"\\n\\nfor norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n\\n # No feature selection\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\n print(\\n f\\\"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\\n\\n # With feature selection\\n consensus_feat_df = all_consensus_dfs[norm_strat][operation][\\\"feat_select\\\"]\\n\\n consensus_feat_file = (\\n f\\\"{batch}_consensus_{operation}_feature_select{file_suffix}\\\"\\n )\\n consensus_feat_file = pathlib.Path(batch, consensus_feat_file)\\n\\n print(\\n f\\\"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_feat_file}\\\"\\n )\\n print(consensus_feat_df.shape)\\n\\n consensus_feat_df.to_csv(\\n consensus_feat_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\";\n", + " var nbb_formatted_code = \"float_format = \\\"%5g\\\"\\ncompression = \\\"gzip\\\"\\n\\nfor norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n\\n # No feature selection\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\n print(\\n f\\\"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\\n\\n # With feature selection\\n consensus_feat_df = all_consensus_dfs[norm_strat][operation][\\\"feat_select\\\"]\\n\\n consensus_feat_file = (\\n f\\\"{batch}_consensus_{operation}_feature_select{file_suffix}\\\"\\n )\\n consensus_feat_file = pathlib.Path(batch, consensus_feat_file)\\n\\n print(\\n f\\\"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_feat_file}\\\"\\n )\\n print(consensus_feat_df.shape)\\n\\n consensus_feat_df.to_csv(\\n consensus_feat_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -535,21 +587,51 @@ } ], "source": [ + "float_format = \"%5g\"\n", + "compression = \"gzip\"\n", + "\n", "for norm_strat in file_bases:\n", " file_suffix = file_bases[norm_strat][\"output_file_suffix\"]\n", " for operation in operations:\n", + "\n", + " # No feature selection\n", " consensus_file = f\"{batch}_consensus_{operation}{file_suffix}\"\n", " consensus_file = pathlib.Path(batch, consensus_file)\n", "\n", - " consensus_df = all_consensus_dfs[norm_strat][operation]\n", + " consensus_df = all_consensus_dfs[norm_strat][operation][\"no_feat_select\"]\n", "\n", " print(\n", - " f\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_file}\"\n", + " f\"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_file}\"\n", " )\n", " print(consensus_df.shape)\n", "\n", " consensus_df.to_csv(\n", - " consensus_file, sep=\",\", compression=\"gzip\", float_format=\"%5g\", index=False\n", + " consensus_file,\n", + " sep=\",\",\n", + " compression=compression,\n", + " float_format=float_format,\n", + " index=False,\n", + " )\n", + "\n", + " # With feature selection\n", + " consensus_feat_df = all_consensus_dfs[norm_strat][operation][\"feat_select\"]\n", + "\n", + " consensus_feat_file = (\n", + " f\"{batch}_consensus_{operation}_feature_select{file_suffix}\"\n", + " )\n", + " consensus_feat_file = pathlib.Path(batch, consensus_feat_file)\n", + "\n", + " print(\n", + " f\"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_feat_file}\"\n", + " )\n", + " print(consensus_feat_df.shape)\n", + "\n", + " consensus_feat_df.to_csv(\n", + " consensus_feat_file,\n", + " sep=\",\",\n", + " compression=compression,\n", + " float_format=float_format,\n", + " index=False,\n", " )" ] } @@ -570,7 +652,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.10" } }, "nbformat": 4, diff --git a/consensus/scripts/nbconverted/build-consensus-signatures.py b/consensus/scripts/nbconverted/build-consensus-signatures.py index 1ea23ea..ad5dd55 100644 --- a/consensus/scripts/nbconverted/build-consensus-signatures.py +++ b/consensus/scripts/nbconverted/build-consensus-signatures.py @@ -6,14 +6,18 @@ # Here, we generate consensus signatures for the LINCS Drug Repurposing Hub Cell Painting subset. # See the project [README.md](README.md) for more details. # -# This notebook generates four files; one per plate normalization and consensus normalization strategy. +# This notebook generates eight files; one per plate normalization and consensus normalization strategy, with and without feature selection. # -# | Plate Normalization | Consensus Normalization | Consensus Suffix | -# | :------------------: | :------------------------: | -----------------: | -# | DMSO | Median | `_consensus_median_dmso.csv.gz` | -# | DMSO | MODZ | `_consensus_modz_dmso.csv.gz` | -# | Whole Plate | Median | `_consensus_median.csv.gz` | -# | Whole Plate | MODZ | `_consensus_modz.csv.gz` | +# |Feature selection | Plate Normalization | Consensus Normalization | Consensus Suffix | +# |:---------------- | :------------------: | :------------------------: | -----------------: | +# | No | DMSO | Median | `_consensus_median_dmso.csv.gz` | +# | No | DMSO | MODZ | `_consensus_modz_dmso.csv.gz` | +# | No | Whole Plate | Median | `_consensus_median.csv.gz` | +# | No | Whole Plate | MODZ | `_consensus_modz.csv.gz` | +# | Yes | DMSO | Median | `_consensus_median_feature_select_dmso.csv.gz` | +# | Yes | DMSO | MODZ | `_consensus_modz_feature_select_dmso.csv.gz` | +# | Yes | Whole Plate | Median | `_consensus_median_feature_select.csv.gz` | +# | Yes | Whole Plate | MODZ | `_consensus_modz_feature_select.csv.gz` | # In[1]: @@ -31,7 +35,7 @@ from pycytominer.aggregate import aggregate from pycytominer.consensus import modz_base - +from pycytominer.feature_select import feature_select from pycytominer.cyto_utils import infer_cp_features @@ -141,9 +145,9 @@ def consensus_apply(df, operation, cp_features, replicate_cols): del all_profiles_df -# ## Create Consensus Profiles +# ## Create Consensus Profiles, with and without feature selection # -# We generate two different consensus profiles for each of the normalization strategies. This generates four different files. +# We generate two different consensus profiles for each of the normalization strategies, with and without feature selection. This generates eight different files. # In[7]: @@ -155,12 +159,22 @@ def consensus_apply(df, operation, cp_features, replicate_cols): "Metadata_pert_well", "Metadata_mmoles_per_liter", "Metadata_dose_recode", + "Metadata_moa", + "Metadata_target", ] # In[8]: +# feature selection operations +feature_select_ops = [ + "drop_na_columns", + "variance_threshold", + "correlation_threshold", + "blacklist", +] + all_consensus_dfs = {} for norm_strat in file_bases: all_profiles_df = all_profiles_dfs[norm_strat] @@ -170,7 +184,9 @@ def consensus_apply(df, operation, cp_features, replicate_cols): for operation in operations: print(f"Now calculating {operation} consensus for {norm_strat} normalization") - consensus_profiles[operation] = consensus_apply( + consensus_profiles[operation] = {} + + consensus_profiles[operation]["no_feat_select"] = consensus_apply( all_profiles_df, operation=operation, cp_features=cp_norm_features, @@ -179,31 +195,77 @@ def consensus_apply(df, operation, cp_features, replicate_cols): # How many DMSO profiles per well? print( - f"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization" + f"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization" + ) + + # feature selection + print( + f"Now feature selecting on {operation} consensus for {norm_strat} normalization" + ) + + consensus_profiles[operation]["feat_select"] = feature_select( + profiles=consensus_profiles[operation]["no_feat_select"], + features="infer", + operation=feature_select_ops, + ) + + # How many features in feature selected profile? + print( + f"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization" ) all_consensus_dfs[norm_strat] = consensus_profiles -# ## Merge and Output Consensus Signatures +# ## Merge and Output Consensus Signatures, with and without feature selection # In[9]: +float_format = "%5g" +compression = "gzip" + for norm_strat in file_bases: file_suffix = file_bases[norm_strat]["output_file_suffix"] for operation in operations: + + # No feature selection consensus_file = f"{batch}_consensus_{operation}{file_suffix}" consensus_file = pathlib.Path(batch, consensus_file) - consensus_df = all_consensus_dfs[norm_strat][operation] + consensus_df = all_consensus_dfs[norm_strat][operation]["no_feat_select"] print( - f"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_file}" + f"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_file}" ) print(consensus_df.shape) consensus_df.to_csv( - consensus_file, sep=",", compression="gzip", float_format="%5g", index=False + consensus_file, + sep=",", + compression=compression, + float_format=float_format, + index=False, + ) + + # With feature selection + consensus_feat_df = all_consensus_dfs[norm_strat][operation]["feat_select"] + + consensus_feat_file = ( + f"{batch}_consensus_{operation}_feature_select{file_suffix}" + ) + consensus_feat_file = pathlib.Path(batch, consensus_feat_file) + + print( + f"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_feat_file}" + ) + print(consensus_feat_df.shape) + + consensus_feat_df.to_csv( + consensus_feat_file, + sep=",", + compression=compression, + float_format=float_format, + index=False, ) diff --git a/environment.yml b/environment.yml index c093792..92695c1 100644 --- a/environment.yml +++ b/environment.yml @@ -2,6 +2,7 @@ name: lincs channels: - conda-forge dependencies: +- pip=21.0.1 - conda-forge::pandas=1.0.1 - conda-forge::tabulate=0.8.7 - conda-forge::jupyter=1.0.0