From 2409017b82c88b0c41ca4a8f2b1dd0d42fa5fa3f Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Tue, 9 Mar 2021 22:16:55 -0500 Subject: [PATCH] add more columns, include feature selection --- consensus/build-consensus-signatures.ipynb | 176 +++++++++++++----- .../nbconverted/build-consensus-signatures.py | 103 ++++++++-- 2 files changed, 219 insertions(+), 60 deletions(-) diff --git a/consensus/build-consensus-signatures.ipynb b/consensus/build-consensus-signatures.ipynb index 079a4e4..444ceef 100644 --- a/consensus/build-consensus-signatures.ipynb +++ b/consensus/build-consensus-signatures.ipynb @@ -9,14 +9,18 @@ "Here, we generate consensus signatures for the LINCS Drug Repurposing Hub Cell Painting subset.\n", "See the project [README.md](README.md) for more details.\n", "\n", - "This notebook generates four files; one per plate normalization and consensus normalization strategy.\n", - "\n", - "| Plate Normalization | Consensus Normalization | Consensus Suffix |\n", - "| :------------------: | :------------------------: | -----------------: |\n", - "| DMSO | Median | `_consensus_median_dmso.csv.gz` |\n", - "| DMSO | MODZ | `_consensus_modz_dmso.csv.gz` |\n", - "| Whole Plate | Median | `_consensus_median.csv.gz` |\n", - "| Whole Plate | MODZ | `_consensus_modz.csv.gz` |" + "This notebook generates eight files; one per plate normalization and consensus normalization strategy, with and without feature selection.\n", + "\n", + "|Feature selection | Plate Normalization | Consensus Normalization | Consensus Suffix |\n", + "|:---------------- | :------------------: | :------------------------: | -----------------: |\n", + "| No | DMSO | Median | `_consensus_median_dmso.csv.gz` |\n", + "| No | DMSO | MODZ | `_consensus_modz_dmso.csv.gz` |\n", + "| No | Whole Plate | Median | `_consensus_median.csv.gz` |\n", + "| No | Whole Plate | MODZ | `_consensus_modz.csv.gz` |\n", + "| Yes | DMSO | Median | `_consensus_median_feature_select_dmso.csv.gz` |\n", + "| Yes | DMSO | MODZ | `_consensus_modz_feature_select_dmso.csv.gz` |\n", + "| Yes | Whole Plate | Median | `_consensus_median_feature_select.csv.gz` |\n", + "| Yes | Whole Plate | MODZ | `_consensus_modz_feature_select.csv.gz` |" ] }, { @@ -67,8 +71,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 2;\n", - " var nbb_unformatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", - " var nbb_formatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", + " var nbb_unformatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\nfrom pycytominer.feature_select import feature_select\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", + " var nbb_formatted_code = \"import os\\nimport pathlib\\nimport numpy as np\\nimport pandas as pd\\n\\nfrom pycytominer.aggregate import aggregate\\nfrom pycytominer.consensus import modz_base\\nfrom pycytominer.feature_select import feature_select\\n\\nfrom pycytominer.cyto_utils import infer_cp_features\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -97,6 +101,7 @@ "\n", "from pycytominer.aggregate import aggregate\n", "from pycytominer.consensus import modz_base\n", + "from pycytominer.feature_select import feature_select\n", "\n", "from pycytominer.cyto_utils import infer_cp_features" ] @@ -355,9 +360,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create Consensus Profiles\n", + "## Create Consensus Profiles, with and without feature selection\n", "\n", - "We generate two different consensus profiles for each of the normalization strategies. This generates four different files." + "We generate two different consensus profiles for each of the normalization strategies, with and without feature selection. This generates eight different files." ] }, { @@ -371,8 +376,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 7;\n", - " var nbb_unformatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n]\";\n", - " var nbb_formatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n]\";\n", + " var nbb_unformatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n \\\"Metadata_moa\\\",\\n \\\"Metadata_target\\\",\\n]\";\n", + " var nbb_formatted_code = \"# Aggregating columns\\nreplicate_cols = [\\n \\\"Metadata_Plate_Map_Name\\\",\\n \\\"Metadata_broad_sample\\\",\\n \\\"Metadata_pert_well\\\",\\n \\\"Metadata_mmoles_per_liter\\\",\\n \\\"Metadata_dose_recode\\\",\\n \\\"Metadata_moa\\\",\\n \\\"Metadata_target\\\",\\n]\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -401,6 +406,8 @@ " \"Metadata_pert_well\",\n", " \"Metadata_mmoles_per_liter\",\n", " \"Metadata_dose_recode\",\n", + " \"Metadata_moa\",\n", + " \"Metadata_target\",\n", "]" ] }, @@ -414,13 +421,21 @@ "output_type": "stream", "text": [ "Now calculating median consensus for whole_plate normalization\n", - "There are 10752 median consensus profiles for whole_plate normalization\n", + "There are 8340 median consensus profiles for whole_plate normalization\n", + "Now feature selecting on median consensus for whole_plate normalization\n", + "There are 510 features in median consensus profiles for whole_plate normalization\n", "Now calculating modz consensus for whole_plate normalization\n", - "There are 10752 modz consensus profiles for whole_plate normalization\n", + "There are 8340 modz consensus profiles for whole_plate normalization\n", + "Now feature selecting on modz consensus for whole_plate normalization\n", + "There are 447 features in modz consensus profiles for whole_plate normalization\n", "Now calculating median consensus for dmso normalization\n", - "There are 10752 median consensus profiles for dmso normalization\n", + "There are 8340 median consensus profiles for dmso normalization\n", + "Now feature selecting on median consensus for dmso normalization\n", + "There are 592 features in median consensus profiles for dmso normalization\n", "Now calculating modz consensus for dmso normalization\n", - "There are 10752 modz consensus profiles for dmso normalization\n" + "There are 8340 modz consensus profiles for dmso normalization\n", + "Now feature selecting on modz consensus for dmso normalization\n", + "There are 527 features in modz consensus profiles for dmso normalization\n" ] }, { @@ -429,8 +444,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 8;\n", - " var nbb_unformatted_code = \"all_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", - " var nbb_formatted_code = \"all_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", + " var nbb_unformatted_code = \"# feature selection operations\\nfeature_select_ops = [\\n \\\"drop_na_columns\\\",\\n \\\"variance_threshold\\\",\\n \\\"correlation_threshold\\\",\\n \\\"blacklist\\\",\\n]\\n\\nall_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = {}\\n\\n consensus_profiles[operation][\\\"no_feat_select\\\"] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n # feature selection\\n print(\\n f\\\"Now feature selecting on {operation} consensus for {norm_strat} normalization\\\"\\n )\\n\\n consensus_profiles[operation][\\\"feat_select\\\"] = feature_select(\\n profiles=consensus_profiles[operation][\\\"no_feat_select\\\"],\\n features=\\\"infer\\\",\\n operation=feature_select_ops,\\n )\\n\\n # How many features in feature selected profile?\\n print(\\n f\\\"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", + " var nbb_formatted_code = \"# feature selection operations\\nfeature_select_ops = [\\n \\\"drop_na_columns\\\",\\n \\\"variance_threshold\\\",\\n \\\"correlation_threshold\\\",\\n \\\"blacklist\\\",\\n]\\n\\nall_consensus_dfs = {}\\nfor norm_strat in file_bases:\\n all_profiles_df = all_profiles_dfs[norm_strat]\\n cp_norm_features = cp_features[norm_strat]\\n\\n consensus_profiles = {}\\n for operation in operations:\\n print(f\\\"Now calculating {operation} consensus for {norm_strat} normalization\\\")\\n\\n consensus_profiles[operation] = {}\\n\\n consensus_profiles[operation][\\\"no_feat_select\\\"] = consensus_apply(\\n all_profiles_df,\\n operation=operation,\\n cp_features=cp_norm_features,\\n replicate_cols=replicate_cols,\\n )\\n\\n # How many DMSO profiles per well?\\n print(\\n f\\\"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n # feature selection\\n print(\\n f\\\"Now feature selecting on {operation} consensus for {norm_strat} normalization\\\"\\n )\\n\\n consensus_profiles[operation][\\\"feat_select\\\"] = feature_select(\\n profiles=consensus_profiles[operation][\\\"no_feat_select\\\"],\\n features=\\\"infer\\\",\\n operation=feature_select_ops,\\n )\\n\\n # How many features in feature selected profile?\\n print(\\n f\\\"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization\\\"\\n )\\n\\n all_consensus_dfs[norm_strat] = consensus_profiles\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -452,6 +467,14 @@ } ], "source": [ + "# feature selection operations\n", + "feature_select_ops = [\n", + " \"drop_na_columns\",\n", + " \"variance_threshold\",\n", + " \"correlation_threshold\",\n", + " \"blacklist\",\n", + "]\n", + "\n", "all_consensus_dfs = {}\n", "for norm_strat in file_bases:\n", " all_profiles_df = all_profiles_dfs[norm_strat]\n", @@ -461,7 +484,9 @@ " for operation in operations:\n", " print(f\"Now calculating {operation} consensus for {norm_strat} normalization\")\n", "\n", - " consensus_profiles[operation] = consensus_apply(\n", + " consensus_profiles[operation] = {}\n", + "\n", + " consensus_profiles[operation][\"no_feat_select\"] = consensus_apply(\n", " all_profiles_df,\n", " operation=operation,\n", " cp_features=cp_norm_features,\n", @@ -470,7 +495,23 @@ "\n", " # How many DMSO profiles per well?\n", " print(\n", - " f\"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization\"\n", + " f\"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization\"\n", + " )\n", + "\n", + " # feature selection\n", + " print(\n", + " f\"Now feature selecting on {operation} consensus for {norm_strat} normalization\"\n", + " )\n", + "\n", + " consensus_profiles[operation][\"feat_select\"] = feature_select(\n", + " profiles=consensus_profiles[operation][\"no_feat_select\"],\n", + " features=\"infer\",\n", + " operation=feature_select_ops,\n", + " )\n", + "\n", + " # How many features in feature selected profile?\n", + " print(\n", + " f\"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization\"\n", " )\n", "\n", " all_consensus_dfs[norm_strat] = consensus_profiles" @@ -480,7 +521,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Merge and Output Consensus Signatures" + "## Merge and Output Consensus Signatures, with and without feature selection" ] }, { @@ -492,18 +533,30 @@ "name": "stdout", "output_type": "stream", "text": [ - "Now Writing: Consensus Operation: median; Norm Strategy: whole_plate\n", + "Now Writing: Feature selection: No; Consensus Operation: median; Norm Strategy: whole_plate\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median.csv.gz\n", - "(10752, 1788)\n", - "Now Writing: Consensus Operation: modz; Norm Strategy: whole_plate\n", + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: median; Norm Strategy: whole_plate\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select.csv.gz\n", + "(8340, 510)\n", + "Now Writing: Feature selection: No; Consensus Operation: modz; Norm Strategy: whole_plate\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.csv.gz\n", - "(10752, 1788)\n", - "Now Writing: Consensus Operation: median; Norm Strategy: dmso\n", + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: modz; Norm Strategy: whole_plate\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select.csv.gz\n", + "(8340, 447)\n", + "Now Writing: Feature selection: No; Consensus Operation: median; Norm Strategy: dmso\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_dmso.csv.gz\n", - "(10752, 1788)\n", - "Now Writing: Consensus Operation: modz; Norm Strategy: dmso\n", + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: median; Norm Strategy: dmso\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_median_feature_select_dmso.csv.gz\n", + "(8340, 592)\n", + "Now Writing: Feature selection: No; Consensus Operation: modz; Norm Strategy: dmso\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_dmso.csv.gz\n", - "(10752, 1788)\n" + "(8340, 1790)\n", + "Now Writing: Feature selection: Yes; Consensus Operation: modz; Norm Strategy: dmso\n", + "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz_feature_select_dmso.csv.gz\n", + "(8340, 527)\n" ] }, { @@ -512,8 +565,8 @@ "\n", " setTimeout(function() {\n", " var nbb_cell_id = 9;\n", - " var nbb_unformatted_code = \"for norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation]\\n\\n print(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file, sep=\\\",\\\", compression=\\\"gzip\\\", float_format=\\\"%5g\\\", index=False\\n )\";\n", - " var nbb_formatted_code = \"for norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation]\\n\\n print(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file, sep=\\\",\\\", compression=\\\"gzip\\\", float_format=\\\"%5g\\\", index=False\\n )\";\n", + " var nbb_unformatted_code = \"float_format = \\\"%5g\\\"\\ncompression = \\\"gzip\\\"\\n\\nfor norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n\\n # No feature selection\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\n print(\\n f\\\"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\\n\\n # With feature selection\\n consensus_feat_df = all_consensus_dfs[norm_strat][operation][\\\"feat_select\\\"]\\n\\n consensus_feat_file = (\\n f\\\"{batch}_consensus_{operation}_feature_select{file_suffix}\\\"\\n )\\n consensus_feat_file = pathlib.Path(batch, consensus_feat_file)\\n\\n print(\\n f\\\"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_feat_file}\\\"\\n )\\n print(consensus_feat_df.shape)\\n\\n consensus_feat_df.to_csv(\\n consensus_feat_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\";\n", + " var nbb_formatted_code = \"float_format = \\\"%5g\\\"\\ncompression = \\\"gzip\\\"\\n\\nfor norm_strat in file_bases:\\n file_suffix = file_bases[norm_strat][\\\"output_file_suffix\\\"]\\n for operation in operations:\\n\\n # No feature selection\\n consensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\n consensus_file = pathlib.Path(batch, consensus_file)\\n\\n consensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\n print(\\n f\\\"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n )\\n print(consensus_df.shape)\\n\\n consensus_df.to_csv(\\n consensus_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\\n\\n # With feature selection\\n consensus_feat_df = all_consensus_dfs[norm_strat][operation][\\\"feat_select\\\"]\\n\\n consensus_feat_file = (\\n f\\\"{batch}_consensus_{operation}_feature_select{file_suffix}\\\"\\n )\\n consensus_feat_file = pathlib.Path(batch, consensus_feat_file)\\n\\n print(\\n f\\\"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_feat_file}\\\"\\n )\\n print(consensus_feat_df.shape)\\n\\n consensus_feat_df.to_csv(\\n consensus_feat_file,\\n sep=\\\",\\\",\\n compression=compression,\\n float_format=float_format,\\n index=False,\\n )\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -535,21 +588,51 @@ } ], "source": [ + "float_format = \"%5g\"\n", + "compression = \"gzip\"\n", + "\n", "for norm_strat in file_bases:\n", " file_suffix = file_bases[norm_strat][\"output_file_suffix\"]\n", " for operation in operations:\n", + "\n", + " # No feature selection\n", " consensus_file = f\"{batch}_consensus_{operation}{file_suffix}\"\n", " consensus_file = pathlib.Path(batch, consensus_file)\n", "\n", - " consensus_df = all_consensus_dfs[norm_strat][operation]\n", + " consensus_df = all_consensus_dfs[norm_strat][operation][\"no_feat_select\"]\n", "\n", " print(\n", - " f\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_file}\"\n", + " f\"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_file}\"\n", " )\n", " print(consensus_df.shape)\n", "\n", " consensus_df.to_csv(\n", - " consensus_file, sep=\",\", compression=\"gzip\", float_format=\"%5g\", index=False\n", + " consensus_file,\n", + " sep=\",\",\n", + " compression=compression,\n", + " float_format=float_format,\n", + " index=False,\n", + " )\n", + "\n", + " # With feature selection\n", + " consensus_feat_df = all_consensus_dfs[norm_strat][operation][\"feat_select\"]\n", + "\n", + " consensus_feat_file = (\n", + " f\"{batch}_consensus_{operation}_feature_select{file_suffix}\"\n", + " )\n", + " consensus_feat_file = pathlib.Path(batch, consensus_feat_file)\n", + "\n", + " print(\n", + " f\"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_feat_file}\"\n", + " )\n", + " print(consensus_feat_df.shape)\n", + "\n", + " consensus_feat_df.to_csv(\n", + " consensus_feat_file,\n", + " sep=\",\",\n", + " compression=compression,\n", + " float_format=float_format,\n", + " index=False,\n", " )" ] }, @@ -564,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -573,7 +656,7 @@ "text": [ "Now Writing: Consensus Operation: modz; Norm Strategy: whole_plate\n", "File: 2016_04_01_a549_48hr_batch1/2016_04_01_a549_48hr_batch1_consensus_modz.gct\n", - "(10752, 1788)\n" + "(8340, 1790)\n" ] }, { @@ -581,9 +664,9 @@ "application/javascript": [ "\n", " setTimeout(function() {\n", - " var nbb_cell_id = 10;\n", - " var nbb_unformatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", - " var nbb_formatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", + " var nbb_cell_id = 11;\n", + " var nbb_unformatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", + " var nbb_formatted_code = \"import pycytominer.write_gct\\n\\noperation = \\\"modz\\\"\\nnorm_strat = \\\"whole_plate\\\"\\nfile_suffix = \\\".gct\\\"\\nconsensus_file = f\\\"{batch}_consensus_{operation}{file_suffix}\\\"\\nconsensus_file = pathlib.Path(batch, consensus_file)\\n\\nconsensus_df = all_consensus_dfs[norm_strat][operation][\\\"no_feat_select\\\"]\\n\\nprint(\\n f\\\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\\\nFile: {consensus_file}\\\"\\n)\\nprint(consensus_df.shape)\\n\\npycytominer.write_gct(consensus_df, consensus_file)\";\n", " var nbb_cells = Jupyter.notebook.get_cells();\n", " for (var i = 0; i < nbb_cells.length; ++i) {\n", " if (nbb_cells[i].input_prompt_number == nbb_cell_id) {\n", @@ -613,7 +696,7 @@ "consensus_file = f\"{batch}_consensus_{operation}{file_suffix}\"\n", "consensus_file = pathlib.Path(batch, consensus_file)\n", "\n", - "consensus_df = all_consensus_dfs[norm_strat][operation]\n", + "consensus_df = all_consensus_dfs[norm_strat][operation][\"no_feat_select\"]\n", "\n", "print(\n", " f\"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\\nFile: {consensus_file}\"\n", @@ -622,6 +705,13 @@ "\n", "pycytominer.write_gct(consensus_df, consensus_file)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -640,7 +730,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.7.10" } }, "nbformat": 4, diff --git a/consensus/scripts/nbconverted/build-consensus-signatures.py b/consensus/scripts/nbconverted/build-consensus-signatures.py index 4a323f5..7e80785 100644 --- a/consensus/scripts/nbconverted/build-consensus-signatures.py +++ b/consensus/scripts/nbconverted/build-consensus-signatures.py @@ -6,14 +6,18 @@ # Here, we generate consensus signatures for the LINCS Drug Repurposing Hub Cell Painting subset. # See the project [README.md](README.md) for more details. # -# This notebook generates four files; one per plate normalization and consensus normalization strategy. +# This notebook generates eight files; one per plate normalization and consensus normalization strategy, with and without feature selection. # -# | Plate Normalization | Consensus Normalization | Consensus Suffix | -# | :------------------: | :------------------------: | -----------------: | -# | DMSO | Median | `_consensus_median_dmso.csv.gz` | -# | DMSO | MODZ | `_consensus_modz_dmso.csv.gz` | -# | Whole Plate | Median | `_consensus_median.csv.gz` | -# | Whole Plate | MODZ | `_consensus_modz.csv.gz` | +# |Feature selection | Plate Normalization | Consensus Normalization | Consensus Suffix | +# |:---------------- | :------------------: | :------------------------: | -----------------: | +# | No | DMSO | Median | `_consensus_median_dmso.csv.gz` | +# | No | DMSO | MODZ | `_consensus_modz_dmso.csv.gz` | +# | No | Whole Plate | Median | `_consensus_median.csv.gz` | +# | No | Whole Plate | MODZ | `_consensus_modz.csv.gz` | +# | Yes | DMSO | Median | `_consensus_median_feature_select_dmso.csv.gz` | +# | Yes | DMSO | MODZ | `_consensus_modz_feature_select_dmso.csv.gz` | +# | Yes | Whole Plate | Median | `_consensus_median_feature_select.csv.gz` | +# | Yes | Whole Plate | MODZ | `_consensus_modz_feature_select.csv.gz` | # In[1]: @@ -31,6 +35,7 @@ from pycytominer.aggregate import aggregate from pycytominer.consensus import modz_base +from pycytominer.feature_select import feature_select from pycytominer.cyto_utils import infer_cp_features @@ -141,9 +146,9 @@ def consensus_apply(df, operation, cp_features, replicate_cols): del all_profiles_df -# ## Create Consensus Profiles +# ## Create Consensus Profiles, with and without feature selection # -# We generate two different consensus profiles for each of the normalization strategies. This generates four different files. +# We generate two different consensus profiles for each of the normalization strategies, with and without feature selection. This generates eight different files. # In[7]: @@ -155,12 +160,22 @@ def consensus_apply(df, operation, cp_features, replicate_cols): "Metadata_pert_well", "Metadata_mmoles_per_liter", "Metadata_dose_recode", + "Metadata_moa", + "Metadata_target", ] # In[8]: +# feature selection operations +feature_select_ops = [ + "drop_na_columns", + "variance_threshold", + "correlation_threshold", + "blacklist", +] + all_consensus_dfs = {} for norm_strat in file_bases: all_profiles_df = all_profiles_dfs[norm_strat] @@ -170,7 +185,9 @@ def consensus_apply(df, operation, cp_features, replicate_cols): for operation in operations: print(f"Now calculating {operation} consensus for {norm_strat} normalization") - consensus_profiles[operation] = consensus_apply( + consensus_profiles[operation] = {} + + consensus_profiles[operation]["no_feat_select"] = consensus_apply( all_profiles_df, operation=operation, cp_features=cp_norm_features, @@ -179,32 +196,78 @@ def consensus_apply(df, operation, cp_features, replicate_cols): # How many DMSO profiles per well? print( - f"There are {consensus_profiles[operation].shape[0]} {operation} consensus profiles for {norm_strat} normalization" + f"There are {consensus_profiles[operation]['no_feat_select'].shape[0]} {operation} consensus profiles for {norm_strat} normalization" + ) + + # feature selection + print( + f"Now feature selecting on {operation} consensus for {norm_strat} normalization" + ) + + consensus_profiles[operation]["feat_select"] = feature_select( + profiles=consensus_profiles[operation]["no_feat_select"], + features="infer", + operation=feature_select_ops, + ) + + # How many features in feature selected profile? + print( + f"There are {consensus_profiles[operation]['feat_select'].shape[1]} features in {operation} consensus profiles for {norm_strat} normalization" ) all_consensus_dfs[norm_strat] = consensus_profiles -# ## Merge and Output Consensus Signatures +# ## Merge and Output Consensus Signatures, with and without feature selection # In[9]: +float_format = "%5g" +compression = "gzip" + for norm_strat in file_bases: file_suffix = file_bases[norm_strat]["output_file_suffix"] for operation in operations: + + # No feature selection consensus_file = f"{batch}_consensus_{operation}{file_suffix}" consensus_file = pathlib.Path(batch, consensus_file) - consensus_df = all_consensus_dfs[norm_strat][operation] + consensus_df = all_consensus_dfs[norm_strat][operation]["no_feat_select"] print( - f"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_file}" + f"Now Writing: Feature selection: No; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_file}" ) print(consensus_df.shape) consensus_df.to_csv( - consensus_file, sep=",", compression="gzip", float_format="%5g", index=False + consensus_file, + sep=",", + compression=compression, + float_format=float_format, + index=False, + ) + + # With feature selection + consensus_feat_df = all_consensus_dfs[norm_strat][operation]["feat_select"] + + consensus_feat_file = ( + f"{batch}_consensus_{operation}_feature_select{file_suffix}" + ) + consensus_feat_file = pathlib.Path(batch, consensus_feat_file) + + print( + f"Now Writing: Feature selection: Yes; Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_feat_file}" + ) + print(consensus_feat_df.shape) + + consensus_feat_df.to_csv( + consensus_feat_file, + sep=",", + compression=compression, + float_format=float_format, + index=False, ) @@ -212,7 +275,7 @@ def consensus_apply(df, operation, cp_features, replicate_cols): # # Whole-plate-normalized + MODZ aggregated consensus profiles will be made available on clue.io/morphology as a GCT file. -# In[10]: +# In[11]: import pycytominer.write_gct @@ -223,7 +286,7 @@ def consensus_apply(df, operation, cp_features, replicate_cols): consensus_file = f"{batch}_consensus_{operation}{file_suffix}" consensus_file = pathlib.Path(batch, consensus_file) -consensus_df = all_consensus_dfs[norm_strat][operation] +consensus_df = all_consensus_dfs[norm_strat][operation]["no_feat_select"] print( f"Now Writing: Consensus Operation: {operation}; Norm Strategy: {norm_strat}\nFile: {consensus_file}" @@ -232,3 +295,9 @@ def consensus_apply(df, operation, cp_features, replicate_cols): pycytominer.write_gct(consensus_df, consensus_file) + +# In[ ]: + + + +