From e84cd3c96e1b497374b256b8980a3cb52e476a99 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Mon, 29 Jul 2024 11:47:50 +0200 Subject: [PATCH 01/33] initial guideline commit --- docs/src/guidelines/Guideline 1: Data.ipynb | 773 ++++++++++++++++++++ 1 file changed, 773 insertions(+) create mode 100644 docs/src/guidelines/Guideline 1: Data.ipynb diff --git a/docs/src/guidelines/Guideline 1: Data.ipynb b/docs/src/guidelines/Guideline 1: Data.ipynb new file mode 100644 index 0000000..020959e --- /dev/null +++ b/docs/src/guidelines/Guideline 1: Data.ipynb @@ -0,0 +1,773 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import networkcommons as nc" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namedescriptionpublication_linkdetailed_description
decryptmDecryptMDrug perturbation proteomics and phosphoproteomics datahttps://doi.org/10.1126/science.ade3925This dataset contains the profiling of 31 cancer drugs in 13 human cancer cell line models resulted in 1.8 million dose-response curves, including 47,502 regulated phosphopeptides, 7316 ubiquitinylated peptides, and 546 regulated acetylated peptides.
panaceaPanaceaPancancer Analysis of Chemical Entity Activity RNA-Seq datahttps://doi.org/10.1016/j.xcrm.2021.100492PANACEA contains dose-response and perturbational profiles for 32 kinase inhibitors in 11 cancer cell lines, in addition to a DMSO control. Originally, this resource served as the basis for a DREAM Challenge assessing the accuracy and sensitivity of computational algorithms for de novo drug polypharmacology predictions.
moonMOONDatabase files for running MOONhttps://example.com/moonThis dataset contains database files required for running the MOON software.
cosmosCOSMOSDatabase files for running COSMOS (MetaPKN)https://example.com/cosmosThis dataset includes database files for the COSMOS software (MetaPKN).
CPTACCPTACClinical Proteomic Tumor Analysis Consortium datahttps://example.com/CPTACThis dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data.
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "decryptm DecryptM \n", + "panacea Panacea \n", + "moon MOON \n", + "cosmos COSMOS \n", + "CPTAC CPTAC \n", + "\n", + " description \\\n", + "decryptm Drug perturbation proteomics and phosphoproteomics data \n", + "panacea Pancancer Analysis of Chemical Entity Activity RNA-Seq data \n", + "moon Database files for running MOON \n", + "cosmos Database files for running COSMOS (MetaPKN) \n", + "CPTAC Clinical Proteomic Tumor Analysis Consortium data \n", + "\n", + " publication_link \\\n", + "decryptm https://doi.org/10.1126/science.ade3925 \n", + "panacea https://doi.org/10.1016/j.xcrm.2021.100492 \n", + "moon https://example.com/moon \n", + "cosmos https://example.com/cosmos \n", + "CPTAC https://example.com/CPTAC \n", + "\n", + " detailed_description \n", + "decryptm This dataset contains the profiling of 31 cancer drugs in 13 human cancer cell line models resulted in 1.8 million dose-response curves, including 47,502 regulated phosphopeptides, 7316 ubiquitinylated peptides, and 546 regulated acetylated peptides. \n", + "panacea PANACEA contains dose-response and perturbational profiles for 32 kinase inhibitors in 11 cancer cell lines, in addition to a DMSO control. Originally, this resource served as the basis for a DREAM Challenge assessing the accuracy and sensitivity of computational algorithms for de novo drug polypharmacology predictions. \n", + "moon This dataset contains database files required for running the MOON software. \n", + "cosmos This dataset includes database files for the COSMOS software (MetaPKN). \n", + "CPTAC This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data. " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# nc.data.omics.datasets()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# sig = nc.data.omics.moon()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metabvalueUnnamed: 2
0HMDB00117470.401991NaN
1HMDB0000755-0.142718NaN
2HMDB0000905-1.244183NaN
3HMDB0000012-0.967207NaN
4HMDB0001191-0.631035NaN
............
133HMDB0011720-0.716155NaN
134HMDB0000883-0.059923NaN
135HMDB0000292-0.772272NaN
136HMDB00002991.132812NaN
137HMDB0002917-1.728352NaN
\n", + "

138 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " metab value Unnamed: 2\n", + "0 HMDB0011747 0.401991 NaN\n", + "1 HMDB0000755 -0.142718 NaN\n", + "2 HMDB0000905 -1.244183 NaN\n", + "3 HMDB0000012 -0.967207 NaN\n", + "4 HMDB0001191 -0.631035 NaN\n", + ".. ... ... ...\n", + "133 HMDB0011720 -0.716155 NaN\n", + "134 HMDB0000883 -0.059923 NaN\n", + "135 HMDB0000292 -0.772272 NaN\n", + "136 HMDB0000299 1.132812 NaN\n", + "137 HMDB0002917 -1.728352 NaN\n", + "\n", + "[138 rows x 3 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sig['metab']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df = nc.data.network.get_omnipath()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "graph = nc.utils.network_from_df(df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "cn_graph = nc._utils.to_cornetograph(graph)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cn_graph._default_edge_type" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "edge_df = pd.DataFrame.from_dict(cn_graph.get_attr_edges())" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
__edge_typeinteraction__source_attr__target_attr
0directed-1{'CALM1': {'__value': {}}}{'TRPC1': {'__value': {}}}
1directed-1{'CALM1': {'__value': {}}}{'TRPC3': {'__value': {}}}
2directed1{'CALM1': {'__value': {}}}{'TRPC6': {'__value': {}}}
3directed1{'CALM1': {'__value': {}}}{'TRPV1': {'__value': {}}}
4directed1{'CALM1': {'__value': {}}}{'TRPV4': {'__value': {}}}
...............
13167directed1{'TASP1': {'__value': {}}}{'GTF2A1': {'__value': {}}}
13168directed1{'NAA10': {'__value': {}}}{'HIF1A': {'__value': {}}}
13169directed1{'ECE1': {'__value': {}}}{'EDN1': {'__value': {}}}
13170directed1{'CSNK1G3': {'__value': {}}}{'TP53': {'__value': {}}}
13171directed1{'CSNK1G3': {'__value': {}}}{'YWHAQ': {'__value': {}}}
\n", + "

13172 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " __edge_type interaction __source_attr \\\n", + "0 directed -1 {'CALM1': {'__value': {}}} \n", + "1 directed -1 {'CALM1': {'__value': {}}} \n", + "2 directed 1 {'CALM1': {'__value': {}}} \n", + "3 directed 1 {'CALM1': {'__value': {}}} \n", + "4 directed 1 {'CALM1': {'__value': {}}} \n", + "... ... ... ... \n", + "13167 directed 1 {'TASP1': {'__value': {}}} \n", + "13168 directed 1 {'NAA10': {'__value': {}}} \n", + "13169 directed 1 {'ECE1': {'__value': {}}} \n", + "13170 directed 1 {'CSNK1G3': {'__value': {}}} \n", + "13171 directed 1 {'CSNK1G3': {'__value': {}}} \n", + "\n", + " __target_attr \n", + "0 {'TRPC1': {'__value': {}}} \n", + "1 {'TRPC3': {'__value': {}}} \n", + "2 {'TRPC6': {'__value': {}}} \n", + "3 {'TRPV1': {'__value': {}}} \n", + "4 {'TRPV4': {'__value': {}}} \n", + "... ... \n", + "13167 {'GTF2A1': {'__value': {}}} \n", + "13168 {'HIF1A': {'__value': {}}} \n", + "13169 {'EDN1': {'__value': {}}} \n", + "13170 {'TP53': {'__value': {}}} \n", + "13171 {'YWHAQ': {'__value': {}}} \n", + "\n", + "[13172 rows x 4 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edge_df" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
__edge_typeinteraction__source_attr
0directed-1{'CALM1': {'__value': {}}}
1directed-1{'CALM1': {'__value': {}}}
2directed1{'CALM1': {'__value': {}}}
3directed1{'CALM1': {'__value': {}}}
4directed1{'CALM1': {'__value': {}}}
............
13167directed1{'TASP1': {'__value': {}}}
13168directed1{'NAA10': {'__value': {}}}
13169directed1{'ECE1': {'__value': {}}}
13170directed1{'CSNK1G3': {'__value': {}}}
13171directed1{'CSNK1G3': {'__value': {}}}
\n", + "

13172 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " __edge_type interaction __source_attr\n", + "0 directed -1 {'CALM1': {'__value': {}}}\n", + "1 directed -1 {'CALM1': {'__value': {}}}\n", + "2 directed 1 {'CALM1': {'__value': {}}}\n", + "3 directed 1 {'CALM1': {'__value': {}}}\n", + "4 directed 1 {'CALM1': {'__value': {}}}\n", + "... ... ... ...\n", + "13167 directed 1 {'TASP1': {'__value': {}}}\n", + "13168 directed 1 {'NAA10': {'__value': {}}}\n", + "13169 directed 1 {'ECE1': {'__value': {}}}\n", + "13170 directed 1 {'CSNK1G3': {'__value': {}}}\n", + "13171 directed 1 {'CSNK1G3': {'__value': {}}}\n", + "\n", + "[13172 rows x 3 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edge_df1 = edge_df[['__edge_type', 'interaction', '__source_attr']]\n", + "edge_df1" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "concat_df = pd.concat([edge_df['__source_attr'], edge_df['__target_attr']]).reset_index()\n", + "concat_df.rename(columns={0: 'node'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexnode
00{'CALM1': {'__value': {}}}
11{'CALM1': {'__value': {}}}
22{'CALM1': {'__value': {}}}
33{'CALM1': {'__value': {}}}
44{'CALM1': {'__value': {}}}
.........
2633913167{'GTF2A1': {'__value': {}}}
2634013168{'HIF1A': {'__value': {}}}
2634113169{'EDN1': {'__value': {}}}
2634213170{'TP53': {'__value': {}}}
2634313171{'YWHAQ': {'__value': {}}}
\n", + "

26344 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " index node\n", + "0 0 {'CALM1': {'__value': {}}}\n", + "1 1 {'CALM1': {'__value': {}}}\n", + "2 2 {'CALM1': {'__value': {}}}\n", + "3 3 {'CALM1': {'__value': {}}}\n", + "4 4 {'CALM1': {'__value': {}}}\n", + "... ... ...\n", + "26339 13167 {'GTF2A1': {'__value': {}}}\n", + "26340 13168 {'HIF1A': {'__value': {}}}\n", + "26341 13169 {'EDN1': {'__value': {}}}\n", + "26342 13170 {'TP53': {'__value': {}}}\n", + "26343 13171 {'YWHAQ': {'__value': {}}}\n", + "\n", + "[26344 rows x 2 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "concat_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "networkcommons-DX9y6Uxu-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 33e5500ecc895ccde65facc1084be0c23223e546 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Mon, 29 Jul 2024 11:48:00 +0200 Subject: [PATCH 02/33] expanded data descriptions --- networkcommons/data/datasets.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/networkcommons/data/datasets.yaml b/networkcommons/data/datasets.yaml index bb2a23a..baec1d2 100644 --- a/networkcommons/data/datasets.yaml +++ b/networkcommons/data/datasets.yaml @@ -4,24 +4,51 @@ omics: decryptm: name: DecryptM description: Drug perturbation proteomics and phosphoproteomics data + publication_link: https://doi.org/10.1126/science.ade3925 + detailed_description: >- + This dataset contains the profiling of 31 cancer drugs in 13 human cancer + cell line models resulted in 1.8 million dose-response curves, including + 47,502 regulated phosphopeptides, 7316 ubiquitinylated peptides, + and 546 regulated acetylated peptides. path: decryptm/{experiment}/{data_type}/{fname} panacea: name: Panacea description: Pancancer Analysis of Chemical Entity Activity RNA-Seq data + publication_link: https://doi.org/10.1016/j.xcrm.2021.100492 + detailed_description: >- + PANACEA contains dose-response and perturbational profiles for 32 + kinase inhibitors in 11 cancer cell lines, in addition to a DMSO control. + Originally, this resource served as the basis for a DREAM Challenge + assessing the accuracy and sensitivity of computational algorithms for + de novo drug polypharmacology predictions. path: panacea/panacea__{table}data.tsv test: name: Test data description: Small RNA-Seq data for unit tests + publication_link: NA + detailed_description: >- + This is a test dataset containing small RNA-Seq data. + It is used for unit tests. path: unit_test/test__{table}data.tsv moon: name: MOON description: Database files for running MOON + publication_link: https://example.com/moon + detailed_description: >- + This dataset contains database files required for running the MOON software. path: moon/{table}_input.tsv cosmos: name: COSMOS description: Database files for running COSMOS (MetaPKN) + publication_link: https://example.com/cosmos + detailed_description: >- + This dataset includes database files for the COSMOS software (MetaPKN). path: moon/net/meta_network.sif CPTAC: name: CPTAC description: Clinical Proteomic Tumor Analysis Consortium data + publication_link: https://example.com/CPTAC + detailed_description: >- + This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. + It includes various cancer types and proteomic data. path: CPTAC/{data_type}/{cancer_type}_{fname}.txt From b11753def548f6ba1d5fad14e2aa2f518006dcc9 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Mon, 29 Jul 2024 11:48:15 +0200 Subject: [PATCH 03/33] expanded data retrieval client --- networkcommons/data/omics/_common.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/networkcommons/data/omics/_common.py b/networkcommons/data/omics/_common.py index 93c9975..4f02078 100644 --- a/networkcommons/data/omics/_common.py +++ b/networkcommons/data/omics/_common.py @@ -45,18 +45,20 @@ def _datasets() -> dict[str, dict]: return _module_data('datasets').get('omics', {}) -def datasets() -> dict[str, str]: +def datasets() -> pd.DataFrame: """ Built-in omics datasets. Returns: - A dict with dataset labels as keys and descriptions as values. + A DataFrame with dataset details. """ + data = _datasets().get('datasets', {}) + df = pd.DataFrame.from_dict(data, orient='index') + pd.set_option('display.max_colwidth', None) - return { - k: v['description'] - for k, v in _datasets().get('datasets', {}).items() - } + df = df[df.index != 'test'] # Exclude the 'test' dataset + + return df[['name', 'description', 'publication_link', 'detailed_description']] def _baseurl() -> str: From 99d5cd4e0bf2e4646a6c73b30bfabdcca5657c8d Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 09:57:37 +0200 Subject: [PATCH 04/33] added NCI60 to server --- networkcommons/data/datasets.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/networkcommons/data/datasets.yaml b/networkcommons/data/datasets.yaml index baec1d2..9a38285 100644 --- a/networkcommons/data/datasets.yaml +++ b/networkcommons/data/datasets.yaml @@ -52,3 +52,12 @@ omics: This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data. path: CPTAC/{data_type}/{cancer_type}_{fname}.txt + NCI60: + name: NCI60 + description: NCI-60 cell line data + publication_link: https://example.com/NCI60 + detailed_description: >- + This dataset contains data from the NCI-60 cell line panel. + It includes three files: TF activities from transcriptomics data, + metabolite abundances and gene reads. + path: NCI60/{cell_line}/{cell_line}__{table}.tsv From 4a3eb0571cfafc9aeeaa6022c1026b2b3ae5eaae Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 10:21:48 +0200 Subject: [PATCH 05/33] added data handlers for nci60 --- networkcommons/data/datasets.yaml | 2 +- networkcommons/data/omics/_moon.py | 80 +++++++++++++++++++++++++++++- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/networkcommons/data/datasets.yaml b/networkcommons/data/datasets.yaml index 9a38285..e015401 100644 --- a/networkcommons/data/datasets.yaml +++ b/networkcommons/data/datasets.yaml @@ -60,4 +60,4 @@ omics: This dataset contains data from the NCI-60 cell line panel. It includes three files: TF activities from transcriptomics data, metabolite abundances and gene reads. - path: NCI60/{cell_line}/{cell_line}__{table}.tsv + path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv diff --git a/networkcommons/data/omics/_moon.py b/networkcommons/data/omics/_moon.py index e985525..28f5614 100644 --- a/networkcommons/data/omics/_moon.py +++ b/networkcommons/data/omics/_moon.py @@ -19,11 +19,14 @@ from __future__ import annotations -__all__ = ['moon'] +__all__ = ['moon', 'nci60_datasets', 'nci60_datatypes', 'nci60_table'] import pandas as pd +import os +import urllib.parse from . import _common +from networkcommons import _conf def moon() -> dict[str, pd.DataFrame]: @@ -42,3 +45,78 @@ def moon() -> dict[str, pd.DataFrame]: ) for table in ('sig', 'metab', 'rna') } + + +def nci60_datasets(update: bool = False) -> pd.DataFrame: + """ + Table of all NCI60 datasets (cell types). + + Args: + update: + Force download and update cache. + + Returns: + Data frame of all NCI60 datasets, with columns "experiment", + "data_type" and "fname". + """ + + path = os.path.join(_conf.get('pickle_dir'), 'nci60_datasets.pickle') + + if update or not os.path.exists(path): + + baseurl = urllib.parse.urljoin(_common._baseurl(), 'NCI60') + + datasets = pd.DataFrame( + [ + ( + cell_line, + ) + for cell_line in _common._ls(baseurl) + ], + columns = ['cell_line'] + ) + datasets.to_pickle(path) + + else: + + datasets = pd.read_pickle(path) + + return datasets + + +def nci60_datatypes() -> pd.DataFrame: + """ + Table of all NCI60 data types. + + Returns: + Data frame of all NCI60 data types, with columns "data_type", + and "description". + """ + df = pd.DataFrame({ + 'data_type': ['TF_scores', 'RNA', 'metabolomic'], + 'description': ['TF scores', 'RNA expression', 'metabolomic data'] + } + ) + + return df + + +def nci60_table(cell_line: str, data_type: str) -> pd.DataFrame: + """ + One table of omics data from NCI60. + + Args: + cell_line: + Name of the cell line. For a complete list see + `nci60_datasets()`. + data_type: + Type of data. For a complete list see `nci60_datatypes()`. + + Returns: + The table as a pandas DataFrame. + """ + + return _common._open( + _common._commons_url('NCI60', **locals()), + df = {'sep': '\t'}, + ) From d51aa5ac6f4f63d31957cb21f02698d884ca6cbe Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 10:22:27 +0200 Subject: [PATCH 06/33] removed old nci60 data handler --- networkcommons/data/omics/_moon.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/networkcommons/data/omics/_moon.py b/networkcommons/data/omics/_moon.py index 28f5614..f9a9c0d 100644 --- a/networkcommons/data/omics/_moon.py +++ b/networkcommons/data/omics/_moon.py @@ -19,7 +19,7 @@ from __future__ import annotations -__all__ = ['moon', 'nci60_datasets', 'nci60_datatypes', 'nci60_table'] +__all__ = ['nci60_datasets', 'nci60_datatypes', 'nci60_table'] import pandas as pd import os @@ -29,23 +29,6 @@ from networkcommons import _conf -def moon() -> dict[str, pd.DataFrame]: - """ - Example data for Moon. - - Returns: - Three data frames: signaling, metabolite and gene activity - measurements. - """ - - return { - table: _common._open( - _common._commons_url('moon', table = table), - df = {'sep': '\t'}, - ) - for table in ('sig', 'metab', 'rna') - } - def nci60_datasets(update: bool = False) -> pd.DataFrame: """ From 5b85b6abfd0d02267a21da46b2b3fc5d7afb0b7a Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 10:32:24 +0200 Subject: [PATCH 07/33] linted code --- networkcommons/data/omics/_moon.py | 1 - 1 file changed, 1 deletion(-) diff --git a/networkcommons/data/omics/_moon.py b/networkcommons/data/omics/_moon.py index f9a9c0d..acf8ff3 100644 --- a/networkcommons/data/omics/_moon.py +++ b/networkcommons/data/omics/_moon.py @@ -29,7 +29,6 @@ from networkcommons import _conf - def nci60_datasets(update: bool = False) -> pd.DataFrame: """ Table of all NCI60 datasets (cell types). From 4c9e7b3fd386bdbc37db7f58f6c3c1068816d0cd Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:29:45 +0200 Subject: [PATCH 08/33] added info in contribution's guidelines --- docs/src/guidelines/Guideline 1: Data.ipynb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/src/guidelines/Guideline 1: Data.ipynb b/docs/src/guidelines/Guideline 1: Data.ipynb index 020959e..7b9f391 100644 --- a/docs/src/guidelines/Guideline 1: Data.ipynb +++ b/docs/src/guidelines/Guideline 1: Data.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -111,22 +111,22 @@ "CPTAC This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data. " ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# nc.data.omics.datasets()" + "nc.data.omics.datasets()" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "# sig = nc.data.omics.moon()" + "sig = nc.data.omics.moon()" ] }, { @@ -255,7 +255,7 @@ } ], "source": [ - "# sig['metab']" + "sig['metab']" ] }, { From 6eb916bc372c9c88e8e0d1b521e14d40ebe58f93 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:29:51 +0200 Subject: [PATCH 09/33] updated dois --- networkcommons/data/datasets.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/networkcommons/data/datasets.yaml b/networkcommons/data/datasets.yaml index e015401..412c52d 100644 --- a/networkcommons/data/datasets.yaml +++ b/networkcommons/data/datasets.yaml @@ -33,21 +33,21 @@ omics: moon: name: MOON description: Database files for running MOON - publication_link: https://example.com/moon + publication_link: NA detailed_description: >- This dataset contains database files required for running the MOON software. path: moon/{table}_input.tsv cosmos: name: COSMOS description: Database files for running COSMOS (MetaPKN) - publication_link: https://example.com/cosmos + publication_link: NA detailed_description: >- This dataset includes database files for the COSMOS software (MetaPKN). path: moon/net/meta_network.sif CPTAC: name: CPTAC description: Clinical Proteomic Tumor Analysis Consortium data - publication_link: https://example.com/CPTAC + publication_link: https://doi.org/10.1158/2159-8290.CD-13-0219 detailed_description: >- This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data. @@ -55,7 +55,7 @@ omics: NCI60: name: NCI60 description: NCI-60 cell line data - publication_link: https://example.com/NCI60 + publication_link: https://doi.org/10.1038/nrc1951 detailed_description: >- This dataset contains data from the NCI-60 cell line panel. It includes three files: TF activities from transcriptomics data, From eddfb840dacc8b2487b8ed35074902be717e21d5 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:31:18 +0200 Subject: [PATCH 10/33] added nci60 functions to docs --- docs/src/api.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/src/api.rst b/docs/src/api.rst index 400ff0e..4bd1911 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -149,6 +149,19 @@ CPTAC data.omics.cptac_datatypes data.omics.cptac_extend_dataframe +NCI60 +~~~~~ +.. module::networkcommons.data.omics +.. currentmodule:: networkcommons + +.. autosummary:: + :toctree: api + :recursive: + + data.omics.nci60_datasets + data.omics.nci60_datatypes + data.omics.nci60_table + Other ~~~~~~~~ From a5539b36e45ecfc4874e8d58860d0eb7f883a826 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:33:21 +0200 Subject: [PATCH 11/33] renamed file --- docs/src/contents.rst | 7 +++++++ .../{Guideline 1: Data.ipynb => guide_1_data.ipynb} | 0 2 files changed, 7 insertions(+) rename docs/src/guidelines/{Guideline 1: Data.ipynb => guide_1_data.ipynb} (100%) diff --git a/docs/src/contents.rst b/docs/src/contents.rst index 09e1667..b3b5631 100644 --- a/docs/src/contents.rst +++ b/docs/src/contents.rst @@ -11,6 +11,13 @@ NetworkCommons: Table of Contents api +..toctree:: + :maxdepth: 2 + :caption: COntribution guidelienes + + guidelines/ + + .. toctree:: :maxdepth: 2 :caption: Vignettes diff --git a/docs/src/guidelines/Guideline 1: Data.ipynb b/docs/src/guidelines/guide_1_data.ipynb similarity index 100% rename from docs/src/guidelines/Guideline 1: Data.ipynb rename to docs/src/guidelines/guide_1_data.ipynb From f8bcda54f0381bb9aecd268d728fe6270ef491b2 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:39:04 +0200 Subject: [PATCH 12/33] added info --- docs/src/guidelines/guide_1_data.ipynb | 655 ++++--------------------- 1 file changed, 89 insertions(+), 566 deletions(-) diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb index 7b9f391..a523268 100644 --- a/docs/src/guidelines/guide_1_data.ipynb +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -1,17 +1,55 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Contribution's guideline: Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thank you very much for considering contributing to the data collection of **NetworkCommons**! In order to make the resource as user-friendly as possible, we aim to be as transparent as possible, which means that all contributions should contain at least the following elements.\n", + "\n", + "## 1. Data information\n", + "* Experimental design: number of samples, number of experiments (if applicable), confounding factors\n", + "* Data production and processing: tools used, how the data processing was performed (if applicable).\n", + "* Files: number and type of files, with a small description of their contents.\n", + "* Link to the database from which the data was retrieved.\n", + "* Link to the dataset publication\n", + "* Path information explaining the structure of the data directories\n", + "This information should be appended to the existing YAML file in `networkcommons/data/datasets.yaml`\n", + "\n", + "An example of this can be found below:" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "NCI60:\n", + " name: NCI60\n", + " description: NCI-60 cell line data\n", + " publication_link: https://doi.org/10.1038/nrc1951\n", + " detailed_description: >-\n", + " This dataset contains data from the NCI-60 cell line panel.\n", + " It includes three files: TF activities from transcriptomics data,\n", + " metabolite abundances and gene reads.\n", + " path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv" + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import networkcommons as nc" + "This information can then be accessed via `nc.data.omics.datasets()`" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -77,6 +115,13 @@ " https://example.com/CPTAC\n", " This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data.\n", " \n", + " \n", + " NCI60\n", + " NCI60\n", + " NCI-60 cell line data\n", + " https://example.com/NCI60\n", + " This dataset contains data from the NCI-60 cell line panel. It includes three files: TF activities from transcriptomics data, metabolite abundances and gene reads.\n", + " \n", " \n", "\n", "" @@ -88,6 +133,7 @@ "moon MOON \n", "cosmos COSMOS \n", "CPTAC CPTAC \n", + "NCI60 NCI60 \n", "\n", " description \\\n", "decryptm Drug perturbation proteomics and phosphoproteomics data \n", @@ -95,6 +141,7 @@ "moon Database files for running MOON \n", "cosmos Database files for running COSMOS (MetaPKN) \n", "CPTAC Clinical Proteomic Tumor Analysis Consortium data \n", + "NCI60 NCI-60 cell line data \n", "\n", " publication_link \\\n", "decryptm https://doi.org/10.1126/science.ade3925 \n", @@ -102,16 +149,18 @@ "moon https://example.com/moon \n", "cosmos https://example.com/cosmos \n", "CPTAC https://example.com/CPTAC \n", + "NCI60 https://example.com/NCI60 \n", "\n", " detailed_description \n", "decryptm This dataset contains the profiling of 31 cancer drugs in 13 human cancer cell line models resulted in 1.8 million dose-response curves, including 47,502 regulated phosphopeptides, 7316 ubiquitinylated peptides, and 546 regulated acetylated peptides. \n", "panacea PANACEA contains dose-response and perturbational profiles for 32 kinase inhibitors in 11 cancer cell lines, in addition to a DMSO control. Originally, this resource served as the basis for a DREAM Challenge assessing the accuracy and sensitivity of computational algorithms for de novo drug polypharmacology predictions. \n", "moon This dataset contains database files required for running the MOON software. \n", "cosmos This dataset includes database files for the COSMOS software (MetaPKN). \n", - "CPTAC This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data. " + "CPTAC This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data. \n", + "NCI60 This dataset contains data from the NCI-60 cell line panel. It includes three files: TF activities from transcriptomics data, metabolite abundances and gene reads. " ] }, - "execution_count": 3, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -121,513 +170,24 @@ ] }, { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "sig = nc.data.omics.moon()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
metabvalueUnnamed: 2
0HMDB00117470.401991NaN
1HMDB0000755-0.142718NaN
2HMDB0000905-1.244183NaN
3HMDB0000012-0.967207NaN
4HMDB0001191-0.631035NaN
............
133HMDB0011720-0.716155NaN
134HMDB0000883-0.059923NaN
135HMDB0000292-0.772272NaN
136HMDB00002991.132812NaN
137HMDB0002917-1.728352NaN
\n", - "

138 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " metab value Unnamed: 2\n", - "0 HMDB0011747 0.401991 NaN\n", - "1 HMDB0000755 -0.142718 NaN\n", - "2 HMDB0000905 -1.244183 NaN\n", - "3 HMDB0000012 -0.967207 NaN\n", - "4 HMDB0001191 -0.631035 NaN\n", - ".. ... ... ...\n", - "133 HMDB0011720 -0.716155 NaN\n", - "134 HMDB0000883 -0.059923 NaN\n", - "135 HMDB0000292 -0.772272 NaN\n", - "136 HMDB0000299 1.132812 NaN\n", - "137 HMDB0002917 -1.728352 NaN\n", - "\n", - "[138 rows x 3 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sig['metab']" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "df = nc.data.network.get_omnipath()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "graph = nc.utils.network_from_df(df)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "cn_graph = nc._utils.to_cornetograph(graph)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cn_graph._default_edge_type" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "edge_df = pd.DataFrame.from_dict(cn_graph.get_attr_edges())" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
__edge_typeinteraction__source_attr__target_attr
0directed-1{'CALM1': {'__value': {}}}{'TRPC1': {'__value': {}}}
1directed-1{'CALM1': {'__value': {}}}{'TRPC3': {'__value': {}}}
2directed1{'CALM1': {'__value': {}}}{'TRPC6': {'__value': {}}}
3directed1{'CALM1': {'__value': {}}}{'TRPV1': {'__value': {}}}
4directed1{'CALM1': {'__value': {}}}{'TRPV4': {'__value': {}}}
...............
13167directed1{'TASP1': {'__value': {}}}{'GTF2A1': {'__value': {}}}
13168directed1{'NAA10': {'__value': {}}}{'HIF1A': {'__value': {}}}
13169directed1{'ECE1': {'__value': {}}}{'EDN1': {'__value': {}}}
13170directed1{'CSNK1G3': {'__value': {}}}{'TP53': {'__value': {}}}
13171directed1{'CSNK1G3': {'__value': {}}}{'YWHAQ': {'__value': {}}}
\n", - "

13172 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " __edge_type interaction __source_attr \\\n", - "0 directed -1 {'CALM1': {'__value': {}}} \n", - "1 directed -1 {'CALM1': {'__value': {}}} \n", - "2 directed 1 {'CALM1': {'__value': {}}} \n", - "3 directed 1 {'CALM1': {'__value': {}}} \n", - "4 directed 1 {'CALM1': {'__value': {}}} \n", - "... ... ... ... \n", - "13167 directed 1 {'TASP1': {'__value': {}}} \n", - "13168 directed 1 {'NAA10': {'__value': {}}} \n", - "13169 directed 1 {'ECE1': {'__value': {}}} \n", - "13170 directed 1 {'CSNK1G3': {'__value': {}}} \n", - "13171 directed 1 {'CSNK1G3': {'__value': {}}} \n", - "\n", - " __target_attr \n", - "0 {'TRPC1': {'__value': {}}} \n", - "1 {'TRPC3': {'__value': {}}} \n", - "2 {'TRPC6': {'__value': {}}} \n", - "3 {'TRPV1': {'__value': {}}} \n", - "4 {'TRPV4': {'__value': {}}} \n", - "... ... \n", - "13167 {'GTF2A1': {'__value': {}}} \n", - "13168 {'HIF1A': {'__value': {}}} \n", - "13169 {'EDN1': {'__value': {}}} \n", - "13170 {'TP53': {'__value': {}}} \n", - "13171 {'YWHAQ': {'__value': {}}} \n", - "\n", - "[13172 rows x 4 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edge_df" - ] - }, - { - "cell_type": "code", - "execution_count": 15, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
__edge_typeinteraction__source_attr
0directed-1{'CALM1': {'__value': {}}}
1directed-1{'CALM1': {'__value': {}}}
2directed1{'CALM1': {'__value': {}}}
3directed1{'CALM1': {'__value': {}}}
4directed1{'CALM1': {'__value': {}}}
............
13167directed1{'TASP1': {'__value': {}}}
13168directed1{'NAA10': {'__value': {}}}
13169directed1{'ECE1': {'__value': {}}}
13170directed1{'CSNK1G3': {'__value': {}}}
13171directed1{'CSNK1G3': {'__value': {}}}
\n", - "

13172 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " __edge_type interaction __source_attr\n", - "0 directed -1 {'CALM1': {'__value': {}}}\n", - "1 directed -1 {'CALM1': {'__value': {}}}\n", - "2 directed 1 {'CALM1': {'__value': {}}}\n", - "3 directed 1 {'CALM1': {'__value': {}}}\n", - "4 directed 1 {'CALM1': {'__value': {}}}\n", - "... ... ... ...\n", - "13167 directed 1 {'TASP1': {'__value': {}}}\n", - "13168 directed 1 {'NAA10': {'__value': {}}}\n", - "13169 directed 1 {'ECE1': {'__value': {}}}\n", - "13170 directed 1 {'CSNK1G3': {'__value': {}}}\n", - "13171 directed 1 {'CSNK1G3': {'__value': {}}}\n", - "\n", - "[13172 rows x 3 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "edge_df1 = edge_df[['__edge_type', 'interaction', '__source_attr']]\n", - "edge_df1" + "## 2. Data handlers (API)\n", + "The data will either be deposited in the [NetworkCommons server](https://commons.omnipathdb.org/), or can be directly accessed from the original source. Regardless of this, the following functions are required\n", + "* A function providing an overview of the subsets (if applicable). For example, check `nc.data.omics.decryptm_experiments()`. \n", + "* In case the data contains different files (for example, different omics layers, metadata tables, etc.), a function should retrieve this information. For example, check `nc.data.omics.nci60_datatypes()`\n", + "* A function that retrieves the data. For example, check `nc.data.omics.nci60_table()`. Ideally, a `pd.DataFrame`, but we are planning to expand support for `AnnData` instances.\n", + "\n", + "These new functions can be implemented in a new file, `_{dataset}`, inside the `networkcommons/data/omics/` folder.\n", + "\n", + "For example, `nc.data.omics.nci60_table()` retrieves a single `pd.DataFrame` by providing a data type and a cell line." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, - "outputs": [], - "source": [ - "concat_df = pd.concat([edge_df['__source_attr'], edge_df['__target_attr']]).reset_index()\n", - "concat_df.rename(columns={0: 'node'}, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, "outputs": [ { "data": { @@ -650,95 +210,56 @@ " \n", " \n", " \n", - " index\n", - " node\n", + " ID\n", + " score\n", " \n", " \n", " \n", " \n", " 0\n", - " 0\n", - " {'CALM1': {'__value': {}}}\n", + " WASH7P\n", + " -2.109966\n", " \n", " \n", " 1\n", - " 1\n", - " {'CALM1': {'__value': {}}}\n", + " NOC2L\n", + " -1.480194\n", " \n", " \n", " 2\n", - " 2\n", - " {'CALM1': {'__value': {}}}\n", + " HES4\n", + " -0.781522\n", " \n", " \n", " 3\n", - " 3\n", - " {'CALM1': {'__value': {}}}\n", + " ISG15\n", + " 0.406806\n", " \n", " \n", " 4\n", - " 4\n", - " {'CALM1': {'__value': {}}}\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 26339\n", - " 13167\n", - " {'GTF2A1': {'__value': {}}}\n", - " \n", - " \n", - " 26340\n", - " 13168\n", - " {'HIF1A': {'__value': {}}}\n", - " \n", - " \n", - " 26341\n", - " 13169\n", - " {'EDN1': {'__value': {}}}\n", - " \n", - " \n", - " 26342\n", - " 13170\n", - " {'TP53': {'__value': {}}}\n", - " \n", - " \n", - " 26343\n", - " 13171\n", - " {'YWHAQ': {'__value': {}}}\n", + " AGRN\n", + " -0.324970\n", " \n", " \n", "\n", - "

26344 rows × 2 columns

\n", "" ], "text/plain": [ - " index node\n", - "0 0 {'CALM1': {'__value': {}}}\n", - "1 1 {'CALM1': {'__value': {}}}\n", - "2 2 {'CALM1': {'__value': {}}}\n", - "3 3 {'CALM1': {'__value': {}}}\n", - "4 4 {'CALM1': {'__value': {}}}\n", - "... ... ...\n", - "26339 13167 {'GTF2A1': {'__value': {}}}\n", - "26340 13168 {'HIF1A': {'__value': {}}}\n", - "26341 13169 {'EDN1': {'__value': {}}}\n", - "26342 13170 {'TP53': {'__value': {}}}\n", - "26343 13171 {'YWHAQ': {'__value': {}}}\n", - "\n", - "[26344 rows x 2 columns]" + " ID score\n", + "0 WASH7P -2.109966\n", + "1 NOC2L -1.480194\n", + "2 HES4 -0.781522\n", + "3 ISG15 0.406806\n", + "4 AGRN -0.324970" ] }, - "execution_count": 22, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "concat_df" + "nc.data.omics.nci60_table(cell_line='A498', data_type='RNA').head()" ] }, { @@ -746,7 +267,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "nc.data.omics.nci" + ] } ], "metadata": { From faa4257308813770299ab4e30987cb774385a3a0 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:39:14 +0200 Subject: [PATCH 13/33] added new menu: guidelines --- docs/src/contents.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/contents.rst b/docs/src/contents.rst index b3b5631..6381a2a 100644 --- a/docs/src/contents.rst +++ b/docs/src/contents.rst @@ -15,7 +15,7 @@ NetworkCommons: Table of Contents :maxdepth: 2 :caption: COntribution guidelienes - guidelines/ + guidelines/guide_1_data .. toctree:: From 319380f38977624f10b67312400cee5c1c78efb9 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:40:10 +0200 Subject: [PATCH 14/33] fixed typo --- docs/src/contents.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/contents.rst b/docs/src/contents.rst index 6381a2a..f30c541 100644 --- a/docs/src/contents.rst +++ b/docs/src/contents.rst @@ -11,7 +11,7 @@ NetworkCommons: Table of Contents api -..toctree:: +.. toctree:: :maxdepth: 2 :caption: COntribution guidelienes From b991511f64b723727563e7c77d212a0c9f18ecb4 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:40:32 +0200 Subject: [PATCH 15/33] fixed typo --- docs/src/contents.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/contents.rst b/docs/src/contents.rst index f30c541..c990379 100644 --- a/docs/src/contents.rst +++ b/docs/src/contents.rst @@ -13,7 +13,7 @@ NetworkCommons: Table of Contents .. toctree:: :maxdepth: 2 - :caption: COntribution guidelienes + :caption: Contribution guidelienes guidelines/guide_1_data From a29e137bad5b1e596d573d22534d6de8d05aac14 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:45:31 +0200 Subject: [PATCH 16/33] changed code to html --- docs/src/guidelines/guide_1_data.ipynb | 64 +++++++++++++++++++++----- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb index a523268..80c5703 100644 --- a/docs/src/guidelines/guide_1_data.ipynb +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -26,18 +26,60 @@ ] }, { - "cell_type": "raw", - "metadata": {}, + "cell_type": "code", + "execution_count": 1, + "metadata": { + "vscode": { + "languageId": "html" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n",
+       "        <!-- FILEPATH: /home/victo/networkcommons/docs/src/guidelines/guide_1_data.ipynb -->\n",
+       "        <pre>\n",
+       "            NCI60:\n",
+       "                name: NCI60\n",
+       "                description: NCI-60 cell line data\n",
+       "                publication_link: https://doi.org/10.1038/nrc1951\n",
+       "                detailed_description: >-\n",
+       "                    This dataset contains data from the NCI-60 cell line panel.\n",
+       "                    It includes three files: TF activities from transcriptomics data,\n",
+       "                    metabolite abundances and gene reads.\n",
+       "                path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv\n",
+       "        </pre>\n",
+       "    
\n", + "
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "NCI60:\n", - " name: NCI60\n", - " description: NCI-60 cell line data\n", - " publication_link: https://doi.org/10.1038/nrc1951\n", - " detailed_description: >-\n", - " This dataset contains data from the NCI-60 cell line panel.\n", - " It includes three files: TF activities from transcriptomics data,\n", - " metabolite abundances and gene reads.\n", - " path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv" + "%%html\n", + "\n", + "
\n",
+    "        <!-- FILEPATH: /home/victo/networkcommons/docs/src/guidelines/guide_1_data.ipynb -->\n",
+    "        <pre>\n",
+    "            NCI60:\n",
+    "                name: NCI60\n",
+    "                description: NCI-60 cell line data\n",
+    "                publication_link: https://doi.org/10.1038/nrc1951\n",
+    "                detailed_description: >-\n",
+    "                    This dataset contains data from the NCI-60 cell line panel.\n",
+    "                    It includes three files: TF activities from transcriptomics data,\n",
+    "                    metabolite abundances and gene reads.\n",
+    "                path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv\n",
+    "        </pre>\n",
+    "    
\n", + "
" ] }, { From f3119e39fcd432e1ea0da0eaf1ad15ac40e1f688 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:53:10 +0200 Subject: [PATCH 17/33] changed code chunk to markdown --- docs/src/guidelines/guide_1_data.ipynb | 65 +++++--------------------- 1 file changed, 12 insertions(+), 53 deletions(-) diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb index 80c5703..690cf4d 100644 --- a/docs/src/guidelines/guide_1_data.ipynb +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -26,60 +26,19 @@ ] }, { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "vscode": { - "languageId": "html" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n",
-       "        <!-- FILEPATH: /home/victo/networkcommons/docs/src/guidelines/guide_1_data.ipynb -->\n",
-       "        <pre>\n",
-       "            NCI60:\n",
-       "                name: NCI60\n",
-       "                description: NCI-60 cell line data\n",
-       "                publication_link: https://doi.org/10.1038/nrc1951\n",
-       "                detailed_description: >-\n",
-       "                    This dataset contains data from the NCI-60 cell line panel.\n",
-       "                    It includes three files: TF activities from transcriptomics data,\n",
-       "                    metabolite abundances and gene reads.\n",
-       "                path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv\n",
-       "        </pre>\n",
-       "    
\n", - "
\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "cell_type": "markdown", + "metadata": {}, "source": [ - "%%html\n", - "\n", - "
\n",
-    "        <!-- FILEPATH: /home/victo/networkcommons/docs/src/guidelines/guide_1_data.ipynb -->\n",
-    "        <pre>\n",
-    "            NCI60:\n",
-    "                name: NCI60\n",
-    "                description: NCI-60 cell line data\n",
-    "                publication_link: https://doi.org/10.1038/nrc1951\n",
-    "                detailed_description: >-\n",
-    "                    This dataset contains data from the NCI-60 cell line panel.\n",
-    "                    It includes three files: TF activities from transcriptomics data,\n",
-    "                    metabolite abundances and gene reads.\n",
-    "                path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv\n",
-    "        </pre>\n",
-    "    
\n", - "
" + "```yaml\n", + "NCI60:\n", + " name: NCI60\n", + " description: NCI-60 cell line data\n", + " publication_link: https://doi.org/10.1038/nrc1951\n", + " detailed_description: >-\n", + " This dataset contains data from the NCI-60 cell line panel.\n", + " It includes three files: TF activities from transcriptomics data,\n", + " metabolite abundances and gene reads.\n", + " path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv" ] }, { From 3bd999b0eb0b1fe1cc87fef83751a97e234f63a7 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 14:53:32 +0200 Subject: [PATCH 18/33] fixed typo --- docs/src/contents.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/contents.rst b/docs/src/contents.rst index c990379..4be9548 100644 --- a/docs/src/contents.rst +++ b/docs/src/contents.rst @@ -13,7 +13,7 @@ NetworkCommons: Table of Contents .. toctree:: :maxdepth: 2 - :caption: Contribution guidelienes + :caption: Contribution guidelines guidelines/guide_1_data From cdd22e09e60ae8f6adba414aa87946e590abc937 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 15:06:48 +0200 Subject: [PATCH 19/33] updated (again) code chunk --- docs/src/guidelines/guide_1_data.ipynb | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb index 690cf4d..2887bd7 100644 --- a/docs/src/guidelines/guide_1_data.ipynb +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -29,16 +29,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "```yaml\n", - "NCI60:\n", - " name: NCI60\n", - " description: NCI-60 cell line data\n", - " publication_link: https://doi.org/10.1038/nrc1951\n", - " detailed_description: >-\n", - " This dataset contains data from the NCI-60 cell line panel.\n", - " It includes three files: TF activities from transcriptomics data,\n", - " metabolite abundances and gene reads.\n", - " path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv" + ".. code-block:: yaml\n", + "\n", + " NCI60:\n", + " name: NCI60\n", + " description: NCI-60 cell line data\n", + " publication_link: https://doi.org/10.1038/nrc1951\n", + " detailed_description: >-\n", + " This dataset contains data from the NCI-60 cell line panel.\n", + " It includes three files: TF activities from transcriptomics data,\n", + " metabolite abundances and gene reads.\n", + " path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv" ] }, { From 47399dcaf7a91481c0035abbdd2ed24a39209f06 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 15:15:51 +0200 Subject: [PATCH 20/33] updated code chunk again --- docs/src/guidelines/guide_1_data.ipynb | 30 +++++++++----------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb index 2887bd7..0522321 100644 --- a/docs/src/guidelines/guide_1_data.ipynb +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -29,17 +29,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - ".. code-block:: yaml\n", - "\n", - " NCI60:\n", - " name: NCI60\n", - " description: NCI-60 cell line data\n", - " publication_link: https://doi.org/10.1038/nrc1951\n", - " detailed_description: >-\n", - " This dataset contains data from the NCI-60 cell line panel.\n", - " It includes three files: TF activities from transcriptomics data,\n", - " metabolite abundances and gene reads.\n", - " path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv" + "```yaml\n", + "NCI60:\n", + " name: NCI60\n", + " description: NCI-60 cell line data\n", + " publication_link: https://doi.org/10.1038/nrc1951\n", + " detailed_description: >-\n", + " This dataset contains data from the NCI-60 cell line panel.\n", + " It includes three files: TF activities from transcriptomics data,\n", + " metabolite abundances and gene reads.\n", + " path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv" ] }, { @@ -263,15 +262,6 @@ "source": [ "nc.data.omics.nci60_table(cell_line='A498', data_type='RNA').head()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "nc.data.omics.nci" - ] } ], "metadata": { From c925e0840ccb58a1adfd2ba85dc6b2733f073d81 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 15:20:45 +0200 Subject: [PATCH 21/33] Revert "updated code chunk again" This reverts commit 47399dcaf7a91481c0035abbdd2ed24a39209f06. --- docs/src/guidelines/guide_1_data.ipynb | 30 +++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb index 0522321..2887bd7 100644 --- a/docs/src/guidelines/guide_1_data.ipynb +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -29,16 +29,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "```yaml\n", - "NCI60:\n", - " name: NCI60\n", - " description: NCI-60 cell line data\n", - " publication_link: https://doi.org/10.1038/nrc1951\n", - " detailed_description: >-\n", - " This dataset contains data from the NCI-60 cell line panel.\n", - " It includes three files: TF activities from transcriptomics data,\n", - " metabolite abundances and gene reads.\n", - " path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv" + ".. code-block:: yaml\n", + "\n", + " NCI60:\n", + " name: NCI60\n", + " description: NCI-60 cell line data\n", + " publication_link: https://doi.org/10.1038/nrc1951\n", + " detailed_description: >-\n", + " This dataset contains data from the NCI-60 cell line panel.\n", + " It includes three files: TF activities from transcriptomics data,\n", + " metabolite abundances and gene reads.\n", + " path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv" ] }, { @@ -262,6 +263,15 @@ "source": [ "nc.data.omics.nci60_table(cell_line='A498', data_type='RNA').head()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nc.data.omics.nci" + ] } ], "metadata": { From 4965cef683d63dd49ee404619cff3a938f83993e Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 15:23:02 +0200 Subject: [PATCH 22/33] removed extra code chunk --- docs/src/guidelines/guide_1_data.ipynb | 9 --------- 1 file changed, 9 deletions(-) diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb index 2887bd7..a90e1c3 100644 --- a/docs/src/guidelines/guide_1_data.ipynb +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -263,15 +263,6 @@ "source": [ "nc.data.omics.nci60_table(cell_line='A498', data_type='RNA').head()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "nc.data.omics.nci" - ] } ], "metadata": { From dc9eea3f00eea35bde03b8aaace583ac3e15ae39 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 15:27:11 +0200 Subject: [PATCH 23/33] fixed indent --- docs/src/guidelines/guide_1_data.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb index a90e1c3..56b16a4 100644 --- a/docs/src/guidelines/guide_1_data.ipynb +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -177,6 +177,7 @@ "source": [ "## 2. Data handlers (API)\n", "The data will either be deposited in the [NetworkCommons server](https://commons.omnipathdb.org/), or can be directly accessed from the original source. Regardless of this, the following functions are required\n", + "\n", "* A function providing an overview of the subsets (if applicable). For example, check `nc.data.omics.decryptm_experiments()`. \n", "* In case the data contains different files (for example, different omics layers, metadata tables, etc.), a function should retrieve this information. For example, check `nc.data.omics.nci60_datatypes()`\n", "* A function that retrieves the data. For example, check `nc.data.omics.nci60_table()`. Ideally, a `pd.DataFrame`, but we are planning to expand support for `AnnData` instances.\n", From c8c6fff59ba6ce8f8d8e40ad165831d451f0b051 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 15:28:59 +0200 Subject: [PATCH 24/33] removed space --- docs/src/guidelines/guide_1_data.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb index 56b16a4..051bc40 100644 --- a/docs/src/guidelines/guide_1_data.ipynb +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -30,7 +30,6 @@ "metadata": {}, "source": [ ".. code-block:: yaml\n", - "\n", " NCI60:\n", " name: NCI60\n", " description: NCI-60 cell line data\n", From 0e68950a12de80aaf52ba915e9cc480fab46f7b0 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 16:03:20 +0200 Subject: [PATCH 25/33] added spacing --- docs/src/guidelines/guide_1_data.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb index 051bc40..56b16a4 100644 --- a/docs/src/guidelines/guide_1_data.ipynb +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -30,6 +30,7 @@ "metadata": {}, "source": [ ".. code-block:: yaml\n", + "\n", " NCI60:\n", " name: NCI60\n", " description: NCI-60 cell line data\n", From 452cd45a007d61a34c804396ed4a3e85836eeaa5 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 16:23:15 +0200 Subject: [PATCH 26/33] updated moon vignette with new data handlers --- docs/src/vignettes/2_moon.ipynb | 265 ++++++++++++++++++++------------ 1 file changed, 163 insertions(+), 102 deletions(-) diff --git a/docs/src/vignettes/2_moon.ipynb b/docs/src/vignettes/2_moon.ipynb index 2ed8bf7..9d489b7 100644 --- a/docs/src/vignettes/2_moon.ipynb +++ b/docs/src/vignettes/2_moon.ipynb @@ -48,26 +48,9 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/victo/networkcommons/docs/src/vignettes\n" - ] - } - ], - "source": [ - "!pwd" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [], "source": [ - "meta_network_df = pd.read_csv('../../../data/moon/meta_network.sif', sep='\\t')" + "meta_network_df = nc.data.network.get_cosmos_pkn()" ] }, { @@ -109,120 +92,198 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We then download a dataset and read inputs:" + "In this notebook, we will use data from the NCI60 Human Tumor Cell Lines Screen. We will use the cell line 706-0. To have an overview of the cell lines, we can run `nc.data.omics.nci60_datasets()`." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cell_line
0786-0
1A498
2A549_ATCC
3ACHN
4BT-549
\n", + "
" + ], + "text/plain": [ + " cell_line\n", + "0 786-0\n", + "1 A498\n", + "2 A549_ATCC\n", + "3 ACHN\n", + "4 BT-549" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nc.data.omics.nci60_datasets().head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This resource contains three different types of data: transcriptomics, TF activity estimates and metabolic information." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
data_typedescription
0TF_scoresTF scores
1RNARNA expression
2metabolomicmetabolomic data
\n", + "
" + ], + "text/plain": [ + " data_type description\n", + "0 TF_scores TF scores\n", + "1 RNA RNA expression\n", + "2 metabolomic metabolomic data" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nc.data.omics.nci60_datatypes()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "moon_data = nc.data.omics.moon()" + "sig_df = nc.data.omics.nci60_table(cell_line='786-0', data_type='TF_scores')\n", + "rna_df = nc.data.omics.nci60_table(cell_line='786-0', data_type='RNA')\n", + "metab_df = nc.data.omics.nci60_table(cell_line='786-0', data_type='metabolomic')" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'sig': TF value\n", - " 0 AR 1.156582\n", - " 1 BACH1 2.399881\n", - " 2 CEBPA 3.687354\n", - " 3 CREB1 0.829149\n", - " 4 CTCF 2.914983\n", - " 5 E2F1 4.989779\n", - " 6 E2F4 3.972646\n", - " 7 EGR1 6.337803\n", - " 8 ELK1 0.444149\n", - " 9 EPAS1 4.268129\n", - " 10 ESR1 7.069928\n", - " 11 ETS1 5.957844\n", - " 12 FOS 5.009215\n", - " 13 FOXA1 2.338539\n", - " 14 FOXM1 1.206632\n", - " 15 FOXO3 -0.772054\n", - " 16 FOXP1 0.876896\n", - " 17 GATA2 1.052240\n", - " 18 GATA3 4.433932\n", - " 19 HIF1A 2.503899\n", - " 20 HNF4A 5.230794\n", - " 21 JUN 4.310749\n", - " 22 MITF 4.685015\n", - " 23 MYC 0.761681\n", - " 24 NFKB1 2.386302\n", - " 25 PRDM14 2.602170\n", - " 26 RARA 2.259669\n", - " 27 RELA 3.635926\n", - " 28 RUNX1 1.654963\n", - " 29 SOX2 0.903587\n", - " 30 SP1 2.073969\n", - " 31 SP3 0.190111\n", - " 32 SPI1 5.666462\n", - " 33 SREBF1 1.577459\n", - " 34 STAT1 2.219767\n", - " 35 STAT2 0.092127\n", - " 36 STAT3 1.241225\n", - " 37 TAL1 2.968578\n", - " 38 TFAP2A 0.182564\n", - " 39 TFAP2C 7.987909\n", - " 40 TP53 1.014723\n", - " 41 USF1 2.194528\n", - " 42 VDR 1.545408\n", - " 43 YY1 1.521236\n", - " 44 ZNF263 7.254531,\n", - " 'metab': metab value Unnamed: 2\n", - " 0 HMDB0011747 0.401991 NaN\n", - " 1 HMDB0000755 -0.142718 NaN\n", - " 2 HMDB0000905 -1.244183 NaN\n", - " 3 HMDB0000012 -0.967207 NaN\n", - " 4 HMDB0001191 -0.631035 NaN\n", - " .. ... ... ...\n", - " 133 HMDB0011720 -0.716155 NaN\n", - " 134 HMDB0000883 -0.059923 NaN\n", - " 135 HMDB0000292 -0.772272 NaN\n", - " 136 HMDB0000299 1.132812 NaN\n", - " 137 HMDB0002917 -1.728352 NaN\n", - " \n", - " [138 rows x 3 columns],\n", - " 'rna': gene value\n", - " 0 WASH7P -0.084246\n", - " 1 LINC01128 -0.246712\n", - " 2 NOC2L 0.508906\n", - " 3 KLHL17 -0.001460\n", - " 4 HES4 -0.281146\n", - " ... ... ...\n", - " 8252 CMC4 -0.029409\n", - " 8253 BRCC3 -0.058883\n", - " 8254 VBP1 -0.554804\n", - " 8255 TMLHE -1.095530\n", - " 8256 CD24 0.923448\n", - " \n", - " [8257 rows x 2 columns]}" + "{'AR': 1.1565824565146148,\n", + " 'BACH1': 2.3998807796742443,\n", + " 'CEBPA': 3.6873543923958847,\n", + " 'CREB1': 0.8291485083247008,\n", + " 'CTCF': 2.9149829587082383}" ] }, - "execution_count": 8, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "moon_data" + "sig_df.set_index('ID')['score'].to_dict()" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "sig_input = moon_data['sig'].set_index('TF')['value'].to_dict()\n", - "rna_input = moon_data['rna'].set_index('gene')['value'].to_dict()\n", - "metab_input = moon_data['metab'].set_index('metab')['value'].to_dict()" + "sig_input = sig_df.set_index('ID')['score'].to_dict()\n", + "rna_input = rna_df.set_index('ID')['score'].to_dict()\n", + "metab_input = metab_df.set_index('ID')['score'].to_dict()" ] }, { From a6ed1ab120d07f18dc1978ef9c143e0069c13520 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 16:23:29 +0200 Subject: [PATCH 27/33] added new handler for cosmos pkn --- networkcommons/data/network/_moon.py | 38 +++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/networkcommons/data/network/_moon.py b/networkcommons/data/network/_moon.py index 9b49a6f..7311b7c 100644 --- a/networkcommons/data/network/_moon.py +++ b/networkcommons/data/network/_moon.py @@ -17,18 +17,50 @@ Prior knowledge network used by MOON. """ -__all__ = ['build_moon_regulons'] +__all__ = ['build_moon_regulons', 'get_cosmos_pkn'] import lazy_import import numpy as np import pandas as pd -dc = lazy_import.lazy_module('decoupler') - from networkcommons import _utils from . import _omnipath from . import _liana +import os +import urllib +from networkcommons import _conf +from networkcommons.data.omics import _common + +dc = lazy_import.lazy_module('decoupler') + + +def get_cosmos_pkn(update: bool = False): + """ + Retrieves the metabolic network used in COSMOS from the server + + Returns: + network (pandas.DataFrame): metabolic network with + source, target, and sign columns. + """ + path = os.path.join(_conf.get('pickle_dir'), 'metapkn.pickle') + + if update or not os.path.exists(path): + + baseurl = urllib.parse.urljoin(_common._baseurl(), 'prior_knowledge') + + file_legend = pd.read_csv(baseurl + '/meta_network.sif', sep='\t') + + file_legend.to_pickle(path) + + else: + + file_legend = pd.read_pickle(path) + + return file_legend + + + def build_moon_regulons(include_liana=False): From 06a0c25c4793f86238fb412a3f8540dd932418c9 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 16:23:36 +0200 Subject: [PATCH 28/33] renamed folder --- networkcommons/data/network/_omnipath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/networkcommons/data/network/_omnipath.py b/networkcommons/data/network/_omnipath.py index a76e14a..78533e4 100644 --- a/networkcommons/data/network/_omnipath.py +++ b/networkcommons/data/network/_omnipath.py @@ -79,7 +79,7 @@ def get_phosphositeplus(update: bool = False): if update or not os.path.exists(path): - baseurl = urllib.parse.urljoin(_common._baseurl(), 'phosphosite') + baseurl = urllib.parse.urljoin(_common._baseurl(), 'prior_knowledge') file_legend = pd.read_csv(baseurl + '/kinase-substrate.tsv', sep='\t') From 2bb8c781ca9d093dbeb518b6ad75b3af1c404031 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 16:25:16 +0200 Subject: [PATCH 29/33] added cosmos pkn to docs --- docs/src/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/src/api.rst b/docs/src/api.rst index 4bd1911..b4d6a22 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -78,6 +78,7 @@ Prior Knowledge data.network.get_omnipath data.network.get_lianaplus data.network.get_phosphositeplus + data.network.get_cosmos_pkn Datasets From 8c814e61ddba0fd9b1bb69ca7ceacdf8d355380b Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 16:25:59 +0200 Subject: [PATCH 30/33] removed old moon/cosmos descriptions --- networkcommons/data/datasets.yaml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/networkcommons/data/datasets.yaml b/networkcommons/data/datasets.yaml index 412c52d..232869d 100644 --- a/networkcommons/data/datasets.yaml +++ b/networkcommons/data/datasets.yaml @@ -30,20 +30,6 @@ omics: This is a test dataset containing small RNA-Seq data. It is used for unit tests. path: unit_test/test__{table}data.tsv - moon: - name: MOON - description: Database files for running MOON - publication_link: NA - detailed_description: >- - This dataset contains database files required for running the MOON software. - path: moon/{table}_input.tsv - cosmos: - name: COSMOS - description: Database files for running COSMOS (MetaPKN) - publication_link: NA - detailed_description: >- - This dataset includes database files for the COSMOS software (MetaPKN). - path: moon/net/meta_network.sif CPTAC: name: CPTAC description: Clinical Proteomic Tumor Analysis Consortium data From 6de50dcc72b0a4b5f80a2e98a970cc836d7b66e8 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 16:33:50 +0200 Subject: [PATCH 31/33] disabled lazy import for decoupler --- networkcommons/data/network/_moon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/networkcommons/data/network/_moon.py b/networkcommons/data/network/_moon.py index 7311b7c..5134c87 100644 --- a/networkcommons/data/network/_moon.py +++ b/networkcommons/data/network/_moon.py @@ -32,7 +32,8 @@ from networkcommons import _conf from networkcommons.data.omics import _common -dc = lazy_import.lazy_module('decoupler') +# dc = lazy_import.lazy_module('decoupler') +import decoupler as dc def get_cosmos_pkn(update: bool = False): From e6490bb58f4768588f5dcffa785643169ceffb3a Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 16:33:55 +0200 Subject: [PATCH 32/33] updated tests --- tests/test_omics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_omics.py b/tests/test_omics.py index 3d20e30..bcebf65 100644 --- a/tests/test_omics.py +++ b/tests/test_omics.py @@ -25,8 +25,10 @@ def test_datasets_2(): dsets = _common.datasets() - assert 'decryptm' in dsets - assert 'CPTAC' in dsets + assert isinstance(dsets, pd.DataFrame) + assert dsets.columns.tolist() == ['name', 'description', 'publication_link', 'detailed_description'] + assert 'decryptm' in dsets.index + assert 'CPTAC' in dsets.index def test_commons_url(): From 03e61cbd5609b10d14d0ce063c18337944826878 Mon Sep 17 00:00:00 2001 From: vicpaton Date: Tue, 30 Jul 2024 16:38:09 +0200 Subject: [PATCH 33/33] disabled lazy import --- networkcommons/methods/_moon.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/networkcommons/methods/_moon.py b/networkcommons/methods/_moon.py index b64e086..868820d 100644 --- a/networkcommons/methods/_moon.py +++ b/networkcommons/methods/_moon.py @@ -42,7 +42,8 @@ import lazy_import import networkx as nx import pandas as pd -dc = lazy_import.lazy_module('decoupler') +# dc = lazy_import.lazy_module('decoupler') +import decoupler as dc import numpy as np from . import _graph