diff --git a/docs/src/api.rst b/docs/src/api.rst index 400ff0e..b4d6a22 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -78,6 +78,7 @@ Prior Knowledge data.network.get_omnipath data.network.get_lianaplus data.network.get_phosphositeplus + data.network.get_cosmos_pkn Datasets @@ -149,6 +150,19 @@ CPTAC data.omics.cptac_datatypes data.omics.cptac_extend_dataframe +NCI60 +~~~~~ +.. module::networkcommons.data.omics +.. currentmodule:: networkcommons + +.. autosummary:: + :toctree: api + :recursive: + + data.omics.nci60_datasets + data.omics.nci60_datatypes + data.omics.nci60_table + Other ~~~~~~~~ diff --git a/docs/src/contents.rst b/docs/src/contents.rst index 09e1667..4be9548 100644 --- a/docs/src/contents.rst +++ b/docs/src/contents.rst @@ -11,6 +11,13 @@ NetworkCommons: Table of Contents api +.. toctree:: + :maxdepth: 2 + :caption: Contribution guidelines + + guidelines/guide_1_data + + .. toctree:: :maxdepth: 2 :caption: Vignettes diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb new file mode 100644 index 0000000..56b16a4 --- /dev/null +++ b/docs/src/guidelines/guide_1_data.ipynb @@ -0,0 +1,290 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Contribution's guideline: Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thank you very much for considering contributing to the data collection of **NetworkCommons**! In order to make the resource as user-friendly as possible, we aim to be as transparent as possible, which means that all contributions should contain at least the following elements.\n", + "\n", + "## 1. Data information\n", + "* Experimental design: number of samples, number of experiments (if applicable), confounding factors\n", + "* Data production and processing: tools used, how the data processing was performed (if applicable).\n", + "* Files: number and type of files, with a small description of their contents.\n", + "* Link to the database from which the data was retrieved.\n", + "* Link to the dataset publication\n", + "* Path information explaining the structure of the data directories\n", + "This information should be appended to the existing YAML file in `networkcommons/data/datasets.yaml`\n", + "\n", + "An example of this can be found below:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + ".. code-block:: yaml\n", + "\n", + " NCI60:\n", + " name: NCI60\n", + " description: NCI-60 cell line data\n", + " publication_link: https://doi.org/10.1038/nrc1951\n", + " detailed_description: >-\n", + " This dataset contains data from the NCI-60 cell line panel.\n", + " It includes three files: TF activities from transcriptomics data,\n", + " metabolite abundances and gene reads.\n", + " path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This information can then be accessed via `nc.data.omics.datasets()`" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namedescriptionpublication_linkdetailed_description
decryptmDecryptMDrug perturbation proteomics and phosphoproteomics datahttps://doi.org/10.1126/science.ade3925This dataset contains the profiling of 31 cancer drugs in 13 human cancer cell line models resulted in 1.8 million dose-response curves, including 47,502 regulated phosphopeptides, 7316 ubiquitinylated peptides, and 546 regulated acetylated peptides.
panaceaPanaceaPancancer Analysis of Chemical Entity Activity RNA-Seq datahttps://doi.org/10.1016/j.xcrm.2021.100492PANACEA contains dose-response and perturbational profiles for 32 kinase inhibitors in 11 cancer cell lines, in addition to a DMSO control. Originally, this resource served as the basis for a DREAM Challenge assessing the accuracy and sensitivity of computational algorithms for de novo drug polypharmacology predictions.
moonMOONDatabase files for running MOONhttps://example.com/moonThis dataset contains database files required for running the MOON software.
cosmosCOSMOSDatabase files for running COSMOS (MetaPKN)https://example.com/cosmosThis dataset includes database files for the COSMOS software (MetaPKN).
CPTACCPTACClinical Proteomic Tumor Analysis Consortium datahttps://example.com/CPTACThis dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data.
NCI60NCI60NCI-60 cell line datahttps://example.com/NCI60This dataset contains data from the NCI-60 cell line panel. It includes three files: TF activities from transcriptomics data, metabolite abundances and gene reads.
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "decryptm DecryptM \n", + "panacea Panacea \n", + "moon MOON \n", + "cosmos COSMOS \n", + "CPTAC CPTAC \n", + "NCI60 NCI60 \n", + "\n", + " description \\\n", + "decryptm Drug perturbation proteomics and phosphoproteomics data \n", + "panacea Pancancer Analysis of Chemical Entity Activity RNA-Seq data \n", + "moon Database files for running MOON \n", + "cosmos Database files for running COSMOS (MetaPKN) \n", + "CPTAC Clinical Proteomic Tumor Analysis Consortium data \n", + "NCI60 NCI-60 cell line data \n", + "\n", + " publication_link \\\n", + "decryptm https://doi.org/10.1126/science.ade3925 \n", + "panacea https://doi.org/10.1016/j.xcrm.2021.100492 \n", + "moon https://example.com/moon \n", + "cosmos https://example.com/cosmos \n", + "CPTAC https://example.com/CPTAC \n", + "NCI60 https://example.com/NCI60 \n", + "\n", + " detailed_description \n", + "decryptm This dataset contains the profiling of 31 cancer drugs in 13 human cancer cell line models resulted in 1.8 million dose-response curves, including 47,502 regulated phosphopeptides, 7316 ubiquitinylated peptides, and 546 regulated acetylated peptides. \n", + "panacea PANACEA contains dose-response and perturbational profiles for 32 kinase inhibitors in 11 cancer cell lines, in addition to a DMSO control. Originally, this resource served as the basis for a DREAM Challenge assessing the accuracy and sensitivity of computational algorithms for de novo drug polypharmacology predictions. \n", + "moon This dataset contains database files required for running the MOON software. \n", + "cosmos This dataset includes database files for the COSMOS software (MetaPKN). \n", + "CPTAC This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data. \n", + "NCI60 This dataset contains data from the NCI-60 cell line panel. It includes three files: TF activities from transcriptomics data, metabolite abundances and gene reads. " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nc.data.omics.datasets()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data handlers (API)\n", + "The data will either be deposited in the [NetworkCommons server](https://commons.omnipathdb.org/), or can be directly accessed from the original source. Regardless of this, the following functions are required\n", + "\n", + "* A function providing an overview of the subsets (if applicable). For example, check `nc.data.omics.decryptm_experiments()`. \n", + "* In case the data contains different files (for example, different omics layers, metadata tables, etc.), a function should retrieve this information. For example, check `nc.data.omics.nci60_datatypes()`\n", + "* A function that retrieves the data. For example, check `nc.data.omics.nci60_table()`. Ideally, a `pd.DataFrame`, but we are planning to expand support for `AnnData` instances.\n", + "\n", + "These new functions can be implemented in a new file, `_{dataset}`, inside the `networkcommons/data/omics/` folder.\n", + "\n", + "For example, `nc.data.omics.nci60_table()` retrieves a single `pd.DataFrame` by providing a data type and a cell line." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDscore
0WASH7P-2.109966
1NOC2L-1.480194
2HES4-0.781522
3ISG150.406806
4AGRN-0.324970
\n", + "
" + ], + "text/plain": [ + " ID score\n", + "0 WASH7P -2.109966\n", + "1 NOC2L -1.480194\n", + "2 HES4 -0.781522\n", + "3 ISG15 0.406806\n", + "4 AGRN -0.324970" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nc.data.omics.nci60_table(cell_line='A498', data_type='RNA').head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "networkcommons-DX9y6Uxu-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/src/vignettes/2_moon.ipynb b/docs/src/vignettes/2_moon.ipynb index 2ed8bf7..9d489b7 100644 --- a/docs/src/vignettes/2_moon.ipynb +++ b/docs/src/vignettes/2_moon.ipynb @@ -48,26 +48,9 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/victo/networkcommons/docs/src/vignettes\n" - ] - } - ], - "source": [ - "!pwd" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, "outputs": [], "source": [ - "meta_network_df = pd.read_csv('../../../data/moon/meta_network.sif', sep='\\t')" + "meta_network_df = nc.data.network.get_cosmos_pkn()" ] }, { @@ -109,120 +92,198 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We then download a dataset and read inputs:" + "In this notebook, we will use data from the NCI60 Human Tumor Cell Lines Screen. We will use the cell line 706-0. To have an overview of the cell lines, we can run `nc.data.omics.nci60_datasets()`." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cell_line
0786-0
1A498
2A549_ATCC
3ACHN
4BT-549
\n", + "
" + ], + "text/plain": [ + " cell_line\n", + "0 786-0\n", + "1 A498\n", + "2 A549_ATCC\n", + "3 ACHN\n", + "4 BT-549" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nc.data.omics.nci60_datasets().head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This resource contains three different types of data: transcriptomics, TF activity estimates and metabolic information." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
data_typedescription
0TF_scoresTF scores
1RNARNA expression
2metabolomicmetabolomic data
\n", + "
" + ], + "text/plain": [ + " data_type description\n", + "0 TF_scores TF scores\n", + "1 RNA RNA expression\n", + "2 metabolomic metabolomic data" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nc.data.omics.nci60_datatypes()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "moon_data = nc.data.omics.moon()" + "sig_df = nc.data.omics.nci60_table(cell_line='786-0', data_type='TF_scores')\n", + "rna_df = nc.data.omics.nci60_table(cell_line='786-0', data_type='RNA')\n", + "metab_df = nc.data.omics.nci60_table(cell_line='786-0', data_type='metabolomic')" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'sig': TF value\n", - " 0 AR 1.156582\n", - " 1 BACH1 2.399881\n", - " 2 CEBPA 3.687354\n", - " 3 CREB1 0.829149\n", - " 4 CTCF 2.914983\n", - " 5 E2F1 4.989779\n", - " 6 E2F4 3.972646\n", - " 7 EGR1 6.337803\n", - " 8 ELK1 0.444149\n", - " 9 EPAS1 4.268129\n", - " 10 ESR1 7.069928\n", - " 11 ETS1 5.957844\n", - " 12 FOS 5.009215\n", - " 13 FOXA1 2.338539\n", - " 14 FOXM1 1.206632\n", - " 15 FOXO3 -0.772054\n", - " 16 FOXP1 0.876896\n", - " 17 GATA2 1.052240\n", - " 18 GATA3 4.433932\n", - " 19 HIF1A 2.503899\n", - " 20 HNF4A 5.230794\n", - " 21 JUN 4.310749\n", - " 22 MITF 4.685015\n", - " 23 MYC 0.761681\n", - " 24 NFKB1 2.386302\n", - " 25 PRDM14 2.602170\n", - " 26 RARA 2.259669\n", - " 27 RELA 3.635926\n", - " 28 RUNX1 1.654963\n", - " 29 SOX2 0.903587\n", - " 30 SP1 2.073969\n", - " 31 SP3 0.190111\n", - " 32 SPI1 5.666462\n", - " 33 SREBF1 1.577459\n", - " 34 STAT1 2.219767\n", - " 35 STAT2 0.092127\n", - " 36 STAT3 1.241225\n", - " 37 TAL1 2.968578\n", - " 38 TFAP2A 0.182564\n", - " 39 TFAP2C 7.987909\n", - " 40 TP53 1.014723\n", - " 41 USF1 2.194528\n", - " 42 VDR 1.545408\n", - " 43 YY1 1.521236\n", - " 44 ZNF263 7.254531,\n", - " 'metab': metab value Unnamed: 2\n", - " 0 HMDB0011747 0.401991 NaN\n", - " 1 HMDB0000755 -0.142718 NaN\n", - " 2 HMDB0000905 -1.244183 NaN\n", - " 3 HMDB0000012 -0.967207 NaN\n", - " 4 HMDB0001191 -0.631035 NaN\n", - " .. ... ... ...\n", - " 133 HMDB0011720 -0.716155 NaN\n", - " 134 HMDB0000883 -0.059923 NaN\n", - " 135 HMDB0000292 -0.772272 NaN\n", - " 136 HMDB0000299 1.132812 NaN\n", - " 137 HMDB0002917 -1.728352 NaN\n", - " \n", - " [138 rows x 3 columns],\n", - " 'rna': gene value\n", - " 0 WASH7P -0.084246\n", - " 1 LINC01128 -0.246712\n", - " 2 NOC2L 0.508906\n", - " 3 KLHL17 -0.001460\n", - " 4 HES4 -0.281146\n", - " ... ... ...\n", - " 8252 CMC4 -0.029409\n", - " 8253 BRCC3 -0.058883\n", - " 8254 VBP1 -0.554804\n", - " 8255 TMLHE -1.095530\n", - " 8256 CD24 0.923448\n", - " \n", - " [8257 rows x 2 columns]}" + "{'AR': 1.1565824565146148,\n", + " 'BACH1': 2.3998807796742443,\n", + " 'CEBPA': 3.6873543923958847,\n", + " 'CREB1': 0.8291485083247008,\n", + " 'CTCF': 2.9149829587082383}" ] }, - "execution_count": 8, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "moon_data" + "sig_df.set_index('ID')['score'].to_dict()" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "sig_input = moon_data['sig'].set_index('TF')['value'].to_dict()\n", - "rna_input = moon_data['rna'].set_index('gene')['value'].to_dict()\n", - "metab_input = moon_data['metab'].set_index('metab')['value'].to_dict()" + "sig_input = sig_df.set_index('ID')['score'].to_dict()\n", + "rna_input = rna_df.set_index('ID')['score'].to_dict()\n", + "metab_input = metab_df.set_index('ID')['score'].to_dict()" ] }, { diff --git a/networkcommons/data/datasets.yaml b/networkcommons/data/datasets.yaml index bb2a23a..232869d 100644 --- a/networkcommons/data/datasets.yaml +++ b/networkcommons/data/datasets.yaml @@ -4,24 +4,46 @@ omics: decryptm: name: DecryptM description: Drug perturbation proteomics and phosphoproteomics data + publication_link: https://doi.org/10.1126/science.ade3925 + detailed_description: >- + This dataset contains the profiling of 31 cancer drugs in 13 human cancer + cell line models resulted in 1.8 million dose-response curves, including + 47,502 regulated phosphopeptides, 7316 ubiquitinylated peptides, + and 546 regulated acetylated peptides. path: decryptm/{experiment}/{data_type}/{fname} panacea: name: Panacea description: Pancancer Analysis of Chemical Entity Activity RNA-Seq data + publication_link: https://doi.org/10.1016/j.xcrm.2021.100492 + detailed_description: >- + PANACEA contains dose-response and perturbational profiles for 32 + kinase inhibitors in 11 cancer cell lines, in addition to a DMSO control. + Originally, this resource served as the basis for a DREAM Challenge + assessing the accuracy and sensitivity of computational algorithms for + de novo drug polypharmacology predictions. path: panacea/panacea__{table}data.tsv test: name: Test data description: Small RNA-Seq data for unit tests + publication_link: NA + detailed_description: >- + This is a test dataset containing small RNA-Seq data. + It is used for unit tests. path: unit_test/test__{table}data.tsv - moon: - name: MOON - description: Database files for running MOON - path: moon/{table}_input.tsv - cosmos: - name: COSMOS - description: Database files for running COSMOS (MetaPKN) - path: moon/net/meta_network.sif CPTAC: name: CPTAC description: Clinical Proteomic Tumor Analysis Consortium data + publication_link: https://doi.org/10.1158/2159-8290.CD-13-0219 + detailed_description: >- + This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. + It includes various cancer types and proteomic data. path: CPTAC/{data_type}/{cancer_type}_{fname}.txt + NCI60: + name: NCI60 + description: NCI-60 cell line data + publication_link: https://doi.org/10.1038/nrc1951 + detailed_description: >- + This dataset contains data from the NCI-60 cell line panel. + It includes three files: TF activities from transcriptomics data, + metabolite abundances and gene reads. + path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv diff --git a/networkcommons/data/network/_moon.py b/networkcommons/data/network/_moon.py index 9b49a6f..5134c87 100644 --- a/networkcommons/data/network/_moon.py +++ b/networkcommons/data/network/_moon.py @@ -17,18 +17,51 @@ Prior knowledge network used by MOON. """ -__all__ = ['build_moon_regulons'] +__all__ = ['build_moon_regulons', 'get_cosmos_pkn'] import lazy_import import numpy as np import pandas as pd -dc = lazy_import.lazy_module('decoupler') - from networkcommons import _utils from . import _omnipath from . import _liana +import os +import urllib +from networkcommons import _conf +from networkcommons.data.omics import _common + +# dc = lazy_import.lazy_module('decoupler') +import decoupler as dc + + +def get_cosmos_pkn(update: bool = False): + """ + Retrieves the metabolic network used in COSMOS from the server + + Returns: + network (pandas.DataFrame): metabolic network with + source, target, and sign columns. + """ + path = os.path.join(_conf.get('pickle_dir'), 'metapkn.pickle') + + if update or not os.path.exists(path): + + baseurl = urllib.parse.urljoin(_common._baseurl(), 'prior_knowledge') + + file_legend = pd.read_csv(baseurl + '/meta_network.sif', sep='\t') + + file_legend.to_pickle(path) + + else: + + file_legend = pd.read_pickle(path) + + return file_legend + + + def build_moon_regulons(include_liana=False): diff --git a/networkcommons/data/network/_omnipath.py b/networkcommons/data/network/_omnipath.py index a76e14a..78533e4 100644 --- a/networkcommons/data/network/_omnipath.py +++ b/networkcommons/data/network/_omnipath.py @@ -79,7 +79,7 @@ def get_phosphositeplus(update: bool = False): if update or not os.path.exists(path): - baseurl = urllib.parse.urljoin(_common._baseurl(), 'phosphosite') + baseurl = urllib.parse.urljoin(_common._baseurl(), 'prior_knowledge') file_legend = pd.read_csv(baseurl + '/kinase-substrate.tsv', sep='\t') diff --git a/networkcommons/data/omics/_common.py b/networkcommons/data/omics/_common.py index 93c9975..4f02078 100644 --- a/networkcommons/data/omics/_common.py +++ b/networkcommons/data/omics/_common.py @@ -45,18 +45,20 @@ def _datasets() -> dict[str, dict]: return _module_data('datasets').get('omics', {}) -def datasets() -> dict[str, str]: +def datasets() -> pd.DataFrame: """ Built-in omics datasets. Returns: - A dict with dataset labels as keys and descriptions as values. + A DataFrame with dataset details. """ + data = _datasets().get('datasets', {}) + df = pd.DataFrame.from_dict(data, orient='index') + pd.set_option('display.max_colwidth', None) - return { - k: v['description'] - for k, v in _datasets().get('datasets', {}).items() - } + df = df[df.index != 'test'] # Exclude the 'test' dataset + + return df[['name', 'description', 'publication_link', 'detailed_description']] def _baseurl() -> str: diff --git a/networkcommons/data/omics/_moon.py b/networkcommons/data/omics/_moon.py index e985525..acf8ff3 100644 --- a/networkcommons/data/omics/_moon.py +++ b/networkcommons/data/omics/_moon.py @@ -19,26 +19,86 @@ from __future__ import annotations -__all__ = ['moon'] +__all__ = ['nci60_datasets', 'nci60_datatypes', 'nci60_table'] import pandas as pd +import os +import urllib.parse from . import _common +from networkcommons import _conf -def moon() -> dict[str, pd.DataFrame]: +def nci60_datasets(update: bool = False) -> pd.DataFrame: """ - Example data for Moon. + Table of all NCI60 datasets (cell types). + + Args: + update: + Force download and update cache. Returns: - Three data frames: signaling, metabolite and gene activity - measurements. + Data frame of all NCI60 datasets, with columns "experiment", + "data_type" and "fname". """ - return { - table: _common._open( - _common._commons_url('moon', table = table), - df = {'sep': '\t'}, + path = os.path.join(_conf.get('pickle_dir'), 'nci60_datasets.pickle') + + if update or not os.path.exists(path): + + baseurl = urllib.parse.urljoin(_common._baseurl(), 'NCI60') + + datasets = pd.DataFrame( + [ + ( + cell_line, + ) + for cell_line in _common._ls(baseurl) + ], + columns = ['cell_line'] ) - for table in ('sig', 'metab', 'rna') - } + datasets.to_pickle(path) + + else: + + datasets = pd.read_pickle(path) + + return datasets + + +def nci60_datatypes() -> pd.DataFrame: + """ + Table of all NCI60 data types. + + Returns: + Data frame of all NCI60 data types, with columns "data_type", + and "description". + """ + df = pd.DataFrame({ + 'data_type': ['TF_scores', 'RNA', 'metabolomic'], + 'description': ['TF scores', 'RNA expression', 'metabolomic data'] + } + ) + + return df + + +def nci60_table(cell_line: str, data_type: str) -> pd.DataFrame: + """ + One table of omics data from NCI60. + + Args: + cell_line: + Name of the cell line. For a complete list see + `nci60_datasets()`. + data_type: + Type of data. For a complete list see `nci60_datatypes()`. + + Returns: + The table as a pandas DataFrame. + """ + + return _common._open( + _common._commons_url('NCI60', **locals()), + df = {'sep': '\t'}, + ) diff --git a/networkcommons/methods/_moon.py b/networkcommons/methods/_moon.py index b64e086..868820d 100644 --- a/networkcommons/methods/_moon.py +++ b/networkcommons/methods/_moon.py @@ -42,7 +42,8 @@ import lazy_import import networkx as nx import pandas as pd -dc = lazy_import.lazy_module('decoupler') +# dc = lazy_import.lazy_module('decoupler') +import decoupler as dc import numpy as np from . import _graph diff --git a/tests/test_omics.py b/tests/test_omics.py index 3d20e30..bcebf65 100644 --- a/tests/test_omics.py +++ b/tests/test_omics.py @@ -25,8 +25,10 @@ def test_datasets_2(): dsets = _common.datasets() - assert 'decryptm' in dsets - assert 'CPTAC' in dsets + assert isinstance(dsets, pd.DataFrame) + assert dsets.columns.tolist() == ['name', 'description', 'publication_link', 'detailed_description'] + assert 'decryptm' in dsets.index + assert 'CPTAC' in dsets.index def test_commons_url():