Merge pull request #44 from saezlab/41-expand-data-utilities

41 expand data utilities
saezlab · Jul 30, 2024 · ac65afc · ac65afc
2 parents d91b9c4 + 03e61cb
commit ac65afc
Show file tree

Hide file tree

Showing 11 changed files with 626 additions and 134 deletions.
diff --git a/docs/src/api.rst b/docs/src/api.rst
@@ -78,6 +78,7 @@ Prior Knowledge
     data.network.get_omnipath
     data.network.get_lianaplus
     data.network.get_phosphositeplus
+    data.network.get_cosmos_pkn
 
 
 Datasets
@@ -149,6 +150,19 @@ CPTAC
     data.omics.cptac_datatypes
     data.omics.cptac_extend_dataframe
 
+NCI60
+~~~~~
+.. module::networkcommons.data.omics
+.. currentmodule:: networkcommons
+
+.. autosummary::
+    :toctree: api
+    :recursive:
+
+    data.omics.nci60_datasets
+    data.omics.nci60_datatypes
+    data.omics.nci60_table
+
 
 Other
 ~~~~~~~~

diff --git a/docs/src/contents.rst b/docs/src/contents.rst
@@ -11,6 +11,13 @@ NetworkCommons: Table of Contents
    api
 
 
+.. toctree::
+   :maxdepth: 2
+   :caption: Contribution guidelines
+
+   guidelines/guide_1_data
+
+
 .. toctree::
    :maxdepth: 2
    :caption: Vignettes

diff --git a/docs/src/guidelines/guide_1_data.ipynb b/docs/src/guidelines/guide_1_data.ipynb
@@ -0,0 +1,290 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Contribution's guideline: Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Thank you very much for considering contributing to the data collection of **NetworkCommons**! In order to make the resource as user-friendly as possible, we aim to be as transparent as possible, which means that all contributions should contain at least the following elements.\n",
+    "\n",
+    "## 1. Data information\n",
+    "* Experimental design: number of samples, number of experiments (if applicable), confounding factors\n",
+    "* Data production and processing: tools used, how the data processing was performed (if applicable).\n",
+    "* Files: number and type of files, with a small description of their contents.\n",
+    "* Link to the database from which the data was retrieved.\n",
+    "* Link to the dataset publication\n",
+    "* Path information explaining the structure of the data directories\n",
+    "This information should be appended to the existing YAML file in `networkcommons/data/datasets.yaml`\n",
+    "\n",
+    "An example of this can be found below:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    ".. code-block:: yaml\n",
+    "\n",
+    "    NCI60:\n",
+    "        name: NCI60\n",
+    "        description: NCI-60 cell line data\n",
+    "        publication_link: https://doi.org/10.1038/nrc1951\n",
+    "        detailed_description: >-\n",
+    "            This dataset contains data from the NCI-60 cell line panel.\n",
+    "            It includes three files: TF activities from transcriptomics data,\n",
+    "            metabolite abundances and gene reads.\n",
+    "        path: NCI60/{cell_line}/{cell_line}__{data_type}.tsv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This information can then be accessed via `nc.data.omics.datasets()`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>description</th>\n",
+       "      <th>publication_link</th>\n",
+       "      <th>detailed_description</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>decryptm</th>\n",
+       "      <td>DecryptM</td>\n",
+       "      <td>Drug perturbation proteomics and phosphoproteomics data</td>\n",
+       "      <td>https://doi.org/10.1126/science.ade3925</td>\n",
+       "      <td>This dataset contains the profiling of 31 cancer drugs in 13 human cancer cell line models resulted in 1.8 million dose-response curves, including 47,502 regulated phosphopeptides, 7316 ubiquitinylated peptides, and 546 regulated acetylated peptides.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>panacea</th>\n",
+       "      <td>Panacea</td>\n",
+       "      <td>Pancancer Analysis of Chemical Entity Activity RNA-Seq data</td>\n",
+       "      <td>https://doi.org/10.1016/j.xcrm.2021.100492</td>\n",
+       "      <td>PANACEA contains dose-response and perturbational profiles for 32 kinase inhibitors in 11 cancer cell lines, in addition to a DMSO control. Originally, this resource served as the basis for a DREAM Challenge assessing the accuracy and sensitivity of computational algorithms for de novo drug polypharmacology predictions.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>moon</th>\n",
+       "      <td>MOON</td>\n",
+       "      <td>Database files for running MOON</td>\n",
+       "      <td>https://example.com/moon</td>\n",
+       "      <td>This dataset contains database files required for running the MOON software.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>cosmos</th>\n",
+       "      <td>COSMOS</td>\n",
+       "      <td>Database files for running COSMOS (MetaPKN)</td>\n",
+       "      <td>https://example.com/cosmos</td>\n",
+       "      <td>This dataset includes database files for the COSMOS software (MetaPKN).</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>CPTAC</th>\n",
+       "      <td>CPTAC</td>\n",
+       "      <td>Clinical Proteomic Tumor Analysis Consortium data</td>\n",
+       "      <td>https://example.com/CPTAC</td>\n",
+       "      <td>This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>NCI60</th>\n",
+       "      <td>NCI60</td>\n",
+       "      <td>NCI-60 cell line data</td>\n",
+       "      <td>https://example.com/NCI60</td>\n",
+       "      <td>This dataset contains data from the NCI-60 cell line panel. It includes three files: TF activities from transcriptomics data, metabolite abundances and gene reads.</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              name  \\\n",
+       "decryptm  DecryptM   \n",
+       "panacea    Panacea   \n",
+       "moon          MOON   \n",
+       "cosmos      COSMOS   \n",
+       "CPTAC        CPTAC   \n",
+       "NCI60        NCI60   \n",
+       "\n",
+       "                                                          description  \\\n",
+       "decryptm      Drug perturbation proteomics and phosphoproteomics data   \n",
+       "panacea   Pancancer Analysis of Chemical Entity Activity RNA-Seq data   \n",
+       "moon                                  Database files for running MOON   \n",
+       "cosmos                    Database files for running COSMOS (MetaPKN)   \n",
+       "CPTAC               Clinical Proteomic Tumor Analysis Consortium data   \n",
+       "NCI60                                           NCI-60 cell line data   \n",
+       "\n",
+       "                                    publication_link  \\\n",
+       "decryptm     https://doi.org/10.1126/science.ade3925   \n",
+       "panacea   https://doi.org/10.1016/j.xcrm.2021.100492   \n",
+       "moon                        https://example.com/moon   \n",
+       "cosmos                    https://example.com/cosmos   \n",
+       "CPTAC                      https://example.com/CPTAC   \n",
+       "NCI60                      https://example.com/NCI60   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                       detailed_description  \n",
+       "decryptm                                                                         This dataset contains the profiling of 31 cancer drugs in 13 human cancer cell line models resulted in 1.8 million dose-response curves, including 47,502 regulated phosphopeptides, 7316 ubiquitinylated peptides, and 546 regulated acetylated peptides.  \n",
+       "panacea   PANACEA contains dose-response and perturbational profiles for 32 kinase inhibitors in 11 cancer cell lines, in addition to a DMSO control. Originally, this resource served as the basis for a DREAM Challenge assessing the accuracy and sensitivity of computational algorithms for de novo drug polypharmacology predictions.  \n",
+       "moon                                                                                                                                                                                                                                                           This dataset contains database files required for running the MOON software.  \n",
+       "cosmos                                                                                                                                                                                                                                                              This dataset includes database files for the COSMOS software (MetaPKN).  \n",
+       "CPTAC                                                                                                                                                                                                This dataset contains data from the Clinical Proteomic Tumor Analysis Consortium. It includes various cancer types and proteomic data.  \n",
+       "NCI60                                                                                                                                                                   This dataset contains data from the NCI-60 cell line panel. It includes three files: TF activities from transcriptomics data, metabolite abundances and gene reads.  "
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nc.data.omics.datasets()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Data handlers (API)\n",
+    "The data will either be deposited in the [NetworkCommons server](https://commons.omnipathdb.org/), or can be directly accessed from the original source. Regardless of this, the following functions are required\n",
+    "\n",
+    "* A function providing an overview of the subsets (if applicable). For example, check `nc.data.omics.decryptm_experiments()`. \n",
+    "* In case the data contains different files (for example, different omics layers, metadata tables, etc.), a function should retrieve this information. For example, check `nc.data.omics.nci60_datatypes()`\n",
+    "* A function that retrieves the data. For example, check `nc.data.omics.nci60_table()`. Ideally, a `pd.DataFrame`, but we are planning to expand support for `AnnData` instances.\n",
+    "\n",
+    "These new functions can be implemented in a new file, `_{dataset}`, inside the `networkcommons/data/omics/` folder.\n",
+    "\n",
+    "For example, `nc.data.omics.nci60_table()` retrieves a single `pd.DataFrame` by providing a data type and a cell line."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ID</th>\n",
+       "      <th>score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>WASH7P</td>\n",
+       "      <td>-2.109966</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>NOC2L</td>\n",
+       "      <td>-1.480194</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>HES4</td>\n",
+       "      <td>-0.781522</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ISG15</td>\n",
+       "      <td>0.406806</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>AGRN</td>\n",
+       "      <td>-0.324970</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       ID     score\n",
+       "0  WASH7P -2.109966\n",
+       "1   NOC2L -1.480194\n",
+       "2    HES4 -0.781522\n",
+       "3   ISG15  0.406806\n",
+       "4    AGRN -0.324970"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nc.data.omics.nci60_table(cell_line='A498', data_type='RNA').head()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "networkcommons-DX9y6Uxu-py3.10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}