SomeoneInParticular · SomeoneInParticular · Dec 18, 2024 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/.gitignore b/.gitignore
@@ -8,14 +8,22 @@
 /sct_processing/softseg_data/*.tsv
 /sct_processing/softseg_data/*.csv
 /sct_processing/softseg_data/*.json
+/sct_processing/softseg_data/c2c7/*.tsv
+/sct_processing/softseg_data/c2c7/*.csv
+/sct_processing/softseg_data/c2c7/*.json
 /sct_processing/deepseg_data/*.tsv
 /sct_processing/deepseg_data/*.csv
 /sct_processing/deepseg_data/*.json
+/sct_processing/deepseg_data/c2c7/*.tsv
+/sct_processing/deepseg_data/c2c7/*.csv
+/sct_processing/deepseg_data/c2c7/*.json
 /sct_processing/results/figures/
 /sct_processing/participants.tsv
 /sct_processing/results.db
 
 # Python notebook checkpoints of questionable utility
 /sct_processing/results/.ipynb_checkpoints/
 /sct_processing/softseg_data/.ipynb_checkpoints/
+/sct_processing/softseg_data/c2c7/.ipynb_checkpoints/
 /sct_processing/deepseg_data/.ipynb_checkpoints/
+/sct_processing/deepseg_data/c2c7/.ipynb_checkpoints/
diff --git a/sct_processing/deepseg_data/c2c7/autogen_configs.ipynb b/sct_processing/deepseg_data/c2c7/autogen_configs.ipynb
@@ -0,0 +1,329 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "11c7abc8-3cf6-4971-9144-84c294f2a810",
+   "metadata": {},
+   "source": [
+    "# Data Configuration Autogenertation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f47e2416-30d8-4af9-bc27-ae1ccbf4ee67",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c315c14-4389-4382-8865-e535b4830dc0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from copy import deepcopy\n",
+    "from pathlib import Path\n",
+    "from json import dump"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "57367529-eb6f-4cb9-90f5-b5fbe16bc331",
+   "metadata": {},
+   "source": [
+    "## Template"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b80f0062-15ba-4ba6-801f-d299d9be5743",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template = {\n",
+    "  \"label\": None,\n",
+    "  \"format\": \"tabular\",\n",
+    "  \"data_source\": None,\n",
+    "  \"separator\": \"\\t\",\n",
+    "  \"pre_split_hooks\": [],\n",
+    "  \"post_split_hooks\": []\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d4061ce5-40cf-4ca8-90e0-3307f7800f2b",
+   "metadata": {},
+   "source": [
+    "## Utility Functions "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad621df0-61f7-4ed2-be01-c122fc611e16",
+   "metadata": {},
+   "source": [
+    "Build the baseline pre-split hooks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94e7ef0f-e429-4ad2-979d-911e5bf2b7f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_presplit(file_name: str, json_content: dict):\n",
+    "    # Isolate the pre_split_hook list\n",
+    "    pre_split_hooks = json_content['pre_split_hooks']\n",
+    "\n",
+    "    feature_list = ['GRP']\n",
+    "    \n",
+    "    # Append an extended list of explicitly dropped features if the dataset contains clinical data\n",
+    "    if \"full_\" in file_name or \"clinical_\" in file_name:\n",
+    "        feature_list.extend([\n",
+    "            \"Site\",\n",
+    "            \"Surgical\",\n",
+    "            \"Number of Surgeries\",\n",
+    "            \"Treatment Plan\",\n",
+    "            \"Followup: 6-18 weeks\",\n",
+    "            \"Followup: 12 month\",\n",
+    "            \"Followup: 24 month\",\n",
+    "            \"Followup: 60 month\",\n",
+    "            \"Date of Assessment\",\n",
+    "            \"CSM Duration\",\n",
+    "            \"Work Status\",\n",
+    "            \"mJOA 12 months\",\n",
+    "            \"HRR\"\n",
+    "        ])\n",
+    "    # Otherwise just drop some basic metadata\n",
+    "    if \"full_\" in file_name or \"img_\" in file_name:\n",
+    "        feature_list.extend([\n",
+    "            \"acq\",\n",
+    "            \"weight\"\n",
+    "        ])\n",
+    "        \n",
+    "    # Append the resulting list of features to drop explicitly\n",
+    "    pre_split_hooks.append({\n",
+    "        \"type\": \"drop_features_explicit\",\n",
+    "        \"features\": feature_list\n",
+    "    })\n",
+    "        \n",
+    "    # Add some nullity checks as well\n",
+    "    pre_split_hooks.extend([{\n",
+    "        \"type\": \"feature_drop_null\",\n",
+    "        \"threshold\": 0.5\n",
+    "    }, {\n",
+    "        \"type\": \"sample_drop_null\",\n",
+    "        \"threshold\": 0.5\n",
+    "    }])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3a0ea10-bdfd-43de-96b2-94c608c40d99",
+   "metadata": {},
+   "source": [
+    "Build the baseline post-split hooks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c18bc6d1-531d-4b83-ac31-6f6b93da78cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_postsplit(file_name: str, json_content: dict):\n",
+    "    # Append an extended list of explicitly categorical features if the dataset contains clinical data\n",
+    "    if \"full_\" in file_name or \"clinical_\" in file_name:\n",
+    "        json_content['post_split_hooks'].extend([{\n",
+    "            \"type\": \"imputation_simple\",\n",
+    "            \"strategy\": \"most_frequent\",\n",
+    "            \"features\": [\n",
+    "                \"EQ5D: Anxiety/Depression\",\n",
+    "                \"EQ5D: Mobility\",\n",
+    "                \"EQ5D: Pain/Discomfort\",\n",
+    "                \"EQ5D: Self-Care\",\n",
+    "                \"EQ5D: Total\",\n",
+    "                \"EQ5D: Usual Activities\",\n",
+    "                \"Sex\",\n",
+    "                \"Symptom Duration\",\n",
+    "                \"Work Status (Category)\",\n",
+    "                \"Comorbidities: Nicotine (Smoking)\",\n",
+    "                \"Comorbidities: Nicotine (Smokeless)\",\n",
+    "                \"Comorbidities: Nicotine (Patches)\",\n",
+    "                \"Comorbidities: Nicotine (Recent Quit)\"\n",
+    "            ]}, {\n",
+    "            \"type\": \"one_hot_encode\",\n",
+    "            \"features\": [\n",
+    "                \"EQ5D: Anxiety/Depression\",\n",
+    "                \"EQ5D: Mobility\",\n",
+    "                \"EQ5D: Pain/Discomfort\",\n",
+    "                \"EQ5D: Self-Care\",\n",
+    "                \"EQ5D: Usual Activities\",\n",
+    "                \"Sex\",\n",
+    "                \"Symptom Duration\",\n",
+    "                \"Work Status (Category)\",\n",
+    "                \"Comorbidities: Nicotine (Smoking)\",\n",
+    "                \"Comorbidities: Nicotine (Smokeless)\",\n",
+    "                \"Comorbidities: Nicotine (Patches)\",\n",
+    "                \"Comorbidities: Nicotine (Recent Quit)\"\n",
+    "            ],\n",
+    "            \"max_unique_vals\": 5,\n",
+    "            \"handle_unknown\": \"ignore\"\n",
+    "        }])\n",
+    "    # Add some common standardization and imputation for everything\n",
+    "    json_content['post_split_hooks'].extend([{\n",
+    "            \"type\": \"imputation_simple\",\n",
+    "            \"strategy\": \"mean\"\n",
+    "        }, {\n",
+    "            \"type\": \"standard_scaling\",\n",
+    "            \"run_per_cross\": True\n",
+    "        }])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b68e2738-d678-49d1-96b4-1081c64f8c02",
+   "metadata": {},
+   "source": [
+    "Feature Selection/Transformation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7bf1416c-6ca8-4451-b954-624168ec7720",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_rfe(json_content):\n",
+    "    json_content['label'] += '_rfe'\n",
+    "    json_content['post_split_hooks'].append({\n",
+    "        \"type\":  \"recursive_feature_elimination\",\n",
+    "        \"proportion\": {\n",
+    "            \"label\": \"rfe_feature_proportion\",\n",
+    "            \"type\": \"float\",\n",
+    "            \"low\": 0.1,\n",
+    "            \"high\": 0.9\n",
+    "        }\n",
+    "    })\n",
+    "\n",
+    "def add_pca(json_content):\n",
+    "    json_content['label'] += '_pca'\n",
+    "    json_content['post_split_hooks'].append({\n",
+    "        \"type\":  \"principal_component_analysis\",\n",
+    "        \"proportion\": {\n",
+    "            \"label\": \"pca_component_proportion\",\n",
+    "            \"type\": \"float\",\n",
+    "            \"low\": 0.1,\n",
+    "            \"high\": 0.9\n",
+    "        }\n",
+    "    })"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6c10c3f-49a2-46f5-926f-fd63de623820",
+   "metadata": {},
+   "source": [
+    "## Configuration Generation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c838ba4f-4771-4429-bcaf-fa6ef758c55a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root_path = Path(\"/home/kalum.ost/classic_ml_reloaded/sct_processing/deepseg_data/c2c7\")\n",
+    "\n",
+    "# NOTE; the full metrics are CSV, not TSV, so we don't need to check within the loop at all!\n",
+    "for p in Path('.').glob('*.tsv'):\n",
+    "    # Copy the template\n",
+    "    new_json = deepcopy(template)\n",
+    "\n",
+    "    # Extend it with initial pre- and post-split hooks\n",
+    "    build_presplit(p.name, new_json)\n",
+    "    build_postsplit(p.name, new_json)\n",
+    "    \n",
+    "    # Set the data path\n",
+    "    new_json['data_source'] = str(root_path / p.name)\n",
+    "\n",
+    "    # Initialize the config label by using the file's name\n",
+    "    new_label = str(p.name).split('.')[0]\n",
+    "    new_json['label'] = new_label\n",
+    "\n",
+    "    # Append the segmentation algorithm to the label if using image-derived data\n",
+    "    if \"full_\" in p.name or \"img_\" in p.name:\n",
+    "        new_json['label'] = 'deepseg_c2c7_' + new_label\n",
+    "\n",
+    "    # Generate 5 configs each: no RFE/PCA (basic)...\n",
+    "    final_json = deepcopy(new_json)\n",
+    "    final_json['label'] += '_noprep'\n",
+    "    with open(f\"{final_json['label']}.json\", 'w') as fp:\n",
+    "        dump(final_json, fp, indent=2)\n",
+    "\n",
+    "    # RFE only...\n",
+    "    final_json = deepcopy(new_json)\n",
+    "    add_rfe(final_json)\n",
+    "    with open(f\"{final_json['label']}.json\", 'w') as fp:\n",
+    "        dump(final_json, fp, indent=2)\n",
+    "\n",
+    "    # PCA only...\n",
+    "    final_json = deepcopy(new_json)\n",
+    "    add_pca(final_json)\n",
+    "    with open(f\"{final_json['label']}.json\", 'w') as fp:\n",
+    "        dump(final_json, fp, indent=2)\n",
+    "\n",
+    "    # RFE into PCA...\n",
+    "    final_json = deepcopy(new_json)\n",
+    "    add_rfe(final_json)\n",
+    "    add_pca(final_json)\n",
+    "    with open(f\"{final_json['label']}.json\", 'w') as fp:\n",
+    "        dump(final_json, fp, indent=2)\n",
+    "    \n",
+    "    # ... and PCA into RFE.\n",
+    "    final_json = deepcopy(new_json)\n",
+    "    add_pca(final_json)\n",
+    "    add_rfe(final_json)\n",
+    "    with open(f\"{final_json['label']}.json\", 'w') as fp:\n",
+    "        dump(final_json, fp, indent=2)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39c1383e-f2a2-49ad-83f7-3428065d4425",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}