Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C2-C7 extended #22

Merged
merged 3 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,22 @@
/sct_processing/softseg_data/*.tsv
/sct_processing/softseg_data/*.csv
/sct_processing/softseg_data/*.json
/sct_processing/softseg_data/c2c7/*.tsv
/sct_processing/softseg_data/c2c7/*.csv
/sct_processing/softseg_data/c2c7/*.json
/sct_processing/deepseg_data/*.tsv
/sct_processing/deepseg_data/*.csv
/sct_processing/deepseg_data/*.json
/sct_processing/deepseg_data/c2c7/*.tsv
/sct_processing/deepseg_data/c2c7/*.csv
/sct_processing/deepseg_data/c2c7/*.json
/sct_processing/results/figures/
/sct_processing/participants.tsv
/sct_processing/results.db

# Python notebook checkpoints of questionable utility
/sct_processing/results/.ipynb_checkpoints/
/sct_processing/softseg_data/.ipynb_checkpoints/
/sct_processing/softseg_data/c2c7/.ipynb_checkpoints/
/sct_processing/deepseg_data/.ipynb_checkpoints/
/sct_processing/deepseg_data/c2c7/.ipynb_checkpoints/
329 changes: 329 additions & 0 deletions sct_processing/deepseg_data/c2c7/autogen_configs.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,329 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "11c7abc8-3cf6-4971-9144-84c294f2a810",
"metadata": {},
"source": [
"# Data Configuration Autogenertation"
]
},
{
"cell_type": "markdown",
"id": "f47e2416-30d8-4af9-bc27-ae1ccbf4ee67",
"metadata": {},
"source": [
"## Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8c315c14-4389-4382-8865-e535b4830dc0",
"metadata": {},
"outputs": [],
"source": [
"from copy import deepcopy\n",
"from pathlib import Path\n",
"from json import dump"
]
},
{
"cell_type": "markdown",
"id": "57367529-eb6f-4cb9-90f5-b5fbe16bc331",
"metadata": {},
"source": [
"## Template"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b80f0062-15ba-4ba6-801f-d299d9be5743",
"metadata": {},
"outputs": [],
"source": [
"template = {\n",
" \"label\": None,\n",
" \"format\": \"tabular\",\n",
" \"data_source\": None,\n",
" \"separator\": \"\\t\",\n",
" \"pre_split_hooks\": [],\n",
" \"post_split_hooks\": []\n",
"}"
]
},
{
"cell_type": "markdown",
"id": "d4061ce5-40cf-4ca8-90e0-3307f7800f2b",
"metadata": {},
"source": [
"## Utility Functions "
]
},
{
"cell_type": "markdown",
"id": "ad621df0-61f7-4ed2-be01-c122fc611e16",
"metadata": {},
"source": [
"Build the baseline pre-split hooks"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94e7ef0f-e429-4ad2-979d-911e5bf2b7f5",
"metadata": {},
"outputs": [],
"source": [
"def build_presplit(file_name: str, json_content: dict):\n",
" # Isolate the pre_split_hook list\n",
" pre_split_hooks = json_content['pre_split_hooks']\n",
"\n",
" feature_list = ['GRP']\n",
" \n",
" # Append an extended list of explicitly dropped features if the dataset contains clinical data\n",
" if \"full_\" in file_name or \"clinical_\" in file_name:\n",
" feature_list.extend([\n",
" \"Site\",\n",
" \"Surgical\",\n",
" \"Number of Surgeries\",\n",
" \"Treatment Plan\",\n",
" \"Followup: 6-18 weeks\",\n",
" \"Followup: 12 month\",\n",
" \"Followup: 24 month\",\n",
" \"Followup: 60 month\",\n",
" \"Date of Assessment\",\n",
" \"CSM Duration\",\n",
" \"Work Status\",\n",
" \"mJOA 12 months\",\n",
" \"HRR\"\n",
" ])\n",
" # Otherwise just drop some basic metadata\n",
" if \"full_\" in file_name or \"img_\" in file_name:\n",
" feature_list.extend([\n",
" \"acq\",\n",
" \"weight\"\n",
" ])\n",
" \n",
" # Append the resulting list of features to drop explicitly\n",
" pre_split_hooks.append({\n",
" \"type\": \"drop_features_explicit\",\n",
" \"features\": feature_list\n",
" })\n",
" \n",
" # Add some nullity checks as well\n",
" pre_split_hooks.extend([{\n",
" \"type\": \"feature_drop_null\",\n",
" \"threshold\": 0.5\n",
" }, {\n",
" \"type\": \"sample_drop_null\",\n",
" \"threshold\": 0.5\n",
" }])"
]
},
{
"cell_type": "markdown",
"id": "a3a0ea10-bdfd-43de-96b2-94c608c40d99",
"metadata": {},
"source": [
"Build the baseline post-split hooks"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c18bc6d1-531d-4b83-ac31-6f6b93da78cb",
"metadata": {},
"outputs": [],
"source": [
"def build_postsplit(file_name: str, json_content: dict):\n",
" # Append an extended list of explicitly categorical features if the dataset contains clinical data\n",
" if \"full_\" in file_name or \"clinical_\" in file_name:\n",
" json_content['post_split_hooks'].extend([{\n",
" \"type\": \"imputation_simple\",\n",
" \"strategy\": \"most_frequent\",\n",
" \"features\": [\n",
" \"EQ5D: Anxiety/Depression\",\n",
" \"EQ5D: Mobility\",\n",
" \"EQ5D: Pain/Discomfort\",\n",
" \"EQ5D: Self-Care\",\n",
" \"EQ5D: Total\",\n",
" \"EQ5D: Usual Activities\",\n",
" \"Sex\",\n",
" \"Symptom Duration\",\n",
" \"Work Status (Category)\",\n",
" \"Comorbidities: Nicotine (Smoking)\",\n",
" \"Comorbidities: Nicotine (Smokeless)\",\n",
" \"Comorbidities: Nicotine (Patches)\",\n",
" \"Comorbidities: Nicotine (Recent Quit)\"\n",
" ]}, {\n",
" \"type\": \"one_hot_encode\",\n",
" \"features\": [\n",
" \"EQ5D: Anxiety/Depression\",\n",
" \"EQ5D: Mobility\",\n",
" \"EQ5D: Pain/Discomfort\",\n",
" \"EQ5D: Self-Care\",\n",
" \"EQ5D: Usual Activities\",\n",
" \"Sex\",\n",
" \"Symptom Duration\",\n",
" \"Work Status (Category)\",\n",
" \"Comorbidities: Nicotine (Smoking)\",\n",
" \"Comorbidities: Nicotine (Smokeless)\",\n",
" \"Comorbidities: Nicotine (Patches)\",\n",
" \"Comorbidities: Nicotine (Recent Quit)\"\n",
" ],\n",
" \"max_unique_vals\": 5,\n",
" \"handle_unknown\": \"ignore\"\n",
" }])\n",
" # Add some common standardization and imputation for everything\n",
" json_content['post_split_hooks'].extend([{\n",
" \"type\": \"imputation_simple\",\n",
" \"strategy\": \"mean\"\n",
" }, {\n",
" \"type\": \"standard_scaling\",\n",
" \"run_per_cross\": True\n",
" }])"
]
},
{
"cell_type": "markdown",
"id": "b68e2738-d678-49d1-96b4-1081c64f8c02",
"metadata": {},
"source": [
"Feature Selection/Transformation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7bf1416c-6ca8-4451-b954-624168ec7720",
"metadata": {},
"outputs": [],
"source": [
"def add_rfe(json_content):\n",
" json_content['label'] += '_rfe'\n",
" json_content['post_split_hooks'].append({\n",
" \"type\": \"recursive_feature_elimination\",\n",
" \"proportion\": {\n",
" \"label\": \"rfe_feature_proportion\",\n",
" \"type\": \"float\",\n",
" \"low\": 0.1,\n",
" \"high\": 0.9\n",
" }\n",
" })\n",
"\n",
"def add_pca(json_content):\n",
" json_content['label'] += '_pca'\n",
" json_content['post_split_hooks'].append({\n",
" \"type\": \"principal_component_analysis\",\n",
" \"proportion\": {\n",
" \"label\": \"pca_component_proportion\",\n",
" \"type\": \"float\",\n",
" \"low\": 0.1,\n",
" \"high\": 0.9\n",
" }\n",
" })"
]
},
{
"cell_type": "markdown",
"id": "e6c10c3f-49a2-46f5-926f-fd63de623820",
"metadata": {},
"source": [
"## Configuration Generation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c838ba4f-4771-4429-bcaf-fa6ef758c55a",
"metadata": {},
"outputs": [],
"source": [
"root_path = Path(\"/home/kalum.ost/classic_ml_reloaded/sct_processing/deepseg_data/c2c7\")\n",
"\n",
"# NOTE; the full metrics are CSV, not TSV, so we don't need to check within the loop at all!\n",
"for p in Path('.').glob('*.tsv'):\n",
" # Copy the template\n",
" new_json = deepcopy(template)\n",
"\n",
" # Extend it with initial pre- and post-split hooks\n",
" build_presplit(p.name, new_json)\n",
" build_postsplit(p.name, new_json)\n",
" \n",
" # Set the data path\n",
" new_json['data_source'] = str(root_path / p.name)\n",
"\n",
" # Initialize the config label by using the file's name\n",
" new_label = str(p.name).split('.')[0]\n",
" new_json['label'] = new_label\n",
"\n",
" # Append the segmentation algorithm to the label if using image-derived data\n",
" if \"full_\" in p.name or \"img_\" in p.name:\n",
" new_json['label'] = 'deepseg_c2c7_' + new_label\n",
"\n",
" # Generate 5 configs each: no RFE/PCA (basic)...\n",
" final_json = deepcopy(new_json)\n",
" final_json['label'] += '_noprep'\n",
" with open(f\"{final_json['label']}.json\", 'w') as fp:\n",
" dump(final_json, fp, indent=2)\n",
"\n",
" # RFE only...\n",
" final_json = deepcopy(new_json)\n",
" add_rfe(final_json)\n",
" with open(f\"{final_json['label']}.json\", 'w') as fp:\n",
" dump(final_json, fp, indent=2)\n",
"\n",
" # PCA only...\n",
" final_json = deepcopy(new_json)\n",
" add_pca(final_json)\n",
" with open(f\"{final_json['label']}.json\", 'w') as fp:\n",
" dump(final_json, fp, indent=2)\n",
"\n",
" # RFE into PCA...\n",
" final_json = deepcopy(new_json)\n",
" add_rfe(final_json)\n",
" add_pca(final_json)\n",
" with open(f\"{final_json['label']}.json\", 'w') as fp:\n",
" dump(final_json, fp, indent=2)\n",
" \n",
" # ... and PCA into RFE.\n",
" final_json = deepcopy(new_json)\n",
" add_pca(final_json)\n",
" add_rfe(final_json)\n",
" with open(f\"{final_json['label']}.json\", 'w') as fp:\n",
" dump(final_json, fp, indent=2)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "39c1383e-f2a2-49ad-83f7-3428065d4425",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading