WayScience · MikeLippincott · Jul 31, 2024 · MattsonCam · Aug 15, 2024 · MattsonCam
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,39 +1,43 @@
----
 repos:
+-   repo: https://gitlab.com/vojko.pribudic.foss/pre-commit-update
+    rev: v0.3.3post1  # Insert the latest tag here
+    hooks:
+    -   id: pre-commit-update
+        args: [--exclude, black, --keep, isort]
   # Formats import order
-  - repo: https://github.com/pycqa/isort
+-   repo: https://github.com/pycqa/isort
     rev: 5.13.2
     hooks:
-      - id: isort
+    -   id: isort
         name: isort (python)
         args: ["--profile", "black", "--filter-files"]
 
   # Code formatter for both python files and jupyter notebooks
-  - repo: https://github.com/psf/black
+-   repo: https://github.com/psf/black
     rev: 24.3.0
     hooks:
-      - id: black-jupyter
-      - id: black
+    -   id: black-jupyter
+    -   id: black
         language_version: python3.10
 
-  - repo: https://github.com/nbQA-dev/nbQA
+-   repo: https://github.com/nbQA-dev/nbQA
     rev: 1.8.5
     hooks:
-    - id: nbqa-isort
-      additional_dependencies: [isort==5.6.4]
-      args: [--profile=black]
+    -   id: nbqa-isort
+        additional_dependencies: [isort==5.6.4]
+        args: [--profile=black]
 
   # remove unused imports
-  - repo: https://github.com/hadialqattan/pycln.git
+-   repo: https://github.com/hadialqattan/pycln.git
     rev: v2.4.0
     hooks:
-      - id: pycln
+    -   id: pycln
 
   # additional hooks found with in the pre-commit lib
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
     hooks:
-      - id: trailing-whitespace # removes trailing white spaces
-      - id: mixed-line-ending # removes mixed end of line
+    -   id: trailing-whitespace # removes trailing white spaces
+    -   id: mixed-line-ending # removes mixed end of line
         args:
-          - --fix=lf
+        -   --fix=lf
diff --git a/0.EDA/figures/CP/umap_centroid_plot.png b/0.EDA/figures/CP/umap_centroid_plot.png
diff --git a/0.EDA/figures/CP/umap_centroid_plot_part of doses.png b/0.EDA/figures/CP/umap_centroid_plot_part of doses.png
diff --git a/0.EDA/figures/CP/umap_plot_time.png b/0.EDA/figures/CP/umap_plot_time.png
diff --git a/0.EDA/figures/CP/umap_plot_time_part_of_doses.png b/0.EDA/figures/CP/umap_plot_time_part_of_doses.png
diff --git a/0.EDA/figures/combined/umap_centroid_plot.png b/0.EDA/figures/combined/umap_centroid_plot.png
diff --git a/0.EDA/figures/combined/umap_centroid_plot_part of doses.png b/0.EDA/figures/combined/umap_centroid_plot_part of doses.png
diff --git a/0.EDA/figures/combined/umap_plot_time.png b/0.EDA/figures/combined/umap_plot_time.png
diff --git a/0.EDA/figures/combined/umap_plot_time_part_of_doses.png b/0.EDA/figures/combined/umap_plot_time_part_of_doses.png
diff --git a/0.EDA/figures/scDINO/umap_centroid_plot.png b/0.EDA/figures/scDINO/umap_centroid_plot.png
diff --git a/0.EDA/figures/scDINO/umap_centroid_plot_part of doses.png b/0.EDA/figures/scDINO/umap_centroid_plot_part of doses.png
diff --git a/...shiny_app/CLS_features_annotated_umap.csv → 0.EDA/figures/scDINO/umap_plot_time.png b/...shiny_app/CLS_features_annotated_umap.csv → 0.EDA/figures/scDINO/umap_plot_time.png
diff --git a/0.EDA/figures/scDINO/umap_plot_time_part_of_doses.png b/0.EDA/figures/scDINO/umap_plot_time_part_of_doses.png
diff --git a/0.EDA/notebooks/0.generate_umap_embeddings.ipynb b/0.EDA/notebooks/0.generate_umap_embeddings.ipynb
diff --git a/0.EDA/notebooks/1.visualize_umaps.ipynb b/0.EDA/notebooks/1.visualize_umaps.ipynb
diff --git a/0.EDA/run_eda.sh b/0.EDA/run_eda.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# this script is used to run the EDA process
+
+# activate the conda environment
+mamba activate timelapse_env
+
+jupyter nbconvert --to=script --FilesWriter.build_directory=scripts notebooks/*.ipynb
+
+cd scripts
+
+run the EDA script
+python 0.generate_umap_embeddings.py --data_mode "CP"
+python 0.generate_umap_embeddings.py --data_mode "scDINO"
+python 0.generate_umap_embeddings.py --data_mode "combined"
+
+# deactivate the conda environment
+mamba deactivate
+
+mamba activate R_timelapse_env
+
+Rscript 1.visualize_umaps.r --data_mode "CP"
+Rscript 1.visualize_umaps.r --data_mode "scDINO"
+Rscript 1.visualize_umaps.r --data_mode "combined"
+
+mamba deactivate
+
+cd ../
+
+# end of script
+echo "EDA process completed"
diff --git a/0.EDA/scripts/0.generate_umap_embeddings.py b/0.EDA/scripts/0.generate_umap_embeddings.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# This notebook generates the umap embeddings of the images in the dataset. The embeddings are saved in a parquet file.
+
+# In[1]:
+
+
+import argparse
+import pathlib
+
+import numpy as np
+import pandas as pd
+import umap
+
+# In[ ]:
+
+
+# set the arg parser
+parser = argparse.ArgumentParser(description="UMAP on a matrix")
+
+parser.add_argument("--data_mode", type=str, default="CP", help="data mode to use")
+
+# get the args
+args = parser.parse_args()
+
+# set data mode to either "CP" or "scDINO" or "combined"
+data_mode = args.data_mode
+
+
+# In[2]:
+
+
+# set the paths to the data
+CP_fs_sc_profiles_path = pathlib.Path(
+    "../../data/feature_selected_data/run_20230920ChromaLiveTL_24hr4ch_MaxIP_norm_fs.parquet"
+).resolve(strict=True)
+scDINO_sc_profiles_path = pathlib.Path(
+    "../../data/outputdir/apoptosis_timelapse/CLS_features/CLS_features_annotated.parquet"
+).resolve(strict=True)
+combined_profiles_path = pathlib.Path(
+    "../../data/20231017ChromaLive_6hr_4ch_MaxIP_normalized_combined_data_feature_selected.parquet"
+).resolve(strict=True)
+
+output_path = pathlib.Path(f"../../data/umap/").resolve()
+output_path.mkdir(parents=True, exist_ok=True)
+
+
+# In[3]:
+
+
+if data_mode == "CP":
+    # read the data
+    profiles_df = pd.read_parquet(CP_fs_sc_profiles_path)
+elif data_mode == "scDINO":
+    # read the data
+    profiles_df = pd.read_parquet(scDINO_sc_profiles_path)
+elif data_mode == "combined":
+    # read the data
+    profiles_df = pd.read_parquet(combined_profiles_path)
+else:
+    raise ValueError("data_mode must be either 'CP' or 'scDINO' or 'combined'")
+print(profiles_df.shape)
+# show all columns
+pd.set_option("display.max_columns", None)
+profiles_df.head()
+
+
+# In[4]:
+
+
+# filter the data and drop nan values
+print(profiles_df.shape)
+# drop nan values in non metadata columns
+profiles_df = profiles_df.dropna(
+    subset=profiles_df.columns[~profiles_df.columns.str.contains("Meta")]
+)
+print(profiles_df.shape)
+
+
+# In[5]:
+
+
+# get the metadata columns
+metadata_cols = profiles_df.columns.str.contains("Metadata_")
+metadata_df = profiles_df.loc[:, metadata_cols]
+features_df = profiles_df.loc[:, ~metadata_cols]
+
+# set the umap parameters
+umap = umap.UMAP(
+    n_neighbors=15,
+    n_components=2,
+    metric="euclidean",
+    random_state=42,
+    min_dist=0.1,
+    n_epochs=500,
+    learning_rate=1,
+)
+
+# fit the umap model
+umap.fit(features_df)
+
+# transform the data
+umap_transformed = umap.transform(features_df)
+
+# create a dataframe with the transformed data
+umap_df = pd.DataFrame(
+    umap_transformed, columns=["UMAP0", "UMAP1"], index=features_df.index
+)
+
+# combine the metadata and umap dataframes
+umap_df = pd.concat([metadata_df, umap_df], axis=1)
+print(umap_df.shape)
+umap_df.head()
+
+
+# In[ ]:
+
+
+# save the umap dataframe
+umap_df.to_parquet(f"../../data/umap/{data_mode}_umap_transformed.parquet")