From 23c790cf798dcbcd082d4856eeea974028773ac9 Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 15:23:09 -0700 Subject: [PATCH 01/12] Update heatcluster.py --- heatcluster.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/heatcluster.py b/heatcluster.py index e40a40b..8e16b5b 100755 --- a/heatcluster.py +++ b/heatcluster.py @@ -1,11 +1,12 @@ #!/usr/bin/python3 ########################################### -# heatcluster-1.0.2c # +# heatcluster-1.1.0.20240131 # # written by Stephen Beckstrom-Sternberg # # Creates SNP heatmaps # # from SNP matrices # # Outputs sorted csv SNP matrix # +# Uses Polars instead of Pandas # ########################################### import argparse @@ -16,6 +17,9 @@ import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt +import polars as pl +import seaborn_polars as snl + logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%y-%b-%d %H:%M:%S', level=logging.INFO) From 9e9dc9e592d0e4ac5ce9d7be10b5fee439b1a081 Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 15:28:37 -0700 Subject: [PATCH 02/12] Update Dockerfile Added seaborn_polars, polars, and pyarrow --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index f5cfa22..41b3e31 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:jammy as app # default version -ARG HEATCLUSTER_VER="1.0.2c" +ARG HEATCLUSTER_VER="1.1.0" # adding labels LABEL base.image="ubuntu:jammy" @@ -27,7 +27,7 @@ RUN apt-get update && apt-get upgrade -y && \ apt-get autoclean && rm -rf /var/lib/apt/lists/* # installing python dependencies -RUN pip3 install --no-cache argparse pandas numpy pathlib seaborn matplotlib scipy --upgrade-strategy=only-if-needed +RUN pip3 install --no-cache argparse pandas numpy pathlib seaborn seaborn_polars polars pyarrow matplotlib scipy --upgrade-strategy=only-if-needed # copying files to docker image COPY . /heatcluster From b1dbe2b2bbd496891a3a92d82f6a2ec2d3b8a28e Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 15:38:38 -0700 Subject: [PATCH 03/12] Update heatcluster.py Using Polars instead of Pandas --- heatcluster.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/heatcluster.py b/heatcluster.py index 8e16b5b..2d158f3 100755 --- a/heatcluster.py +++ b/heatcluster.py @@ -75,11 +75,12 @@ def read_snp_matrix(file): commas = pd.read_csv(file, nrows=1, sep=',').shape[1] if tabs > commas: logging.debug('The file is tab-delimited') - df = pd.read_csv(file, sep='\t', index_col=False) + #df = pd.read_csv(file, sep='\t', index_col=False) + df = pl.scan_csv(file, sep='\t', index_col=False) else: logging.debug('The file is comma-delimited') - df = pd.read_csv(file, sep=',', index_col=False) - + #df = pd.read_csv(file, sep=',', index_col=False) + df = pl.scan_csv(file, sep=',', index_col=False) return df def clean_and_read_df(df): @@ -186,7 +187,8 @@ def create_heatmap(df, fontSize, labelSize, figsize, labels): fig,ax = plt.subplots(figsize=figsize) logging.debug('Creating heatmap') - heatmap = sns.heatmap( + #heatmap = sns.heatmap( + heatmap = snl.heatmap( df, xticklabels=True, yticklabels=True, From bf44769a4c4e5cbc4cf96fdcf30d29e9c3fa8e4d Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 15:47:20 -0700 Subject: [PATCH 04/12] Update heatcluster.py --- heatcluster.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/heatcluster.py b/heatcluster.py index 2d158f3..16ae1a3 100755 --- a/heatcluster.py +++ b/heatcluster.py @@ -68,18 +68,16 @@ def read_snp_matrix(file): file (str): SNP dist output file that should be converted to pandas dataframe Returns: - df (DataFrame): Pandas dataframe of SNP matrix. + df (DataFrame): Polars dataframe of SNP matrix. """ logging.debug('Determining if file is comma or tab delimited') - tabs = pd.read_csv(file, nrows=1, sep='\t').shape[1] - commas = pd.read_csv(file, nrows=1, sep=',').shape[1] + tabs = pl.scan_csv(file, nrows=1, sep='\t').shape[1] + commas = pl.scan_csv(file, nrows=1, sep=',').shape[1] if tabs > commas: logging.debug('The file is tab-delimited') - #df = pd.read_csv(file, sep='\t', index_col=False) df = pl.scan_csv(file, sep='\t', index_col=False) else: logging.debug('The file is comma-delimited') - #df = pd.read_csv(file, sep=',', index_col=False) df = pl.scan_csv(file, sep=',', index_col=False) return df From ede6014d68a3f71c6b010ba1a9995de05719e162 Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 15:56:51 -0700 Subject: [PATCH 05/12] Update Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 41b3e31..d94a6a7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:jammy as app # default version -ARG HEATCLUSTER_VER="1.1.0" +ARG HEATCLUSTER_VER="1.1.0.20240131" # adding labels LABEL base.image="ubuntu:jammy" From 343daa251a6bb3a2407a967fc25a2487c54f220a Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 16:25:55 -0700 Subject: [PATCH 06/12] Update heatcluster.py Using Polars instead of Pandas for dataframe, and using seaborn-polars for the heatmap creation. Does this still need matplotlib? From ea353bf46c114fe67b748e38a6edd95a3689adf7 Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 16:34:21 -0700 Subject: [PATCH 07/12] Update heatcluster.yml Added seaborn_polars, polars, and pyarrow to heatcluster.yml --- .github/workflows/heatcluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/heatcluster.yml b/.github/workflows/heatcluster.yml index 209d457..5d461b2 100644 --- a/.github/workflows/heatcluster.yml +++ b/.github/workflows/heatcluster.yml @@ -8,7 +8,7 @@ jobs: uses: actions/checkout@v3 - name: install dependencies - run: pip3 install pandas numpy pathlib seaborn matplotlib scipy + run: pip3 install pandas numpy pathlib seaborn seaborn_polars polars pyarrow matplotlib scipy - name: test (tab-delimited) run: python3 heatcluster.py -i test/small_matrix.csv From 8a9a650f00d349d7cad605b3fa8f380667277fb8 Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 16:43:56 -0700 Subject: [PATCH 08/12] Update heatcluster.py Removed nrows call from scan_csv --- heatcluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heatcluster.py b/heatcluster.py index 16ae1a3..639f6dc 100755 --- a/heatcluster.py +++ b/heatcluster.py @@ -71,8 +71,8 @@ def read_snp_matrix(file): df (DataFrame): Polars dataframe of SNP matrix. """ logging.debug('Determining if file is comma or tab delimited') - tabs = pl.scan_csv(file, nrows=1, sep='\t').shape[1] - commas = pl.scan_csv(file, nrows=1, sep=',').shape[1] + tabs = pl.scan_csv(file, sep='\t').shape[1] + commas = pl.scan_csv(file, sep=',').shape[1] if tabs > commas: logging.debug('The file is tab-delimited') df = pl.scan_csv(file, sep='\t', index_col=False) From 58d60ffd555dfe3a860e74be350949a00044244e Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 16:54:20 -0700 Subject: [PATCH 09/12] Update heatcluster.py Changed nrows=1 to n_rows=1. --- heatcluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heatcluster.py b/heatcluster.py index 639f6dc..2a46d18 100755 --- a/heatcluster.py +++ b/heatcluster.py @@ -71,8 +71,8 @@ def read_snp_matrix(file): df (DataFrame): Polars dataframe of SNP matrix. """ logging.debug('Determining if file is comma or tab delimited') - tabs = pl.scan_csv(file, sep='\t').shape[1] - commas = pl.scan_csv(file, sep=',').shape[1] + tabs = pl.scan_csv(file, n_rows=1, sep='\t').shape[1] + commas = pl.scan_csv(file, n_rows=1, sep=',').shape[1] if tabs > commas: logging.debug('The file is tab-delimited') df = pl.scan_csv(file, sep='\t', index_col=False) From 09a94193f324b8da860746180428d82af958f1ba Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 17:10:21 -0700 Subject: [PATCH 10/12] Update heatcluster.yml Changed actions/checkout@v3 to actions/checkout@v4 --- .github/workflows/heatcluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/heatcluster.yml b/.github/workflows/heatcluster.yml index 5d461b2..ec8933d 100644 --- a/.github/workflows/heatcluster.yml +++ b/.github/workflows/heatcluster.yml @@ -5,7 +5,7 @@ jobs: runs-on: ubuntu-latest steps: - name: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: install dependencies run: pip3 install pandas numpy pathlib seaborn seaborn_polars polars pyarrow matplotlib scipy From beb75b92e119c0ab8b47674951fbee23a955289d Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 17:21:49 -0700 Subject: [PATCH 11/12] Update heatcluster.py Imported pyarrow. --- heatcluster.py | 1 + 1 file changed, 1 insertion(+) diff --git a/heatcluster.py b/heatcluster.py index 2a46d18..a8043c5 100755 --- a/heatcluster.py +++ b/heatcluster.py @@ -19,6 +19,7 @@ import matplotlib.pyplot as plt import polars as pl import seaborn_polars as snl +import pyarrow logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%y-%b-%d %H:%M:%S', level=logging.INFO) From 9997bea3ec92cd1a23db92c773341629d5960352 Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 31 Jan 2024 17:49:04 -0700 Subject: [PATCH 12/12] Update heatcluster.py Syntax change for Polars. Replaced 'sep' with 'separator' in polars.scan_csv. --- heatcluster.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/heatcluster.py b/heatcluster.py index a8043c5..18603b7 100755 --- a/heatcluster.py +++ b/heatcluster.py @@ -72,14 +72,14 @@ def read_snp_matrix(file): df (DataFrame): Polars dataframe of SNP matrix. """ logging.debug('Determining if file is comma or tab delimited') - tabs = pl.scan_csv(file, n_rows=1, sep='\t').shape[1] - commas = pl.scan_csv(file, n_rows=1, sep=',').shape[1] + tabs = pl.scan_csv(file, n_rows=1, separator='\t').shape[1] + commas = pl.scan_csv(file, n_rows=1, separator=',').shape[1] if tabs > commas: logging.debug('The file is tab-delimited') - df = pl.scan_csv(file, sep='\t', index_col=False) + df = pl.scan_csv(file, separator='\t', index_col=False) else: logging.debug('The file is comma-delimited') - df = pl.scan_csv(file, sep=',', index_col=False) + df = pl.scan_csv(file, separator=',', index_col=False) return df def clean_and_read_df(df):