From 1c29f4c1bafac772ca18743a6fda0f90185188c8 Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 25 Oct 2023 12:55:51 -0700 Subject: [PATCH 1/5] Create HeatCluster-0.4.11.py "Fully" modularized and fixed input/output file argument parsing. --- HeatCluster-0.4.11.py | 168 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) create mode 100644 HeatCluster-0.4.11.py diff --git a/HeatCluster-0.4.11.py b/HeatCluster-0.4.11.py new file mode 100644 index 0000000..2cedfb1 --- /dev/null +++ b/HeatCluster-0.4.11.py @@ -0,0 +1,168 @@ +#!/usr/bin/python3 + +########################################### +# HeatCluster-0.4.11 # +# written by Stephen Beckstrom-Sternberg # +# Creates SNP heat/cluster maps # +# from SNP matrices # +# - modularized # +########################################### + +import argparse +import logging +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +from pathlib import Path + +logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%y-%b-%d %H:%M:%S', level=logging.INFO) + +parser = argparse.ArgumentParser() +parser.add_argument('-i', '--input', type=str, help='input SNP matrix file name', default='snp-dists.txt') +parser.add_argument('-o', '--out', type=str, help='final file name', default='SNP_matrix') +parser.add_argument('-t', '--type', type=str, help='file extension for final image', default = 'pdf') +parser.add_argument('-v', '--version', help='print version and exit', action='version', version='%(prog)s ' + '0.4.11') +args = parser.parse_args() + +def main(): + """ + Creates image for SNP matrix. + """ + + SNPmatrix=args.input + logging.info('Creating figure for ' + SNPmatrix) + + df = read_snp_matrix(SNPmatrix) + logging.debug('The input SNP matrix:') + logging.debug(df) + + if len(df.index) > len(df.columns): + logging.fatal('This matrix has been melted. Sorry!') + exit(0) + + df = clean_and_read_df(df) + logging.debug('The clean SNP matrix:') + logging.debug(df) + + (df, fontSize) = determine_heatmap_size(df, SNPmatrix) + + create_heatmap(df, fontSize) +# + print("Done") +def read_snp_matrix(file): + """ + Reads the SNP matrix into a pandas dataframe. + + Args: + file (str): SNP dist output file that should be converted to pandas dataframe + + Returns: + df (DataFrame): Pandas dataframe of SNP matrix. + """ + logging.debug('Determining if file is comma or tab delimited') + tabs = pd.read_csv(file, nrows=1, sep='\t').shape[1] + commas = pd.read_csv(file, nrows=1, sep=',').shape[1] + if tabs > commas: + logging.debug('The file is probably tab-delimited') + df = pd.read_csv(file, sep='\t', index_col= False) + else: + logging.debug('The file is probably comma-delimited') + df = pd.read_csv(file, sep=',', index_col= False) + + return df + +def clean_and_read_df(df): + """ + Clean and read DataFrame from lines. + + Args: + lines (list): List of strings representing lines of data. + + Returns: + df (DataFrame): Cleaned DataFrame. + """ + logging.debug('Dropping the first column') + df = df.iloc[: , 1:] + + # Convert column names to strings + df.columns = df.columns.map(str) + + # Define consensus patterns + consensus_patterns = ['snp-dists 0.8.2', '.consensus_threshold_0.6_quality_20', 'Consensus_', 'Unnamed: 0'] + + # Replace consensus patterns in column names + df.columns = df.columns.str.replace('|'.join(consensus_patterns), '', regex=True) + + # Setting the index + df = df.set_index(df.columns) + + return df + +def determine_heatmap_size(df, SNPmatrix): + numSamples = len(df.columns) + logging.info('Found ' + str(numSamples) + ' samples in ' + SNPmatrix) + + if numSamples <= 3: + logging.fatal('This matrix must have 4+ samples. Sorry!') + exit(0) + + # Set output figure size tuple based on number of samples + if (numSamples) >= 140: + fontSize = 2 + elif (numSamples) >=100: + fontSize = 4 + elif (numSamples) >=60: + fontSize = 6 + else: + fontSize=8 + + logging.debug('The fontSize will be ' + str(fontSize)) + + logging.debug('Sorting dataframe and removing empty rows/columns') + df = df.loc[df.sum(axis=1).sort_values(ascending=True).index] + df.replace([np.inf, -np.inf], np.nan) + df.dropna() + + df = df.reindex(columns=df.index) + + return (df, fontSize) + +def create_heatmap(df, fontSize): + logging.debug('Creating heatmap') + heatmap = sns.clustermap( + df, + xticklabels=True, + yticklabels=True, + vmin=0, + vmax=80, + center=20, + annot=True, + annot_kws={'size': fontSize}, + cbar_kws={"orientation": "vertical", "pad": 0.5}, + cmap='Reds_r', + linecolor="white", + linewidths=.1, + fmt='d', + col_cluster=False, + row_cluster=False + ) + +# Set orientation of axes labels + plt.setp(heatmap.ax_heatmap.get_xticklabels(), rotation=45, ha='right',fontsize=fontSize) + plt.setp(heatmap.ax_heatmap.get_yticklabels(), rotation='horizontal', fontsize=fontSize) + + plt.title('SNP matrix visualized via HeatCluster') + + heatmap.ax_row_dendrogram.set_visible(False) + heatmap.ax_col_dendrogram.set_visible(False) + + SNP_matrix = args.out + outfile = (args.out + "." + args.type) + print("\tOutput file is ", outfile) + heatmap.savefig(outfile) + + plt.show() + +if __name__ == "__main__": + main() From 7da7d4c75a7b03b870deb9859bfc95ccbc977a79 Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 25 Oct 2023 14:48:23 -0700 Subject: [PATCH 2/5] Update HeatCluster.py Modularized file and fixed argument parsing. --- HeatCluster.py | 106 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 76 insertions(+), 30 deletions(-) diff --git a/HeatCluster.py b/HeatCluster.py index 52c7deb..2cedfb1 100755 --- a/HeatCluster.py +++ b/HeatCluster.py @@ -1,34 +1,75 @@ #!/usr/bin/python3 ########################################### -# HeatCluster-0.4.10 # +# HeatCluster-0.4.11 # # written by Stephen Beckstrom-Sternberg # # Creates SNP heat/cluster maps # # from SNP matrices # +# - modularized # ########################################### import argparse +import logging import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from pathlib import Path +logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%y-%b-%d %H:%M:%S', level=logging.INFO) + parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', type=str, help='input SNP matrix file name', default='snp-dists.txt') parser.add_argument('-o', '--out', type=str, help='final file name', default='SNP_matrix') parser.add_argument('-t', '--type', type=str, help='file extension for final image', default = 'pdf') -parser.add_argument('-v', '--version', help='print version and exit', action='version', version='%(prog)s ' + '0.4.10') +parser.add_argument('-v', '--version', help='print version and exit', action='version', version='%(prog)s ' + '0.4.11') args = parser.parse_args() +def main(): + """ + Creates image for SNP matrix. + """ + + SNPmatrix=args.input + logging.info('Creating figure for ' + SNPmatrix) + + df = read_snp_matrix(SNPmatrix) + logging.debug('The input SNP matrix:') + logging.debug(df) + + if len(df.index) > len(df.columns): + logging.fatal('This matrix has been melted. Sorry!') + exit(0) + + df = clean_and_read_df(df) + logging.debug('The clean SNP matrix:') + logging.debug(df) + + (df, fontSize) = determine_heatmap_size(df, SNPmatrix) + + create_heatmap(df, fontSize) +# + print("Done") def read_snp_matrix(file): + """ + Reads the SNP matrix into a pandas dataframe. + + Args: + file (str): SNP dist output file that should be converted to pandas dataframe + + Returns: + df (DataFrame): Pandas dataframe of SNP matrix. + """ + logging.debug('Determining if file is comma or tab delimited') tabs = pd.read_csv(file, nrows=1, sep='\t').shape[1] commas = pd.read_csv(file, nrows=1, sep=',').shape[1] if tabs > commas: + logging.debug('The file is probably tab-delimited') df = pd.read_csv(file, sep='\t', index_col= False) else: + logging.debug('The file is probably comma-delimited') df = pd.read_csv(file, sep=',', index_col= False) - + return df def clean_and_read_df(df): @@ -41,36 +82,32 @@ def clean_and_read_df(df): Returns: df (DataFrame): Cleaned DataFrame. """ - # Define consensus patterns + logging.debug('Dropping the first column') + df = df.iloc[: , 1:] + + # Convert column names to strings + df.columns = df.columns.map(str) + + # Define consensus patterns consensus_patterns = ['snp-dists 0.8.2', '.consensus_threshold_0.6_quality_20', 'Consensus_', 'Unnamed: 0'] # Replace consensus patterns in column names df.columns = df.columns.str.replace('|'.join(consensus_patterns), '', regex=True) - # Replace consensus patterns in entire dataframe to change row names - df = df.replace(consensus_patterns, '', regex=True) - # Keep only numeric columns - df = df.set_index(df.columns[0]) - df.dropna(axis=0, inplace=True) - df.dropna(axis=1, inplace=True) - return df + # Setting the index + df = df.set_index(df.columns) -def main(): - if args.input: - path = args.input - else: - try: - path = Path('./snp-dists.txt') - path.resolve(strict=True) - except FileNotFoundError: - path = Path('./snp_matrix.txt') + return df - print("Using file path:", path) - lines = read_snp_matrix(path) - numSamples = len(lines) - 1 +def determine_heatmap_size(df, SNPmatrix): + numSamples = len(df.columns) + logging.info('Found ' + str(numSamples) + ' samples in ' + SNPmatrix) - df = clean_and_read_df(lines) + if numSamples <= 3: + logging.fatal('This matrix must have 4+ samples. Sorry!') + exit(0) + # Set output figure size tuple based on number of samples if (numSamples) >= 140: fontSize = 2 elif (numSamples) >=100: @@ -79,13 +116,20 @@ def main(): fontSize = 6 else: fontSize=8 - + + logging.debug('The fontSize will be ' + str(fontSize)) + + logging.debug('Sorting dataframe and removing empty rows/columns') df = df.loc[df.sum(axis=1).sort_values(ascending=True).index] df.replace([np.inf, -np.inf], np.nan) df.dropna() df = df.reindex(columns=df.index) - print("df after re-indexing columns:\n\n",df,"\n\n") + + return (df, fontSize) + +def create_heatmap(df, fontSize): + logging.debug('Creating heatmap') heatmap = sns.clustermap( df, xticklabels=True, @@ -107,16 +151,18 @@ def main(): # Set orientation of axes labels plt.setp(heatmap.ax_heatmap.get_xticklabels(), rotation=45, ha='right',fontsize=fontSize) plt.setp(heatmap.ax_heatmap.get_yticklabels(), rotation='horizontal', fontsize=fontSize) - + plt.title('SNP matrix visualized via HeatCluster') - + heatmap.ax_row_dendrogram.set_visible(False) heatmap.ax_col_dendrogram.set_visible(False) - heatmap.savefig('SNP_matrix.pdf') + SNP_matrix = args.out + outfile = (args.out + "." + args.type) + print("\tOutput file is ", outfile) + heatmap.savefig(outfile) plt.show() - print("Done") if __name__ == "__main__": main() From 4a96abe3a9a38dd63a169de36d037f37d8818089 Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 25 Oct 2023 14:49:02 -0700 Subject: [PATCH 3/5] Delete HeatCluster-0.4.11.py --- HeatCluster-0.4.11.py | 168 ------------------------------------------ 1 file changed, 168 deletions(-) delete mode 100644 HeatCluster-0.4.11.py diff --git a/HeatCluster-0.4.11.py b/HeatCluster-0.4.11.py deleted file mode 100644 index 2cedfb1..0000000 --- a/HeatCluster-0.4.11.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/python3 - -########################################### -# HeatCluster-0.4.11 # -# written by Stephen Beckstrom-Sternberg # -# Creates SNP heat/cluster maps # -# from SNP matrices # -# - modularized # -########################################### - -import argparse -import logging -import pandas as pd -import numpy as np -import seaborn as sns -import matplotlib.pyplot as plt -from pathlib import Path - -logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%y-%b-%d %H:%M:%S', level=logging.INFO) - -parser = argparse.ArgumentParser() -parser.add_argument('-i', '--input', type=str, help='input SNP matrix file name', default='snp-dists.txt') -parser.add_argument('-o', '--out', type=str, help='final file name', default='SNP_matrix') -parser.add_argument('-t', '--type', type=str, help='file extension for final image', default = 'pdf') -parser.add_argument('-v', '--version', help='print version and exit', action='version', version='%(prog)s ' + '0.4.11') -args = parser.parse_args() - -def main(): - """ - Creates image for SNP matrix. - """ - - SNPmatrix=args.input - logging.info('Creating figure for ' + SNPmatrix) - - df = read_snp_matrix(SNPmatrix) - logging.debug('The input SNP matrix:') - logging.debug(df) - - if len(df.index) > len(df.columns): - logging.fatal('This matrix has been melted. Sorry!') - exit(0) - - df = clean_and_read_df(df) - logging.debug('The clean SNP matrix:') - logging.debug(df) - - (df, fontSize) = determine_heatmap_size(df, SNPmatrix) - - create_heatmap(df, fontSize) -# - print("Done") -def read_snp_matrix(file): - """ - Reads the SNP matrix into a pandas dataframe. - - Args: - file (str): SNP dist output file that should be converted to pandas dataframe - - Returns: - df (DataFrame): Pandas dataframe of SNP matrix. - """ - logging.debug('Determining if file is comma or tab delimited') - tabs = pd.read_csv(file, nrows=1, sep='\t').shape[1] - commas = pd.read_csv(file, nrows=1, sep=',').shape[1] - if tabs > commas: - logging.debug('The file is probably tab-delimited') - df = pd.read_csv(file, sep='\t', index_col= False) - else: - logging.debug('The file is probably comma-delimited') - df = pd.read_csv(file, sep=',', index_col= False) - - return df - -def clean_and_read_df(df): - """ - Clean and read DataFrame from lines. - - Args: - lines (list): List of strings representing lines of data. - - Returns: - df (DataFrame): Cleaned DataFrame. - """ - logging.debug('Dropping the first column') - df = df.iloc[: , 1:] - - # Convert column names to strings - df.columns = df.columns.map(str) - - # Define consensus patterns - consensus_patterns = ['snp-dists 0.8.2', '.consensus_threshold_0.6_quality_20', 'Consensus_', 'Unnamed: 0'] - - # Replace consensus patterns in column names - df.columns = df.columns.str.replace('|'.join(consensus_patterns), '', regex=True) - - # Setting the index - df = df.set_index(df.columns) - - return df - -def determine_heatmap_size(df, SNPmatrix): - numSamples = len(df.columns) - logging.info('Found ' + str(numSamples) + ' samples in ' + SNPmatrix) - - if numSamples <= 3: - logging.fatal('This matrix must have 4+ samples. Sorry!') - exit(0) - - # Set output figure size tuple based on number of samples - if (numSamples) >= 140: - fontSize = 2 - elif (numSamples) >=100: - fontSize = 4 - elif (numSamples) >=60: - fontSize = 6 - else: - fontSize=8 - - logging.debug('The fontSize will be ' + str(fontSize)) - - logging.debug('Sorting dataframe and removing empty rows/columns') - df = df.loc[df.sum(axis=1).sort_values(ascending=True).index] - df.replace([np.inf, -np.inf], np.nan) - df.dropna() - - df = df.reindex(columns=df.index) - - return (df, fontSize) - -def create_heatmap(df, fontSize): - logging.debug('Creating heatmap') - heatmap = sns.clustermap( - df, - xticklabels=True, - yticklabels=True, - vmin=0, - vmax=80, - center=20, - annot=True, - annot_kws={'size': fontSize}, - cbar_kws={"orientation": "vertical", "pad": 0.5}, - cmap='Reds_r', - linecolor="white", - linewidths=.1, - fmt='d', - col_cluster=False, - row_cluster=False - ) - -# Set orientation of axes labels - plt.setp(heatmap.ax_heatmap.get_xticklabels(), rotation=45, ha='right',fontsize=fontSize) - plt.setp(heatmap.ax_heatmap.get_yticklabels(), rotation='horizontal', fontsize=fontSize) - - plt.title('SNP matrix visualized via HeatCluster') - - heatmap.ax_row_dendrogram.set_visible(False) - heatmap.ax_col_dendrogram.set_visible(False) - - SNP_matrix = args.out - outfile = (args.out + "." + args.type) - print("\tOutput file is ", outfile) - heatmap.savefig(outfile) - - plt.show() - -if __name__ == "__main__": - main() From 9fab747a8ecf8a9312a79eef8d505b1a13c973cb Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 25 Oct 2023 14:55:23 -0700 Subject: [PATCH 4/5] Update HeatCluster.py Added import scipy to prevent failing test --- HeatCluster.py | 1 + 1 file changed, 1 insertion(+) diff --git a/HeatCluster.py b/HeatCluster.py index 2cedfb1..6ad9afb 100755 --- a/HeatCluster.py +++ b/HeatCluster.py @@ -14,6 +14,7 @@ import numpy as np import seaborn as sns import matplotlib.pyplot as plt +import scipy from pathlib import Path logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%y-%b-%d %H:%M:%S', level=logging.INFO) From ef71cb0adbbd8438d5ca627686c32351b65af740 Mon Sep 17 00:00:00 2001 From: Stephen Beckstrom-Sternberg Date: Wed, 25 Oct 2023 15:18:48 -0700 Subject: [PATCH 5/5] Update heatcluster.yml Added pip install scipy to prevent test from failing --- .github/workflows/heatcluster.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/heatcluster.yml b/.github/workflows/heatcluster.yml index d250c12..81299e1 100644 --- a/.github/workflows/heatcluster.yml +++ b/.github/workflows/heatcluster.yml @@ -8,7 +8,7 @@ jobs: uses: actions/checkout@v3 - name: install dependencies - run: pip install argparse pandas numpy pathlib seaborn matplotlib + run: pip install argparse pandas numpy pathlib seaborn matplotlib scipy - name: test (tab-delimited) run: ./HeatCluster.py -i test/small_matrix.csv