Skip to content

Commit

Permalink
Merge pull request #6 from DrB-S/update20231025
Browse files Browse the repository at this point in the history
Update20231025
  • Loading branch information
DrB-S authored Oct 27, 2023
2 parents 4c5d673 + ef71cb0 commit 7679c78
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 31 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/heatcluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
uses: actions/checkout@v3

- name: install dependencies
run: pip install argparse pandas numpy pathlib seaborn matplotlib
run: pip install argparse pandas numpy pathlib seaborn matplotlib scipy

- name: test (tab-delimited)
run: ./HeatCluster.py -i test/small_matrix.csv
Expand Down
107 changes: 77 additions & 30 deletions HeatCluster.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,76 @@
#!/usr/bin/python3

###########################################
# HeatCluster-0.4.10 #
# HeatCluster-0.4.11 #
# written by Stephen Beckstrom-Sternberg #
# Creates SNP heat/cluster maps #
# from SNP matrices #
# - modularized #
###########################################

import argparse
import logging
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from pathlib import Path

logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%y-%b-%d %H:%M:%S', level=logging.INFO)

parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, help='input SNP matrix file name', default='snp-dists.txt')
parser.add_argument('-o', '--out', type=str, help='final file name', default='SNP_matrix')
parser.add_argument('-t', '--type', type=str, help='file extension for final image', default = 'pdf')
parser.add_argument('-v', '--version', help='print version and exit', action='version', version='%(prog)s ' + '0.4.10')
parser.add_argument('-v', '--version', help='print version and exit', action='version', version='%(prog)s ' + '0.4.11')
args = parser.parse_args()

def main():
"""
Creates image for SNP matrix.
"""

SNPmatrix=args.input
logging.info('Creating figure for ' + SNPmatrix)

df = read_snp_matrix(SNPmatrix)
logging.debug('The input SNP matrix:')
logging.debug(df)

if len(df.index) > len(df.columns):
logging.fatal('This matrix has been melted. Sorry!')
exit(0)

df = clean_and_read_df(df)
logging.debug('The clean SNP matrix:')
logging.debug(df)

(df, fontSize) = determine_heatmap_size(df, SNPmatrix)

create_heatmap(df, fontSize)
#
print("Done")
def read_snp_matrix(file):
"""
Reads the SNP matrix into a pandas dataframe.
Args:
file (str): SNP dist output file that should be converted to pandas dataframe
Returns:
df (DataFrame): Pandas dataframe of SNP matrix.
"""
logging.debug('Determining if file is comma or tab delimited')
tabs = pd.read_csv(file, nrows=1, sep='\t').shape[1]
commas = pd.read_csv(file, nrows=1, sep=',').shape[1]
if tabs > commas:
logging.debug('The file is probably tab-delimited')
df = pd.read_csv(file, sep='\t', index_col= False)
else:
logging.debug('The file is probably comma-delimited')
df = pd.read_csv(file, sep=',', index_col= False)

return df

def clean_and_read_df(df):
Expand All @@ -41,36 +83,32 @@ def clean_and_read_df(df):
Returns:
df (DataFrame): Cleaned DataFrame.
"""
# Define consensus patterns
logging.debug('Dropping the first column')
df = df.iloc[: , 1:]

# Convert column names to strings
df.columns = df.columns.map(str)

# Define consensus patterns
consensus_patterns = ['snp-dists 0.8.2', '.consensus_threshold_0.6_quality_20', 'Consensus_', 'Unnamed: 0']

# Replace consensus patterns in column names
df.columns = df.columns.str.replace('|'.join(consensus_patterns), '', regex=True)
# Replace consensus patterns in entire dataframe to change row names
df = df.replace(consensus_patterns, '', regex=True)

# Keep only numeric columns
df = df.set_index(df.columns[0])
df.dropna(axis=0, inplace=True)
df.dropna(axis=1, inplace=True)
return df
# Setting the index
df = df.set_index(df.columns)

def main():
if args.input:
path = args.input
else:
try:
path = Path('./snp-dists.txt')
path.resolve(strict=True)
except FileNotFoundError:
path = Path('./snp_matrix.txt')
return df

print("Using file path:", path)
lines = read_snp_matrix(path)
numSamples = len(lines) - 1
def determine_heatmap_size(df, SNPmatrix):
numSamples = len(df.columns)
logging.info('Found ' + str(numSamples) + ' samples in ' + SNPmatrix)

df = clean_and_read_df(lines)
if numSamples <= 3:
logging.fatal('This matrix must have 4+ samples. Sorry!')
exit(0)

# Set output figure size tuple based on number of samples
if (numSamples) >= 140:
fontSize = 2
elif (numSamples) >=100:
Expand All @@ -79,13 +117,20 @@ def main():
fontSize = 6
else:
fontSize=8


logging.debug('The fontSize will be ' + str(fontSize))

logging.debug('Sorting dataframe and removing empty rows/columns')
df = df.loc[df.sum(axis=1).sort_values(ascending=True).index]
df.replace([np.inf, -np.inf], np.nan)
df.dropna()

df = df.reindex(columns=df.index)
print("df after re-indexing columns:\n\n",df,"\n\n")

return (df, fontSize)

def create_heatmap(df, fontSize):
logging.debug('Creating heatmap')
heatmap = sns.clustermap(
df,
xticklabels=True,
Expand All @@ -107,16 +152,18 @@ def main():
# Set orientation of axes labels
plt.setp(heatmap.ax_heatmap.get_xticklabels(), rotation=45, ha='right',fontsize=fontSize)
plt.setp(heatmap.ax_heatmap.get_yticklabels(), rotation='horizontal', fontsize=fontSize)

plt.title('SNP matrix visualized via HeatCluster')

heatmap.ax_row_dendrogram.set_visible(False)
heatmap.ax_col_dendrogram.set_visible(False)

heatmap.savefig('SNP_matrix.pdf')
SNP_matrix = args.out
outfile = (args.out + "." + args.type)
print("\tOutput file is ", outfile)
heatmap.savefig(outfile)

plt.show()
print("Done")

if __name__ == "__main__":
main()

0 comments on commit 7679c78

Please sign in to comment.