Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update20231025 #6

Merged
merged 5 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/heatcluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ jobs:
uses: actions/checkout@v3

- name: install dependencies
run: pip install argparse pandas numpy pathlib seaborn matplotlib
run: pip install argparse pandas numpy pathlib seaborn matplotlib scipy

- name: test (tab-delimited)
run: ./HeatCluster.py -i test/small_matrix.csv
Expand Down
107 changes: 77 additions & 30 deletions HeatCluster.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,76 @@
#!/usr/bin/python3

###########################################
# HeatCluster-0.4.10 #
# HeatCluster-0.4.11 #
# written by Stephen Beckstrom-Sternberg #
# Creates SNP heat/cluster maps #
# from SNP matrices #
# - modularized #
###########################################

import argparse
import logging
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from pathlib import Path

logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%y-%b-%d %H:%M:%S', level=logging.INFO)

parser = argparse.ArgumentParser()
parser.add_argument('-i', '--input', type=str, help='input SNP matrix file name', default='snp-dists.txt')
parser.add_argument('-o', '--out', type=str, help='final file name', default='SNP_matrix')
parser.add_argument('-t', '--type', type=str, help='file extension for final image', default = 'pdf')
parser.add_argument('-v', '--version', help='print version and exit', action='version', version='%(prog)s ' + '0.4.10')
parser.add_argument('-v', '--version', help='print version and exit', action='version', version='%(prog)s ' + '0.4.11')
args = parser.parse_args()

def main():
"""
Creates image for SNP matrix.
"""

SNPmatrix=args.input
logging.info('Creating figure for ' + SNPmatrix)

df = read_snp_matrix(SNPmatrix)
logging.debug('The input SNP matrix:')
logging.debug(df)

if len(df.index) > len(df.columns):
logging.fatal('This matrix has been melted. Sorry!')
exit(0)

df = clean_and_read_df(df)
logging.debug('The clean SNP matrix:')
logging.debug(df)

(df, fontSize) = determine_heatmap_size(df, SNPmatrix)

create_heatmap(df, fontSize)
#
print("Done")
def read_snp_matrix(file):
"""
Reads the SNP matrix into a pandas dataframe.

Args:
file (str): SNP dist output file that should be converted to pandas dataframe

Returns:
df (DataFrame): Pandas dataframe of SNP matrix.
"""
logging.debug('Determining if file is comma or tab delimited')
tabs = pd.read_csv(file, nrows=1, sep='\t').shape[1]
commas = pd.read_csv(file, nrows=1, sep=',').shape[1]
if tabs > commas:
logging.debug('The file is probably tab-delimited')
df = pd.read_csv(file, sep='\t', index_col= False)
else:
logging.debug('The file is probably comma-delimited')
df = pd.read_csv(file, sep=',', index_col= False)

return df

def clean_and_read_df(df):
Expand All @@ -41,36 +83,32 @@ def clean_and_read_df(df):
Returns:
df (DataFrame): Cleaned DataFrame.
"""
# Define consensus patterns
logging.debug('Dropping the first column')
df = df.iloc[: , 1:]

# Convert column names to strings
df.columns = df.columns.map(str)

# Define consensus patterns
consensus_patterns = ['snp-dists 0.8.2', '.consensus_threshold_0.6_quality_20', 'Consensus_', 'Unnamed: 0']

# Replace consensus patterns in column names
df.columns = df.columns.str.replace('|'.join(consensus_patterns), '', regex=True)
# Replace consensus patterns in entire dataframe to change row names
df = df.replace(consensus_patterns, '', regex=True)

# Keep only numeric columns
df = df.set_index(df.columns[0])
df.dropna(axis=0, inplace=True)
df.dropna(axis=1, inplace=True)
return df
# Setting the index
df = df.set_index(df.columns)

def main():
if args.input:
path = args.input
else:
try:
path = Path('./snp-dists.txt')
path.resolve(strict=True)
except FileNotFoundError:
path = Path('./snp_matrix.txt')
return df

print("Using file path:", path)
lines = read_snp_matrix(path)
numSamples = len(lines) - 1
def determine_heatmap_size(df, SNPmatrix):
numSamples = len(df.columns)
logging.info('Found ' + str(numSamples) + ' samples in ' + SNPmatrix)

df = clean_and_read_df(lines)
if numSamples <= 3:
logging.fatal('This matrix must have 4+ samples. Sorry!')
exit(0)

# Set output figure size tuple based on number of samples
if (numSamples) >= 140:
fontSize = 2
elif (numSamples) >=100:
Expand All @@ -79,13 +117,20 @@ def main():
fontSize = 6
else:
fontSize=8


logging.debug('The fontSize will be ' + str(fontSize))

logging.debug('Sorting dataframe and removing empty rows/columns')
df = df.loc[df.sum(axis=1).sort_values(ascending=True).index]
df.replace([np.inf, -np.inf], np.nan)
df.dropna()

df = df.reindex(columns=df.index)
print("df after re-indexing columns:\n\n",df,"\n\n")

return (df, fontSize)

def create_heatmap(df, fontSize):
logging.debug('Creating heatmap')
heatmap = sns.clustermap(
df,
xticklabels=True,
Expand All @@ -107,16 +152,18 @@ def main():
# Set orientation of axes labels
plt.setp(heatmap.ax_heatmap.get_xticklabels(), rotation=45, ha='right',fontsize=fontSize)
plt.setp(heatmap.ax_heatmap.get_yticklabels(), rotation='horizontal', fontsize=fontSize)

plt.title('SNP matrix visualized via HeatCluster')

heatmap.ax_row_dendrogram.set_visible(False)
heatmap.ax_col_dendrogram.set_visible(False)

heatmap.savefig('SNP_matrix.pdf')
SNP_matrix = args.out
outfile = (args.out + "." + args.type)
print("\tOutput file is ", outfile)
heatmap.savefig(outfile)

plt.show()
print("Done")

if __name__ == "__main__":
main()