Skip to content

Commit

Permalink
Minor efficiency updates
Browse files Browse the repository at this point in the history
  • Loading branch information
ryanjameskennedy committed Jul 5, 2024
1 parent f9fc094 commit e80c7cb
Showing 1 changed file with 11 additions and 8 deletions.
19 changes: 11 additions & 8 deletions jasentool/matrix.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""Module for validating pipelines"""

import os
import sys
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
Expand Down Expand Up @@ -38,19 +40,20 @@ def get_jasen_cgmlst_data(self, sample_id):

def compare_cgmlst_alleles(self, row_cgmlst_alleles, col_cgmlst_alleles):
"""Parse through cgmlst alleles of old and new pipeline and compare results"""
match_count, total_count = 0, 0
for idx, row_allele in enumerate(row_cgmlst_alleles):
if str(row_allele) == str(col_cgmlst_alleles[idx]):
match_count += 1
total_count += 1
row_cgmlst_alleles = np.array(row_cgmlst_alleles)
col_cgmlst_alleles = np.array(col_cgmlst_alleles)
matches = row_cgmlst_alleles == col_cgmlst_alleles
match_count = np.sum(matches)
return match_count

def generate_matrix(self, sample_ids, get_cgmlst_data):
matrix_df = pd.DataFrame(index=sample_ids, columns=sample_ids)
id_allele_dict = {sample_id: get_cgmlst_data(sample_id) for sample_id in sample_ids}
print(f"The sample id - alleles dict is approximately {sys.getsizeof(id_allele_dict)} bytes in size")
for row_sample in sample_ids:
row_sample_cgmlst = get_cgmlst_data(row_sample)
row_sample_cgmlst = id_allele_dict[row_sample]
for col_sample in sample_ids:
col_sample_cgmlst = get_cgmlst_data(col_sample)
col_sample_cgmlst = id_allele_dict[col_sample]
if row_sample_cgmlst and col_sample_cgmlst:
matrix_df.loc[row_sample, col_sample] = self.compare_cgmlst_alleles(row_sample_cgmlst, col_sample_cgmlst)
return matrix_df
Expand All @@ -69,7 +72,7 @@ def run(self, input_files, output_fpaths):
sample_ids = [os.path.basename(input_file).replace("_result.json", "") for input_file in input_files]
cgviz_matrix_df = self.generate_matrix(sample_ids, self.get_cgviz_cgmlst_data)
jasen_matrix_df = self.generate_matrix(sample_ids, self.get_jasen_cgmlst_data)
distance_df = abs(jasen_matrix_df - cgviz_matrix_df)
distance_df = jasen_matrix_df - cgviz_matrix_df
distance_df = distance_df.astype(float)
distance_df.to_csv(output_csv_fpath, index=True, header=True)
self.plot_heatmap(distance_df, output_plot_fpath)

0 comments on commit e80c7cb

Please sign in to comment.