Minor efficiency updates

SMD-Bioinformatics-Lund · Jul 5, 2024 · e80c7cb · e80c7cb
1 parent f9fc094
commit e80c7cb
Showing 1 changed file with 11 additions and 8 deletions.
diff --git a/jasentool/matrix.py b/jasentool/matrix.py
@@ -1,7 +1,9 @@
 """Module for validating pipelines"""
 
 import os
+import sys
 import json
+import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
@@ -38,19 +40,20 @@ def get_jasen_cgmlst_data(self, sample_id):
 
     def compare_cgmlst_alleles(self, row_cgmlst_alleles, col_cgmlst_alleles):
         """Parse through cgmlst alleles of old and new pipeline and compare results"""
-        match_count, total_count = 0, 0
-        for idx, row_allele in enumerate(row_cgmlst_alleles):
-            if str(row_allele) == str(col_cgmlst_alleles[idx]):
-                match_count += 1
-            total_count += 1
+        row_cgmlst_alleles = np.array(row_cgmlst_alleles)
+        col_cgmlst_alleles = np.array(col_cgmlst_alleles)
+        matches = row_cgmlst_alleles == col_cgmlst_alleles
+        match_count = np.sum(matches)
         return match_count
 
     def generate_matrix(self, sample_ids, get_cgmlst_data):
         matrix_df = pd.DataFrame(index=sample_ids, columns=sample_ids)
+        id_allele_dict = {sample_id: get_cgmlst_data(sample_id) for sample_id in sample_ids}
+        print(f"The sample id - alleles dict is approximately {sys.getsizeof(id_allele_dict)} bytes in size")
         for row_sample in sample_ids:
-            row_sample_cgmlst = get_cgmlst_data(row_sample)
+            row_sample_cgmlst = id_allele_dict[row_sample]
             for col_sample in sample_ids:
-                col_sample_cgmlst = get_cgmlst_data(col_sample)
+                col_sample_cgmlst = id_allele_dict[col_sample]
                 if row_sample_cgmlst and col_sample_cgmlst:
                     matrix_df.loc[row_sample, col_sample] = self.compare_cgmlst_alleles(row_sample_cgmlst, col_sample_cgmlst)
         return matrix_df
@@ -69,7 +72,7 @@ def run(self, input_files, output_fpaths):
         sample_ids = [os.path.basename(input_file).replace("_result.json", "") for input_file in input_files]
         cgviz_matrix_df = self.generate_matrix(sample_ids, self.get_cgviz_cgmlst_data)
         jasen_matrix_df = self.generate_matrix(sample_ids, self.get_jasen_cgmlst_data)
-        distance_df = abs(jasen_matrix_df - cgviz_matrix_df)
+        distance_df = jasen_matrix_df - cgviz_matrix_df
         distance_df = distance_df.astype(float)
         distance_df.to_csv(output_csv_fpath, index=True, header=True)
         self.plot_heatmap(distance_df, output_plot_fpath)