Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement temp table for hashes #193

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 71 additions & 11 deletions big_scape/genbank/gbk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

# from enum import Enum
from pathlib import Path
import random
import string
from typing import Dict, Optional
import hashlib

Expand All @@ -14,6 +16,7 @@
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature
from sqlalchemy import Column, ForeignKey, Integer, String, Table, select

# from other modules
from big_scape.errors import InvalidGBKError
Expand All @@ -34,6 +37,63 @@
# MIBIG = "mibig"
# REFERENCE = "reference"

# TODO: generalize creating temp tables. this is copied from network.py


def create_temp_hash_table(gbks: list[GBK]) -> Table:
"""Create a temporary table with ids of given records

Args:
include_records (list[BGCRecord]): the records to include in the connected component

Returns:
Table: the temporary table
"""

# generate a short random string
temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10))

temp_table = Table(
temp_table_name,
DB.metadata,
Column(
"hash",
String,
ForeignKey(DB.metadata.tables["gbk"].c.hash),
primary_key=True,
nullable=False,
),
prefixes=["TEMPORARY"],
)

DB.metadata.create_all(DB.engine)

if DB.engine is None:
raise RuntimeError("DB engine is None")

cursor = DB.engine.raw_connection().driver_connection.cursor()

insert_query = f"""
INSERT INTO {temp_table_name} (hash) VALUES (?);
"""

def batch_hash(gbks: list[GBK], n: int):
l = len(gbks)
for ndx in range(0, l, n):
yield [gbk.hash for gbk in gbks[ndx : min(ndx + n, l)]]

for hash_batch in batch_hash(gbks, 1000):
cursor.executemany(insert_query, [(x,) for x in hash_batch]) # type: ignore

cursor.close()

DB.commit()

if DB.metadata is None:
raise ValueError("DB metadata is None")

return temp_table


class GBK:
"""
Expand Down Expand Up @@ -261,7 +321,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
list[GBK]: loaded GBK objects
"""

input_gbk_hashes = [gbk.hash for gbk in input_gbks]
temp_hash_table = create_temp_hash_table(input_gbks)

if not DB.metadata:
raise RuntimeError("DB.metadata is None")
Expand All @@ -278,7 +338,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
gbk_table.c.taxonomy,
gbk_table.c.description,
)
.where(gbk_table.c.hash.in_(input_gbk_hashes))
.where(gbk_table.c.hash.in_(select(temp_hash_table.c.hash)))
.compile()
)

Expand Down Expand Up @@ -616,15 +676,15 @@ def collapse_hybrids_in_cand_clusters(
for number in cand_cluster.proto_clusters.keys()
]
merged_protocluster = MergedProtoCluster.merge(protoclusters)
merged_tmp_proto_clusters[
merged_protocluster.number
] = merged_protocluster
merged_tmp_proto_clusters[merged_protocluster.number] = (
merged_protocluster
)

# update the protocluster old:new ids for the merged protoclusters of this cand_cluster
for proto_cluster_num in cand_cluster.proto_clusters.keys():
merged_protocluster_ids[
proto_cluster_num
] = merged_protocluster.number
merged_protocluster_ids[proto_cluster_num] = (
merged_protocluster.number
)

# now we build a new version of the tmp_proto_clusters dict that contains the merged protoclusters
# as well as protoclusters which did not need merging, with updated unique IDs/numbers
Expand All @@ -638,9 +698,9 @@ def collapse_hybrids_in_cand_clusters(
# this protocluster has been merged, so we need to add it to
# the dict with its new protocluster number
new_proto_cluster_num = merged_protocluster_ids[proto_cluster_num]
updated_tmp_proto_clusters[
new_proto_cluster_num
] = merged_tmp_proto_clusters[new_proto_cluster_num]
updated_tmp_proto_clusters[new_proto_cluster_num] = (
merged_tmp_proto_clusters[new_proto_cluster_num]
)
updated_proto_cluster_dict[new_proto_cluster_num] = None
else:
# protoclusters which have not been merged are added to the dict as is
Expand Down
Loading