Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release/2.0.0 beta.3 #241

Merged
merged 21 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 0 additions & 28 deletions .github/workflows/badges.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,6 @@ on:
branches:
- master

jobs:
run_pytest:
runs-on: ubuntu-latest
if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation') }}
timeout-minutes: 15
steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up mamba environment
uses: mamba-org/setup-micromamba@v1
with:
micromamba-version: '1.3.1-0'
environment-file: environment.yml
environment-name: BiG-SCAPE
init-shell: bash
generate-run-shell: true

- name: Install dependencies
shell: micromamba-shell {0}
run: |
python -m pip install pytest

- name: Test with Pytest
shell: micromamba-shell {0}
run: |
pytest

generate_coverage:
runs-on: ubuntu-latest
if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation') }}
Expand Down
38 changes: 38 additions & 0 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Run tests

on:
push:
branches:
- master
- dev
- release/*
- feature/*
- hotfix/*

jobs:
run_pytest:
runs-on: ubuntu-latest
if: ${{ !contains(github.event.head_commit.message, 'docs') && !contains(github.event.head_commit.message, 'documentation') }}
timeout-minutes: 15
steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up mamba environment
uses: mamba-org/setup-micromamba@v1
with:
micromamba-version: '1.3.1-0'
environment-file: environment.yml
environment-name: BiG-SCAPE
init-shell: bash
generate-run-shell: true

- name: Install dependencies
shell: micromamba-shell {0}
run: |
python -m pip install pytest

- name: Test with Pytest
shell: micromamba-shell {0}
run: |
pytest
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
![License](https://img.shields.io/github/license/medema-group/BiG-SCAPE)
![Github downloads](https://img.shields.io/github/downloads/medema-group/BiG-SCAPE/latest/total?label=Github%20downloads%20%28latest%29)
![Conda downloads](https://img.shields.io/conda/dn/bioconda/bigscape?label=Conda%20downloads)
![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/test.yml/badge.svg)
![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/run-tests.yml/badge.svg)
![Test workflow](https://github.com/medema-group/BiG-SCAPE/actions/workflows/deploy-docker.yml/badge.svg)
![Coverage](https://medema-group.github.io/BiG-SCAPE/badges/coverage.svg)
![Pylint](https://medema-group.github.io/BiG-SCAPE/badges/pylint.svg)

Expand Down
27 changes: 26 additions & 1 deletion big_scape/cli/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> Non
"""
with open(config_file_path, "rb") as f:
content = f.read()
BigscapeConfig.HASH = hashlib.sha256(content).hexdigest()
config = yaml.load(content, Loader=yaml.FullLoader)

# PROFILER
Expand Down Expand Up @@ -212,10 +211,36 @@ def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> Non
legacy_classes[group] = set(classes)
BigscapeConfig.LEGACY_ANTISMASH_CLASSES = legacy_classes

# store relevant hash
BigscapeConfig.generate_relevant_hash()

# write config log
if log_path is not None:
BigscapeConfig.write_config_log(log_path, config)

@staticmethod
def generate_relevant_hash() -> None:
"""Generates a config hash from values that might/will invalidate existing data"""
content = (
BigscapeConfig.MERGED_CAND_CLUSTER_TYPE,
BigscapeConfig.CDS_OVERLAP_CUTOFF,
BigscapeConfig.DOMAIN_OVERLAP_CUTOFF,
BigscapeConfig.REGION_MIN_LCS_LEN,
BigscapeConfig.PROTO_MIN_LCS_LEN,
BigscapeConfig.REGION_MIN_EXTEND_LEN,
BigscapeConfig.REGION_MIN_EXTEND_LEN_BIO,
BigscapeConfig.PROTO_MIN_EXTEND_LEN,
BigscapeConfig.NO_MIN_CLASSES,
BigscapeConfig.EXTEND_MATCH_SCORE,
BigscapeConfig.EXTEND_MISMATCH_SCORE,
BigscapeConfig.EXTEND_GAP_SCORE,
BigscapeConfig.EXTEND_MAX_MATCH_PERC,
BigscapeConfig.ANCHOR_DOMAINS,
)
BigscapeConfig.HASH = hashlib.sha256(
bytearray(str(content), "utf-8")
).hexdigest()

@staticmethod
def write_config_log(log_path: Path, config: dict) -> None:
"""writes config log file
Expand Down
4 changes: 2 additions & 2 deletions big_scape/data/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,8 +481,8 @@ def check_config_hash():

if latest_config and BigscapeConfig.HASH != latest_config:
raise RuntimeError(
"Config file values have changed from the previous run! "
"Existing data is not guarenteed to be reusable, please "
"Relevant config file values have changed (see config.log) from the "
"previous run! Existing data is not guarenteed to be reusable, please "
"run with a fresh output directory/database."
)

Expand Down
14 changes: 10 additions & 4 deletions big_scape/genbank/candidate_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# from dependencies
from Bio.SeqFeature import SeqFeature
from sqlalchemy import Table, select

# from other modules
from big_scape.data import DB
Expand Down Expand Up @@ -169,7 +170,7 @@ def __repr__(self) -> str:
return f"{self.parent_gbk} Candidate cluster {self.number} {self.nt_start}-{self.nt_stop} "

@staticmethod
def load_all(region_dict: dict[int, Region]):
def load_all(region_dict: dict[int, Region], temp_gbk_id_table: Table = None):
"""Load all CandidateCluster objects from the database

This function populates the CandidateCluster lists in the Regions provided in
Expand Down Expand Up @@ -198,10 +199,15 @@ def load_all(region_dict: dict[int, Region]):
record_table.c.product,
)
.where(record_table.c.record_type == "cand_cluster")
.where(record_table.c.parent_id.in_(region_dict.keys()))
.compile()
)

if temp_gbk_id_table is not None:
candidate_cluster_select_query = candidate_cluster_select_query.where(
record_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
)

candidate_cluster_select_query = candidate_cluster_select_query.compile()

cursor_result = DB.execute(candidate_cluster_select_query)

candidate_cluster_dict = {}
Expand Down Expand Up @@ -230,4 +236,4 @@ def load_all(region_dict: dict[int, Region]):
# add to dictionary
candidate_cluster_dict[result.id] = new_candidate_cluster

ProtoCluster.load_all(candidate_cluster_dict)
ProtoCluster.load_all(candidate_cluster_dict, temp_gbk_id_table)
12 changes: 9 additions & 3 deletions big_scape/genbank/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from Bio.SeqFeature import SeqFeature
from Bio.Seq import Seq
from Bio import BiopythonWarning
from sqlalchemy import Table, select

# from other modules
from big_scape.errors import InvalidGBKError
Expand Down Expand Up @@ -320,7 +321,7 @@ def len_nt_overlap(cds_a: CDS, cds_b: CDS) -> int:
return max(0, right - left)

@staticmethod
def load_all(gbk_dict: dict[int, GBK]) -> None:
def load_all(gbk_dict: dict[int, GBK], temp_gbk_id_table: Table = None) -> None:
"""Load all Region objects from the database

This function populates the region objects in the GBKs provided in the input
Expand Down Expand Up @@ -349,10 +350,15 @@ def load_all(gbk_dict: dict[int, GBK]) -> None:
cds_table.c.aa_seq,
)
.order_by(cds_table.c.orf_num)
.where(cds_table.c.gbk_id.in_(gbk_dict.keys()))
.compile()
)

if temp_gbk_id_table is not None:
region_select_query = region_select_query.where(
cds_table.c.gbk_id.in_(select(temp_gbk_id_table.c.gbk_id))
)

region_select_query = region_select_query.compile()

cursor_result = DB.execute(region_select_query)

for result in cursor_result.all():
Expand Down
125 changes: 121 additions & 4 deletions big_scape/genbank/gbk.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

# from enum import Enum
from pathlib import Path
import random
import string
from typing import Dict, Optional
import hashlib

Expand All @@ -14,6 +16,7 @@
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature
from sqlalchemy import Column, ForeignKey, Integer, String, Table, select

# from other modules
from big_scape.errors import InvalidGBKError
Expand All @@ -34,6 +37,118 @@
# MIBIG = "mibig"
# REFERENCE = "reference"

# TODO: generalize creating temp tables. this is copied from network.py


def create_temp_hash_table(gbks: list[GBK]) -> Table:
"""Create a temporary table with ids of given records

Args:
include_records (list[BGCRecord]): the records to include in the connected component

Returns:
Table: the temporary table
"""

# generate a short random string
temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10))

temp_table = Table(
temp_table_name,
DB.metadata,
Column(
"hash",
String,
ForeignKey(DB.metadata.tables["gbk"].c.hash),
primary_key=True,
nullable=False,
),
prefixes=["TEMPORARY"],
)

DB.metadata.create_all(DB.engine)

if DB.engine is None:
raise RuntimeError("DB engine is None")

cursor = DB.engine.raw_connection().driver_connection.cursor()

insert_query = f"""
INSERT INTO {temp_table_name} (hash) VALUES (?);
"""

def batch_hash(gbks: list[GBK], n: int):
l = len(gbks)
for ndx in range(0, l, n):
yield [gbk.hash for gbk in gbks[ndx : min(ndx + n, l)]]

for hash_batch in batch_hash(gbks, 1000):
cursor.executemany(insert_query, [(x,) for x in hash_batch]) # type: ignore

cursor.close()

DB.commit()

if DB.metadata is None:
raise ValueError("DB metadata is None")

return temp_table


def create_temp_gbk_id_table(gbks: list[GBK]) -> Table:
"""Create a temporary table with ids of given gbks

Args:
gbks (list[GBK]): the gbks to include in the connected component

Returns:
Table: the temporary table
"""

# generate a short random string
temp_table_name = "temp_" + "".join(random.choices(string.ascii_lowercase, k=10))

temp_table = Table(
temp_table_name,
DB.metadata,
Column(
"gbk_id",
Integer,
ForeignKey(DB.metadata.tables["gbk"].c.id),
primary_key=True,
nullable=False,
),
prefixes=["TEMPORARY"],
)

DB.metadata.create_all(DB.engine)

if DB.engine is None:
raise RuntimeError("DB engine is None")

cursor = DB.engine.raw_connection().driver_connection.cursor()

insert_query = f"""
INSERT INTO {temp_table_name} (gbk_id) VALUES (?);
"""

def batch_hash(gbks: list[GBK], n: int):
l = len(gbks)
for ndx in range(0, l, n):
yield [gbk._db_id for gbk in gbks[ndx : min(ndx + n, l)]]

for hash_batch in batch_hash(gbks, 1000):
cursor.executemany(insert_query, [(x,) for x in hash_batch]) # type: ignore

cursor.close()

DB.commit()

if DB.metadata is None:
raise ValueError("DB metadata is None")

return temp_table


class GBK:
"""
Expand Down Expand Up @@ -261,7 +376,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
list[GBK]: loaded GBK objects
"""

input_gbk_hashes = [gbk.hash for gbk in input_gbks]
temp_hash_table = create_temp_hash_table(input_gbks)

if not DB.metadata:
raise RuntimeError("DB.metadata is None")
Expand All @@ -278,7 +393,7 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
gbk_table.c.taxonomy,
gbk_table.c.description,
)
.where(gbk_table.c.hash.in_(input_gbk_hashes))
.where(gbk_table.c.hash.in_(select(temp_hash_table.c.hash)))
.compile()
)

Expand All @@ -297,9 +412,11 @@ def load_many(input_gbks: list[GBK]) -> list[GBK]:
# load GBK regions. This will also populate all record levels below region
# e.g. candidate cluster, protocore if they exist

Region.load_all(gbk_dict)
temp_gbk_id_table = create_temp_gbk_id_table(input_gbks)

CDS.load_all(gbk_dict)
Region.load_all(gbk_dict, temp_gbk_id_table)

CDS.load_all(gbk_dict, temp_gbk_id_table)

return list(gbk_dict.values())

Expand Down
Loading
Loading