Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chore/edge params #187

Merged
merged 4 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions big_scape/cli/cli_common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,8 @@ def common_cluster_query(fn):
"list of domains of each BGC are compared; 'local': Longest Common "
"Subcluster mode. Redefine the subset of the domains used to "
"calculate distance by trying to find the longest slice of common "
"domain content per gene in both BGCs, then expand each slice. "
"'glocal': Similar to local, but expansion assumes full expansion "
"domain content per gene in both BGCs, then extend each slice. "
"'glocal': Similar to local, but extension assumes full extension "
"of the shortest upstream/downstream arms in a compared pair. "
"'auto': use glocal when at least one of the BGCs in each pair "
"has the 'contig_edge' annotation from antiSMASH v4+, otherwise "
Expand Down
69 changes: 36 additions & 33 deletions big_scape/cli/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

# from python
import yaml
import hashlib
from pathlib import Path
from typing import Optional


# config class
class BigscapeConfig:
# static properties
# static default properties
HASH: str = ""

# PROFILER
PROFILER_UPDATE_INTERVAL: float = 0.5
Expand All @@ -22,18 +25,18 @@ class BigscapeConfig:
DOMAIN_OVERLAP_CUTOFF: float = 0.1

# LCS
REGION_MIN_LCS_LEN: int = 3
PROTO_MIN_LCS_LEN: int = 3

# EXPAND
REGION_MIN_EXPAND_LEN: int = 5
REGION_MIN_EXPAND_LEN_BIO: int = 5
PROTO_MIN_EXPAND_LEN: int = 3
NO_MIN_CLASSES: list[str] = ["Terpene"]
EXPAND_MATCH_SCORE: int = 5
EXPAND_MISMATCH_SCORE: int = -3
EXPAND_GAP_SCORE: int = -2
EXPAND_MAX_MATCH_PERC: float = 0.1
REGION_MIN_LCS_LEN: float = 0.1
PROTO_MIN_LCS_LEN: float = 0.0

# EXTEND
REGION_MIN_EXTEND_LEN: float = 0.3
REGION_MIN_EXTEND_LEN_BIO: float = 0.2
PROTO_MIN_EXTEND_LEN: float = 0.2
NO_MIN_CLASSES: list[str] = ["terpene"]
EXTEND_MATCH_SCORE: int = 5
EXTEND_MISMATCH_SCORE: int = -3
EXTEND_GAP_SCORE: int = -2
EXTEND_MAX_MATCH_PERC: float = 0.1

# CLUSTER
PREFERENCE: float = 0.0
Expand Down Expand Up @@ -151,17 +154,17 @@ class BigscapeConfig:
}

@staticmethod
def parse_config(run: dict) -> None:
"""parses config file
def parse_config(config_file_path: Path, log_path: Optional[Path] = None) -> None:
"""parses config file and writes a config.log if log_path is given

Args:
run (dict): run parameters
config_file_path (Path): path to passed config file
log_file_path (Optional[Path]): path to log file. Defaults to None.
"""

config_file_path = run["config_file_path"]

with open(config_file_path) as f:
config = yaml.load(f, Loader=yaml.FullLoader)
with open(config_file_path, "rb") as f:
content = f.read()
BigscapeConfig.HASH = hashlib.sha256(content).hexdigest()
config = yaml.load(content, Loader=yaml.FullLoader)

# PROFILER
BigscapeConfig.PROFILER_UPDATE_INTERVAL = config["PROFILER_UPDATE_INTERVAL"]
Expand All @@ -179,15 +182,15 @@ def parse_config(run: dict) -> None:
BigscapeConfig.REGION_MIN_LCS_LEN = config["REGION_MIN_LCS_LEN"]
BigscapeConfig.PROTO_MIN_LCS_LEN = config["PROTO_MIN_LCS_LEN"]

# EXPAND
BigscapeConfig.REGION_MIN_EXPAND_LEN = config["REGION_MIN_EXPAND_LEN"]
BigscapeConfig.REGION_MIN_EXPAND_LEN_BIO = config["REGION_MIN_EXPAND_LEN_BIO"]
BigscapeConfig.PROTO_MIN_EXPAND_LEN = config["PROTO_MIN_EXPAND_LEN"]
# EXTEND
BigscapeConfig.REGION_MIN_EXTEND_LEN = config["REGION_MIN_EXTEND_LEN"]
BigscapeConfig.REGION_MIN_EXTEND_LEN_BIO = config["REGION_MIN_EXTEND_LEN_BIO"]
BigscapeConfig.PROTO_MIN_EXTEND_LEN = config["PROTO_MIN_EXTEND_LEN"]
BigscapeConfig.NO_MIN_CLASSES = config["NO_MIN_CLASSES"]
BigscapeConfig.EXPAND_MATCH_SCORE = config["EXPAND_MATCH_SCORE"]
BigscapeConfig.EXPAND_MISMATCH_SCORE = config["EXPAND_MISMATCH_SCORE"]
BigscapeConfig.EXPAND_GAP_SCORE = config["EXPAND_GAP_SCORE"]
BigscapeConfig.EXPAND_MAX_MATCH_PERC = config["EXPAND_MAX_MATCH_PERC"]
BigscapeConfig.EXTEND_MATCH_SCORE = config["EXTEND_MATCH_SCORE"]
BigscapeConfig.EXTEND_MISMATCH_SCORE = config["EXTEND_MISMATCH_SCORE"]
BigscapeConfig.EXTEND_GAP_SCORE = config["EXTEND_GAP_SCORE"]
BigscapeConfig.EXTEND_MAX_MATCH_PERC = config["EXTEND_MAX_MATCH_PERC"]

# CLUSTER
BigscapeConfig.PREFERENCE = config["PREFERENCE"]
Expand All @@ -206,17 +209,17 @@ def parse_config(run: dict) -> None:
BigscapeConfig.LEGACY_ANTISMASH_CLASSES = legacy_classes

# write config log
BigscapeConfig.write_config_log(run, config)
if log_path is not None:
BigscapeConfig.write_config_log(log_path, config)

@staticmethod
def write_config_log(run: dict, config: dict) -> None:
def write_config_log(log_path: Path, config: dict) -> None:
"""writes config log file

Args:
run (dict): run parameters
log_path (Path): path to log file
config (configparser.ConfigParser): config settings
"""
log_path = run["log_path"]
config_log_path = Path(str(log_path).replace(".log", ".config.log"))

with open(config_log_path, "w") as config_log:
Expand Down
28 changes: 14 additions & 14 deletions big_scape/comparison/extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


def reset(pair: RecordPair) -> None:
"""Resets the expansion of a pair's comparable region
"""Resets the extension of a pair's comparable region

Args:
pair: The record pair to reset
Expand All @@ -31,7 +31,7 @@ def reset(pair: RecordPair) -> None:
pair.comparable_region.reverse = False


def len_check(pair: RecordPair, min_len_perc: int) -> bool:
def len_check(pair: RecordPair, min_len_perc: float) -> bool:
"""Checks if a pair's comparable region is of sufficient length

Length is checked based on domains, the comparable region should be longer than
Expand Down Expand Up @@ -84,7 +84,7 @@ def biosynthetic_check(pair: RecordPair) -> bool:
return False


def expand_glocal(pair: RecordPair) -> None:
def extend_glocal(pair: RecordPair) -> None:
"""Includes extension of shortest upstream/downstream arms in comparable region

Args:
Expand Down Expand Up @@ -120,9 +120,9 @@ def extend(
gap: int,
max_match_dist_perc: float,
) -> None:
"""Expands a comparable region
"""Extends a comparable region

This will expand the included set of cds in a pair based on a scoring
This will extend the included set of cds in a pair based on a scoring
mechanism. If the pair in the comparable region consists of protoclusters, the
this will not be limited to the bounds of those protoclusters

Expand Down Expand Up @@ -157,7 +157,7 @@ def extend(
# the shorter cds is the query
# the longer cds is the target
# generate a dictionary of domain positions in the target
# we try to expand the target by finding matching domains in the query
# we try to extend the target by finding matching domains in the query
# when we do not find a match, this counts as a mismatch and we add a penalty
# when we find a match, add gap penalties for the number of gaps we have to insert
# if we find a match before the current position, subtract a gap penalty
Expand Down Expand Up @@ -238,7 +238,7 @@ def extend(
max_match_dist,
)

# expand left
# extend left
if len(a_domains) > len(b_domains):
pair.comparable_region.b_start -= query_exp
pair.comparable_region.domain_b_start -= query_dom_exp
Expand Down Expand Up @@ -353,8 +353,8 @@ def score_extend(
considered a mismatch

Returns:
tuple: A tuple containing the query expansion index on cds and domain level,
target expansion index on cds and domain level, and the maximum score.
tuple: A tuple containing the query extension index on cds and domain level,
target extension index on cds and domain level, and the maximum score.
"""
score = 0
max_score = 0
Expand Down Expand Up @@ -451,7 +451,7 @@ def score_extend_rev(
considered a mismatch

Returns:
tuple: A tuple containing the query expansion index, target expansion index,
tuple: A tuple containing the query extension index, target extension index,
and the maximum score.
"""
score = 0
Expand Down Expand Up @@ -520,17 +520,17 @@ def score_extend_rev(


def extend_greedy(pair: RecordPair) -> None:
"""Expands a comparable region in a greedy fashion
"""Extends a comparable region in a greedy fashion

This will expand the included set of cds in a pair as much as it can,
This will extend the included set of cds in a pair as much as it can,
based on the common domains found in the pair

E.g. if we have the following two records:

A: XAXXBXXXXCX
B: XXXXXXXAXXXXXBCXXXXXXXXX

The comparable region should be expanded to the following:
The comparable region should be extended to the following:

A: XAXXBXXXXCX
[-------]
Expand Down Expand Up @@ -596,7 +596,7 @@ def extend_greedy(pair: RecordPair) -> None:

def extend_simple_match(pair: RecordPair, match, gap):
"""Performs extension by first creating a simple match matrix, then
performing a match/gap extentsion similar to legacy expansion
performing a match/gap extension similar to legacy extension

This method expects LCS to have been performed on the pair, and will
do all four directions at once
Expand Down
Loading
Loading