Skip to content

Commit

Permalink
Merge pull request #211 from medema-group/hotfix/benchmark-label
Browse files Browse the repository at this point in the history
cli args
  • Loading branch information
CatarinaCarolina authored Nov 26, 2024
2 parents e3dc848 + 31cad6e commit 92daf1f
Show file tree
Hide file tree
Showing 7 changed files with 253 additions and 187 deletions.
4 changes: 4 additions & 0 deletions big_scape/benchmarking/benchmark_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ def load_computed_bs2_labels(self, data_path: Path) -> None:
run_times = [
p.stem.replace("_full", "") for p in data_path.glob("*_full.network")
]

# assume date-time is the last element, remove eventual labels
run_times = [rt[-19:] for rt in run_times]

if len(run_times) == 0:
raise FileNotFoundError("No BiG-SCAPE 2 output found")
elif len(run_times) == 1:
Expand Down
278 changes: 161 additions & 117 deletions big_scape/cli/cli_common_options.py

Large diffs are not rendered by default.

40 changes: 21 additions & 19 deletions big_scape/cli/cli_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,9 +244,9 @@ def validate_includelist(ctx, param, domain_includelist_path):
return None

if not domain_includelist_path.exists():
logging.error("domain_includelist file does not exist!")
logging.error("domain-includelist file does not exist!")
raise InvalidArgumentError(
"--domain_includelist_all/any_path", domain_includelist_path
"--domain-includelist-all/any-path", domain_includelist_path
)

with domain_includelist_path.open(encoding="utf-8") as domain_includelist_file:
Expand Down Expand Up @@ -318,11 +318,11 @@ def validate_binning_cluster_workflow(ctx) -> None:

if ctx.obj["legacy_weights"] and not ctx.obj["classify"]:
logging.error(
"You have selected --legacy_weights but no classification method. "
"You have selected --legacy-weights but no classification method. "
"Please select any --classify method"
)
raise click.UsageError(
"You have selected --legacy_weights but no classification method. "
"You have selected --legacy-weights but no classification method. "
"Please select any --classify method"
)

Expand Down Expand Up @@ -350,11 +350,11 @@ def validate_binning_cluster_workflow(ctx) -> None:
if ctx.obj["hybrids_off"]:
if not (ctx.obj["classify"]):
logging.error(
"You have selected --hybrids_off but no classification method. "
"You have selected --hybrids-off but no classification method. "
"Please select any --classify method"
)
raise click.UsageError(
"You have selected --hybrids_off but no classification method. "
"You have selected --hybrids-off but no classification method. "
"Please select any --classify method"
)

Expand All @@ -366,11 +366,11 @@ def validate_binning_query_workflow(ctx) -> None:

if ctx.obj["legacy_weights"] and not ctx.obj["classify"]:
logging.error(
"You have selected --legacy_weights but no classification method. "
"You have selected --legacy-weights but no classification method. "
"Please select any --classify method, or remove this parameter"
)
raise click.UsageError(
"You have selected --legacy_weights but no classification method. "
"You have selected --legacy-weights but no classification method. "
"Please select any --classify method, or remove this parameter"
)

Expand All @@ -381,12 +381,12 @@ def validate_pfam_path(ctx) -> None:

if ctx.obj["pfam_path"] is None and ctx.obj["db_path"] is None:
logging.error(
"Missing option '-p/--pfam_path'."
"Missing option '-p/--pfam-path'."
"BiG-SCAPE database not provided, a pfam file is "
"required in order to detect domains."
)
raise click.UsageError(
"Missing option '-p/--pfam_path'."
"Missing option '-p/--pfam-path'."
"BiG-SCAPE database not provided, a pfam file is "
"required in order to detect domains."
)
Expand All @@ -400,22 +400,22 @@ def validate_domain_include_list(ctx) -> None:
and ctx.obj["domain_includelist_any_path"]
):
logging.error(
"You have selected both all and any domain_includelist options. "
"You have selected both all and any domain-includelist options. "
"Please select only one of the two at a time."
)
raise click.UsageError(
"You have selected both all and any domain_includelist options. "
"You have selected both all and any domain-includelist options. "
"Please select only one of the two at a time."
)


def validate_record_type(ctx, _, record_type) -> Optional[bs_enums.genbank.RECORD_TYPE]:
"""Validates whether a region_type is provided when running classify"""
"""Validates whether a record-type is provided when running classify"""
valid_types = {mode.value: mode for mode in bs_enums.genbank.RECORD_TYPE}

if record_type not in valid_types:
logging.error("Provided --record_type is invalid")
raise click.UsageError("Provided --record_type in invalid")
logging.error("Provided --record-type is invalid")
raise click.UsageError("Provided --record-type in invalid")

return valid_types[record_type]

Expand All @@ -429,12 +429,14 @@ def validate_query_record(ctx) -> None:
and ctx.obj["record_type"] != bs_enums.genbank.RECORD_TYPE.REGION
):
logging.error(
"Missing option '--query_record_number'."
"A query record number is required when running query mode with a given record type."
"Missing option '--query-record-number'."
"A query record number is required when running query mode with "
"a record type other than 'region'."
)
raise click.UsageError(
"Missing option '--query_record_number'."
"A query record number is required when running query mode with a given record type."
"Missing option '--query-record-number'."
"A query record number is required when running query mode with "
"a record type other than 'region'."
)

return None
32 changes: 18 additions & 14 deletions big_scape/cli/cluster_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,16 +36,17 @@
(e.g. T2PKS) or categories (e.g. PKS) to run analyses on class/category-based
bins, respectively.
'--classify legacy' which is based on BiG-SCAPE v1 predefined groups:
'--classify legacy' is based on BiG-SCAPE v1 predefined groups:
PKS1, PKSOther, NRPS, NRPS-PKS-hybrid, RiPP, Saccharide, Terpene, Others,
and will automatically use complementary '--legacy_weights'.
This feature is available for backwards compatibility with input .gbks
and will automatically use complementary '--legacy-weights'.
'--classify legacy' is available for backwards compatibility with input .gbks
generated with antiSMASH versions up to version 7. For higher antiSMASH
versions, use at your own risk, as BGC classes may have changed. All antiSMASH
classes that this legacy mode does not recognise will be grouped in 'others'.
To update the antiSMASH classes list yourself, see the config.yml file.
'--classify class' and '--classify category' can be used in combination with
--legacy_weights if input .gbks have been generated by antiSMASH version 6 or
--legacy-weights if input .gbks have been generated by antiSMASH version 6 or
higher. For older antiSMASH versions, either use --classify 'legacy' or do not
select --legacy_weights, which will perform the weighted distance calculations
based on the generic 'mix' weights. For more detail, see wiki.
Expand All @@ -57,25 +58,25 @@
is_flag=True,
help=(
"Calculate distances using a 'mix' bin, wherein no classification is applied. "
"This will do an all-vs-all comparison, and is likely going to take a long time. "
"This will do an all-vs-all comparison of all input BGC records. "
"This bin will use weights from the 'mix' weights distribution: "
"{JC: 0.2, AI: 0.05, DSS: 0.75, Anchor boost: 2.0}"
"{JC: 0.2, AI: 0.05, DSS: 0.75, Anchor boost: 2.0}. For more detail, see wiki."
),
)
# comparison parameters
@click.option(
"--hybrids_off",
"--hybrids-off",
is_flag=True,
help=(
"Toggle to add BGCs with hybrid predicted classes/categories to each "
"Toggle to add BGC records with hybrid predicted classes/categories to each "
"subclass instead of a hybrid class/network (e.g. a 'terpene-nrps' BGC "
"would be added to both the terpene and NRPS classes/networks instead of "
"the terpene.nrps network). "
"Only works if any --classify mode is selected."
),
)
@click.option(
"--exclude_categories",
"--exclude-categories",
callback=validate_class_category_filter,
help=(
"A comma separated list of categories. BGCs that have at least one of the product "
Expand All @@ -85,7 +86,7 @@
),
)
@click.option(
"--include_categories",
"--include-categories",
callback=validate_class_category_filter,
help=(
"A comma separated list of categories. Only BGCs that have at least one of the "
Expand All @@ -95,7 +96,7 @@
),
)
@click.option(
"--exclude_classes",
"--exclude-classes",
callback=validate_class_category_filter,
help=(
"A comma separated list of classes. BGCs that have at least one of the product "
Expand All @@ -104,7 +105,7 @@
),
)
@click.option(
"--include_classes",
"--include-classes",
callback=validate_class_category_filter,
help=(
"A comma separated list of classes. Only BGCs that have at least one of the "
Expand All @@ -114,9 +115,12 @@
)
# networking parameters
@click.option(
"--include_singletons",
"--include-singletons",
is_flag=True,
help=("Include singletons in the network."),
help=(
"Include singletons in the networ and all respective output."
" Reference singletons will not be included even if this is toggled."
),
)
@click.pass_context
def cluster(ctx, *args, **kwargs):
Expand Down
49 changes: 26 additions & 23 deletions big_scape/cli/query_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,42 +25,29 @@
@click.command()
@common_all
@common_cluster_query
@click.option(
"--classify",
type=click.Choice(["none", "class", "category"]),
default="none",
callback=validate_classify,
help=(
"By default BiG-SCAPE will query against any other supplied BGCs regardless of "
"class/category. Instead, select 'class' or 'category' to run analyses on "
"class-based bins. Only gene clusters with the same class/category will be "
"compared. Can be used in combination with '--legacy_weights' for gbks "
"produced by antiSMASH version 6 or higher. For older antiSMASH versions, "
"deselect '--legacy_weights', leading to the use of a generic 'mix' weight: "
"{JC: 0.2, AI: 0.05, DSS: 0.75, Anchor boost: 2.0}. (default: none)"
),
)
@click.option(
"-q",
"--query_bgc_path",
"--query-bgc-path",
type=click.Path(exists=True, dir_okay=False, file_okay=True, path_type=Path),
required=True,
callback=validate_query_bgc,
help=(
"Path to query BGC file. BiG-SCAPE will compare "
"all BGCs in the input and reference folders to the query"
"Path to query BGC .gbk file. BiG-SCAPE will compare "
"all BGCs records in the input and reference folders to the query"
" in a one-vs-all mode."
),
)
@click.option(
"-n",
"--query_record_number",
"--query-record-number",
type=int,
required=False,
help=(
"Query BGC record number. Used to select the specific record "
"from the query BGC gbk. Warning: if interleaved or chemical hybrid proto "
"cluster/cores are merged (see config), the relevant number is that of the "
"from the query BGC .gbk, and is only relevant when running "
"--record-type cand_cluster, protocluster or proto_core."
" Warning: if interleaved or chemical hybrid proto cluster/cores "
"are merged (see config.yml), the relevant number is that of the "
"first record of the merged cluster (the one with the lowest number). "
"e.g. if records 1 and 2 get merged, the relevant number is 1. "
),
Expand All @@ -70,9 +57,25 @@
is_flag=True,
help=(
"By default, BiG-SCAPE will only generate edges between the query and reference"
" BGCs. With the propagate flag, BiG-SCAPE will go through multiple cycles of "
" BGC records. With the propagate flag, BiG-SCAPE will go through multiple cycles of "
"edge generation until no new reference BGCs are connected to the query "
"connected component."
"connected component. For more details, see the Wiki."
),
)
@click.option(
"--classify",
type=click.Choice(["none", "class", "category"]),
default="none",
callback=validate_classify,
help=(
"By default BiG-SCAPE will compare the query BGC record against any other "
"supplied reference BGC records regardless of antiSMASH product class/category. "
"Instead, select 'class' or 'category' to run analyses on one class-specific bin, "
"in which case only reference BGC records with the same class/category as the "
"query record will be compared. Can be used in combination with --legacy-weights "
"for .gbks produced by antiSMASH version 6 or higher. For older antiSMASH versions "
"or if --legacy-weights is not selected, BiG-SCAPE will use the generic 'mix' weights: "
"{JC: 0.2, AI: 0.05, DSS: 0.75, Anchor boost: 2.0}. (default: none)"
),
)
@click.pass_context
Expand Down
21 changes: 15 additions & 6 deletions big_scape/config.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
# For more details on the config options, see the documentation at
# the github wiki (https://github.com/medema-group/BiG-SCAPE/wiki).

# PROFILER
# Update interval in seconds when profiler functionality is active.
PROFILER_UPDATE_INTERVAL: 0.5

# INPUT
# list of cand_cluster types where subrecords will be merged.
# List of cand_cluster types where subrecords will be merged.
MERGED_CAND_CLUSTER_TYPE:
- chemical_hybrid
- interleaved
Expand All @@ -12,7 +15,7 @@ MIN_BGC_LENGTH: 0
MAX_BGC_LENGTH: 500000

# CDS and DOMAIN
# Specify at which overlap percentage (as a decimal) two CDS in a gbk
# Specify at which overlap percentage (as a decimal) two CDS in a .gbk
# are considered to overlap. This preserves longest overlapping CDS.
CDS_OVERLAP_CUTOFF: 0.1
# Specify at which overlap percentage (as a decimal) two domains
Expand All @@ -38,7 +41,9 @@ REGION_MIN_EXTEND_LEN: 0.3
REGION_MIN_EXTEND_LEN_BIO: 0.2
# - Protoclusters or Proto_cores with at least one biosynthetic domain in the extended slice
PROTO_MIN_EXTEND_LEN: 0.2
# List of product classes that do not require a minimum length.
# List of product classes that do not require a minimum length. In practice, this
# means that an LCS and/or Extended slice of at least 1 domain will be accepted,
# so long as this is a core biosynthetic domain.
NO_MIN_CLASSES:
- terpene
# Integer scoring metrics used in the LCS extension algorithm for match, mismatch and gap.
Expand All @@ -54,8 +59,9 @@ EXTEND_MAX_MATCH_PERC: 0.1
# of families created. Higher preference will result in more families and vice versa.
PREFERENCE: 0.0

# TREE
# The number of common domains used to generate GCF trees in top frequencies of occurrence.
# GCF TREE
# The number of common domains (present in the exemplar BGC record) used to
# generate GCF trees in top frequencies of occurrence.
TOP_FREQS: 3

# ANCHOR DOMAINS
Expand All @@ -75,7 +81,10 @@ ANCHOR_DOMAINS:
- PF05147 # Lanthionine synthetase C-like protein

# LEGACY ANTISMASH CLASSES
# These are the classes that are used in the legacy classify modes
# List and groupings of the antiSMASH classes that are used in the
# --classify legacy mode and for which --legacy-weights have been
# optimized. These have been updated up to antiSMASH version 7.0,
# and will not be further maintained.
LEGACY_ANTISMASH_CLASSES:
pks1_products:
- t1pks
Expand Down
Loading

0 comments on commit 92daf1f

Please sign in to comment.