diff --git a/404.html b/404.html index 2aa1be7..a4fd466 100644 --- a/404.html +++ b/404.html @@ -392,6 +392,27 @@ + + + + + + +
  • + + + + + Sketch + + + + +
  • + + + + diff --git a/ReferenceQC/index.html b/ReferenceQC/index.html index 53adee2..2bae99d 100644 --- a/ReferenceQC/index.html +++ b/ReferenceQC/index.html @@ -18,7 +18,7 @@ - + @@ -448,6 +448,39 @@ + + +
  • + + + PreparedQC + + + + +
  • @@ -503,6 +536,24 @@ +
  • + +
  • + + + load_genome_sig_to_dict + + + +
  • + +
  • + + + nonref_consume_from_vars + + +
  • @@ -537,6 +588,27 @@ + + + + + + +
  • + + + + + Sketch + + + + +
  • + + + + @@ -597,6 +669,39 @@ + + +
  • + + + PreparedQC + + + + +
  • @@ -652,6 +757,24 @@ +
  • + +
  • + + + load_genome_sig_to_dict + + + +
  • + +
  • + + + nonref_consume_from_vars + + +
  • @@ -718,6 +841,676 @@

    Python API Documentation + + + +

    + PreparedQC + + +

    + + +
    +

    + Bases: ReferenceQC

    + + +

    Class for quality control (QC) analysis of sample signature against prepared snipe profiles.

    + + + + + + +
    + Source code in src/snipe/api/reference_QC.py +
    class PreparedQC(ReferenceQC):
    +    r"""
    +    Class for quality control (QC) analysis of sample signature against prepared snipe profiles.
    +    """
    +
    +    def __init__(self, *, sample_sig: SnipeSig, snipe_db_path: str = '~/.snipe/dbs/', ref_id: Optional[str] = None, amplicon_id: Optional[str] = None, enable_logging: bool = False, **kwargs):
    +        """
    +        Initialize the PreparedQC instance.
    +
    +        **Parameters**
    +
    +        - `sample_sig` (`SnipeSig`): The sample k-mer signature.
    +        - `snipe_db_path` (`str`): Path to the local Snipe database directory.
    +        - `ref_id` (`Optional[str]`): Reference identifier for selecting specific profiles.
    +        - `enable_logging` (`bool`): Flag to enable detailed logging.
    +        - `**kwargs`: Additional keyword arguments.
    +        """
    +        self.snipe_db_path = os.path.expanduser(snipe_db_path)
    +        self.ref_id = ref_id
    +
    +        # Ensure the local database directory exists
    +        os.makedirs(self.snipe_db_path, exist_ok=True)
    +        if enable_logging:
    +            self.logger.debug(f"Local Snipe DB path set to: {self.snipe_db_path}")
    +        else:
    +            self.logger.debug("Logging is disabled for PreparedQC.")
    +
    +        # Initialize without a reference signature for now; it can be set after downloading
    +        super().__init__(
    +            sample_sig=sample_sig,
    +            reference_sig=None,  # To be set after downloading
    +            enable_logging=enable_logging,
    +            **kwargs
    +        )
    +
    +    def download_osf_db(self, url: str, save_path: str = '~/.snipe/dbs', force: bool = False) -> Optional[str]:
    +        """
    +        Download a file from OSF using the provided URL. The file is saved with its original name 
    +        as specified by the OSF server via the Content-Disposition header.
    +
    +        **Parameters**
    +
    +        - `url` (`str`): The OSF URL to download the file from.
    +        - `save_path` (`str`): The directory path where the file will be saved. Supports user (~) and environment variables.
    +                               Default is the local Snipe database directory.
    +        - `force` (`bool`): If True, overwrite the file if it already exists. Default is False.
    +
    +        **Returns**
    +
    +        - `Optional[str]`: The path to the downloaded file if successful, else None.
    +
    +        **Raises**
    +
    +        - `requests.exceptions.RequestException`: If an error occurs during the HTTP request.
    +        - `Exception`: For any other exceptions that may arise.
    +        """
    +        try:
    +            # Expand user (~) and environment variables in save_path
    +            expanded_save_path = os.path.expanduser(os.path.expandvars(save_path))
    +            self.logger.debug(f"Expanded save path: {expanded_save_path}")
    +
    +            # Ensure the download URL ends with '/download'
    +            parsed_url = urlparse(url)
    +            if not parsed_url.path.endswith('/download'):
    +                download_url = f"{url.rstrip('/')}/download"
    +            else:
    +                download_url = url
    +
    +            self.logger.debug(f"Download URL: {download_url}")
    +
    +            # Ensure the save directory exists
    +            os.makedirs(expanded_save_path, exist_ok=True)
    +            self.logger.debug(f"Save path verified/created: {expanded_save_path}")
    +
    +            # Initiate the GET request with streaming
    +            with requests.get(download_url, stream=True, allow_redirects=True) as response:
    +                response.raise_for_status()  # Raise an exception for HTTP errors
    +
    +                # Attempt to extract filename from Content-Disposition
    +                content_disposition = response.headers.get('Content-Disposition')
    +                filename = self._extract_filename(content_disposition, parsed_url.path)
    +                self.logger.debug(f"Filename determined: {filename}")
    +
    +                # Define the full save path
    +                full_save_path = os.path.join(expanded_save_path, filename)
    +                self.logger.debug(f"Full save path: {full_save_path}")
    +
    +                # Check if the file already exists
    +                if os.path.exists(full_save_path):
    +                    if force:
    +                        self.logger.info(f"Overwriting existing file: {full_save_path}")
    +                    else:
    +                        self.logger.info(f"File already exists: {full_save_path}. Skipping download.")
    +                        return full_save_path
    +
    +                # Get the total file size for the progress bar
    +                total_size = int(response.headers.get('Content-Length', 0))
    +
    +                # Initialize the progress bar
    +                with open(full_save_path, 'wb') as file, tqdm(
    +                    total=total_size, 
    +                    unit='B', 
    +                    unit_scale=True, 
    +                    unit_divisor=1024,
    +                    desc=filename,
    +                    ncols=100
    +                ) as bar:
    +                    for chunk in response.iter_content(chunk_size=1024):
    +                        if chunk:  # Filter out keep-alive chunks
    +                            file.write(chunk)
    +                            bar.update(len(chunk))
    +
    +                self.logger.info(f"File downloaded successfully: {full_save_path}")
    +                return full_save_path
    +
    +        except requests.exceptions.RequestException as req_err:
    +            self.logger.error(f"Request error occurred while downloading {url}: {req_err}")
    +            raise
    +        except Exception as e:
    +            self.logger.error(f"An unexpected error occurred while downloading {url}: {e}")
    +            raise
    +
    +    def _extract_filename(self, content_disposition: Optional[str], url_path: str) -> str:
    +        """
    +        Extract filename from Content-Disposition header or fallback to URL path.
    +
    +        **Parameters**
    +
    +        - `content_disposition` (`Optional[str]`): The Content-Disposition header value.
    +        - `url_path` (`str`): The path component of the URL.
    +
    +        **Returns**
    +
    +        - `str`: The extracted filename.
    +        """
    +        filename = None
    +        if content_disposition:
    +            self.logger.debug("Parsing Content-Disposition header for filename.")
    +            parts = content_disposition.split(';')
    +            for part in parts:
    +                part = part.strip()
    +                if part.lower().startswith('filename*='):
    +                    # Handle RFC 5987 encoding (e.g., filename*=UTF-8''example.txt)
    +                    encoded_filename = part.split('=', 1)[1].strip()
    +                    if "''" in encoded_filename:
    +                        filename = encoded_filename.split("''", 1)[1]
    +                    else:
    +                        filename = encoded_filename
    +                    self.logger.debug(f"Filename extracted from headers (RFC 5987): {filename}")
    +                    break
    +                elif part.lower().startswith('filename='):
    +                    # Remove 'filename=' and any surrounding quotes
    +                    filename = part.split('=', 1)[1].strip(' "')
    +                    self.logger.debug(f"Filename extracted from headers: {filename}")
    +                    break
    +
    +        if not filename:
    +            self.logger.debug("Falling back to filename derived from URL path.")
    +            filename = os.path.basename(url_path)
    +            if not filename:
    +                filename = 'downloaded_file'
    +            self.logger.debug(f"Filename derived from URL: {filename}")
    +
    +        return filename
    +
    +
    + + + +
    + + + + + + + + + +
    + + +

    + __init__(*, sample_sig, snipe_db_path='~/.snipe/dbs/', ref_id=None, amplicon_id=None, enable_logging=False, **kwargs) + +

    + + +
    + +

    Initialize the PreparedQC instance.

    +

    Parameters

    +
      +
    • sample_sig (SnipeSig): The sample k-mer signature.
    • +
    • snipe_db_path (str): Path to the local Snipe database directory.
    • +
    • ref_id (Optional[str]): Reference identifier for selecting specific profiles.
    • +
    • enable_logging (bool): Flag to enable detailed logging.
    • +
    • **kwargs: Additional keyword arguments.
    • +
    + +
    + Source code in src/snipe/api/reference_QC.py +
    def __init__(self, *, sample_sig: SnipeSig, snipe_db_path: str = '~/.snipe/dbs/', ref_id: Optional[str] = None, amplicon_id: Optional[str] = None, enable_logging: bool = False, **kwargs):
    +    """
    +    Initialize the PreparedQC instance.
    +
    +    **Parameters**
    +
    +    - `sample_sig` (`SnipeSig`): The sample k-mer signature.
    +    - `snipe_db_path` (`str`): Path to the local Snipe database directory.
    +    - `ref_id` (`Optional[str]`): Reference identifier for selecting specific profiles.
    +    - `enable_logging` (`bool`): Flag to enable detailed logging.
    +    - `**kwargs`: Additional keyword arguments.
    +    """
    +    self.snipe_db_path = os.path.expanduser(snipe_db_path)
    +    self.ref_id = ref_id
    +
    +    # Ensure the local database directory exists
    +    os.makedirs(self.snipe_db_path, exist_ok=True)
    +    if enable_logging:
    +        self.logger.debug(f"Local Snipe DB path set to: {self.snipe_db_path}")
    +    else:
    +        self.logger.debug("Logging is disabled for PreparedQC.")
    +
    +    # Initialize without a reference signature for now; it can be set after downloading
    +    super().__init__(
    +        sample_sig=sample_sig,
    +        reference_sig=None,  # To be set after downloading
    +        enable_logging=enable_logging,
    +        **kwargs
    +    )
    +
    +
    +
    + +
    + +
    + + +

    + download_osf_db(url, save_path='~/.snipe/dbs', force=False) + +

    + + +
    + +

    Download a file from OSF using the provided URL. The file is saved with its original name +as specified by the OSF server via the Content-Disposition header.

    +

    Parameters

    +
      +
    • url (str): The OSF URL to download the file from.
    • +
    • save_path (str): The directory path where the file will be saved. Supports user (~) and environment variables. + Default is the local Snipe database directory.
    • +
    • force (bool): If True, overwrite the file if it already exists. Default is False.
    • +
    +

    Returns

    +
      +
    • Optional[str]: The path to the downloaded file if successful, else None.
    • +
    +

    Raises

    +
      +
    • requests.exceptions.RequestException: If an error occurs during the HTTP request.
    • +
    • Exception: For any other exceptions that may arise.
    • +
    + +
    + Source code in src/snipe/api/reference_QC.py +
    def download_osf_db(self, url: str, save_path: str = '~/.snipe/dbs', force: bool = False) -> Optional[str]:
    +    """
    +    Download a file from OSF using the provided URL. The file is saved with its original name 
    +    as specified by the OSF server via the Content-Disposition header.
    +
    +    **Parameters**
    +
    +    - `url` (`str`): The OSF URL to download the file from.
    +    - `save_path` (`str`): The directory path where the file will be saved. Supports user (~) and environment variables.
    +                           Default is the local Snipe database directory.
    +    - `force` (`bool`): If True, overwrite the file if it already exists. Default is False.
    +
    +    **Returns**
    +
    +    - `Optional[str]`: The path to the downloaded file if successful, else None.
    +
    +    **Raises**
    +
    +    - `requests.exceptions.RequestException`: If an error occurs during the HTTP request.
    +    - `Exception`: For any other exceptions that may arise.
    +    """
    +    try:
    +        # Expand user (~) and environment variables in save_path
    +        expanded_save_path = os.path.expanduser(os.path.expandvars(save_path))
    +        self.logger.debug(f"Expanded save path: {expanded_save_path}")
    +
    +        # Ensure the download URL ends with '/download'
    +        parsed_url = urlparse(url)
    +        if not parsed_url.path.endswith('/download'):
    +            download_url = f"{url.rstrip('/')}/download"
    +        else:
    +            download_url = url
    +
    +        self.logger.debug(f"Download URL: {download_url}")
    +
    +        # Ensure the save directory exists
    +        os.makedirs(expanded_save_path, exist_ok=True)
    +        self.logger.debug(f"Save path verified/created: {expanded_save_path}")
    +
    +        # Initiate the GET request with streaming
    +        with requests.get(download_url, stream=True, allow_redirects=True) as response:
    +            response.raise_for_status()  # Raise an exception for HTTP errors
    +
    +            # Attempt to extract filename from Content-Disposition
    +            content_disposition = response.headers.get('Content-Disposition')
    +            filename = self._extract_filename(content_disposition, parsed_url.path)
    +            self.logger.debug(f"Filename determined: {filename}")
    +
    +            # Define the full save path
    +            full_save_path = os.path.join(expanded_save_path, filename)
    +            self.logger.debug(f"Full save path: {full_save_path}")
    +
    +            # Check if the file already exists
    +            if os.path.exists(full_save_path):
    +                if force:
    +                    self.logger.info(f"Overwriting existing file: {full_save_path}")
    +                else:
    +                    self.logger.info(f"File already exists: {full_save_path}. Skipping download.")
    +                    return full_save_path
    +
    +            # Get the total file size for the progress bar
    +            total_size = int(response.headers.get('Content-Length', 0))
    +
    +            # Initialize the progress bar
    +            with open(full_save_path, 'wb') as file, tqdm(
    +                total=total_size, 
    +                unit='B', 
    +                unit_scale=True, 
    +                unit_divisor=1024,
    +                desc=filename,
    +                ncols=100
    +            ) as bar:
    +                for chunk in response.iter_content(chunk_size=1024):
    +                    if chunk:  # Filter out keep-alive chunks
    +                        file.write(chunk)
    +                        bar.update(len(chunk))
    +
    +            self.logger.info(f"File downloaded successfully: {full_save_path}")
    +            return full_save_path
    +
    +    except requests.exceptions.RequestException as req_err:
    +        self.logger.error(f"Request error occurred while downloading {url}: {req_err}")
    +        raise
    +    except Exception as e:
    +        self.logger.error(f"An unexpected error occurred while downloading {url}: {e}")
    +        raise
    +
    +
    +
    + +
    + + + +
    + +
    + + +
    @@ -1043,16 +1836,7 @@

    Source code in src/snipe/api/reference_QC.py -
      12
    -  13
    -  14
    -  15
    -  16
    -  17
    -  18
    -  19
    -  20
    -  21
    +                
      21
       22
       23
       24
    @@ -2301,1265 +3085,1802 @@ 

    1267 1268 1269 -1270

    class ReferenceQC:
    -    r"""
    -    Class for performing quality control of sequencing data against a reference genome.
    -
    -    This class computes various metrics to assess the quality and characteristics of a sequencing sample, including coverage indices and abundance ratios, by comparing sample k-mer signatures with a reference genome and an optional amplicon signature.
    -
    -    **Parameters**
    -
    -    - `sample_sig` (`SnipeSig`): The sample k-mer signature (must be of type `SigType.SAMPLE`).
    -    - `reference_sig` (`SnipeSig`): The reference genome k-mer signature (must be of type `SigType.GENOME`).
    -    - `amplicon_sig` (`Optional[SnipeSig]`): The amplicon k-mer signature (must be of type `SigType.AMPLICON`), if applicable.
    -    - `enable_logging` (`bool`): Flag to enable detailed logging.
    +1270
    +1271
    +1272
    +1273
    +1274
    +1275
    +1276
    +1277
    +1278
    +1279
    +1280
    +1281
    +1282
    +1283
    +1284
    +1285
    +1286
    +1287
    +1288
    +1289
    +1290
    +1291
    +1292
    +1293
    +1294
    +1295
    +1296
    +1297
    +1298
    +1299
    +1300
    +1301
    +1302
    +1303
    +1304
    +1305
    +1306
    +1307
    +1308
    +1309
    +1310
    +1311
    +1312
    +1313
    +1314
    +1315
    +1316
    +1317
    +1318
    +1319
    +1320
    +1321
    +1322
    +1323
    +1324
    +1325
    +1326
    +1327
    +1328
    +1329
    +1330
    +1331
    +1332
    +1333
    +1334
    +1335
    +1336
    +1337
    +1338
    +1339
    +1340
    +1341
    +1342
    +1343
    +1344
    +1345
    +1346
    +1347
    +1348
    +1349
    +1350
    +1351
    +1352
    +1353
    +1354
    +1355
    +1356
    +1357
    +1358
    +1359
    +1360
    +1361
    +1362
    +1363
    +1364
    +1365
    +1366
    +1367
    +1368
    +1369
    +1370
    +1371
    +1372
    +1373
    +1374
    +1375
    +1376
    +1377
    +1378
    +1379
    +1380
    +1381
    +1382
    +1383
    +1384
    +1385
    +1386
    +1387
    +1388
    +1389
    +1390
    +1391
    +1392
    +1393
    +1394
    +1395
    +1396
    +1397
    +1398
    +1399
    +1400
    +1401
    +1402
    +1403
    +1404
    +1405
    +1406
    +1407
    +1408
    +1409
    +1410
    +1411
    +1412
    +1413
    +1414
    +1415
    +1416
    +1417
    +1418
    +1419
    +1420
    +1421
    +1422
    +1423
    +1424
    +1425
    +1426
    +1427
    +1428
    +1429
    +1430
    +1431
    +1432
    +1433
    +1434
    +1435
    +1436
    +1437
    +1438
    +1439
    +1440
    +1441
    +1442
    +1443
    +1444
    +1445
    +1446
    +1447
    +1448
    +1449
    +1450
    +1451
    +1452
    +1453
    +1454
    +1455
    +1456
    +1457
    +1458
    +1459
    +1460
    +1461
    +1462
    +1463
    +1464
    +1465
    +1466
    +1467
    +1468
    +1469
    +1470
    +1471
    +1472
    +1473
    +1474
    +1475
    +1476
    +1477
    +1478
    +1479
    +1480
    +1481
    +1482
    +1483
    +1484
    +1485
    +1486
    +1487
    +1488
    +1489
    +1490
    +1491
    +1492
    +1493
    +1494
    +1495
    +1496
    +1497
    +1498
    +1499
    +1500
    +1501
    +1502
    +1503
    +1504
    +1505
    +1506
    +1507
    +1508
    +1509
    +1510
    +1511
    +1512
    +1513
    +1514
    +1515
    +1516
    +1517
    +1518
    +1519
    +1520
    +1521
    +1522
    +1523
    +1524
    +1525
    +1526
    +1527
    +1528
    +1529
    +1530
    +1531
    +1532
    +1533
    +1534
    +1535
    +1536
    +1537
    +1538
    +1539
    +1540
    +1541
    +1542
    +1543
    class ReferenceQC:
    +    r"""
    +    Class for performing quality control of sequencing data against a reference genome.
     
    -    **Attributes**
    +    This class computes various metrics to assess the quality and characteristics of a sequencing sample, including coverage indices and abundance ratios, by comparing sample k-mer signatures with a reference genome and an optional amplicon signature.
     
    -    - `sample_sig` (`SnipeSig`): The sample signature.
    -    - `reference_sig` (`SnipeSig`): The reference genome signature.
    -    - `amplicon_sig` (`Optional[SnipeSig]`): The amplicon signature.
    -    - `sample_stats` (`Dict[str, Any]`): Statistics of the sample signature.
    -    - `genome_stats` (`Dict[str, Any]`): Calculated genome-related statistics.
    -    - `amplicon_stats` (`Dict[str, Any]`): Calculated amplicon-related statistics (if `amplicon_sig` is provided).
    -    - `advanced_stats` (`Dict[str, Any]`): Calculated advanced statistics (optional).
    -    - `predicted_assay_type` (`str`): Predicted assay type based on metrics.
    +    **Parameters**
    +
    +    - `sample_sig` (`SnipeSig`): The sample k-mer signature (must be of type `SigType.SAMPLE`).
    +    - `reference_sig` (`SnipeSig`): The reference genome k-mer signature (must be of type `SigType.GENOME`).
    +    - `amplicon_sig` (`Optional[SnipeSig]`): The amplicon k-mer signature (must be of type `SigType.AMPLICON`), if applicable.
    +    - `enable_logging` (`bool`): Flag to enable detailed logging.
    +
    +    **Attributes**
     
    -    **Calculated Metrics**
    -
    -    The class calculates the following metrics:
    -
    -    - **Total unique k-mers**
    -        - Description: Number of unique k-mers in the sample signature.
    -        - Calculation:
    -          $$
    -          \text{Total unique k-mers} = \left| \text{Sample k-mer set} \right|
    -          $$
    +    - `sample_sig` (`SnipeSig`): The sample signature.
    +    - `reference_sig` (`SnipeSig`): The reference genome signature.
    +    - `amplicon_sig` (`Optional[SnipeSig]`): The amplicon signature.
    +    - `sample_stats` (`Dict[str, Any]`): Statistics of the sample signature.
    +    - `genome_stats` (`Dict[str, Any]`): Calculated genome-related statistics.
    +    - `amplicon_stats` (`Dict[str, Any]`): Calculated amplicon-related statistics (if `amplicon_sig` is provided).
    +    - `advanced_stats` (`Dict[str, Any]`): Calculated advanced statistics (optional).
    +    - `predicted_assay_type` (`str`): Predicted assay type based on metrics.
    +
    +    **Calculated Metrics**
     
    -    - **k-mer total abundance**
    -        - Description: Sum of abundances of all k-mers in the sample signature.
    -        - Calculation:
    -          $$
    -          \text{k-mer total abundance} = \sum_{k \in \text{Sample k-mer set}} \text{abundance}(k)
    +    The class calculates the following metrics:
    +
    +    - **Total unique k-mers**
    +        - Description: Number of unique k-mers in the sample signature.
    +        - Calculation:
               $$
    -
    -    - **k-mer mean abundance**
    -        - Description: Average abundance of k-mers in the sample signature.
    -        - Calculation:
    -          $$
    -          \text{k-mer mean abundance} = \frac{\text{k-mer total abundance}}{\text{Total unique k-mers}}
    +          \text{Total unique k-mers} = \left| \text{Sample k-mer set} \right|
    +          $$
    +
    +    - **k-mer total abundance**
    +        - Description: Sum of abundances of all k-mers in the sample signature.
    +        - Calculation:
               $$
    -
    -    - **k-mer median abundance**
    -        - Description: Median abundance of k-mers in the sample signature.
    -        - Calculation: Median of abundances in the sample k-mers.
    -
    -    - **Number of singletons**
    -        - Description: Number of k-mers with an abundance of 1 in the sample signature.
    -        - Calculation:
    +          \text{k-mer total abundance} = \sum_{k \in \text{Sample k-mer set}} \text{abundance}(k)
    +          $$
    +
    +    - **k-mer mean abundance**
    +        - Description: Average abundance of k-mers in the sample signature.
    +        - Calculation:
    +          $$
    +          \text{k-mer mean abundance} = \frac{\text{k-mer total abundance}}{\text{Total unique k-mers}}
               $$
    -          \text{Number of singletons} = \left| \{ k \in \text{Sample k-mer set} \mid \text{abundance}(k) = 1 \} \right|
    -          $$
    -
    -    - **Genomic unique k-mers**
    -        - Description: Number of k-mers shared between the sample and the reference genome.
    -        - Calculation:
    -          $$
    -          \text{Genomic unique k-mers} = \left| \text{Sample k-mer set} \cap \text{Reference genome k-mer set} \right|
    +
    +    - **k-mer median abundance**
    +        - Description: Median abundance of k-mers in the sample signature.
    +        - Calculation: Median of abundances in the sample k-mers.
    +
    +    - **Number of singletons**
    +        - Description: Number of k-mers with an abundance of 1 in the sample signature.
    +        - Calculation:
               $$
    -
    -    - **Genome coverage index**
    -        - Description: Proportion of the reference genome's k-mers that are present in the sample.
    -        - Calculation:
    -          $$
    -          \text{Genome coverage index} = \frac{\text{Genomic unique k-mers}}{\left| \text{Reference genome k-mer set} \right|}
    +          \text{Number of singletons} = \left| \{ k \in \text{Sample k-mer set} \mid \text{abundance}(k) = 1 \} \right|
    +          $$
    +
    +    - **Genomic unique k-mers**
    +        - Description: Number of k-mers shared between the sample and the reference genome.
    +        - Calculation:
               $$
    -
    -    - **Genomic k-mers total abundance**
    -        - Description: Sum of abundances for k-mers shared with the reference genome.
    -        - Calculation:
    -          $$
    -          \text{Genomic k-mers total abundance} = \sum_{k \in \text{Sample k-mer set} \cap \text{Reference genome k-mer set}} \text{abundance}(k)
    +          \text{Genomic unique k-mers} = \left| \text{Sample k-mer set} \cap \text{Reference genome k-mer set} \right|
    +          $$
    +
    +    - **Genome coverage index**
    +        - Description: Proportion of the reference genome's k-mers that are present in the sample.
    +        - Calculation:
               $$
    -
    -    - **Genomic k-mers mean abundance**
    -        - Description: Average abundance of k-mers shared with the reference genome.
    -        - Calculation:
    -          $$
    -          \text{Genomic k-mers mean abundance} = \frac{\text{Genomic k-mers total abundance}}{\text{Genomic unique k-mers}}
    +          \text{Genome coverage index} = \frac{\text{Genomic unique k-mers}}{\left| \text{Reference genome k-mer set} \right|}
    +          $$
    +
    +    - **Genomic k-mers total abundance**
    +        - Description: Sum of abundances for k-mers shared with the reference genome.
    +        - Calculation:
               $$
    -
    -    - **Mapping index**
    -        - Description: Proportion of the sample's total k-mer abundance that maps to the reference genome.
    -        - Calculation:
    -          $$
    -          \text{Mapping index} = \frac{\text{Genomic k-mers total abundance}}{\text{k-mer total abundance}}
    +          \text{Genomic k-mers total abundance} = \sum_{k \in \text{Sample k-mer set} \cap \text{Reference genome k-mer set}} \text{abundance}(k)
    +          $$
    +
    +    - **Genomic k-mers mean abundance**
    +        - Description: Average abundance of k-mers shared with the reference genome.
    +        - Calculation:
               $$
    -
    -    If `amplicon_sig` is provided, additional metrics are calculated:
    +          \text{Genomic k-mers mean abundance} = \frac{\text{Genomic k-mers total abundance}}{\text{Genomic unique k-mers}}
    +          $$
     
    -    - **Amplicon unique k-mers**
    -        - Description: Number of k-mers shared between the sample and the amplicon.
    +    - **Mapping index**
    +        - Description: Proportion of the sample's total k-mer abundance that maps to the reference genome.
             - Calculation:
               $$
    -          \text{Amplicon unique k-mers} = \left| \text{Sample k-mer set} \cap \text{Amplicon k-mer set} \right|
    +          \text{Mapping index} = \frac{\text{Genomic k-mers total abundance}}{\text{k-mer total abundance}}
               $$
     
    -    - **Amplicon coverage index**
    -        - Description: Proportion of the amplicon's k-mers that are present in the sample.
    -        - Calculation:
    -          $$
    -          \text{Amplicon coverage index} = \frac{\text{Amplicon unique k-mers}}{\left| \text{Amplicon k-mer set} \right|}
    +    If `amplicon_sig` is provided, additional metrics are calculated:
    +
    +    - **Amplicon unique k-mers**
    +        - Description: Number of k-mers shared between the sample and the amplicon.
    +        - Calculation:
               $$
    -
    -    - **Amplicon k-mers total abundance**
    -        - Description: Sum of abundances for k-mers shared with the amplicon.
    -        - Calculation:
    -          $$
    -          \text{Amplicon k-mers total abundance} = \sum_{k \in \text{Sample k-mer set} \cap \text{Amplicon k-mer set}} \text{abundance}(k)
    +          \text{Amplicon unique k-mers} = \left| \text{Sample k-mer set} \cap \text{Amplicon k-mer set} \right|
    +          $$
    +
    +    - **Amplicon coverage index**
    +        - Description: Proportion of the amplicon's k-mers that are present in the sample.
    +        - Calculation:
               $$
    -
    -    - **Amplicon k-mers mean abundance**
    -        - Description: Average abundance of k-mers shared with the amplicon.
    -        - Calculation:
    -          $$
    -          \text{Amplicon k-mers mean abundance} = \frac{\text{Amplicon k-mers total abundance}}{\text{Amplicon unique k-mers}}
    +          \text{Amplicon coverage index} = \frac{\text{Amplicon unique k-mers}}{\left| \text{Amplicon k-mer set} \right|}
    +          $$
    +
    +    - **Amplicon k-mers total abundance**
    +        - Description: Sum of abundances for k-mers shared with the amplicon.
    +        - Calculation:
               $$
    -
    -    - **Relative total abundance**
    -        - Description: Ratio of the amplicon k-mers total abundance to the genomic k-mers total abundance.
    -        - Calculation:
    -          $$
    -          \text{Relative total abundance} = \frac{\text{Amplicon k-mers total abundance}}{\text{Genomic k-mers total abundance}}
    +          \text{Amplicon k-mers total abundance} = \sum_{k \in \text{Sample k-mer set} \cap \text{Amplicon k-mer set}} \text{abundance}(k)
    +          $$
    +
    +    - **Amplicon k-mers mean abundance**
    +        - Description: Average abundance of k-mers shared with the amplicon.
    +        - Calculation:
               $$
    -
    -    - **Relative coverage**
    -        - Description: Ratio of the amplicon coverage index to the genome coverage index.
    -        - Calculation:
    -          $$
    -          \text{Relative coverage} = \frac{\text{Amplicon coverage index}}{\text{Genome coverage index}}
    +          \text{Amplicon k-mers mean abundance} = \frac{\text{Amplicon k-mers total abundance}}{\text{Amplicon unique k-mers}}
    +          $$
    +
    +    - **Relative total abundance**
    +        - Description: Ratio of the amplicon k-mers total abundance to the genomic k-mers total abundance.
    +        - Calculation:
               $$
    -
    -    - **Predicted Assay Type**
    -        - Description: Predicted assay type based on the `Relative total abundance`.
    -        - Calculation:
    -          - If \(\text{Relative total abundance} \leq 0.0809\), then **WGS** (Whole Genome Sequencing).
    -          - If \(\text{Relative total abundance} \geq 0.1188\), then **WXS** (Whole Exome Sequencing).
    -          - If between these values, assign based on the closest threshold.
    -
    -    **Advanced Metrics** (optional, calculated if `include_advanced` is `True`):
    +          \text{Relative total abundance} = \frac{\text{Amplicon k-mers total abundance}}{\text{Genomic k-mers total abundance}}
    +          $$
    +
    +    - **Relative coverage**
    +        - Description: Ratio of the amplicon coverage index to the genome coverage index.
    +        - Calculation:
    +          $$
    +          \text{Relative coverage} = \frac{\text{Amplicon coverage index}}{\text{Genome coverage index}}
    +          $$
     
    -    - **Median-trimmed unique k-mers**
    -        - Description: Number of unique k-mers in the sample after removing k-mers with abundance below the median.
    +    - **Predicted Assay Type**
    +        - Description: Predicted assay type based on the `Relative total abundance`.
             - Calculation:
    -          - Remove k-mers where \(\text{abundance}(k) < \text{Median abundance}\).
    -          - Count the remaining k-mers.
    -
    -    - **Median-trimmed total abundance**
    -        - Description: Sum of abundances after median trimming.
    -        - Calculation:
    -          $$
    -          \text{Median-trimmed total abundance} = \sum_{k \in \text{Median-trimmed Sample k-mer set}} \text{abundance}(k)
    -          $$
    -
    -    - **Median-trimmed mean abundance**
    -        - Description: Average abundance after median trimming.
    -        - Calculation:
    -          $$
    -          \text{Median-trimmed mean abundance} = \frac{\text{Median-trimmed total abundance}}{\text{Median-trimmed unique k-mers}}
    +          - If \(\text{Relative total abundance} \leq 0.0809\), then **WGS** (Whole Genome Sequencing).
    +          - If \(\text{Relative total abundance} \geq 0.1188\), then **WXS** (Whole Exome Sequencing).
    +          - If between these values, assign based on the closest threshold.
    +
    +    **Advanced Metrics** (optional, calculated if `include_advanced` is `True`):
    +
    +    - **Median-trimmed unique k-mers**
    +        - Description: Number of unique k-mers in the sample after removing k-mers with abundance below the median.
    +        - Calculation:
    +          - Remove k-mers where \(\text{abundance}(k) < \text{Median abundance}\).
    +          - Count the remaining k-mers.
    +
    +    - **Median-trimmed total abundance**
    +        - Description: Sum of abundances after median trimming.
    +        - Calculation:
               $$
    -
    -    - **Median-trimmed median abundance**
    -        - Description: Median abundance after median trimming.
    -        - Calculation: Median of abundances in the median-trimmed sample.
    -
    -    - **Median-trimmed Genomic unique k-mers**
    -        - Description: Number of genomic k-mers in the median-trimmed sample.
    -        - Calculation:
    +          \text{Median-trimmed total abundance} = \sum_{k \in \text{Median-trimmed Sample k-mer set}} \text{abundance}(k)
    +          $$
    +
    +    - **Median-trimmed mean abundance**
    +        - Description: Average abundance after median trimming.
    +        - Calculation:
    +          $$
    +          \text{Median-trimmed mean abundance} = \frac{\text{Median-trimmed total abundance}}{\text{Median-trimmed unique k-mers}}
               $$
    -          \text{Median-trimmed Genomic unique k-mers} = \left| \text{Median-trimmed Sample k-mer set} \cap \text{Reference genome k-mer set} \right|
    -          $$
    -
    -    - **Median-trimmed Genome coverage index**
    -        - Description: Genome coverage index after median trimming.
    -        - Calculation:
    -          $$
    -          \text{Median-trimmed Genome coverage index} = \frac{\text{Median-trimmed Genomic unique k-mers}}{\left| \text{Reference genome k-mer set} \right|}
    +
    +    - **Median-trimmed median abundance**
    +        - Description: Median abundance after median trimming.
    +        - Calculation: Median of abundances in the median-trimmed sample.
    +
    +    - **Median-trimmed Genomic unique k-mers**
    +        - Description: Number of genomic k-mers in the median-trimmed sample.
    +        - Calculation:
               $$
    -
    -    - **Median-trimmed Amplicon unique k-mers** (if `amplicon_sig` is provided)
    -        - Description: Number of amplicon k-mers in the median-trimmed sample.
    -        - Calculation:
    -          $$
    -          \text{Median-trimmed Amplicon unique k-mers} = \left| \text{Median-trimmed Sample k-mer set} \cap \text{Amplicon k-mer set} \right|
    +          \text{Median-trimmed Genomic unique k-mers} = \left| \text{Median-trimmed Sample k-mer set} \cap \text{Reference genome k-mer set} \right|
    +          $$
    +
    +    - **Median-trimmed Genome coverage index**
    +        - Description: Genome coverage index after median trimming.
    +        - Calculation:
               $$
    -
    -    - **Median-trimmed Amplicon coverage index**
    -        - Description: Amplicon coverage index after median trimming.
    -        - Calculation:
    -          $$
    -          \text{Median-trimmed Amplicon coverage index} = \frac{\text{Median-trimmed Amplicon unique k-mers}}{\left| \text{Amplicon k-mer set} \right|}
    +          \text{Median-trimmed Genome coverage index} = \frac{\text{Median-trimmed Genomic unique k-mers}}{\left| \text{Reference genome k-mer set} \right|}
    +          $$
    +
    +    - **Median-trimmed Amplicon unique k-mers** (if `amplicon_sig` is provided)
    +        - Description: Number of amplicon k-mers in the median-trimmed sample.
    +        - Calculation:
               $$
    -
    -    - **Median-trimmed relative coverage**
    -        - Description: Relative coverage after median trimming.
    -        - Calculation:
    -          $$
    -          \text{Median-trimmed relative coverage} = \frac{\text{Median-trimmed Amplicon coverage index}}{\text{Median-trimmed Genome coverage index}}
    +          \text{Median-trimmed Amplicon unique k-mers} = \left| \text{Median-trimmed Sample k-mer set} \cap \text{Amplicon k-mer set} \right|
    +          $$
    +
    +    - **Median-trimmed Amplicon coverage index**
    +        - Description: Amplicon coverage index after median trimming.
    +        - Calculation:
               $$
    -
    -    - **Median-trimmed relative mean abundance**
    -        - Description: Ratio of median-trimmed amplicon mean abundance to median-trimmed genomic mean abundance.
    -        - Calculation:
    -          $$
    -          \text{Median-trimmed relative mean abundance} = \frac{\text{Median-trimmed Amplicon mean abundance}}{\text{Median-trimmed Genomic mean abundance}}
    +          \text{Median-trimmed Amplicon coverage index} = \frac{\text{Median-trimmed Amplicon unique k-mers}}{\left| \text{Amplicon k-mer set} \right|}
    +          $$
    +
    +    - **Median-trimmed relative coverage**
    +        - Description: Relative coverage after median trimming.
    +        - Calculation:
               $$
    -
    -    **Usage Example**
    +          \text{Median-trimmed relative coverage} = \frac{\text{Median-trimmed Amplicon coverage index}}{\text{Median-trimmed Genome coverage index}}
    +          $$
     
    -    ```python
    -    qc = ReferenceQC(
    -        sample_sig=sample_signature,
    -        reference_sig=reference_signature,
    -        amplicon_sig=amplicon_signature,
    -        enable_logging=True
    -    )
    -
    -    stats = qc.get_aggregated_stats(include_advanced=True)
    -    ```
    -    """
    -
    -    def __init__(self, *,
    -                 sample_sig: SnipeSig,
    -                 reference_sig: SnipeSig,
    -                 amplicon_sig: Optional[SnipeSig] = None,
    -                 enable_logging: bool = False,
    -                 **kwargs):
    -        # Initialize logger
    -        self.logger = logging.getLogger(self.__class__.__name__)
    +    - **Median-trimmed relative mean abundance**
    +        - Description: Ratio of median-trimmed amplicon mean abundance to median-trimmed genomic mean abundance.
    +        - Calculation:
    +          $$
    +          \text{Median-trimmed relative mean abundance} = \frac{\text{Median-trimmed Amplicon mean abundance}}{\text{Median-trimmed Genomic mean abundance}}
    +          $$
    +
    +    **Usage Example**
    +
    +    ```python
    +    qc = ReferenceQC(
    +        sample_sig=sample_signature,
    +        reference_sig=reference_signature,
    +        amplicon_sig=amplicon_signature,
    +        enable_logging=True
    +    )
    +
    +    stats = qc.get_aggregated_stats(include_advanced=True)
    +    ```
    +    """
     
    -        if enable_logging:
    -            self.logger.setLevel(logging.DEBUG)
    -            if not self.logger.hasHandlers():
    -                ch = logging.StreamHandler()
    -                ch.setLevel(logging.DEBUG)
    -                formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    -                ch.setFormatter(formatter)
    -                self.logger.addHandler(ch)
    -            self.logger.debug("Logging is enabled for ReferenceQC.")
    -        else:
    -            self.logger.setLevel(logging.CRITICAL)
    -
    -        # logging all passed parameters
    -        self.logger.debug("passed parameters:\n")
    -        for key, value in locals().items():
    -            self.logger.debug("\t%s: %s", key, value)
    -
    -
    -        # Validate that all signatures have the same ksize and scale
    -        self.logger.debug("Validating ksize and scale across signatures.")
    -        if sample_sig.ksize != reference_sig.ksize:
    -            self.logger.error("K-mer sizes do not match: sample_sig.ksize=%d vs reference_sig.ksize=%d",
    -                              sample_sig.ksize, reference_sig.ksize)
    -            raise ValueError(f"sample_sig kszie ({sample_sig.ksize}) does not match reference_sig ksize ({reference_sig.ksize}).")
    -        if sample_sig.scale != reference_sig.scale:
    -            self.logger.error("Scale values do not match: sample_sig.scale=%d vs reference_sig.scale=%d",
    -                              sample_sig.scale, reference_sig.scale)
    -            raise ValueError(f"sample_sig scale ({sample_sig.scale}) does not match reference_sig scale ({reference_sig.scale}).")
    -
    -        if amplicon_sig is not None:
    -            if amplicon_sig.ksize != sample_sig.ksize:
    -                self.logger.error("K-mer sizes do not match: amplicon_sig.ksize=%d vs sample_sig.ksize=%d",
    -                                  amplicon_sig.ksize, sample_sig.ksize)
    -                raise ValueError(f"amplicon_sig ksize ({amplicon_sig.ksize}) does not match sample_sig ksize ({sample_sig.ksize}).")
    -            if amplicon_sig.scale != sample_sig.scale:
    -                self.logger.error("Scale values do not match: amplicon_sig.scale=%d vs sample_sig.scale=%d",
    -                                  amplicon_sig.scale, sample_sig.scale)
    -                raise ValueError(f"amplicon_sig scale ({amplicon_sig.scale}) does not match sample_sig scale ({sample_sig.scale}).")
    -
    -        self.logger.debug("All signatures have matching ksize and scale.")
    -
    -
    -        # Verify signature types
    -        if sample_sig._type != SigType.SAMPLE:
    -            self.logger.error("Invalid signature type for sample_sig: %s | %s", sample_sig.sigtype, sample_sig._type)
    -            raise ValueError(f"sample_sig must be of type {SigType.SAMPLE}, got {sample_sig.sigtype}")
    -
    -        if reference_sig.sigtype != SigType.GENOME:
    -            self.logger.error("Invalid signature type for reference_sig: %s", reference_sig.sigtype)
    -            raise ValueError(f"reference_sig must be of type {SigType.GENOME}, got {reference_sig.sigtype}")
    -
    -        if amplicon_sig is not None and amplicon_sig.sigtype != SigType.AMPLICON:
    -            self.logger.error("Invalid signature type for amplicon_sig: %s", amplicon_sig.sigtype)
    -            raise ValueError(f"amplicon_sig must be of type {SigType.AMPLICON}, got {amplicon_sig.sigtype}")
    +    def __init__(self, *,
    +                 sample_sig: SnipeSig,
    +                 reference_sig: SnipeSig,
    +                 amplicon_sig: Optional[SnipeSig] = None,
    +                 enable_logging: bool = False,
    +                 **kwargs):
    +        # Initialize logger
    +        self.logger = logging.getLogger(self.__class__.__name__)
    +
    +        # Initialize split cache
    +        self._split_cache: Dict[int, List[SnipeSig]] = {}
    +        self.logger.debug("Initialized split cache.")
    +
    +
    +        if enable_logging:
    +            self.logger.setLevel(logging.DEBUG)
    +            if not self.logger.hasHandlers():
    +                ch = logging.StreamHandler()
    +                ch.setLevel(logging.DEBUG)
    +                formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    +                ch.setFormatter(formatter)
    +                self.logger.addHandler(ch)
    +            self.logger.debug("Logging is enabled for ReferenceQC.")
    +        else:
    +            self.logger.setLevel(logging.CRITICAL)
    +
    +        # logging all passed parameters
    +        self.logger.debug("passed parameters:\n")
    +        for key, value in locals().items():
    +            self.logger.debug("\t%s: %s", key, value)
    +
    +
    +        # Validate that all signatures have the same ksize and scale
    +        self.logger.debug("Validating ksize and scale across signatures.")
    +        if sample_sig.ksize != reference_sig.ksize:
    +            self.logger.error("K-mer sizes do not match: sample_sig.ksize=%d vs reference_sig.ksize=%d",
    +                              sample_sig.ksize, reference_sig.ksize)
    +            raise ValueError(f"sample_sig kszie ({sample_sig.ksize}) does not match reference_sig ksize ({reference_sig.ksize}).")
    +        if sample_sig.scale != reference_sig.scale:
    +            self.logger.error("Scale values do not match: sample_sig.scale=%d vs reference_sig.scale=%d",
    +                              sample_sig.scale, reference_sig.scale)
    +            raise ValueError(f"sample_sig scale ({sample_sig.scale}) does not match reference_sig scale ({reference_sig.scale}).")
    +
    +        if amplicon_sig is not None:
    +            if amplicon_sig.ksize != sample_sig.ksize:
    +                self.logger.error("K-mer sizes do not match: amplicon_sig.ksize=%d vs sample_sig.ksize=%d",
    +                                  amplicon_sig.ksize, sample_sig.ksize)
    +                raise ValueError(f"amplicon_sig ksize ({amplicon_sig.ksize}) does not match sample_sig ksize ({sample_sig.ksize}).")
    +            if amplicon_sig.scale != sample_sig.scale:
    +                self.logger.error("Scale values do not match: amplicon_sig.scale=%d vs sample_sig.scale=%d",
    +                                  amplicon_sig.scale, sample_sig.scale)
    +                raise ValueError(f"amplicon_sig scale ({amplicon_sig.scale}) does not match sample_sig scale ({sample_sig.scale}).")
    +
    +        self.logger.debug("All signatures have matching ksize and scale.")
     
     
    -        self.logger.debug("Chromosome specific signatures provided.")
    -        self.flag_activate_sex_metrics = True
    -
    -
    -        self.sample_sig = sample_sig
    -        self.reference_sig = reference_sig
    -        self.amplicon_sig = amplicon_sig
    -        self.enable_logging = enable_logging
    +        # Verify signature types
    +        if sample_sig._type != SigType.SAMPLE:
    +            self.logger.error("Invalid signature type for sample_sig: %s | %s", sample_sig.sigtype, sample_sig._type)
    +            raise ValueError(f"sample_sig must be of type {SigType.SAMPLE}, got {sample_sig.sigtype}")
    +
    +        if reference_sig.sigtype != SigType.GENOME:
    +            self.logger.error("Invalid signature type for reference_sig: %s", reference_sig.sigtype)
    +            raise ValueError(f"reference_sig must be of type {SigType.GENOME}, got {reference_sig.sigtype}")
     
    -        # Initialize attributes
    -        self.sample_stats: Dict[str, Any] = {}
    -        self.genome_stats: Dict[str, Any] = {}
    -        self.amplicon_stats: Dict[str, Any] = {}
    -        self.advanced_stats: Dict[str, Any] = {}
    -        self.chrs_stats: Dict[str, Dict[str, Any]] = {}
    -        self.sex_stats: Dict[str, Any] = {}
    -        self.predicted_assay_type: str = ""
    +        if amplicon_sig is not None and amplicon_sig.sigtype != SigType.AMPLICON:
    +            self.logger.error("Invalid signature type for amplicon_sig: %s", amplicon_sig.sigtype)
    +            raise ValueError(f"amplicon_sig must be of type {SigType.AMPLICON}, got {amplicon_sig.sigtype}")
    +
    +
    +        self.logger.debug("Chromosome specific signatures provided.")
    +        self.flag_activate_sex_metrics = True
    +
     
    -        # Set grey zone thresholds
    -        self.relative_total_abundance_grey_zone = [0.08092723407173719, 0.11884490500267662]
    -
    -        # Get sample statistics
    -        self.logger.debug("Getting sample statistics.")
    -        self.sample_stats_raw = self.sample_sig.get_sample_stats
    -
    -        # Get reference genome statistics
    -        self.logger.debug("Getting reference genome statistics.")
    -        self.genome_sig_stats = self.reference_sig.get_sample_stats
    -
    -        # If amplicon_sig is provided, get its stats
    -        if self.amplicon_sig is not None:
    -            self.logger.debug("Getting amplicon statistics.")
    -            self.amplicon_sig_stats = self.amplicon_sig.get_sample_stats
    +        self.sample_sig = sample_sig
    +        self.reference_sig = reference_sig
    +        self.amplicon_sig = amplicon_sig
    +        self.enable_logging = enable_logging
    +
    +        # Initialize attributes
    +        self.sample_stats: Dict[str, Any] = {}
    +        self.genome_stats: Dict[str, Any] = {}
    +        self.amplicon_stats: Dict[str, Any] = {}
    +        self.advanced_stats: Dict[str, Any] = {}
    +        self.chrs_stats: Dict[str, Dict[str, Any]] = {}
    +        self.sex_stats: Dict[str, Any] = {}
    +        self.predicted_error_contamination_index: Dict[str, Any] = {}
    +        self.vars_nonref_stats: Dict[str, Any] = {}
    +        self.predicted_assay_type: str = ""
     
    -        # Compute metrics
    -        self.logger.debug("Calculating statistics.")
    -        self._calculate_stats()
    -
    -
    -    def _calculate_stats(self):
    -        r"""
    -        Calculate the various metrics based on the sample, reference, and optional amplicon signatures.
    -        """
    -        # ============= SAMPLE STATS =============
    -        self.logger.debug("Processing sample statistics.")
    -        self.sample_stats = {
    -            "Total unique k-mers": self.sample_stats_raw["num_hashes"],
    -            "k-mer total abundance": self.sample_stats_raw["total_abundance"],
    -            "k-mer mean abundance": self.sample_stats_raw["mean_abundance"],
    -            "k-mer median abundance": self.sample_stats_raw["median_abundance"],
    -            "num_singletons": self.sample_stats_raw["num_singletons"],
    -            "ksize": self.sample_stats_raw["ksize"],
    -            "scale": self.sample_stats_raw["scale"],
    -            "name": self.sample_stats_raw["name"],
    -            "filename": self.sample_stats_raw["filename"],
    -        }
    -
    -        # ============= GENOME STATS =============
    -        self.logger.debug("Calculating genome statistics.")
    -        # Compute intersection of sample and reference genome
    -        self.logger.debug("Type of sample_sig: %s | Type of reference_sig: %s", self.sample_sig.sigtype, self.reference_sig.sigtype)
    -        sample_genome = self.sample_sig & self.reference_sig
    -        # Get stats (call get_sample_stats only once)
    -
    -        # Log hashes and abundances for both sample and reference
    -        self.logger.debug("Sample hashes: %s", self.sample_sig.hashes)
    -        self.logger.debug("Sample abundances: %s", self.sample_sig.abundances)
    -        self.logger.debug("Reference hashes: %s", self.reference_sig.hashes)
    -        self.logger.debug("Reference abundances: %s", self.reference_sig.abundances)
    -
    -        sample_genome_stats = sample_genome.get_sample_stats
    -
    -        self.genome_stats = {
    -            "Genomic unique k-mers": sample_genome_stats["num_hashes"],
    -            "Genomic k-mers total abundance": sample_genome_stats["total_abundance"],
    -            "Genomic k-mers mean abundance": sample_genome_stats["mean_abundance"],
    -            "Genomic k-mers median abundance": sample_genome_stats["median_abundance"],
    -            # Genome coverage index
    -            "Genome coverage index": (
    -                sample_genome_stats["num_hashes"] / self.genome_sig_stats["num_hashes"]
    -                if self.genome_sig_stats["num_hashes"] > 0 else 0
    -            ),
    -            # Mapping index
    -            "Mapping index": (
    -                sample_genome_stats["total_abundance"] / self.sample_stats["k-mer total abundance"]
    -                if self.sample_stats["k-mer total abundance"] > 0 else 0
    -            ),
    -        }
    -
    -        # ============= AMPLICON STATS =============
    -        if self.amplicon_sig is not None:
    -            self.logger.debug("Calculating amplicon statistics.")
    -            # Compute intersection of sample and amplicon
    -            sample_amplicon = self.sample_sig & self.amplicon_sig
    -            # Get stats (call get_sample_stats only once)
    -            sample_amplicon_stats = sample_amplicon.get_sample_stats
    -
    -            self.amplicon_stats = {
    -                "Amplicon unique k-mers": sample_amplicon_stats["num_hashes"],
    -                "Amplicon k-mers total abundance": sample_amplicon_stats["total_abundance"],
    -                "Amplicon k-mers mean abundance": sample_amplicon_stats["mean_abundance"],
    -                "Amplicon k-mers median abundance": sample_amplicon_stats["median_abundance"],
    -                # Amplicon coverage index
    -                "Amplicon coverage index": (
    -                    sample_amplicon_stats["num_hashes"] / self.amplicon_sig_stats["num_hashes"]
    -                    if self.amplicon_sig_stats["num_hashes"] > 0 else 0
    -                ),
    -            }
    -
    -            # Relative metrics
    -            self.amplicon_stats["Relative total abundance"] = (
    -                self.amplicon_stats["Amplicon k-mers total abundance"] / self.genome_stats["Genomic k-mers total abundance"]
    -                if self.genome_stats["Genomic k-mers total abundance"] > 0 else 0
    -            )
    -            self.amplicon_stats["Relative coverage"] = (
    -                self.amplicon_stats["Amplicon coverage index"] / self.genome_stats["Genome coverage index"]
    -                if self.genome_stats["Genome coverage index"] > 0 else 0
    -            )
    -
    -            # Predicted assay type
    -            relative_total_abundance = self.amplicon_stats["Relative total abundance"]
    -            if relative_total_abundance <= self.relative_total_abundance_grey_zone[0]:
    -                self.predicted_assay_type = "WGS"
    -            elif relative_total_abundance >= self.relative_total_abundance_grey_zone[1]:
    -                self.predicted_assay_type = "WXS"
    -            else:
    -                # Assign based on the closest threshold
    -                distance_to_wgs = abs(relative_total_abundance - self.relative_total_abundance_grey_zone[0])
    -                distance_to_wxs = abs(relative_total_abundance - self.relative_total_abundance_grey_zone[1])
    -                self.predicted_assay_type = "WGS" if distance_to_wgs < distance_to_wxs else "WXS"
    -            self.logger.debug("Predicted assay type: %s", self.predicted_assay_type)
    -
    -    def get_aggregated_stats(self, include_advanced: bool = False) -> Dict[str, Any]:
    -        r"""
    -        Retrieve aggregated statistics from the quality control analysis.
    -
    -        **Parameters**
    -
    -        - `include_advanced (bool)`:  
    -          If set to `True`, includes advanced metrics in the aggregated statistics.
    -
    -        **Returns**
    -
    -        - `Dict[str, Any]`:  
    -          A dictionary containing the aggregated statistics, which may include:
    -          - Sample statistics
    -          - Genome statistics
    -          - Amplicon statistics (if provided)
    -          - Predicted assay type
    -          - Advanced statistics (if `include_advanced` is `True`)
    -        """
    -        aggregated_stats: Dict[str, Any] = {}
    -        # Include sample_stats
    -        aggregated_stats.update(self.sample_stats)
    -        # Include genome_stats
    -        aggregated_stats.update(self.genome_stats)
    -        # Include amplicon_stats if available
    -        if self.amplicon_sig is not None:
    -            self.logger.debug("While aggregating stats; amplicon signature provided.")
    -            aggregated_stats.update(self.amplicon_stats)
    -            aggregated_stats["Predicted Assay Type"] = self.predicted_assay_type
    -
    -        if self.chrs_stats:
    -            aggregated_stats.update(self.chrs_stats)
    -
    -        if self.sex_stats:
    -            aggregated_stats.update(self.sex_stats)
    -
    -        # Include advanced_stats if requested
    -        if include_advanced:
    -            self._calculate_advanced_stats()
    -            aggregated_stats.update(self.advanced_stats)
    +        # Set grey zone thresholds
    +        self.relative_total_abundance_grey_zone = [0.08092723407173719, 0.11884490500267662]
    +
    +        # Get sample statistics
    +        self.logger.debug("Getting sample statistics.")
    +        self.sample_stats_raw = self.sample_sig.get_sample_stats
    +
    +        # Get reference genome statistics
    +        self.logger.debug("Getting reference genome statistics.")
    +        self.genome_sig_stats = self.reference_sig.get_sample_stats
    +
    +        # If amplicon_sig is provided, get its stats
    +        if self.amplicon_sig is not None:
    +            self.logger.debug("Getting amplicon statistics.")
    +            self.amplicon_sig_stats = self.amplicon_sig.get_sample_stats
    +
    +        # Compute metrics
    +        self.logger.debug("Calculating statistics.")
    +        self._calculate_stats()
    +
    +
    +    def _calculate_stats(self):
    +        r"""
    +        Calculate the various metrics based on the sample, reference, and optional amplicon signatures.
    +        """
    +        # ============= SAMPLE STATS =============
    +        self.logger.debug("Processing sample statistics.")
    +        self.sample_stats = {
    +            "Total unique k-mers": self.sample_stats_raw["num_hashes"],
    +            "k-mer total abundance": self.sample_stats_raw["total_abundance"],
    +            "k-mer mean abundance": self.sample_stats_raw["mean_abundance"],
    +            "k-mer median abundance": self.sample_stats_raw["median_abundance"],
    +            "num_singletons": self.sample_stats_raw["num_singletons"],
    +            "ksize": self.sample_stats_raw["ksize"],
    +            "scale": self.sample_stats_raw["scale"],
    +            "name": self.sample_stats_raw["name"],
    +            "filename": self.sample_stats_raw["filename"],
    +        }
    +
    +        # ============= GENOME STATS =============
    +        self.logger.debug("Calculating genome statistics.")
    +        # Compute intersection of sample and reference genome
    +        self.logger.debug("Type of sample_sig: %s | Type of reference_sig: %s", self.sample_sig.sigtype, self.reference_sig.sigtype)
    +        sample_genome = self.sample_sig & self.reference_sig
    +        # Get stats (call get_sample_stats only once)
    +
    +        # Log hashes and abundances for both sample and reference
    +        # self.logger.debug("Sample hashes: %s", self.sample_sig.hashes)
    +        # self.logger.debug("Sample abundances: %s", self.sample_sig.abundances)
    +        # self.logger.debug("Reference hashes: %s", self.reference_sig.hashes)
    +        # self.logger.debug("Reference abundances: %s", self.reference_sig.abundances)
    +
    +        sample_genome_stats = sample_genome.get_sample_stats
    +
    +        self.genome_stats = {
    +            "Genomic unique k-mers": sample_genome_stats["num_hashes"],
    +            "Genomic k-mers total abundance": sample_genome_stats["total_abundance"],
    +            "Genomic k-mers mean abundance": sample_genome_stats["mean_abundance"],
    +            "Genomic k-mers median abundance": sample_genome_stats["median_abundance"],
    +            # Genome coverage index
    +            "Genome coverage index": (
    +                sample_genome_stats["num_hashes"] / self.genome_sig_stats["num_hashes"]
    +                if self.genome_sig_stats["num_hashes"] > 0 else 0
    +            ),
    +            # Mapping index
    +            "Mapping index": (
    +                sample_genome_stats["total_abundance"] / self.sample_stats["k-mer total abundance"]
    +                if self.sample_stats["k-mer total abundance"] > 0 else 0
    +            ),
    +        }
    +
    +        # ============= AMPLICON STATS =============
    +        if self.amplicon_sig is not None:
    +            self.logger.debug("Calculating amplicon statistics.")
    +            # Compute intersection of sample and amplicon
    +            sample_amplicon = self.sample_sig & self.amplicon_sig
    +            # Get stats (call get_sample_stats only once)
    +            sample_amplicon_stats = sample_amplicon.get_sample_stats
    +
    +            self.amplicon_stats = {
    +                "Amplicon unique k-mers": sample_amplicon_stats["num_hashes"],
    +                "Amplicon k-mers total abundance": sample_amplicon_stats["total_abundance"],
    +                "Amplicon k-mers mean abundance": sample_amplicon_stats["mean_abundance"],
    +                "Amplicon k-mers median abundance": sample_amplicon_stats["median_abundance"],
    +                # Amplicon coverage index
    +                "Amplicon coverage index": (
    +                    sample_amplicon_stats["num_hashes"] / self.amplicon_sig_stats["num_hashes"]
    +                    if self.amplicon_sig_stats["num_hashes"] > 0 else 0
    +                ),
    +            }
    +
    +            # ============= RELATIVE STATS =============
    +            self.amplicon_stats["Relative total abundance"] = (
    +                self.amplicon_stats["Amplicon k-mers total abundance"] / self.genome_stats["Genomic k-mers total abundance"]
    +                if self.genome_stats["Genomic k-mers total abundance"] > 0 else 0
    +            )
    +            self.amplicon_stats["Relative coverage"] = (
    +                self.amplicon_stats["Amplicon coverage index"] / self.genome_stats["Genome coverage index"]
    +                if self.genome_stats["Genome coverage index"] > 0 else 0
    +            )
    +
    +            relative_total_abundance = self.amplicon_stats["Relative total abundance"]
    +            if relative_total_abundance <= self.relative_total_abundance_grey_zone[0]:
    +                self.predicted_assay_type = "WGS"
    +            elif relative_total_abundance >= self.relative_total_abundance_grey_zone[1]:
    +                self.predicted_assay_type = "WXS"
    +            else:
    +                # Assign based on the closest threshold
    +                distance_to_wgs = abs(relative_total_abundance - self.relative_total_abundance_grey_zone[0])
    +                distance_to_wxs = abs(relative_total_abundance - self.relative_total_abundance_grey_zone[1])
    +                self.predicted_assay_type = "WGS" if distance_to_wgs < distance_to_wxs else "WXS"
    +
    +
    +            self.logger.debug("Predicted assay type: %s", self.predicted_assay_type)
    +
    +        self.logger.debug("Calculuating error and contamination indices.")
    +        try:
    +            sample_nonref = self.sample_sig - self.reference_sig
    +            sample_nonref_singletons = sample_nonref.count_singletons()
    +            sample_nonref_non_singletons = sample_nonref.total_abundance - sample_nonref_singletons
    +            sample_total_abundance = self.sample_sig.total_abundance
    +
    +            predicted_error_index = sample_nonref_singletons / sample_total_abundance
    +            predicted_contamination_index = sample_nonref_non_singletons / sample_total_abundance
    +
    +            # predict error and contamination index
    +            self.predicted_error_contamination_index["Predicted contamination index"] = predicted_contamination_index
    +            self.predicted_error_contamination_index["Sequencing errors index"] = predicted_error_index
    +        # except zero division error
    +        except ZeroDivisionError:
    +            self.logger.error("Please check the sample signature, it seems to be empty.")
    +
    +
    +    def get_aggregated_stats(self, include_advanced: bool = False) -> Dict[str, Any]:
    +        r"""
    +        Retrieve aggregated statistics from the quality control analysis.
    +
    +        **Parameters**
     
    -        return aggregated_stats
    -
    -    def _calculate_advanced_stats(self):
    -        r"""
    -        Calculate advanced statistics, such as median-trimmed metrics.
    -        """
    -        self.logger.debug("Calculating advanced statistics.")
    -
    -        # Copy sample signature to avoid modifying the original
    -        median_trimmed_sample_sig = self.sample_sig.copy()
    -        # Trim below median
    -        median_trimmed_sample_sig.trim_below_median()
    -        # Get stats
    -        median_trimmed_sample_stats = median_trimmed_sample_sig.get_sample_stats
    -        self.advanced_stats.update({
    -            "Median-trimmed unique k-mers": median_trimmed_sample_stats["num_hashes"],
    -            "Median-trimmed total abundance": median_trimmed_sample_stats["total_abundance"],
    -            "Median-trimmed mean abundance": median_trimmed_sample_stats["mean_abundance"],
    -            "Median-trimmed median abundance": median_trimmed_sample_stats["median_abundance"],
    -        })
    -
    -        # Genome stats for median-trimmed sample
    -        median_trimmed_sample_genome = median_trimmed_sample_sig & self.reference_sig
    -        median_trimmed_sample_genome_stats = median_trimmed_sample_genome.get_sample_stats
    -        self.advanced_stats.update({
    -            "Median-trimmed Genomic unique k-mers": median_trimmed_sample_genome_stats["num_hashes"],
    -            "Median-trimmed Genomic total abundance": median_trimmed_sample_genome_stats["total_abundance"],
    -            "Median-trimmed Genomic mean abundance": median_trimmed_sample_genome_stats["mean_abundance"],
    -            "Median-trimmed Genomic median abundance": median_trimmed_sample_genome_stats["median_abundance"],
    -            "Median-trimmed Genome coverage index": (
    -                median_trimmed_sample_genome_stats["num_hashes"] / self.genome_sig_stats["num_hashes"]
    -                if self.genome_sig_stats["num_hashes"] > 0 else 0
    -            ),
    -        })
    -
    -        if self.amplicon_sig is not None:
    -            self.logger.debug("Calculating advanced amplicon statistics.")
    -            # Amplicon stats for median-trimmed sample
    -            median_trimmed_sample_amplicon = median_trimmed_sample_sig & self.amplicon_sig
    -            median_trimmed_sample_amplicon_stats = median_trimmed_sample_amplicon.get_sample_stats
    -            self.advanced_stats.update({
    -                "Median-trimmed Amplicon unique k-mers": median_trimmed_sample_amplicon_stats["num_hashes"],
    -                "Median-trimmed Amplicon total abundance": median_trimmed_sample_amplicon_stats["total_abundance"],
    -                "Median-trimmed Amplicon mean abundance": median_trimmed_sample_amplicon_stats["mean_abundance"],
    -                "Median-trimmed Amplicon median abundance": median_trimmed_sample_amplicon_stats["median_abundance"],
    -                "Median-trimmed Amplicon coverage index": (
    -                    median_trimmed_sample_amplicon_stats["num_hashes"] / self.amplicon_sig_stats["num_hashes"]
    -                    if self.amplicon_sig_stats["num_hashes"] > 0 else 0
    -                ),
    -            })
    -            # Additional advanced relative metrics
    -            self.logger.debug("Calculating advanced relative metrics.")
    -            self.amplicon_stats["Median-trimmed relative coverage"] = (
    -                self.advanced_stats["Median-trimmed Amplicon coverage index"] / self.advanced_stats["Median-trimmed Genome coverage index"]
    -                if self.advanced_stats["Median-trimmed Genome coverage index"] > 0 else 0
    -            )
    -            self.amplicon_stats["Median-trimmed relative mean abundance"] = (
    -                self.advanced_stats["Median-trimmed Amplicon mean abundance"] / self.advanced_stats["Median-trimmed Genomic mean abundance"]
    -                if self.advanced_stats["Median-trimmed Genomic mean abundance"] > 0 else 0
    -            )
    -            # Update amplicon_stats with advanced metrics
    -            self.amplicon_stats.update({
    -                "Median-trimmed relative coverage": self.amplicon_stats["Median-trimmed relative coverage"],
    -                "Median-trimmed relative mean abundance": self.amplicon_stats["Median-trimmed relative mean abundance"],
    -            })
    -
    -            self.advanced_stats.update(self.amplicon_stats)
    -
    -    def _calculate_advanced_stats(self):
    -        r"""
    -        Calculate advanced statistics, such as median-trimmed metrics.
    -        """
    -        self.logger.debug("Calculating advanced statistics.")
    -
    -        # Copy sample signature to avoid modifying the original
    -        median_trimmed_sample_sig = self.sample_sig.copy()
    -        # Trim below median
    -        median_trimmed_sample_sig.trim_below_median()
    -        # Get stats
    -        median_trimmed_sample_stats = median_trimmed_sample_sig.get_sample_stats
    -        self.advanced_stats.update({
    -            "Median-trimmed unique k-mers": median_trimmed_sample_stats["num_hashes"],
    -            "Median-trimmed total abundance": median_trimmed_sample_stats["total_abundance"],
    -            "Median-trimmed mean abundance": median_trimmed_sample_stats["mean_abundance"],
    -            "Median-trimmed median abundance": median_trimmed_sample_stats["median_abundance"],
    -        })
    -
    -        # Genome stats for median-trimmed sample
    -        median_trimmed_sample_genome = median_trimmed_sample_sig & self.reference_sig
    -        median_trimmed_sample_genome_stats = median_trimmed_sample_genome.get_sample_stats
    -        self.advanced_stats.update({
    -            "Median-trimmed Genomic unique k-mers": median_trimmed_sample_genome_stats["num_hashes"],
    -            "Median-trimmed Genomic total abundance": median_trimmed_sample_genome_stats["total_abundance"],
    -            "Median-trimmed Genomic mean abundance": median_trimmed_sample_genome_stats["mean_abundance"],
    -            "Median-trimmed Genomic median abundance": median_trimmed_sample_genome_stats["median_abundance"],
    -            "Median-trimmed Genome coverage index": (
    -                median_trimmed_sample_genome_stats["num_hashes"] / self.genome_sig_stats["num_hashes"]
    -                if self.genome_sig_stats["num_hashes"] > 0 else 0
    -            ),
    -        })
    -
    -        if self.amplicon_sig is not None:
    -            self.logger.debug("Calculating advanced amplicon statistics.")
    -            # Amplicon stats for median-trimmed sample
    -            median_trimmed_sample_amplicon = median_trimmed_sample_sig & self.amplicon_sig
    -            median_trimmed_sample_amplicon_stats = median_trimmed_sample_amplicon.get_sample_stats
    -            self.advanced_stats.update({
    -                "Median-trimmed Amplicon unique k-mers": median_trimmed_sample_amplicon_stats["num_hashes"],
    -                "Median-trimmed Amplicon total abundance": median_trimmed_sample_amplicon_stats["total_abundance"],
    -                "Median-trimmed Amplicon mean abundance": median_trimmed_sample_amplicon_stats["mean_abundance"],
    -                "Median-trimmed Amplicon median abundance": median_trimmed_sample_amplicon_stats["median_abundance"],
    -                "Median-trimmed Amplicon coverage index": (
    -                    median_trimmed_sample_amplicon_stats["num_hashes"] / self.amplicon_sig_stats["num_hashes"]
    -                    if self.amplicon_sig_stats["num_hashes"] > 0 else 0
    -                ),
    -            })
    -            # Additional advanced relative metrics
    -            self.logger.debug("Calculating advanced relative metrics.")
    -            self.amplicon_stats["Median-trimmed relative coverage"] = (
    -                self.advanced_stats["Median-trimmed Amplicon coverage index"] / self.advanced_stats["Median-trimmed Genome coverage index"]
    -                if self.advanced_stats["Median-trimmed Genome coverage index"] > 0 else 0
    -            )
    -            self.amplicon_stats["Median-trimmed relative mean abundance"] = (
    -                self.advanced_stats["Median-trimmed Amplicon mean abundance"] / self.advanced_stats["Median-trimmed Genomic mean abundance"]
    -                if self.advanced_stats["Median-trimmed Genomic mean abundance"] > 0 else 0
    -            )
    -            # Update amplicon_stats with advanced metrics
    -            self.amplicon_stats.update({
    -                "Median-trimmed relative coverage": self.amplicon_stats["Median-trimmed relative coverage"],
    -                "Median-trimmed relative mean abundance": self.amplicon_stats["Median-trimmed relative mean abundance"],
    -            })
    -
    -            self.advanced_stats.update(self.amplicon_stats)
    -
    -    def split_sig_randomly(self, n: int) -> List[SnipeSig]:
    -        r"""
    -        Split the sample signature into `n` random parts based on abundances.
    -
    -        This method distributes the k-mers of the sample signature into `n` parts using a multinomial distribution
    -        based on their abundances. Each k-mer's abundance is split across the `n` parts proportionally.
    -
    -        **Mathematical Explanation**:
    -
    -        For each k-mer with hash \( h \) and abundance \( a_h \), its abundance is distributed into \( n \) parts
    -        according to a multinomial distribution. Specifically, the abundance in each part \( i \) is given by:
    -
    -        $$
    -        a_{h,i} \sim \text{Multinomial}(a_h, \frac{1}{n}, \frac{1}{n}, \dots, \frac{1}{n})
    -        $$
    -
    -        Where:
    -        - \( a_{h,i} \) is the abundance of k-mer \( h \) in part \( i \).
    -        - Each \( a_{h,i} \) is a non-negative integer such that \( \sum_{i=1}^{n} a_{h,i} = a_h \).
    -
    -        **Parameters**:
    -
    -        - `n` (`int`): Number of parts to split into.
    -
    -        **Returns**:
    -
    -        - `List[SnipeSig]`:  
    -          List of `SnipeSig` instances representing the split parts.
    -
    -        **Usage Example**:
    -
    -        ```python
    -        split_sigs = qc.split_sig_randomly(n=3)
    -        for idx, sig in enumerate(split_sigs, 1):
    -            print(f"Signature part {idx}: {sig}")
    -        ```
    -        """
    -        self.logger.debug("Splitting sample signature into %d random parts.", n)
    -        # Get k-mers and abundances
    -        hash_to_abund = dict(zip(self.sample_sig.hashes, self.sample_sig.abundances))
    -        random_split_sigs = self.distribute_kmers_random(hash_to_abund, n)
    -        split_sigs = [
    -            SnipeSig.create_from_hashes_abundances(
    -                hashes=np.array(list(kmer_dict.keys()), dtype=np.uint64),
    -                abundances=np.array(list(kmer_dict.values()), dtype=np.uint32),
    -                ksize=self.sample_sig.ksize,
    -                scale=self.sample_sig.scale,
    -                name=f"{self.sample_sig.name}_part_{i+1}",
    -                filename=self.sample_sig.filename,
    -                enable_logging=self.enable_logging
    -            )
    -            for i, kmer_dict in enumerate(random_split_sigs)
    -        ]
    -        return split_sigs
    -
    -    @staticmethod
    -    def distribute_kmers_random(original_dict: Dict[int, int], n: int) -> List[Dict[int, int]]:
    -        r"""
    -        Distribute the k-mers randomly into `n` parts based on their abundances.
    -
    -        This helper method performs the actual distribution of k-mers using a multinomial distribution.
    -
    -        **Mathematical Explanation**:
    -
    -        Given a k-mer with hash \( h \) and abundance \( a_h \), the distribution of its abundance across \( n \)
    -        parts is modeled as:
    +        - `include_advanced (bool)`:  
    +          If set to `True`, includes advanced metrics in the aggregated statistics.
    +
    +        **Returns**
    +
    +        - `Dict[str, Any]`:  
    +          A dictionary containing the aggregated statistics, which may include:
    +          - Sample statistics
    +          - Genome statistics
    +          - Amplicon statistics (if provided)
    +          - Predicted assay type
    +          - Advanced statistics (if `include_advanced` is `True`)
    +        """
    +        aggregated_stats: Dict[str, Any] = {}
    +        # Include sample_stats
    +        aggregated_stats.update(self.sample_stats)
    +        # Include genome_stats
    +        aggregated_stats.update(self.genome_stats)
    +        # Include amplicon_stats if available
    +        if self.amplicon_sig is not None:
    +            self.logger.debug("While aggregating stats; amplicon signature provided.")
    +            aggregated_stats.update(self.amplicon_stats)
    +            aggregated_stats["Predicted Assay Type"] = self.predicted_assay_type
    +
    +        if self.chrs_stats:
    +            aggregated_stats.update(self.chrs_stats)
    +
    +        if self.sex_stats:
    +            aggregated_stats.update(self.sex_stats)
    +
    +        if self.vars_nonref_stats:
    +            aggregated_stats.update(self.vars_nonref_stats)
    +
    +        # Include advanced_stats if requested
    +        if include_advanced:
    +            self._calculate_advanced_stats()
    +            aggregated_stats.update(self.advanced_stats)
    +
    +        if self.predicted_error_contamination_index:
    +            aggregated_stats.update(self.predicted_error_contamination_index)
    +
    +        return aggregated_stats
    +
    +    def _calculate_advanced_stats(self):
    +        r"""
    +        Calculate advanced statistics, such as median-trimmed metrics.
    +        """
    +        self.logger.debug("Calculating advanced statistics.")
    +
    +        # Copy sample signature to avoid modifying the original
    +        median_trimmed_sample_sig = self.sample_sig.copy()
    +        # Trim below median
    +        median_trimmed_sample_sig.trim_below_median()
    +        # Get stats
    +        median_trimmed_sample_stats = median_trimmed_sample_sig.get_sample_stats
    +        self.advanced_stats.update({
    +            "Median-trimmed unique k-mers": median_trimmed_sample_stats["num_hashes"],
    +            "Median-trimmed total abundance": median_trimmed_sample_stats["total_abundance"],
    +            "Median-trimmed mean abundance": median_trimmed_sample_stats["mean_abundance"],
    +            "Median-trimmed median abundance": median_trimmed_sample_stats["median_abundance"],
    +        })
    +
    +        # Genome stats for median-trimmed sample
    +        median_trimmed_sample_genome = median_trimmed_sample_sig & self.reference_sig
    +        median_trimmed_sample_genome_stats = median_trimmed_sample_genome.get_sample_stats
    +        self.advanced_stats.update({
    +            "Median-trimmed Genomic unique k-mers": median_trimmed_sample_genome_stats["num_hashes"],
    +            "Median-trimmed Genomic total abundance": median_trimmed_sample_genome_stats["total_abundance"],
    +            "Median-trimmed Genomic mean abundance": median_trimmed_sample_genome_stats["mean_abundance"],
    +            "Median-trimmed Genomic median abundance": median_trimmed_sample_genome_stats["median_abundance"],
    +            "Median-trimmed Genome coverage index": (
    +                median_trimmed_sample_genome_stats["num_hashes"] / self.genome_sig_stats["num_hashes"]
    +                if self.genome_sig_stats["num_hashes"] > 0 else 0
    +            ),
    +        })
    +
    +        if self.amplicon_sig is not None:
    +            self.logger.debug("Calculating advanced amplicon statistics.")
    +            # Amplicon stats for median-trimmed sample
    +            median_trimmed_sample_amplicon = median_trimmed_sample_sig & self.amplicon_sig
    +            median_trimmed_sample_amplicon_stats = median_trimmed_sample_amplicon.get_sample_stats
    +            self.advanced_stats.update({
    +                "Median-trimmed Amplicon unique k-mers": median_trimmed_sample_amplicon_stats["num_hashes"],
    +                "Median-trimmed Amplicon total abundance": median_trimmed_sample_amplicon_stats["total_abundance"],
    +                "Median-trimmed Amplicon mean abundance": median_trimmed_sample_amplicon_stats["mean_abundance"],
    +                "Median-trimmed Amplicon median abundance": median_trimmed_sample_amplicon_stats["median_abundance"],
    +                "Median-trimmed Amplicon coverage index": (
    +                    median_trimmed_sample_amplicon_stats["num_hashes"] / self.amplicon_sig_stats["num_hashes"]
    +                    if self.amplicon_sig_stats["num_hashes"] > 0 else 0
    +                ),
    +            })
    +            # Additional advanced relative metrics
    +            self.logger.debug("Calculating advanced relative metrics.")
    +            self.amplicon_stats["Median-trimmed relative coverage"] = (
    +                self.advanced_stats["Median-trimmed Amplicon coverage index"] / self.advanced_stats["Median-trimmed Genome coverage index"]
    +                if self.advanced_stats["Median-trimmed Genome coverage index"] > 0 else 0
    +            )
    +            self.amplicon_stats["Median-trimmed relative mean abundance"] = (
    +                self.advanced_stats["Median-trimmed Amplicon mean abundance"] / self.advanced_stats["Median-trimmed Genomic mean abundance"]
    +                if self.advanced_stats["Median-trimmed Genomic mean abundance"] > 0 else 0
    +            )
    +            # Update amplicon_stats with advanced metrics
    +            self.amplicon_stats.update({
    +                "Median-trimmed relative coverage": self.amplicon_stats["Median-trimmed relative coverage"],
    +                "Median-trimmed relative mean abundance": self.amplicon_stats["Median-trimmed relative mean abundance"],
    +            })
    +
    +            self.advanced_stats.update(self.amplicon_stats)
    +
    +    def _calculate_advanced_stats(self):
    +        r"""
    +        Calculate advanced statistics, such as median-trimmed metrics.
    +        """
    +        self.logger.debug("Calculating advanced statistics.")
    +
    +        # Copy sample signature to avoid modifying the original
    +        median_trimmed_sample_sig = self.sample_sig.copy()
    +        # Trim below median
    +        median_trimmed_sample_sig.trim_below_median()
    +        # Get stats
    +        median_trimmed_sample_stats = median_trimmed_sample_sig.get_sample_stats
    +        self.advanced_stats.update({
    +            "Median-trimmed unique k-mers": median_trimmed_sample_stats["num_hashes"],
    +            "Median-trimmed total abundance": median_trimmed_sample_stats["total_abundance"],
    +            "Median-trimmed mean abundance": median_trimmed_sample_stats["mean_abundance"],
    +            "Median-trimmed median abundance": median_trimmed_sample_stats["median_abundance"],
    +        })
    +
    +        # Genome stats for median-trimmed sample
    +        median_trimmed_sample_genome = median_trimmed_sample_sig & self.reference_sig
    +        median_trimmed_sample_genome_stats = median_trimmed_sample_genome.get_sample_stats
    +        self.advanced_stats.update({
    +            "Median-trimmed Genomic unique k-mers": median_trimmed_sample_genome_stats["num_hashes"],
    +            "Median-trimmed Genomic total abundance": median_trimmed_sample_genome_stats["total_abundance"],
    +            "Median-trimmed Genomic mean abundance": median_trimmed_sample_genome_stats["mean_abundance"],
    +            "Median-trimmed Genomic median abundance": median_trimmed_sample_genome_stats["median_abundance"],
    +            "Median-trimmed Genome coverage index": (
    +                median_trimmed_sample_genome_stats["num_hashes"] / self.genome_sig_stats["num_hashes"]
    +                if self.genome_sig_stats["num_hashes"] > 0 else 0
    +            ),
    +        })
    +
    +        if self.amplicon_sig is not None:
    +            self.logger.debug("Calculating advanced amplicon statistics.")
    +            # Amplicon stats for median-trimmed sample
    +            median_trimmed_sample_amplicon = median_trimmed_sample_sig & self.amplicon_sig
    +            median_trimmed_sample_amplicon_stats = median_trimmed_sample_amplicon.get_sample_stats
    +            self.advanced_stats.update({
    +                "Median-trimmed Amplicon unique k-mers": median_trimmed_sample_amplicon_stats["num_hashes"],
    +                "Median-trimmed Amplicon total abundance": median_trimmed_sample_amplicon_stats["total_abundance"],
    +                "Median-trimmed Amplicon mean abundance": median_trimmed_sample_amplicon_stats["mean_abundance"],
    +                "Median-trimmed Amplicon median abundance": median_trimmed_sample_amplicon_stats["median_abundance"],
    +                "Median-trimmed Amplicon coverage index": (
    +                    median_trimmed_sample_amplicon_stats["num_hashes"] / self.amplicon_sig_stats["num_hashes"]
    +                    if self.amplicon_sig_stats["num_hashes"] > 0 else 0
    +                ),
    +            })
    +            # Additional advanced relative metrics
    +            self.logger.debug("Calculating advanced relative metrics.")
    +            self.amplicon_stats["Median-trimmed relative coverage"] = (
    +                self.advanced_stats["Median-trimmed Amplicon coverage index"] / self.advanced_stats["Median-trimmed Genome coverage index"]
    +                if self.advanced_stats["Median-trimmed Genome coverage index"] > 0 else 0
    +            )
    +            self.amplicon_stats["Median-trimmed relative mean abundance"] = (
    +                self.advanced_stats["Median-trimmed Amplicon mean abundance"] / self.advanced_stats["Median-trimmed Genomic mean abundance"]
    +                if self.advanced_stats["Median-trimmed Genomic mean abundance"] > 0 else 0
    +            )
    +            # Update amplicon_stats with advanced metrics
    +            self.amplicon_stats.update({
    +                "Median-trimmed relative coverage": self.amplicon_stats["Median-trimmed relative coverage"],
    +                "Median-trimmed relative mean abundance": self.amplicon_stats["Median-trimmed relative mean abundance"],
    +            })
    +
    +            self.advanced_stats.update(self.amplicon_stats)
    +
    +    def split_sig_randomly(self, n: int) -> List[SnipeSig]:
    +        r"""
    +        Split the sample signature into `n` random parts based on abundances.
    +
    +        This method distributes the k-mers of the sample signature into `n` parts using a multinomial distribution
    +        based on their abundances. Each k-mer's abundance is split across the `n` parts proportionally.
    +
    +        **Mathematical Explanation**:
    +
    +        For each k-mer with hash \( h \) and abundance \( a_h \), its abundance is distributed into \( n \) parts
    +        according to a multinomial distribution. Specifically, the abundance in each part \( i \) is given by:
    +
    +        $$
    +        a_{h,i} \sim \text{Multinomial}(a_h, \frac{1}{n}, \frac{1}{n}, \dots, \frac{1}{n})
    +        $$
    +
    +        Where:
    +        - \( a_{h,i} \) is the abundance of k-mer \( h \) in part \( i \).
    +        - Each \( a_{h,i} \) is a non-negative integer such that \( \sum_{i=1}^{n} a_{h,i} = a_h \).
    +
    +        **Parameters**:
    +
    +        - `n` (`int`): Number of parts to split into.
    +
    +        **Returns**:
     
    -        $$
    -        a_{h,1}, a_{h,2}, \dots, a_{h,n} \sim \text{Multinomial}(a_h, p_1, p_2, \dots, p_n)
    -        $$
    -
    -        Where \( p_i = \frac{1}{n} \) for all \( i \).
    -
    -        **Parameters**:
    -
    -        - `original_dict` (`Dict[int, int]`):  
    -          Dictionary mapping k-mer hashes to their abundances.
    -        - `n` (`int`): Number of parts to split into.
    -
    -        **Returns**:
    -
    -        - `List[Dict[int, int]]`:  
    -          List of dictionaries, each mapping k-mer hashes to their abundances in that part.
    -
    -        **Usage Example**:
    +        - `List[SnipeSig]`:  
    +          List of `SnipeSig` instances representing the split parts.
    +
    +        **Usage Example**:
    +
    +        ```python
    +        split_sigs = qc.split_sig_randomly(n=3)
    +        for idx, sig in enumerate(split_sigs, 1):
    +            print(f"Signature part {idx}: {sig}")
    +        ```
    +        """
    +        self.logger.debug("Attempting to split sample signature into %d random parts.", n)
    +
    +        # Check if the split for this n is already cached
    +        if n in self._split_cache:
    +            self.logger.debug("Using cached split signatures for n=%d.", n)
    +            # Return deep copies to prevent external modifications
    +            return [sig.copy() for sig in self._split_cache[n]]
     
    -        ```python
    -        distributed = ReferenceQC.distribute_kmers_random(hash_to_abund, n=3)
    -        ```
    -        """
    -        # Initialize the resulting dictionaries
    -        distributed_dicts = [{} for _ in range(n)]
    -
    -        # For each k-mer and its abundance
    -        for kmer_hash, abundance in original_dict.items():
    -            if abundance == 0:
    -                continue  # Skip zero abundances
    -            # Generate multinomial split of abundance
    -            counts = np.random.multinomial(abundance, [1.0 / n] * n)
    -            # Update each dictionary
    -            for i in range(n):
    -                if counts[i] > 0:
    -                    distributed_dicts[i][kmer_hash] = counts[i]
    -
    -        return distributed_dicts
    -
    -    def calculate_coverage_vs_depth(self, n: int = 30) -> List[Dict[str, Any]]:
    -        r"""
    -        Calculate cumulative coverage index vs cumulative sequencing depth.
    -
    -        This method simulates incremental sequencing by splitting the sample signature into `n` parts and
    -        calculating the cumulative coverage index at each step. It helps in understanding how coverage
    -        improves with increased sequencing depth.
    +        self.logger.debug("No cached splits found for n=%d. Proceeding to split.", n)
    +        # Get k-mers and abundances
    +        hash_to_abund = dict(zip(self.sample_sig.hashes, self.sample_sig.abundances))
    +        random_split_sigs = self.distribute_kmers_random(hash_to_abund, n)
    +        split_sigs = [
    +            SnipeSig.create_from_hashes_abundances(
    +                hashes=np.array(list(kmer_dict.keys()), dtype=np.uint64),
    +                abundances=np.array(list(kmer_dict.values()), dtype=np.uint32),
    +                ksize=self.sample_sig.ksize,
    +                scale=self.sample_sig.scale,
    +                name=f"{self.sample_sig.name}_part_{i+1}",
    +                filename=self.sample_sig.filename,
    +                enable_logging=self.enable_logging
    +            )
    +            for i, kmer_dict in enumerate(random_split_sigs)
    +        ]
    +
    +        # Cache the split signatures
    +        self._split_cache[n] = split_sigs
    +        self.logger.debug("Cached split signatures for n=%d.", n)
    +
    +        return split_sigs
    +
    +    @staticmethod
    +    def distribute_kmers_random(original_dict: Dict[int, int], n: int) -> List[Dict[int, int]]:
    +        r"""
    +        Distribute the k-mers randomly into `n` parts based on their abundances.
     
    -        **Mathematical Explanation**:
    +        This helper method performs the actual distribution of k-mers using a multinomial distribution.
     
    -        For each cumulative part \( i \) (where \( 1 \leq i \leq n \)):
    +        **Mathematical Explanation**:
     
    -        - **Cumulative Sequencing Depth** (\( D_i \)):
    -          $$
    -          D_i = \sum_{j=1}^{i} a_j
    -          $$
    -          Where \( a_j \) is the total abundance of the \( j^{th} \) part.
    -
    -        - **Cumulative Coverage Index** (\( C_i \)):
    -          $$
    -          C_i = \frac{\text{Number of genomic unique k-mers in first } i \text{ parts}}{\left| \text{Reference genome k-mer set} \right|}
    -          $$
    +        Given a k-mer with hash \( h \) and abundance \( a_h \), the distribution of its abundance across \( n \)
    +        parts is modeled as:
    +
    +        $$
    +        a_{h,1}, a_{h,2}, \dots, a_{h,n} \sim \text{Multinomial}(a_h, p_1, p_2, \dots, p_n)
    +        $$
    +
    +        Where \( p_i = \frac{1}{n} \) for all \( i \).
    +
    +        **Parameters**:
     
    -        **Parameters**:
    -
    -        - `n` (`int`): Number of parts to split the signature into.
    +        - `original_dict` (`Dict[int, int]`):  
    +          Dictionary mapping k-mer hashes to their abundances.
    +        - `n` (`int`): Number of parts to split into.
     
             **Returns**:
     
    -        - `List[Dict[str, Any]]`:  
    -          List of dictionaries containing:
    -            - `"cumulative_parts"` (`int`): Number of parts included.
    -            - `"cumulative_total_abundance"` (`int`): Total sequencing depth up to this part.
    -            - `"cumulative_coverage_index"` (`float`): Coverage index up to this part.
    -
    -        **Usage Example**:
    -
    -        ```python
    -        coverage_depth_data = qc.calculate_coverage_vs_depth(n=10)
    -        for data in coverage_depth_data:
    -            print(data)
    -        ```
    -        """
    -        self.logger.debug("Calculating coverage vs depth with %d parts.", n)
    -        # Determine the ROI reference signature
    -        if isinstance(self.amplicon_sig, SnipeSig):
    -            roi_reference_sig = self.amplicon_sig
    -            self.logger.debug("Using amplicon signature as ROI reference.")
    -        else:
    -            roi_reference_sig = self.reference_sig
    -            self.logger.debug("Using reference genome signature as ROI reference.")
    +        - `List[Dict[int, int]]`:  
    +          List of dictionaries, each mapping k-mer hashes to their abundances in that part.
    +
    +        **Usage Example**:
    +
    +        ```python
    +        distributed = ReferenceQC.distribute_kmers_random(hash_to_abund, n=3)
    +        ```
    +        """
    +        # Initialize the resulting dictionaries
    +        distributed_dicts = [{} for _ in range(n)]
    +
    +        # For each k-mer and its abundance
    +        for kmer_hash, abundance in original_dict.items():
    +            if abundance == 0:
    +                continue  # Skip zero abundances
    +            # Generate multinomial split of abundance
    +            counts = np.random.multinomial(abundance, [1.0 / n] * n)
    +            # Update each dictionary
    +            for i in range(n):
    +                if counts[i] > 0:
    +                    distributed_dicts[i][kmer_hash] = counts[i]
     
    -        # Split the sample signature into n random parts
    -        split_sigs = self.split_sig_randomly(n)
    -
    -        coverage_depth_data = []
    -
    -        cumulative_snipe_sig = split_sigs[0].copy()
    -        cumulative_total_abundance = cumulative_snipe_sig.total_abundance
    -
    -        #! force conversion to GENOME
    -        roi_reference_sig.sigtype = SigType.GENOME
    -
    -        # Compute initial coverage index
    -        cumulative_qc = ReferenceQC(
    -            sample_sig=cumulative_snipe_sig,
    -            reference_sig=roi_reference_sig,
    -            enable_logging=self.enable_logging
    -        )
    -        cumulative_stats = cumulative_qc.get_aggregated_stats()
    -        cumulative_coverage_index = cumulative_stats["Genome coverage index"]
    +        return distributed_dicts
    +
    +    def calculate_coverage_vs_depth(self, n: int = 30) -> List[Dict[str, Any]]:
    +        r"""
    +        Calculate cumulative coverage index vs cumulative sequencing depth.
    +
    +        This method simulates incremental sequencing by splitting the sample signature into `n` parts and
    +        calculating the cumulative coverage index at each step. It helps in understanding how coverage
    +        improves with increased sequencing depth.
    +
    +        **Mathematical Explanation**:
    +
    +        For each cumulative part \( i \) (where \( 1 \leq i \leq n \)):
    +
    +        - **Cumulative Sequencing Depth** (\( D_i \)):
    +          $$
    +          D_i = \sum_{j=1}^{i} a_j
    +          $$
    +          Where \( a_j \) is the total abundance of the \( j^{th} \) part.
     
    -        coverage_depth_data.append({
    -            "cumulative_parts": 1,
    -            "cumulative_total_abundance": cumulative_total_abundance,
    -            "cumulative_coverage_index": cumulative_coverage_index,
    -        })
    -
    -        # Iterate over the rest of the parts
    -        for i in range(1, n):
    -            current_part = split_sigs[i]
    -
    -            # Add current part to cumulative signature
    -            cumulative_snipe_sig += current_part
    -            cumulative_total_abundance += current_part.total_abundance
    -
    -            # Compute new coverage index
    -            cumulative_qc = ReferenceQC(
    -                sample_sig=cumulative_snipe_sig,
    -                reference_sig=roi_reference_sig,
    -                enable_logging=self.enable_logging
    -            )
    -            cumulative_stats = cumulative_qc.get_aggregated_stats()
    -            cumulative_coverage_index = cumulative_stats["Genome coverage index"]
    -
    -            coverage_depth_data.append({
    -                "cumulative_parts": i + 1,
    -                "cumulative_total_abundance": cumulative_total_abundance,
    -                "cumulative_coverage_index": cumulative_coverage_index,
    -            })
    -
    -        self.logger.debug("Coverage vs depth calculation completed.")
    -        return coverage_depth_data
    -
    -    def predict_coverage(self, extra_fold: float, n: int = 30) -> float:
    -        r"""
    -        Predict the coverage index if additional sequencing is performed.
    -
    -        This method estimates the potential increase in the genome coverage index when the sequencing depth
    -        is increased by a specified fold (extra sequencing). It does so by:
    +        - **Cumulative Coverage Index** (\( C_i \)):
    +          $$
    +          C_i = \frac{\text{Number of genomic unique k-mers in first } i \text{ parts}}{\left| \text{Reference genome k-mer set} \right|}
    +          $$
    +
    +        **Parameters**:
    +
    +        - `n` (`int`): Number of parts to split the signature into.
    +
    +        **Returns**:
    +
    +        - `List[Dict[str, Any]]`:  
    +          List of dictionaries containing:
    +            - `"cumulative_parts"` (`int`): Number of parts included.
    +            - `"cumulative_total_abundance"` (`int`): Total sequencing depth up to this part.
    +            - `"cumulative_coverage_index"` (`float`): Coverage index up to this part.
    +
    +        **Usage Example**:
    +
    +        ```python
    +        coverage_depth_data = qc.calculate_coverage_vs_depth(n=10)
    +        for data in coverage_depth_data:
    +            print(data)
    +        ```
    +        """
    +        self.logger.debug("Calculating coverage vs depth with %d parts.", n)
    +        # Determine the ROI reference signature
    +        if isinstance(self.amplicon_sig, SnipeSig):
    +            roi_reference_sig = self.amplicon_sig
    +            self.logger.debug("Using amplicon signature as ROI reference.")
    +        else:
    +            roi_reference_sig = self.reference_sig
    +            self.logger.debug("Using reference genome signature as ROI reference.")
    +
    +        # Split the sample signature into n random parts (cached if available)
    +        split_sigs = self.split_sig_randomly(n)
    +
    +        coverage_depth_data = []
     
    -        1. **Cumulative Coverage Calculation**:
    -        - Splitting the sample signature into `n` random parts to simulate incremental sequencing data.
    -        - Calculating the cumulative coverage index and cumulative sequencing depth at each incremental step.
    +        if not split_sigs:
    +            self.logger.error("No split signatures available. Cannot calculate coverage vs depth.")
    +            return coverage_depth_data
     
    -        2. **Saturation Curve Fitting**:
    -        - Modeling the relationship between cumulative coverage and cumulative sequencing depth using
    -            a hyperbolic saturation function.
    -        - The saturation model reflects how coverage approaches a maximum limit as sequencing depth increases.
    -
    -        3. **Coverage Prediction**:
    -        - Using the fitted model to predict the coverage index at an increased sequencing depth (current depth
    -            multiplied by `1 + extra_fold`).
    -
    -        **Mathematical Explanation**:
    -
    -        - **Saturation Model**:
    -        The coverage index \( C \) as a function of sequencing depth \( D \) is modeled using the function:
    -
    -        $$
    -        C(D) = \frac{a \cdot D}{b + D}
    -        $$
    -
    -        Where:
    -        - \( a \) and \( b \) are parameters estimated from the data.
    -        - \( D \) is the cumulative sequencing depth (total abundance).
    -        - \( C(D) \) is the cumulative coverage index at depth \( D \).
    +        cumulative_snipe_sig = split_sigs[0].copy()
    +        cumulative_total_abundance = cumulative_snipe_sig.total_abundance
    +
    +        # Force conversion to GENOME
    +        roi_reference_sig.sigtype = SigType.GENOME
    +
    +        # Compute initial coverage index
    +        cumulative_qc = ReferenceQC(
    +            sample_sig=cumulative_snipe_sig,
    +            reference_sig=roi_reference_sig,
    +            enable_logging=self.enable_logging
    +        )
    +        cumulative_stats = cumulative_qc.get_aggregated_stats()
    +        cumulative_coverage_index = cumulative_stats.get("Genome coverage index", 0.0)
    +
    +        coverage_depth_data.append({
    +            "cumulative_parts": 1,
    +            "cumulative_total_abundance": cumulative_total_abundance,
    +            "cumulative_coverage_index": cumulative_coverage_index,
    +        })
    +
    +        self.logger.debug("Added initial coverage depth data for part 1.")
     
    -        - **Parameter Estimation**:
    -        The parameters \( a \) and \( b \) are determined by fitting the model to the observed cumulative
    -        coverage and depth data using non-linear least squares optimization.
    +        # Iterate over the rest of the parts
    +        for i in range(1, n):
    +            current_part = split_sigs[i]
     
    -        - **Coverage Prediction**:
    -        The predicted coverage index \( C_{\text{pred}} \) at an increased sequencing depth \( D_{\text{pred}} \)
    -        is calculated as:
    +            # Add current part to cumulative signature
    +            cumulative_snipe_sig += current_part
    +            cumulative_total_abundance += current_part.total_abundance
     
    -        $$
    -        D_{\text{pred}} = D_{\text{current}} \times (1 + \text{extra\_fold})
    -        $$
    -
    -        $$
    -        C_{\text{pred}} = \frac{a \cdot D_{\text{pred}}}{b + D_{\text{pred}}}
    -        $$
    -
    -        **Parameters**:
    -
    -        - `extra_fold` (*float*):  
    -          The fold increase in sequencing depth to simulate. For example, extra_fold = 1.0 represents doubling
    -          the current sequencing depth.
    -
    -        - `n` (*int, optional*):  
    -          The number of parts to split the sample signature into for modeling the saturation curve.
    -          Default is 30.
    -
    -        **Returns**:
    -            - `float`:  
    -              The predicted genome coverage index at the increased sequencing depth.
    -
    -        **Raises**:
    -            - `RuntimeError`:  
    -              If the saturation model fails to converge during curve fitting.
    -
    -        **Usage Example**:
    -
    -        ```python
    -        # Create a ReferenceQC instance with sample and reference signatures
    -        qc = ReferenceQC(sample_sig=sample_sig, reference_sig=reference_sig)
    -
    -        # Predict coverage index after increasing sequencing depth by 50%
    -        predicted_coverage = qc.predict_coverage(extra_fold=0.5)
    -
    -        print(f"Predicted coverage index at 1.5x sequencing depth: {predicted_coverage:.4f}")
    -        ```
    -
    -        **Implementation Details**:
    +            # Compute new coverage index
    +            cumulative_qc = ReferenceQC(
    +                sample_sig=cumulative_snipe_sig,
    +                reference_sig=roi_reference_sig,
    +                enable_logging=self.enable_logging
    +            )
    +            cumulative_stats = cumulative_qc.get_aggregated_stats()
    +            cumulative_coverage_index = cumulative_stats.get("Genome coverage index", 0.0)
    +
    +            coverage_depth_data.append({
    +                "cumulative_parts": i + 1,
    +                "cumulative_total_abundance": cumulative_total_abundance,
    +                "cumulative_coverage_index": cumulative_coverage_index,
    +            })
    +
    +            self.logger.debug("Added coverage depth data for part %d.", i + 1)
    +
    +        self.logger.debug("Coverage vs depth calculation completed.")
    +        return coverage_depth_data
    +
    +    def predict_coverage(self, extra_fold: float, n: int = 30) -> float:
    +        r"""
    +        Predict the coverage index if additional sequencing is performed.
    +
    +        This method estimates the potential increase in the genome coverage index when the sequencing depth
    +        is increased by a specified fold (extra sequencing). It does so by:
    +
    +        1. **Cumulative Coverage Calculation**:
    +        - Splitting the sample signature into `n` random parts to simulate incremental sequencing data.
    +        - Calculating the cumulative coverage index and cumulative sequencing depth at each incremental step.
    +
    +        2. **Saturation Curve Fitting**:
    +        - Modeling the relationship between cumulative coverage and cumulative sequencing depth using
    +            a hyperbolic saturation function.
    +        - The saturation model reflects how coverage approaches a maximum limit as sequencing depth increases.
    +
    +        3. **Coverage Prediction**:
    +        - Using the fitted model to predict the coverage index at an increased sequencing depth (current depth
    +            multiplied by `1 + extra_fold`).
     
    -        - **Splitting the Sample Signature**:
    -            - The sample signature is split into `n` random parts using a multinomial distribution based on k-mer abundances.
    -            - Each part represents an incremental addition of sequencing data.
    -
    -        - **Cumulative Calculations**:
    -            - At each incremental step, the cumulative signature is updated, and the cumulative coverage index and sequencing depth are calculated.
    -
    -        - **Curve Fitting**:
    -            - The `scipy.optimize.curve_fit` function is used to fit the saturation model to the cumulative data.
    -            - Initial parameter guesses are based on the observed data to aid convergence.
    -        """
    -        if extra_fold < 1:
    -            raise ValueError("extra_fold must be >= 1.0.")
    +        **Mathematical Explanation**:
    +
    +        - **Saturation Model**:
    +        The coverage index \( C \) as a function of sequencing depth \( D \) is modeled using the function:
    +
    +        $$
    +        C(D) = \frac{a \cdot D}{b + D}
    +        $$
    +
    +        Where:
    +        - \( a \) and \( b \) are parameters estimated from the data.
    +        - \( D \) is the cumulative sequencing depth (total abundance).
    +        - \( C(D) \) is the cumulative coverage index at depth \( D \).
     
    -        if n < 1 or not isinstance(n, int):
    -            raise ValueError("n must be a positive integer.")
    -
    -        self.logger.debug("Predicting coverage with extra fold: %f", extra_fold)
    -        coverage_depth_data = self.calculate_coverage_vs_depth(n=n)
    -
    -        # Extract cumulative total abundance and coverage index
    -        x_data = np.array([d["cumulative_total_abundance"] for d in coverage_depth_data])
    -        y_data = np.array([d["cumulative_coverage_index"] for d in coverage_depth_data])
    -
    -        # Saturation model function
    -        def saturation_model(x, a, b):
    -            return a * x / (b + x)
    -
    -        # Initial parameter guesses
    -        initial_guess = [y_data[-1], x_data[int(len(x_data) / 2)]]
    -
    -        # Fit the model to the data
    -        try:
    -            with warnings.catch_warnings():
    -                warnings.simplefilter("error", OptimizeWarning)
    -                params, covariance = curve_fit(
    -                    saturation_model,
    -                    x_data,
    -                    y_data,
    -                    p0=initial_guess,
    -                    bounds=(0, np.inf),
    -                    maxfev=10000
    -                )
    -        except (RuntimeError, OptimizeWarning) as exc:
    -            self.logger.error("Curve fitting failed.")
    -            raise RuntimeError("Saturation model fitting failed. Cannot predict coverage.") from exc
    -
    -        # Check if covariance contains inf or nan
    -        if np.isinf(covariance).any() or np.isnan(covariance).any():
    -            self.logger.error("Covariance of parameters could not be estimated.")
    -            raise RuntimeError("Saturation model fitting failed. Cannot predict coverage.")
    -
    -        a, b = params
    +        - **Parameter Estimation**:
    +        The parameters \( a \) and \( b \) are determined by fitting the model to the observed cumulative
    +        coverage and depth data using non-linear least squares optimization.
    +
    +        - **Coverage Prediction**:
    +        The predicted coverage index \( C_{\text{pred}} \) at an increased sequencing depth \( D_{\text{pred}} \)
    +        is calculated as:
    +
    +        $$
    +        D_{\text{pred}} = D_{\text{current}} \times (1 + \text{extra\_fold})
    +        $$
    +
    +        $$
    +        C_{\text{pred}} = \frac{a \cdot D_{\text{pred}}}{b + D_{\text{pred}}}
    +        $$
    +
    +        **Parameters**:
    +
    +        - `extra_fold` (*float*):  
    +          The fold increase in sequencing depth to simulate. For example, extra_fold = 1.0 represents doubling
    +          the current sequencing depth.
    +
    +        - `n` (*int, optional*):  
    +          The number of parts to split the sample signature into for modeling the saturation curve.
    +          Default is 30.
    +
    +        **Returns**:
    +            - `float`:  
    +              The predicted genome coverage index at the increased sequencing depth.
    +
    +        **Raises**:
    +            - `RuntimeError`:  
    +              If the saturation model fails to converge during curve fitting.
    +
    +        **Usage Example**:
    +
    +        ```python
    +        # Create a ReferenceQC instance with sample and reference signatures
    +        qc = ReferenceQC(sample_sig=sample_sig, reference_sig=reference_sig)
     
    -        # Predict coverage at increased sequencing depth
    -        total_abundance = x_data[-1]
    -        predicted_total_abundance = total_abundance * (1 + extra_fold)
    -        predicted_coverage = saturation_model(predicted_total_abundance, a, b)
    -
    -        # Ensure the predicted coverage does not exceed maximum possible coverage
    -        max_coverage = 1.0  # Coverage index cannot exceed 1
    -        predicted_coverage = min(predicted_coverage, max_coverage)
    -
    -        self.logger.debug("Predicted coverage at %.2f-fold increase: %f", extra_fold, predicted_coverage)
    -        return predicted_coverage
    +        # Predict coverage index after increasing sequencing depth by 50%
    +        predicted_coverage = qc.predict_coverage(extra_fold=0.5)
    +
    +        print(f"Predicted coverage index at 1.5x sequencing depth: {predicted_coverage:.4f}")
    +        ```
    +
    +        **Implementation Details**:
    +
    +        - **Splitting the Sample Signature**:
    +            - The sample signature is split into `n` random parts using a multinomial distribution based on k-mer abundances.
    +            - Each part represents an incremental addition of sequencing data.
     
    -    def calculate_chromosome_metrics(self, chr_to_sig: Dict[str, SnipeSig]) -> Dict[str, Any]:
    -        r"""
    -        Calculate the coefficient of variation (CV) of mean abundances across autosomal chromosomes.
    -
    -        This method computes the CV to assess the variability of mean abundances among autosomal chromosomes,
    -        excluding any sex chromosomes.
    -
    -        **Mathematical Explanation**:
    -
    -        The Coefficient of Variation (CV) is defined as:
    -
    -        $$
    -        \text{CV} = \frac{\sigma}{\mu}
    -        $$
    -
    -        Where:
    -        - \( \sigma \) is the standard deviation of the mean abundances across autosomal chromosomes.
    -        - \( \mu \) is the mean of the mean abundances across autosomal chromosomes.
    -
    -        **Parameters**:
    -
    -        - `chr_to_sig` (`Dict[str, SnipeSig]`):  
    -          A dictionary mapping chromosome names (e.g., `'autosomal-1'`, `'autosomal-2'`, `'sex-x'`, `'sex-y'`) to their corresponding
    -          `SnipeSig` instances. Each `SnipeSig` should represent the k-mer signature of a specific chromosome.
    -
    -        **Returns**:
    +        - **Cumulative Calculations**:
    +            - At each incremental step, the cumulative signature is updated, and the cumulative coverage index and sequencing depth are calculated.
    +
    +        - **Curve Fitting**:
    +            - The `scipy.optimize.curve_fit` function is used to fit the saturation model to the cumulative data.
    +            - Initial parameter guesses are based on the observed data to aid convergence.
    +        """
    +        if extra_fold < 1:
    +            raise ValueError("extra_fold must be >= 1.0.")
    +
    +        if n < 1 or not isinstance(n, int):
    +            raise ValueError("n must be a positive integer.")
    +
    +        self.logger.debug("Predicting coverage with extra fold: %f", extra_fold)
    +        coverage_depth_data = self.calculate_coverage_vs_depth(n=n)
    +
    +        # Extract cumulative total abundance and coverage index
    +        x_data = np.array([d["cumulative_total_abundance"] for d in coverage_depth_data])
    +        y_data = np.array([d["cumulative_coverage_index"] for d in coverage_depth_data])
    +
    +        # Saturation model function
    +        def saturation_model(x, a, b):
    +            return a * x / (b + x)
    +
    +        # Initial parameter guesses
    +        initial_guess = [y_data[-1], x_data[int(len(x_data) / 2)]]
     
    -        - `Dict[str, Any]`:  
    -          A dictionary containing the computed metrics:
    -              - `"Autosomal_CV"` (`float`):  
    -                The coefficient of variation of mean abundances across autosomal chromosomes.
    -
    -        **Raises**:
    -
    -        - `ValueError`:  
    -          If `chr_to_sig` is empty or if there is an inconsistency in the signatures' parameters.
    -
    -        **Usage Example**:
    -
    -        ```python
    -        # Assume `chr_signatures` is a dictionary of chromosome-specific SnipeSig instances
    -        chr_signatures = {
    -            "1": sig_chr1,
    -            "2": sig_chr2,
    -            "X": sig_chrX,
    -            "Y": sig_chrY
    -        }
    +        # Fit the model to the data
    +        try:
    +            with warnings.catch_warnings():
    +                warnings.simplefilter("error", OptimizeWarning)
    +                params, covariance = curve_fit(
    +                    saturation_model,
    +                    x_data,
    +                    y_data,
    +                    p0=initial_guess,
    +                    bounds=(0, np.inf),
    +                    maxfev=10000
    +                )
    +        except (RuntimeError, OptimizeWarning) as exc:
    +            self.logger.error("Curve fitting failed.")
    +            raise RuntimeError("Saturation model fitting failed. Cannot predict coverage.") from exc
    +
    +        # Check if covariance contains inf or nan
    +        if np.isinf(covariance).any() or np.isnan(covariance).any():
    +            self.logger.error("Covariance of parameters could not be estimated.")
    +            raise RuntimeError("Saturation model fitting failed. Cannot predict coverage.")
     
    -        # Calculate chromosome metrics
    -        metrics = qc.calculate_chromosome_metrics(chr_to_sig=chr_signatures)
    -
    -        print(metrics)
    -        # Output:
    -        # {'Autosomal_CV': 0.15}
    -        ```
    -
    -        **Notes**:
    -
    -        - **Exclusion of Sex Chromosomes**:  
    -          Chromosomes with names containing the substring `"sex"` (e.g., `'sex-y'`, `'sex-x'`) are excluded from the CV calculation to focus solely on autosomal chromosomes.
    -
    -        - **Mean Abundance Calculation**:  
    -          The mean abundance for each chromosome is calculated by intersecting the sample signature with the chromosome-specific signature and averaging the abundances of the shared k-mers.
    -        """
    -
    -        # Implementation of the method
    -        # let's make sure all chromosome sigs are unique
    -        specific_chr_to_sig = SnipeSig.get_unique_signatures(chr_to_sig)
    +        a, b = params
    +
    +        # Predict coverage at increased sequencing depth
    +        total_abundance = x_data[-1]
    +        predicted_total_abundance = total_abundance * (1 + extra_fold)
    +        predicted_coverage = saturation_model(predicted_total_abundance, a, b)
    +
    +        # Ensure the predicted coverage does not exceed maximum possible coverage
    +        max_coverage = 1.0  # Coverage index cannot exceed 1
    +        predicted_coverage = min(predicted_coverage, max_coverage)
    +
    +        self.logger.debug("Predicted coverage at %.2f-fold increase: %f", extra_fold, predicted_coverage)
    +        return predicted_coverage
    +
    +    def calculate_chromosome_metrics(self, chr_to_sig: Dict[str, SnipeSig]) -> Dict[str, Any]:
    +        r"""
    +        Calculate the coefficient of variation (CV) of mean abundances across autosomal chromosomes.
    +
    +        This method computes the CV to assess the variability of mean abundances among autosomal chromosomes,
    +        excluding any sex chromosomes.
     
    -        # calculate mean abundance for each chromosome and loaded sample sig
    -        chr_to_mean_abundance = {}
    -        self.logger.debug("Calculating mean abundance for each chromosome.")
    -        for chr_name, chr_sig in specific_chr_to_sig.items():
    -            chr_sample_sig = self.sample_sig & chr_sig
    -            chr_stats = chr_sample_sig.get_sample_stats
    -            chr_to_mean_abundance[chr_name] = chr_stats["mean_abundance"]
    -            self.logger.debug("\t-Mean abundance for %s: %f", chr_name, chr_stats["mean_abundance"])
    -
    -
    -        # chr_to_mean_abundance but without any chr with partial name sex
    -        autosomal_chr_to_mean_abundance = {}
    -        for chr_name, mean_abundance in chr_to_mean_abundance.items():
    -            if "sex" in chr_name.lower():
    -                continue
    -            autosomal_chr_to_mean_abundance[chr_name] = mean_abundance
    -
    +        **Mathematical Explanation**:
    +
    +        The Coefficient of Variation (CV) is defined as:
    +
    +        $$
    +        \text{CV} = \frac{\sigma}{\mu}
    +        $$
    +
    +        Where:
    +        - \( \sigma \) is the standard deviation of the mean abundances across autosomal chromosomes.
    +        - \( \mu \) is the mean of the mean abundances across autosomal chromosomes.
    +
    +        **Parameters**:
    +
    +        - `chr_to_sig` (`Dict[str, SnipeSig]`):  
    +          A dictionary mapping chromosome names (e.g., `'autosomal-1'`, `'autosomal-2'`, `'sex-x'`, `'sex-y'`) to their corresponding
    +          `SnipeSig` instances. Each `SnipeSig` should represent the k-mer signature of a specific chromosome.
     
    -        # calculate the CV for the whole sample
    -        if autosomal_chr_to_mean_abundance:
    -            mean_abundances = np.array(list(autosomal_chr_to_mean_abundance.values()), dtype=float)
    -            cv = np.std(mean_abundances) / np.mean(mean_abundances) if np.mean(mean_abundances) != 0 else 0.0
    -            self.chrs_stats.update({"Autosomal_CV": cv})
    -            self.logger.debug("Calculated Autosomal CV: %f", cv)
    -        else:
    -            self.logger.warning("No autosomal chromosomes were processed. 'Autosomal_CV' set to None.")
    -            self.chrs_stats.update({"Autosomal_CV": None})
    -
    -        # optional return, not required
    -        return self.chrs_stats
    -
    +        **Returns**:
    +
    +        - `Dict[str, Any]`:  
    +          A dictionary containing the computed metrics:
    +              - `"Autosomal_CV"` (`float`):  
    +                The coefficient of variation of mean abundances across autosomal chromosomes.
    +
    +        **Raises**:
    +
    +        - `ValueError`:  
    +          If `chr_to_sig` is empty or if there is an inconsistency in the signatures' parameters.
    +
    +        **Usage Example**:
     
    -    def calculate_sex_chrs_metrics(self, genome_and_chr_to_sig: Dict[str, SnipeSig]) -> Dict[str, Any]:
    -        r"""
    -        Calculate sex chromosome-related metrics based on genome and chromosome-specific signatures.
    -
    -        This method processes a collection of genome and chromosome-specific `SnipeSig` instances to compute
    -        metrics such as the X-Ploidy score and Y-Coverage. It ensures that each chromosome signature contains
    -        only unique hashes that do not overlap with hashes from other chromosomes or the autosomal genome.
    -        The method excludes sex chromosomes (e.g., Y chromosome) from the autosomal genome signature to
    -        accurately assess sex chromosome metrics.
    -
    -        **Mathematical Explanation**:
    +        ```python
    +        # Assume `chr_signatures` is a dictionary of chromosome-specific SnipeSig instances
    +        chr_signatures = {
    +            "1": sig_chr1,
    +            "2": sig_chr2,
    +            "X": sig_chrX,
    +            "Y": sig_chrY
    +        }
    +
    +        # Calculate chromosome metrics
    +        metrics = qc.calculate_chromosome_metrics(chr_to_sig=chr_signatures)
     
    -        - **X-Ploidy Score**:
    -
    -          The X-Ploidy score is calculated using the formula:
    -
    -          $$
    -          \text{X-Ploidy} = \left(\frac{\mu_X}{\mu_{\text{autosomal}}}\right) \times \left(\frac{N_{\text{autosomal}}}{N_X}\right)
    -          $$
    -
    -          Where:
    -          - \( \mu_X \) is the mean abundance of X chromosome-specific k-mers in the sample.
    -          - \( \mu_{\text{autosomal}} \) is the mean abundance of autosomal k-mers in the sample.
    -          - \( N_{\text{autosomal}} \) is the number of autosomal k-mers in the reference genome.
    -          - \( N_X \) is the number of X chromosome-specific k-mers in the reference genome.
    +        print(metrics)
    +        # Output:
    +        # {'Autosomal_CV': 0.15}
    +        ```
    +
    +        **Notes**:
    +
    +        - **Exclusion of Sex Chromosomes**:  
    +          Chromosomes with names containing the substring `"sex"` (e.g., `'sex-y'`, `'sex-x'`) are excluded from the CV calculation to focus solely on autosomal chromosomes.
    +
    +        - **Mean Abundance Calculation**:  
    +          The mean abundance for each chromosome is calculated by intersecting the sample signature with the chromosome-specific signature and averaging the abundances of the shared k-mers.
    +        """
     
    -        - **Y-Coverage**:
    -
    -          The Y-Coverage is calculated using the formula:
    -
    -          $$
    -          \text{Y-Coverage} = \frac{\left(\frac{N_Y^{\text{sample}}}{N_Y}\right)}{\left(\frac{N_{\text{autosomal}}^{\text{sample}}}{N_{\text{autosomal}}}\right)}
    -          $$
    -
    -          Where:
    -          - \( N_Y^{\text{sample}} \) is the number of Y chromosome-specific k-mers in the sample.
    -          - \( N_Y \) is the number of Y chromosome-specific k-mers in the reference genome.
    -          - \( N_{\text{autosomal}}^{\text{sample}} \) is the number of autosomal k-mers in the sample.
    -          - \( N_{\text{autosomal}} \) is the number of autosomal k-mers in the reference genome.
    -
    -        **Parameters**:
    -
    -            - `genome_and_chr_to_sig` (`Dict[str, SnipeSig]`):  
    -              A dictionary mapping signature names to their corresponding `SnipeSig` instances. This should include
    -              the autosomal genome signature (with a name ending in `'-snipegenome'`) and chromosome-specific
    -              signatures (e.g., `'sex-x'`, `'sex-y'`, `'autosome-1'`, `'autosome-2'`, etc.).
    -
    -        **Returns**:
    -
    -            - `Dict[str, Any]`:  
    -              A dictionary containing the calculated sex-related metrics:
    -                  - `"X-Ploidy score"` (`float`):  
    -                    The ploidy score of the X chromosome, reflecting the ratio of X chromosome k-mer abundance
    -                    to autosomal k-mer abundance, adjusted by genome and X chromosome sizes.
    -                  - `"Y-Coverage"` (`float`, optional):  
    -                    The coverage of Y chromosome-specific k-mers in the sample relative to autosomal coverage.
    -                    This key is present only if a Y chromosome signature is provided.
    -
    -        **Raises**:
    -
    -            - `ValueError`:  
    -              - If the `'sex-x'` chromosome signature is not found in `genome_and_chr_to_sig`.
    -              - If the autosomal genome signature is not found or improperly labeled.
    -
    -        **Usage Example**:
    +        # Implementation of the method
    +        # let's make sure all chromosome sigs are unique
    +        self.logger.debug("Computing specific chromosome hashes for %s.", ','.join(chr_to_sig.keys()))
    +        self.logger.debug(f"\t-All hashes for chromosomes before getting unique sigs {len(SnipeSig.sum_signatures(list(chr_to_sig.values())))}")
    +        specific_chr_to_sig = SnipeSig.get_unique_signatures(chr_to_sig)
    +        self.logger.debug(f"\t-All hashes for chromosomes after getting unique sigs {len(SnipeSig.sum_signatures(list(specific_chr_to_sig.values())))}")
    +
    +        # calculate mean abundance for each chromosome and loaded sample sig
    +        chr_to_mean_abundance = {}
    +        self.logger.debug("Calculating mean abundance for each chromosome.")
    +        for chr_name, chr_sig in specific_chr_to_sig.items():
    +            self.logger.debug("Intersecting %s (%d) with %s (%d)", self.sample_sig.name, len(self.sample_sig), chr_name, len(chr_sig))
    +            chr_sample_sig = self.sample_sig & chr_sig
    +            chr_stats = chr_sample_sig.get_sample_stats
    +            chr_to_mean_abundance[chr_name] = chr_stats["mean_abundance"]
    +            self.logger.debug("\t-Mean abundance for %s: %f", chr_name, chr_stats["mean_abundance"])
    +
    +        self.chrs_stats.update(chr_to_mean_abundance)
    +
    +        # chr_to_mean_abundance but without any chr with partial name sex
    +        autosomal_chr_to_mean_abundance = {}
    +        for chr_name, mean_abundance in chr_to_mean_abundance.items():
    +            if "sex" in chr_name.lower():
    +                continue
    +            autosomal_chr_to_mean_abundance[chr_name] = mean_abundance
    +
    +
    +        # calculate the CV for the whole sample
    +        if autosomal_chr_to_mean_abundance:
    +            mean_abundances = np.array(list(autosomal_chr_to_mean_abundance.values()), dtype=float)
    +            cv = np.std(mean_abundances) / np.mean(mean_abundances) if np.mean(mean_abundances) != 0 else 0.0
    +            self.chrs_stats.update({"Autosomal_CV": cv})
    +            self.logger.debug("Calculated Autosomal CV: %f", cv)
    +        else:
    +            self.logger.warning("No autosomal chromosomes were processed. 'Autosomal_CV' set to None.")
    +            self.chrs_stats.update({"Autosomal_CV": None})
    +
    +        # optional return, not required
    +        return self.chrs_stats
     
    -        ```python
    -        # Assume `genome_and_chr_signatures` is a dictionary of genome and chromosome-specific SnipeSig instances
    -        genome_and_chr_signatures = {
    -            "autosomal-snipegenome": sig_autosomal_genome,
    -            "1": sig_chr1,
    -            "2": sig_chr2,
    -            "sex-x": sig_sex_x,
    -            "sex-y": sig_sex_y
    -        }
    -
    -        # Calculate sex chromosome metrics
    -        metrics = qc.calculate_sex_chrs_metrics(genome_and_chr_to_sig=genome_and_chr_signatures)
    +
    +    def calculate_sex_chrs_metrics(self, genome_and_chr_to_sig: Dict[str, SnipeSig]) -> Dict[str, Any]:
    +        r"""
    +        Calculate sex chromosome-related metrics based on genome and chromosome-specific signatures.
    +
    +        This method processes a collection of genome and chromosome-specific `SnipeSig` instances to compute
    +        metrics such as the X-Ploidy score and Y-Coverage. It ensures that each chromosome signature contains
    +        only unique hashes that do not overlap with hashes from other chromosomes or the autosomal genome.
    +        The method excludes sex chromosomes (e.g., Y chromosome) from the autosomal genome signature to
    +        accurately assess sex chromosome metrics.
    +
    +        **Mathematical Explanation**:
     
    -        print(metrics)
    -        # Output Example:
    -        # {
    -        #     "X-Ploidy score": 2.6667,
    -        #     "Y-Coverage": 0.0
    -        # }
    -        ```
    +        - **X-Ploidy Score**:
    +
    +          The X-Ploidy score is calculated using the formula:
    +
    +          $$
    +          \text{X-Ploidy} = \left(\frac{\mu_X}{\mu_{\text{autosomal}}}\right) \times \left(\frac{N_{\text{autosomal}}}{N_X}\right)
    +          $$
     
    -        **Notes**:
    -
    -            - **Signature Naming Convention**:  
    -              The autosomal genome signature must have a name ending with `'-snipegenome'`. Chromosome-specific
    -              signatures should be named accordingly (e.g., `'sex-x'`, `'sex-y'`, `'autosomal-1'`, `'autosomal-2'`, etc.).
    +          Where:
    +          - \( \mu_X \) is the mean abundance of X chromosome-specific k-mers in the sample.
    +          - \( \mu_{\text{autosomal}} \) is the mean abundance of autosomal k-mers in the sample.
    +          - \( N_{\text{autosomal}} \) is the number of autosomal k-mers in the reference genome.
    +          - \( N_X \) is the number of X chromosome-specific k-mers in the reference genome.
     
    -            - **Exclusion of Sex Chromosomes from Autosomal Genome**:  
    -              The Y chromosome signature (`'sex-y'`) is subtracted from the autosomal genome signature to ensure
    -              that Y chromosome k-mers are not counted towards autosomal metrics.
    +        - **Y-Coverage**:
    +
    +          The Y-Coverage is calculated using the formula:
     
    -            - **Robustness**:  
    -              The method includes comprehensive logging for debugging purposes, tracking each major step and
    -              any exclusions made during processing.
    -        """
    -
    -        # Ensure that the chromosome X signature exists
    -        if 'sex-x' not in genome_and_chr_to_sig:
    -            self.logger.warning("Chromosome X ('sex-x') not found in the provided signatures. X-Ploidy score will be set to zero.")
    -            # set sex-x to an empty signature
    -            genome_and_chr_to_sig['sex-x'] = SnipeSig.create_from_hashes_abundances(
    -                hashes=np.array([], dtype=np.uint64),
    -                abundances=np.array([], dtype=np.uint32),
    -                ksize=genome_and_chr_to_sig[list(genome_and_chr_to_sig.keys())[0]].ksize,
    -                scale=genome_and_chr_to_sig[list(genome_and_chr_to_sig.keys())[0]].scale,
    -            )
    -
    -        # Separate the autosomal genome signature from chromosome-specific signatures
    -        chr_to_sig: Dict[str, SnipeSig] = {}
    -        autosomals_genome_sig: Optional[SnipeSig] = None
    -        self.logger.debug("Separating autosomal genome signature from chromosome-specific signatures.")
    -
    -        for name, sig in genome_and_chr_to_sig.items():
    -            if name.endswith('-snipegenome'):
    -                self.logger.debug("\t- Identified autosomal genome signature: '%s'.", name)
    -                autosomals_genome_sig = sig
    -            else:
    -                chr_to_sig[name] = sig
    +          $$
    +          \text{Y-Coverage} = \frac{\left(\frac{N_Y^{\text{sample}}}{N_Y}\right)}{\left(\frac{N_{\text{autosomal}}^{\text{sample}}}{N_{\text{autosomal}}}\right)}
    +          $$
    +
    +          Where:
    +          - \( N_Y^{\text{sample}} \) is the number of Y chromosome-specific k-mers in the sample.
    +          - \( N_Y \) is the number of Y chromosome-specific k-mers in the reference genome.
    +          - \( N_{\text{autosomal}}^{\text{sample}} \) is the number of autosomal k-mers in the sample.
    +          - \( N_{\text{autosomal}} \) is the number of autosomal k-mers in the reference genome.
    +
    +        **Parameters**:
    +
    +            - `genome_and_chr_to_sig` (`Dict[str, SnipeSig]`):  
    +              A dictionary mapping signature names to their corresponding `SnipeSig` instances. This should include
    +              the autosomal genome signature (with a name ending in `'-snipegenome'`) and chromosome-specific
    +              signatures (e.g., `'sex-x'`, `'sex-y'`, `'autosome-1'`, `'autosome-2'`, etc.).
    +
    +        **Returns**:
    +
    +            - `Dict[str, Any]`:  
    +              A dictionary containing the calculated sex-related metrics:
    +                  - `"X-Ploidy score"` (`float`):  
    +                    The ploidy score of the X chromosome, reflecting the ratio of X chromosome k-mer abundance
    +                    to autosomal k-mer abundance, adjusted by genome and X chromosome sizes.
    +                  - `"Y-Coverage"` (`float`, optional):  
    +                    The coverage of Y chromosome-specific k-mers in the sample relative to autosomal coverage.
    +                    This key is present only if a Y chromosome signature is provided.
     
    -        if autosomals_genome_sig is None:
    -            self.logger.error("Autosomal genome signature (ending with '-snipegenome') not found.")
    -            raise ValueError("Autosomal genome signature (ending with '-snipegenome') not found.")
    -
    -        # Ensure all chromosome signatures have unique hashes
    -        specific_chr_to_sig = SnipeSig.get_unique_signatures(chr_to_sig)
    -
    -        # Exclude Y chromosome from the autosomal genome signature if present
    -        if 'sex-y' in chr_to_sig:
    -            self.logger.debug("Y chromosome ('sex-y') detected. Removing its hashes from the autosomal genome signature.")
    -            self.logger.debug("\t- Original autosomal genome size: %d hashes.", len(autosomals_genome_sig))
    -            autosomals_genome_sig = autosomals_genome_sig - chr_to_sig['sex-y']
    -            self.logger.debug("\t- Updated autosomal genome size after removing Y chromosome: %d hashes.", len(autosomals_genome_sig))
    -
    -        # Remove X chromosome hashes from the autosomal genome signature
    -        self.logger.debug("Removing X chromosome ('sex-x') hashes from the autosomal genome signature.")
    -        autosomals_genome_sig = autosomals_genome_sig - chr_to_sig['sex-x']
    -        self.logger.debug("\t- Updated autosomal genome size after removing X chromosome: %d hashes.", len(autosomals_genome_sig))
    -
    -        # Derive the X chromosome-specific signature by subtracting autosomal genome hashes
    -        specific_xchr_sig = specific_chr_to_sig["sex-x"] - autosomals_genome_sig
    -        self.logger.debug("\t-Derived X chromosome-specific signature size: %d hashes.", len(specific_xchr_sig))
    -
    -        # Intersect the sample signature with chromosome-specific signatures
    -        sample_specific_xchr_sig = self.sample_sig & specific_xchr_sig
    -        if len(sample_specific_xchr_sig) == 0:
    -            self.logger.warning("No X chromosome-specific k-mers found in the sample signature.")
    -        self.logger.debug("\t-Intersected sample signature with X chromosome-specific k-mers = %d hashes.", len(sample_specific_xchr_sig))
    -        sample_autosomal_sig = self.sample_sig & autosomals_genome_sig
    -        self.logger.debug("\t-Intersected sample signature with autosomal genome k-mers = %d hashes.", len(sample_autosomal_sig))
    +        **Raises**:
    +
    +            - `ValueError`:  
    +              - If the `'sex-x'` chromosome signature is not found in `genome_and_chr_to_sig`.
    +              - If the autosomal genome signature is not found or improperly labeled.
    +
    +        **Usage Example**:
    +
    +        ```python
    +        # Assume `genome_and_chr_signatures` is a dictionary of genome and chromosome-specific SnipeSig instances
    +        genome_and_chr_signatures = {
    +            "autosomal-snipegenome": sig_autosomal_genome,
    +            "1": sig_chr1,
    +            "2": sig_chr2,
    +            "sex-x": sig_sex_x,
    +            "sex-y": sig_sex_y
    +        }
    +
    +        # Calculate sex chromosome metrics
    +        metrics = qc.calculate_sex_chrs_metrics(genome_and_chr_to_sig=genome_and_chr_signatures)
    +
    +        print(metrics)
    +        # Output Example:
    +        # {
    +        #     "X-Ploidy score": 2.6667,
    +        #     "Y-Coverage": 0.0
    +        # }
    +        ```
    +
    +        **Notes**:
     
    -        # Retrieve mean abundances
    -        xchr_mean_abundance = sample_specific_xchr_sig.get_sample_stats.get("mean_abundance", 0.0)
    -        autosomal_mean_abundance = sample_autosomal_sig.get_sample_stats.get("mean_abundance", 0.0)
    +            - **Signature Naming Convention**:  
    +              The autosomal genome signature must have a name ending with `'-snipegenome'`. Chromosome-specific
    +              signatures should be named accordingly (e.g., `'sex-x'`, `'sex-y'`, `'autosomal-1'`, `'autosomal-2'`, etc.).
     
    -        # Calculate X-Ploidy score
    -        if autosomal_mean_abundance == 0:
    -            self.logger.warning("Autosomal mean abundance is zero. Setting X-Ploidy score to zero to avoid division by zero.")
    -            xploidy_score = 0.0
    -        else:
    -            xploidy_score = (xchr_mean_abundance / autosomal_mean_abundance) * \
    -                            (len(autosomals_genome_sig) / len(specific_xchr_sig) if len(specific_xchr_sig) > 0 else 0.0)
    -
    -        self.logger.debug("Calculated X-Ploidy score: %.4f", xploidy_score)
    -        self.sex_stats.update({"X-Ploidy score": xploidy_score})
    -
    -        # Calculate Y-Coverage if Y chromosome is present
    -        if 'sex-y' in specific_chr_to_sig:
    -            self.logger.debug("Calculating Y-Coverage based on Y chromosome-specific k-mers.")
    -
    -            # Derive Y chromosome-specific k-mers by excluding autosomal and X chromosome k-mers
    -            ychr_specific_kmers = chr_to_sig["sex-y"] - autosomals_genome_sig - specific_xchr_sig
    -            self.logger.debug("\t-Derived Y chromosome-specific signature size: %d hashes.", len(ychr_specific_kmers))
    -
    -            # Intersect Y chromosome-specific k-mers with the sample signature
    -            ychr_in_sample = self.sample_sig & ychr_specific_kmers
    -            self.logger.debug("\t-Intersected sample signature with Y chromosome-specific k-mers = %d hashes.", len(ychr_in_sample))
    -            if len(ychr_in_sample) == 0:
    -                self.logger.warning("No Y chromosome-specific k-mers found in the sample signature.")
    +            - **Exclusion of Sex Chromosomes from Autosomal Genome**:  
    +              The Y chromosome signature (`'sex-y'`) is subtracted from the autosomal genome signature to ensure
    +              that Y chromosome k-mers are not counted towards autosomal metrics.
    +
    +            - **Robustness**:  
    +              The method includes comprehensive logging for debugging purposes, tracking each major step and
    +              any exclusions made during processing.
    +        """
    +
    +        # Ensure that the chromosome X signature exists
    +        if 'sex-x' not in genome_and_chr_to_sig:
    +            self.logger.warning("Chromosome X ('sex-x') not found in the provided signatures. X-Ploidy score will be set to zero.")
    +            # set sex-x to an empty signature
    +            genome_and_chr_to_sig['sex-x'] = SnipeSig.create_from_hashes_abundances(
    +                hashes=np.array([], dtype=np.uint64),
    +                abundances=np.array([], dtype=np.uint32),
    +                ksize=genome_and_chr_to_sig[list(genome_and_chr_to_sig.keys())[0]].ksize,
    +                scale=genome_and_chr_to_sig[list(genome_and_chr_to_sig.keys())[0]].scale,
    +            )
    +
    +        # Separate the autosomal genome signature from chromosome-specific signatures
    +        chr_to_sig: Dict[str, SnipeSig] = {}
    +        autosomals_genome_sig: Optional[SnipeSig] = None
    +        self.logger.debug("Separating autosomal genome signature from chromosome-specific signatures.")
     
    -            # Derive autosomal-specific k-mers by excluding X and Y chromosome k-mers from the reference signature
    -            autosomals_specific_kmers = self.reference_sig - specific_chr_to_sig["sex-x"] - specific_chr_to_sig['sex-y']
    -
    -            # Calculate Y-Coverage metric
    -            if len(ychr_specific_kmers) == 0 or len(autosomals_specific_kmers) == 0:
    -                self.logger.warning("Insufficient k-mers for Y-Coverage calculation. Setting Y-Coverage to zero.")
    -                ycoverage = 0.0
    -            else:
    -                ycoverage = (len(ychr_in_sample) / len(ychr_specific_kmers)) / \
    -                        (len(sample_autosomal_sig) / len(autosomals_specific_kmers))
    +        for name, sig in genome_and_chr_to_sig.items():
    +            if name.endswith('-snipegenome'):
    +                self.logger.debug("\t- Identified autosomal genome signature: '%s'.", name)
    +                autosomals_genome_sig = sig
    +            else:
    +                chr_to_sig[name] = sig
    +
    +        if autosomals_genome_sig is None:
    +            self.logger.error("Autosomal genome signature (ending with '-snipegenome') not found.")
    +            raise ValueError("Autosomal genome signature (ending with '-snipegenome') not found.")
     
    -            self.logger.debug("Calculated Y-Coverage: %.4f", ycoverage)
    -            self.sex_stats.update({"Y-Coverage": ycoverage})
    +        # Ensure all chromosome signatures have unique hashes
    +        specific_chr_to_sig = SnipeSig.get_unique_signatures(chr_to_sig)
     
    -        return self.sex_stats
    +        # Exclude Y chromosome from the autosomal genome signature if present
    +        if 'sex-y' in chr_to_sig:
    +            self.logger.debug("Y chromosome ('sex-y') detected. Removing its hashes from the autosomal genome signature.")
    +            self.logger.debug("\t- Original autosomal genome size: %d hashes.", len(autosomals_genome_sig))
    +            autosomals_genome_sig = autosomals_genome_sig - chr_to_sig['sex-y']
    +            self.logger.debug("\t- Updated autosomal genome size after removing Y chromosome: %d hashes.", len(autosomals_genome_sig))
    +
    +        # Remove X chromosome hashes from the autosomal genome signature
    +        self.logger.debug("Removing X chromosome ('sex-x') hashes from the autosomal genome signature.")
    +        autosomals_genome_sig = autosomals_genome_sig - chr_to_sig['sex-x']
    +        self.logger.debug("\t- Updated autosomal genome size after removing X chromosome: %d hashes.", len(autosomals_genome_sig))
    +
    +        # Derive the X chromosome-specific signature by subtracting autosomal genome hashes
    +        specific_xchr_sig = specific_chr_to_sig["sex-x"] - autosomals_genome_sig
    +        self.logger.debug("\t-Derived X chromosome-specific signature size: %d hashes.", len(specific_xchr_sig))
    +
    +        # Intersect the sample signature with chromosome-specific signatures
    +        sample_specific_xchr_sig = self.sample_sig & specific_xchr_sig
    +        if len(sample_specific_xchr_sig) == 0:
    +            self.logger.warning("No X chromosome-specific k-mers found in the sample signature.")
    +        self.logger.debug("\t-Intersected sample signature with X chromosome-specific k-mers = %d hashes.", len(sample_specific_xchr_sig))
    +        sample_autosomal_sig = self.sample_sig & autosomals_genome_sig
    +        self.logger.debug("\t-Intersected sample signature with autosomal genome k-mers = %d hashes.", len(sample_autosomal_sig))
    +
    +        # Retrieve mean abundances
    +        xchr_mean_abundance = sample_specific_xchr_sig.get_sample_stats.get("mean_abundance", 0.0)
    +        autosomal_mean_abundance = sample_autosomal_sig.get_sample_stats.get("mean_abundance", 0.0)
    +
    +        # Calculate X-Ploidy score
    +        if autosomal_mean_abundance == 0:
    +            self.logger.warning("Autosomal mean abundance is zero. Setting X-Ploidy score to zero to avoid division by zero.")
    +            xploidy_score = 0.0
    +        else:
    +            xploidy_score = (xchr_mean_abundance / autosomal_mean_abundance) * \
    +                            (len(autosomals_genome_sig) / len(specific_xchr_sig) if len(specific_xchr_sig) > 0 else 0.0)
    +
    +        self.logger.debug("Calculated X-Ploidy score: %.4f", xploidy_score)
    +        self.sex_stats.update({"X-Ploidy score": xploidy_score})
    +
    +        # Calculate Y-Coverage if Y chromosome is present
    +        if 'sex-y' in specific_chr_to_sig:
    +            self.logger.debug("Calculating Y-Coverage based on Y chromosome-specific k-mers.")
    +
    +            # Derive Y chromosome-specific k-mers by excluding autosomal and X chromosome k-mers
    +            ychr_specific_kmers = chr_to_sig["sex-y"] - autosomals_genome_sig - specific_xchr_sig
    +            self.logger.debug("\t-Derived Y chromosome-specific signature size: %d hashes.", len(ychr_specific_kmers))
    +
    +            # Intersect Y chromosome-specific k-mers with the sample signature
    +            ychr_in_sample = self.sample_sig & ychr_specific_kmers
    +            self.logger.debug("\t-Intersected sample signature with Y chromosome-specific k-mers = %d hashes.", len(ychr_in_sample))
    +            if len(ychr_in_sample) == 0:
    +                self.logger.warning("No Y chromosome-specific k-mers found in the sample signature.")
    +
    +            # Derive autosomal-specific k-mers by excluding X and Y chromosome k-mers from the reference signature
    +            autosomals_specific_kmers = self.reference_sig - specific_chr_to_sig["sex-x"] - specific_chr_to_sig['sex-y']
    +
    +            # Calculate Y-Coverage metric
    +            if len(ychr_specific_kmers) == 0 or len(autosomals_specific_kmers) == 0:
    +                self.logger.warning("Insufficient k-mers for Y-Coverage calculation. Setting Y-Coverage to zero.")
    +                ycoverage = 0.0
    +            else:
    +                ycoverage = (len(ychr_in_sample) / len(ychr_specific_kmers)) / \
    +                        (len(sample_autosomal_sig) / len(autosomals_specific_kmers))
    +
    +            self.logger.debug("Calculated Y-Coverage: %.4f", ycoverage)
    +            self.sex_stats.update({"Y-Coverage": ycoverage})
    +
    +        return self.sex_stats
    +
    +
    +
    +    def nonref_consume_from_vars(self, *, vars: Dict[str, SnipeSig], vars_order: List[str], **kwargs) -> Dict[str, float]:
    +        r"""
    +        Consume and analyze non-reference k-mers from provided variable signatures.
    +
    +        This method processes non-reference k-mers in the sample signature by intersecting them with a set of
    +        variable-specific `SnipeSig` instances. It calculates coverage and total abundance metrics for each
    +        variable in a specified order, ensuring that each non-reference k-mer is accounted for without overlap
    +        between variables. The method updates internal statistics that reflect the distribution of non-reference
    +        k-mers across the provided variables.
    +
    +        **Process Overview**:
    +
    +        1. **Validation**:
    +        - Verifies that all variable names specified in `vars_order` are present in the `vars` dictionary.
    +        - Raises a `ValueError` if any variable in `vars_order` is missing from `vars`.
    +
    +        2. **Non-Reference K-mer Extraction**:
    +        - Computes the set of non-reference non-singleton k-mers by subtracting the reference signature from the sample signature.
    +        - If no non-reference k-mers are found, the method logs a warning and returns an empty dictionary.
    +
    +        3. **Variable-wise Consumption**:
    +        - Iterates over each variable name in `vars_order`.
    +        - For each variable:
    +            - Intersects the remaining non-reference k-mers with the variable-specific signature.
    +            - Calculates the total abundance and coverage index for the intersected k-mers.
    +            - Updates the `vars_nonref_stats` dictionary with the computed metrics.
    +            - Removes the consumed k-mers from the remaining non-reference set to prevent overlap.
    +
    +        4. **Final State Logging**:
    +        - Logs the final size and total abundance of the remaining non-reference k-mers after consumption.
    +
    +        **Parameters**:
    +
    +            - `vars` (`Dict[str, SnipeSig]`):  
    +            A dictionary mapping variable names to their corresponding `SnipeSig` instances. Each `SnipeSig` 
    +            represents a set of k-mers associated with a specific non-reference category or variable.
    +
    +            - `vars_order` (`List[str]`):  
    +            A list specifying the order in which variables should be processed. The order determines the priority 
    +            of consumption, ensuring that earlier variables in the list have their k-mers accounted for before 
    +            later ones.
    +
    +            - `**kwargs`:  
    +            Additional keyword arguments. Reserved for future extensions and should not be used in the current context.
    +
    +        **Returns**:
    +
    +            - `Dict[str, float]`:  
    +            A dictionary containing statistics for each variable name in `vars_order`, 
    +                - `"non-genomic total k-mer abundance"` (`float`):  
    +                    The sum of abundances of non-reference k-mers associated with the variable.
    +                - `"non-genomic coverage index"` (`float`):  
    +                    The ratio of unique non-reference k-mers associated with the variable to the total number 
    +                    of non-reference k-mers in the sample before consumption.
    +
    +            Example Output:
    +            ```python
    +            {
    +                "variable_A non-genomic total k-mer abundance": 1500.0,
    +                "variable_A non-genomic coverage index": 0.20
    +                "variable_B non-genomic total k-mer abundance": 3500.0,
    +                "variable_B non-genomic coverage index": 0.70
    +                "non-var non-genomic total k-mer abundance": 0.10,
    +                "non-var non-genomic coverage index": 218
    +            }
    +            ```
    +
    +        **Raises**:
    +
    +            - `ValueError`:  
    +            - If any variable specified in `vars_order` is not present in the `vars` dictionary.
    +            - This ensures that all variables intended for consumption are available for processing.
    +
    +        **Usage Example**:
    +
    +        ```python
    +        # Assume `variables_signatures` is a dictionary of variable-specific SnipeSig instances
    +        variables_signatures = {
    +            "GTDB": sig_GTDB,
    +            "VIRALDB": sig_VIRALDB,
    +            "contaminant_X": sig_contaminant_x
    +        }
    +
    +        # Define the order in which variables should be processed
    +        processing_order = ["GTDB", "VIRALDB", "contaminant_X"]
    +
    +        # Consume non-reference k-mers and retrieve statistics
    +        nonref_stats = qc.nonref_consume_from_vars(vars=variables_signatures, vars_order=processing_order)
    +
    +        print(nonref_stats)
    +        # Output Example:
    +        # {
    +        #     "GTDB non-genomic total k-mer abundance": 1500.0,
    +        #     "GTDB non-genomic coverage index": 0.2,
    +        #     "VIRALDB non-genomic total k-mer abundance": 3500.0,
    +        #     "VIRALDB non-genomic coverage index": 0.70,
    +        #     "contaminant_X non-genomic total k-mer abundance": 0.0,
    +        #     "contaminant_X non-genomic coverage index": 0.0,
    +        #     "non-var non-genomic total k-mer abundance": 100.0,
    +        #     "non-var non-genomic coverage index": 0.1
    +        # }
    +        ```
    +
    +        **Notes**:
    +
    +            - **Variable Processing Order**:  
    +            The `vars_order` list determines the sequence in which variables are processed. This order is crucial
    +            when there is potential overlap in k-mers between variables, as earlier variables in the list have 
    +            higher priority in consuming shared k-mers.
    +
    +            - **Non-Reference K-mers Definition**:  
    +            Non-reference k-mers are defined as those present in the sample signature but absent in the reference 
    +            signature. This method focuses on characterizing these unique k-mers relative to provided variables.
    +        """
    +
    +        # check the all vars in vars_order are in vars
    +        if not all([var in vars for var in vars_order]):
    +            # report dict keys, and the vars order
    +            self.logger.debug("Provided vars_order: %s, and vars keys: %s", vars_order, list(vars.keys()))
    +            self.logger.error("All variables in vars_order must be present in vars.")
    +            raise ValueError("All variables in vars_order must be present in vars.")
    +
    +        self.logger.debug("Consuming non-reference k-mers from provided variables.")
    +        self.logger.debug("\t-Current size of the sample signature: %d hashes.", len(self.sample_sig))
    +
    +        sample_nonref = self.sample_sig - self.reference_sig
    +
    +        sample_nonref.trim_singletons()
    +
    +        sample_nonref_unique_hashes = len(sample_nonref)
    +
    +        self.logger.debug("\t-Size of non-reference k-mers in the sample signature: %d hashes.", len(sample_nonref))
    +        if len(sample_nonref) == 0:
    +            self.logger.warning("No non-reference k-mers found in the sample signature.")
    +            return {}
    +
    +        # intersect and report coverage and depth, then subtract from sample_nonref so sum will be 100%
    +        for var_name in vars_order:
    +            sample_nonref_var: SnipeSig = sample_nonref & vars[var_name]
    +            sample_nonref_var_total_abundance = sample_nonref_var.total_abundance
    +            sample_nonref_var_unique_hashes = len(sample_nonref_var)
    +            sample_nonref_var_coverage_index = sample_nonref_var_unique_hashes / sample_nonref_unique_hashes
    +            self.vars_nonref_stats.update({
    +                f"{var_name} non-genomic total k-mer abundance": sample_nonref_var_total_abundance,
    +                f"{var_name} non-genomic coverage index": sample_nonref_var_coverage_index
    +            })
    +
    +            self.logger.debug("\t-Consuming non-reference k-mers from variable '%s'.", var_name)
    +            sample_nonref -= sample_nonref_var
    +            self.logger.debug("\t-Size of remaining non-reference k-mers in the sample signature: %d hashes.", len(sample_nonref))
    +
    +        self.vars_nonref_stats["non-var non-genomic total k-mer abundance"] = sample_nonref.total_abundance
    +        self.vars_nonref_stats["non-var non-genomic coverage index"] = len(sample_nonref) / sample_nonref_unique_hashes if sample_nonref_unique_hashes > 0 else 0.0
    +
    +        self.logger.debug(
    +            "After consuming all vars from the non reference k-mers, the size of the sample signature is: %d hashes, "
    +            "with total abundance of %s.", 
    +            len(sample_nonref), sample_nonref.total_abundance
    +        )
    +
    +        return self.vars_nonref_stats
    +
    +    def load_genome_sig_to_dict(self, *, zip_file_path: str, **kwargs) -> Dict[str, 'SnipeSig']:
    +        """
    +        Load a genome signature into a dictionary of SnipeSig instances.
    +
    +        Parameters:
    +            zip_file_path (str): Path to the zip file containing the genome signatures.
    +            **kwargs: Additional keyword arguments to pass to the SnipeSig constructor.
    +
    +        Returns:
    +            Dict[str, SnipeSig]: A dictionary mapping genome names to SnipeSig instances.
    +        """
    +
    +        genome_chr_name_to_sig = {}
    +
    +        sourmash_sigs: List[sourmash.signature.SourmashSignature] = sourmash.load_file_as_signatures(zip_file_path)
    +        sex_count = 0
    +        autosome_count = 0
    +        genome_count = 0
    +        for sig in sourmash_sigs:
    +            name = sig.name
    +            if name.endswith("-snipegenome"):
    +                self.logger.debug(f"Loading genome signature: {name}")
    +                restored_name = name.replace("-snipegenome", "")
    +                genome_chr_name_to_sig[restored_name] = SnipeSig(sourmash_sig=sig, sig_type=SigType.GENOME)
    +                genome_count += 1
    +            elif "sex" in name:
    +                sex_count += 1
    +                genome_chr_name_to_sig[name.replace('sex-','')] = SnipeSig(sourmash_sig=sig, sig_type=SigType.GENOME)
    +            elif "autosome" in name:
    +                autosome_count += 1
    +                genome_chr_name_to_sig[name.replace('autosome-','')] = SnipeSig(sourmash_sig=sig, sig_type=SigType.GENOME)
    +            else:
    +                logging.warning(f"Unknown genome signature name: {name}, are you sure you generated this with `snipe sketch --ref`?")
    +
    +        self.logger.debug("Loaded %d genome signatures and %d sex chrs and %d autosome chrs", genome_count, sex_count, autosome_count)
    +
    +        if genome_count != 1:
    +            logging.error(f"Expected 1 genome signature, found {genome_count}")
    +
    +
    +        return genome_chr_name_to_sig
     
    @@ -3647,69 +4968,7 @@

    Source code in src/snipe/api/reference_QC.py -
     970
    - 971
    - 972
    - 973
    - 974
    - 975
    - 976
    - 977
    - 978
    - 979
    - 980
    - 981
    - 982
    - 983
    - 984
    - 985
    - 986
    - 987
    - 988
    - 989
    - 990
    - 991
    - 992
    - 993
    - 994
    - 995
    - 996
    - 997
    - 998
    - 999
    -1000
    -1001
    -1002
    -1003
    -1004
    -1005
    -1006
    -1007
    -1008
    -1009
    -1010
    -1011
    -1012
    -1013
    -1014
    -1015
    -1016
    -1017
    -1018
    -1019
    -1020
    -1021
    -1022
    -1023
    -1024
    -1025
    -1026
    -1027
    -1028
    -1029
    -1030
    -1031
    -1032
    +              
    1032
     1033
     1034
     1035
    @@ -3745,108 +5004,180 @@ 

    1065 1066 1067 -1068

    def calculate_chromosome_metrics(self, chr_to_sig: Dict[str, SnipeSig]) -> Dict[str, Any]:
    -    r"""
    -    Calculate the coefficient of variation (CV) of mean abundances across autosomal chromosomes.
    -
    -    This method computes the CV to assess the variability of mean abundances among autosomal chromosomes,
    -    excluding any sex chromosomes.
    -
    -    **Mathematical Explanation**:
    -
    -    The Coefficient of Variation (CV) is defined as:
    -
    -    $$
    -    \text{CV} = \frac{\sigma}{\mu}
    -    $$
    -
    -    Where:
    -    - \( \sigma \) is the standard deviation of the mean abundances across autosomal chromosomes.
    -    - \( \mu \) is the mean of the mean abundances across autosomal chromosomes.
    -
    -    **Parameters**:
    -
    -    - `chr_to_sig` (`Dict[str, SnipeSig]`):  
    -      A dictionary mapping chromosome names (e.g., `'autosomal-1'`, `'autosomal-2'`, `'sex-x'`, `'sex-y'`) to their corresponding
    -      `SnipeSig` instances. Each `SnipeSig` should represent the k-mer signature of a specific chromosome.
    -
    -    **Returns**:
    -
    -    - `Dict[str, Any]`:  
    -      A dictionary containing the computed metrics:
    -          - `"Autosomal_CV"` (`float`):  
    -            The coefficient of variation of mean abundances across autosomal chromosomes.
    -
    -    **Raises**:
    -
    -    - `ValueError`:  
    -      If `chr_to_sig` is empty or if there is an inconsistency in the signatures' parameters.
    -
    -    **Usage Example**:
    -
    -    ```python
    -    # Assume `chr_signatures` is a dictionary of chromosome-specific SnipeSig instances
    -    chr_signatures = {
    -        "1": sig_chr1,
    -        "2": sig_chr2,
    -        "X": sig_chrX,
    -        "Y": sig_chrY
    -    }
    -
    -    # Calculate chromosome metrics
    -    metrics = qc.calculate_chromosome_metrics(chr_to_sig=chr_signatures)
    -
    -    print(metrics)
    -    # Output:
    -    # {'Autosomal_CV': 0.15}
    -    ```
    -
    -    **Notes**:
    -
    -    - **Exclusion of Sex Chromosomes**:  
    -      Chromosomes with names containing the substring `"sex"` (e.g., `'sex-y'`, `'sex-x'`) are excluded from the CV calculation to focus solely on autosomal chromosomes.
    -
    -    - **Mean Abundance Calculation**:  
    -      The mean abundance for each chromosome is calculated by intersecting the sample signature with the chromosome-specific signature and averaging the abundances of the shared k-mers.
    -    """
    -
    -    # Implementation of the method
    -    # let's make sure all chromosome sigs are unique
    -    specific_chr_to_sig = SnipeSig.get_unique_signatures(chr_to_sig)
    -
    -    # calculate mean abundance for each chromosome and loaded sample sig
    -    chr_to_mean_abundance = {}
    -    self.logger.debug("Calculating mean abundance for each chromosome.")
    -    for chr_name, chr_sig in specific_chr_to_sig.items():
    -        chr_sample_sig = self.sample_sig & chr_sig
    -        chr_stats = chr_sample_sig.get_sample_stats
    -        chr_to_mean_abundance[chr_name] = chr_stats["mean_abundance"]
    -        self.logger.debug("\t-Mean abundance for %s: %f", chr_name, chr_stats["mean_abundance"])
    -
    -
    -    # chr_to_mean_abundance but without any chr with partial name sex
    -    autosomal_chr_to_mean_abundance = {}
    -    for chr_name, mean_abundance in chr_to_mean_abundance.items():
    -        if "sex" in chr_name.lower():
    -            continue
    -        autosomal_chr_to_mean_abundance[chr_name] = mean_abundance
    -
    -
    -    # calculate the CV for the whole sample
    -    if autosomal_chr_to_mean_abundance:
    -        mean_abundances = np.array(list(autosomal_chr_to_mean_abundance.values()), dtype=float)
    -        cv = np.std(mean_abundances) / np.mean(mean_abundances) if np.mean(mean_abundances) != 0 else 0.0
    -        self.chrs_stats.update({"Autosomal_CV": cv})
    -        self.logger.debug("Calculated Autosomal CV: %f", cv)
    -    else:
    -        self.logger.warning("No autosomal chromosomes were processed. 'Autosomal_CV' set to None.")
    -        self.chrs_stats.update({"Autosomal_CV": None})
    -
    -    # optional return, not required
    -    return self.chrs_stats
    -
    - -
    +1068 +1069 +1070 +1071 +1072 +1073 +1074 +1075 +1076 +1077 +1078 +1079 +1080 +1081 +1082 +1083 +1084 +1085 +1086 +1087 +1088 +1089 +1090 +1091 +1092 +1093 +1094 +1095 +1096 +1097 +1098 +1099 +1100 +1101 +1102 +1103 +1104 +1105 +1106 +1107 +1108 +1109 +1110 +1111 +1112 +1113 +1114 +1115 +1116 +1117 +1118 +1119 +1120 +1121 +1122 +1123 +1124 +1125 +1126 +1127 +1128 +1129 +1130 +1131 +1132 +1133 +1134 +1135
    def calculate_chromosome_metrics(self, chr_to_sig: Dict[str, SnipeSig]) -> Dict[str, Any]:
    +    r"""
    +    Calculate the coefficient of variation (CV) of mean abundances across autosomal chromosomes.
    +
    +    This method computes the CV to assess the variability of mean abundances among autosomal chromosomes,
    +    excluding any sex chromosomes.
    +
    +    **Mathematical Explanation**:
    +
    +    The Coefficient of Variation (CV) is defined as:
    +
    +    $$
    +    \text{CV} = \frac{\sigma}{\mu}
    +    $$
    +
    +    Where:
    +    - \( \sigma \) is the standard deviation of the mean abundances across autosomal chromosomes.
    +    - \( \mu \) is the mean of the mean abundances across autosomal chromosomes.
    +
    +    **Parameters**:
    +
    +    - `chr_to_sig` (`Dict[str, SnipeSig]`):  
    +      A dictionary mapping chromosome names (e.g., `'autosomal-1'`, `'autosomal-2'`, `'sex-x'`, `'sex-y'`) to their corresponding
    +      `SnipeSig` instances. Each `SnipeSig` should represent the k-mer signature of a specific chromosome.
    +
    +    **Returns**:
    +
    +    - `Dict[str, Any]`:  
    +      A dictionary containing the computed metrics:
    +          - `"Autosomal_CV"` (`float`):  
    +            The coefficient of variation of mean abundances across autosomal chromosomes.
    +
    +    **Raises**:
    +
    +    - `ValueError`:  
    +      If `chr_to_sig` is empty or if there is an inconsistency in the signatures' parameters.
    +
    +    **Usage Example**:
    +
    +    ```python
    +    # Assume `chr_signatures` is a dictionary of chromosome-specific SnipeSig instances
    +    chr_signatures = {
    +        "1": sig_chr1,
    +        "2": sig_chr2,
    +        "X": sig_chrX,
    +        "Y": sig_chrY
    +    }
    +
    +    # Calculate chromosome metrics
    +    metrics = qc.calculate_chromosome_metrics(chr_to_sig=chr_signatures)
    +
    +    print(metrics)
    +    # Output:
    +    # {'Autosomal_CV': 0.15}
    +    ```
    +
    +    **Notes**:
    +
    +    - **Exclusion of Sex Chromosomes**:  
    +      Chromosomes with names containing the substring `"sex"` (e.g., `'sex-y'`, `'sex-x'`) are excluded from the CV calculation to focus solely on autosomal chromosomes.
    +
    +    - **Mean Abundance Calculation**:  
    +      The mean abundance for each chromosome is calculated by intersecting the sample signature with the chromosome-specific signature and averaging the abundances of the shared k-mers.
    +    """
    +
    +    # Implementation of the method
    +    # let's make sure all chromosome sigs are unique
    +    self.logger.debug("Computing specific chromosome hashes for %s.", ','.join(chr_to_sig.keys()))
    +    self.logger.debug(f"\t-All hashes for chromosomes before getting unique sigs {len(SnipeSig.sum_signatures(list(chr_to_sig.values())))}")
    +    specific_chr_to_sig = SnipeSig.get_unique_signatures(chr_to_sig)
    +    self.logger.debug(f"\t-All hashes for chromosomes after getting unique sigs {len(SnipeSig.sum_signatures(list(specific_chr_to_sig.values())))}")
    +
    +    # calculate mean abundance for each chromosome and loaded sample sig
    +    chr_to_mean_abundance = {}
    +    self.logger.debug("Calculating mean abundance for each chromosome.")
    +    for chr_name, chr_sig in specific_chr_to_sig.items():
    +        self.logger.debug("Intersecting %s (%d) with %s (%d)", self.sample_sig.name, len(self.sample_sig), chr_name, len(chr_sig))
    +        chr_sample_sig = self.sample_sig & chr_sig
    +        chr_stats = chr_sample_sig.get_sample_stats
    +        chr_to_mean_abundance[chr_name] = chr_stats["mean_abundance"]
    +        self.logger.debug("\t-Mean abundance for %s: %f", chr_name, chr_stats["mean_abundance"])
    +
    +    self.chrs_stats.update(chr_to_mean_abundance)
    +
    +    # chr_to_mean_abundance but without any chr with partial name sex
    +    autosomal_chr_to_mean_abundance = {}
    +    for chr_name, mean_abundance in chr_to_mean_abundance.items():
    +        if "sex" in chr_name.lower():
    +            continue
    +        autosomal_chr_to_mean_abundance[chr_name] = mean_abundance
    +
    +
    +    # calculate the CV for the whole sample
    +    if autosomal_chr_to_mean_abundance:
    +        mean_abundances = np.array(list(autosomal_chr_to_mean_abundance.values()), dtype=float)
    +        cv = np.std(mean_abundances) / np.mean(mean_abundances) if np.mean(mean_abundances) != 0 else 0.0
    +        self.chrs_stats.update({"Autosomal_CV": cv})
    +        self.logger.debug("Calculated Autosomal CV: %f", cv)
    +    else:
    +        self.logger.warning("No autosomal chromosomes were processed. 'Autosomal_CV' set to None.")
    +        self.chrs_stats.update({"Autosomal_CV": None})
    +
    +    # optional return, not required
    +    return self.chrs_stats
    +
    + +

    @@ -3904,61 +5235,7 @@

    Source code in src/snipe/api/reference_QC.py -
    718
    -719
    -720
    -721
    -722
    -723
    -724
    -725
    -726
    -727
    -728
    -729
    -730
    -731
    -732
    -733
    -734
    -735
    -736
    -737
    -738
    -739
    -740
    -741
    -742
    -743
    -744
    -745
    -746
    -747
    -748
    -749
    -750
    -751
    -752
    -753
    -754
    -755
    -756
    -757
    -758
    -759
    -760
    -761
    -762
    -763
    -764
    -765
    -766
    -767
    -768
    -769
    -770
    -771
    -772
    +              
    772
     773
     774
     775
    @@ -4006,109 +5283,179 @@ 

    817 818 819 -820

    def calculate_coverage_vs_depth(self, n: int = 30) -> List[Dict[str, Any]]:
    -    r"""
    -    Calculate cumulative coverage index vs cumulative sequencing depth.
    -
    -    This method simulates incremental sequencing by splitting the sample signature into `n` parts and
    -    calculating the cumulative coverage index at each step. It helps in understanding how coverage
    -    improves with increased sequencing depth.
    -
    -    **Mathematical Explanation**:
    -
    -    For each cumulative part \( i \) (where \( 1 \leq i \leq n \)):
    -
    -    - **Cumulative Sequencing Depth** (\( D_i \)):
    -      $$
    -      D_i = \sum_{j=1}^{i} a_j
    -      $$
    -      Where \( a_j \) is the total abundance of the \( j^{th} \) part.
    -
    -    - **Cumulative Coverage Index** (\( C_i \)):
    -      $$
    -      C_i = \frac{\text{Number of genomic unique k-mers in first } i \text{ parts}}{\left| \text{Reference genome k-mer set} \right|}
    -      $$
    -
    -    **Parameters**:
    -
    -    - `n` (`int`): Number of parts to split the signature into.
    -
    -    **Returns**:
    -
    -    - `List[Dict[str, Any]]`:  
    -      List of dictionaries containing:
    -        - `"cumulative_parts"` (`int`): Number of parts included.
    -        - `"cumulative_total_abundance"` (`int`): Total sequencing depth up to this part.
    -        - `"cumulative_coverage_index"` (`float`): Coverage index up to this part.
    -
    -    **Usage Example**:
    -
    -    ```python
    -    coverage_depth_data = qc.calculate_coverage_vs_depth(n=10)
    -    for data in coverage_depth_data:
    -        print(data)
    -    ```
    -    """
    -    self.logger.debug("Calculating coverage vs depth with %d parts.", n)
    -    # Determine the ROI reference signature
    -    if isinstance(self.amplicon_sig, SnipeSig):
    -        roi_reference_sig = self.amplicon_sig
    -        self.logger.debug("Using amplicon signature as ROI reference.")
    -    else:
    -        roi_reference_sig = self.reference_sig
    -        self.logger.debug("Using reference genome signature as ROI reference.")
    -
    -    # Split the sample signature into n random parts
    -    split_sigs = self.split_sig_randomly(n)
    -
    -    coverage_depth_data = []
    -
    -    cumulative_snipe_sig = split_sigs[0].copy()
    -    cumulative_total_abundance = cumulative_snipe_sig.total_abundance
    -
    -    #! force conversion to GENOME
    -    roi_reference_sig.sigtype = SigType.GENOME
    -
    -    # Compute initial coverage index
    -    cumulative_qc = ReferenceQC(
    -        sample_sig=cumulative_snipe_sig,
    -        reference_sig=roi_reference_sig,
    -        enable_logging=self.enable_logging
    -    )
    -    cumulative_stats = cumulative_qc.get_aggregated_stats()
    -    cumulative_coverage_index = cumulative_stats["Genome coverage index"]
    +820
    +821
    +822
    +823
    +824
    +825
    +826
    +827
    +828
    +829
    +830
    +831
    +832
    +833
    +834
    +835
    +836
    +837
    +838
    +839
    +840
    +841
    +842
    +843
    +844
    +845
    +846
    +847
    +848
    +849
    +850
    +851
    +852
    +853
    +854
    +855
    +856
    +857
    +858
    +859
    +860
    +861
    +862
    +863
    +864
    +865
    +866
    +867
    +868
    +869
    +870
    +871
    +872
    +873
    +874
    +875
    +876
    +877
    +878
    +879
    +880
    +881
    +882
    def calculate_coverage_vs_depth(self, n: int = 30) -> List[Dict[str, Any]]:
    +    r"""
    +    Calculate cumulative coverage index vs cumulative sequencing depth.
    +
    +    This method simulates incremental sequencing by splitting the sample signature into `n` parts and
    +    calculating the cumulative coverage index at each step. It helps in understanding how coverage
    +    improves with increased sequencing depth.
    +
    +    **Mathematical Explanation**:
    +
    +    For each cumulative part \( i \) (where \( 1 \leq i \leq n \)):
    +
    +    - **Cumulative Sequencing Depth** (\( D_i \)):
    +      $$
    +      D_i = \sum_{j=1}^{i} a_j
    +      $$
    +      Where \( a_j \) is the total abundance of the \( j^{th} \) part.
     
    -    coverage_depth_data.append({
    -        "cumulative_parts": 1,
    -        "cumulative_total_abundance": cumulative_total_abundance,
    -        "cumulative_coverage_index": cumulative_coverage_index,
    -    })
    -
    -    # Iterate over the rest of the parts
    -    for i in range(1, n):
    -        current_part = split_sigs[i]
    -
    -        # Add current part to cumulative signature
    -        cumulative_snipe_sig += current_part
    -        cumulative_total_abundance += current_part.total_abundance
    -
    -        # Compute new coverage index
    -        cumulative_qc = ReferenceQC(
    -            sample_sig=cumulative_snipe_sig,
    -            reference_sig=roi_reference_sig,
    -            enable_logging=self.enable_logging
    -        )
    -        cumulative_stats = cumulative_qc.get_aggregated_stats()
    -        cumulative_coverage_index = cumulative_stats["Genome coverage index"]
    -
    -        coverage_depth_data.append({
    -            "cumulative_parts": i + 1,
    -            "cumulative_total_abundance": cumulative_total_abundance,
    -            "cumulative_coverage_index": cumulative_coverage_index,
    -        })
    -
    -    self.logger.debug("Coverage vs depth calculation completed.")
    -    return coverage_depth_data
    +    - **Cumulative Coverage Index** (\( C_i \)):
    +      $$
    +      C_i = \frac{\text{Number of genomic unique k-mers in first } i \text{ parts}}{\left| \text{Reference genome k-mer set} \right|}
    +      $$
    +
    +    **Parameters**:
    +
    +    - `n` (`int`): Number of parts to split the signature into.
    +
    +    **Returns**:
    +
    +    - `List[Dict[str, Any]]`:  
    +      List of dictionaries containing:
    +        - `"cumulative_parts"` (`int`): Number of parts included.
    +        - `"cumulative_total_abundance"` (`int`): Total sequencing depth up to this part.
    +        - `"cumulative_coverage_index"` (`float`): Coverage index up to this part.
    +
    +    **Usage Example**:
    +
    +    ```python
    +    coverage_depth_data = qc.calculate_coverage_vs_depth(n=10)
    +    for data in coverage_depth_data:
    +        print(data)
    +    ```
    +    """
    +    self.logger.debug("Calculating coverage vs depth with %d parts.", n)
    +    # Determine the ROI reference signature
    +    if isinstance(self.amplicon_sig, SnipeSig):
    +        roi_reference_sig = self.amplicon_sig
    +        self.logger.debug("Using amplicon signature as ROI reference.")
    +    else:
    +        roi_reference_sig = self.reference_sig
    +        self.logger.debug("Using reference genome signature as ROI reference.")
    +
    +    # Split the sample signature into n random parts (cached if available)
    +    split_sigs = self.split_sig_randomly(n)
    +
    +    coverage_depth_data = []
    +
    +    if not split_sigs:
    +        self.logger.error("No split signatures available. Cannot calculate coverage vs depth.")
    +        return coverage_depth_data
    +
    +    cumulative_snipe_sig = split_sigs[0].copy()
    +    cumulative_total_abundance = cumulative_snipe_sig.total_abundance
    +
    +    # Force conversion to GENOME
    +    roi_reference_sig.sigtype = SigType.GENOME
    +
    +    # Compute initial coverage index
    +    cumulative_qc = ReferenceQC(
    +        sample_sig=cumulative_snipe_sig,
    +        reference_sig=roi_reference_sig,
    +        enable_logging=self.enable_logging
    +    )
    +    cumulative_stats = cumulative_qc.get_aggregated_stats()
    +    cumulative_coverage_index = cumulative_stats.get("Genome coverage index", 0.0)
    +
    +    coverage_depth_data.append({
    +        "cumulative_parts": 1,
    +        "cumulative_total_abundance": cumulative_total_abundance,
    +        "cumulative_coverage_index": cumulative_coverage_index,
    +    })
    +
    +    self.logger.debug("Added initial coverage depth data for part 1.")
    +
    +    # Iterate over the rest of the parts
    +    for i in range(1, n):
    +        current_part = split_sigs[i]
    +
    +        # Add current part to cumulative signature
    +        cumulative_snipe_sig += current_part
    +        cumulative_total_abundance += current_part.total_abundance
    +
    +        # Compute new coverage index
    +        cumulative_qc = ReferenceQC(
    +            sample_sig=cumulative_snipe_sig,
    +            reference_sig=roi_reference_sig,
    +            enable_logging=self.enable_logging
    +        )
    +        cumulative_stats = cumulative_qc.get_aggregated_stats()
    +        cumulative_coverage_index = cumulative_stats.get("Genome coverage index", 0.0)
    +
    +        coverage_depth_data.append({
    +            "cumulative_parts": i + 1,
    +            "cumulative_total_abundance": cumulative_total_abundance,
    +            "cumulative_coverage_index": cumulative_coverage_index,
    +        })
    +
    +        self.logger.debug("Added coverage depth data for part %d.", i + 1)
    +
    +    self.logger.debug("Coverage vs depth calculation completed.")
    +    return coverage_depth_data
     
    @@ -4214,74 +5561,7 @@

    Source code in src/snipe/api/reference_QC.py -
    1071
    -1072
    -1073
    -1074
    -1075
    -1076
    -1077
    -1078
    -1079
    -1080
    -1081
    -1082
    -1083
    -1084
    -1085
    -1086
    -1087
    -1088
    -1089
    -1090
    -1091
    -1092
    -1093
    -1094
    -1095
    -1096
    -1097
    -1098
    -1099
    -1100
    -1101
    -1102
    -1103
    -1104
    -1105
    -1106
    -1107
    -1108
    -1109
    -1110
    -1111
    -1112
    -1113
    -1114
    -1115
    -1116
    -1117
    -1118
    -1119
    -1120
    -1121
    -1122
    -1123
    -1124
    -1125
    -1126
    -1127
    -1128
    -1129
    -1130
    -1131
    -1132
    -1133
    -1134
    -1135
    -1136
    -1137
    -1138
    +              
    1138
     1139
     1140
     1141
    @@ -4413,206 +5693,273 @@ 

    1267 1268 1269 -1270

    def calculate_sex_chrs_metrics(self, genome_and_chr_to_sig: Dict[str, SnipeSig]) -> Dict[str, Any]:
    -    r"""
    -    Calculate sex chromosome-related metrics based on genome and chromosome-specific signatures.
    -
    -    This method processes a collection of genome and chromosome-specific `SnipeSig` instances to compute
    -    metrics such as the X-Ploidy score and Y-Coverage. It ensures that each chromosome signature contains
    -    only unique hashes that do not overlap with hashes from other chromosomes or the autosomal genome.
    -    The method excludes sex chromosomes (e.g., Y chromosome) from the autosomal genome signature to
    -    accurately assess sex chromosome metrics.
    -
    -    **Mathematical Explanation**:
    -
    -    - **X-Ploidy Score**:
    -
    -      The X-Ploidy score is calculated using the formula:
    -
    -      $$
    -      \text{X-Ploidy} = \left(\frac{\mu_X}{\mu_{\text{autosomal}}}\right) \times \left(\frac{N_{\text{autosomal}}}{N_X}\right)
    -      $$
    -
    -      Where:
    -      - \( \mu_X \) is the mean abundance of X chromosome-specific k-mers in the sample.
    -      - \( \mu_{\text{autosomal}} \) is the mean abundance of autosomal k-mers in the sample.
    -      - \( N_{\text{autosomal}} \) is the number of autosomal k-mers in the reference genome.
    -      - \( N_X \) is the number of X chromosome-specific k-mers in the reference genome.
    -
    -    - **Y-Coverage**:
    -
    -      The Y-Coverage is calculated using the formula:
    -
    -      $$
    -      \text{Y-Coverage} = \frac{\left(\frac{N_Y^{\text{sample}}}{N_Y}\right)}{\left(\frac{N_{\text{autosomal}}^{\text{sample}}}{N_{\text{autosomal}}}\right)}
    -      $$
    -
    -      Where:
    -      - \( N_Y^{\text{sample}} \) is the number of Y chromosome-specific k-mers in the sample.
    -      - \( N_Y \) is the number of Y chromosome-specific k-mers in the reference genome.
    -      - \( N_{\text{autosomal}}^{\text{sample}} \) is the number of autosomal k-mers in the sample.
    -      - \( N_{\text{autosomal}} \) is the number of autosomal k-mers in the reference genome.
    -
    -    **Parameters**:
    -
    -        - `genome_and_chr_to_sig` (`Dict[str, SnipeSig]`):  
    -          A dictionary mapping signature names to their corresponding `SnipeSig` instances. This should include
    -          the autosomal genome signature (with a name ending in `'-snipegenome'`) and chromosome-specific
    -          signatures (e.g., `'sex-x'`, `'sex-y'`, `'autosome-1'`, `'autosome-2'`, etc.).
    -
    -    **Returns**:
    -
    -        - `Dict[str, Any]`:  
    -          A dictionary containing the calculated sex-related metrics:
    -              - `"X-Ploidy score"` (`float`):  
    -                The ploidy score of the X chromosome, reflecting the ratio of X chromosome k-mer abundance
    -                to autosomal k-mer abundance, adjusted by genome and X chromosome sizes.
    -              - `"Y-Coverage"` (`float`, optional):  
    -                The coverage of Y chromosome-specific k-mers in the sample relative to autosomal coverage.
    -                This key is present only if a Y chromosome signature is provided.
    -
    -    **Raises**:
    -
    -        - `ValueError`:  
    -          - If the `'sex-x'` chromosome signature is not found in `genome_and_chr_to_sig`.
    -          - If the autosomal genome signature is not found or improperly labeled.
    -
    -    **Usage Example**:
    -
    -    ```python
    -    # Assume `genome_and_chr_signatures` is a dictionary of genome and chromosome-specific SnipeSig instances
    -    genome_and_chr_signatures = {
    -        "autosomal-snipegenome": sig_autosomal_genome,
    -        "1": sig_chr1,
    -        "2": sig_chr2,
    -        "sex-x": sig_sex_x,
    -        "sex-y": sig_sex_y
    -    }
    -
    -    # Calculate sex chromosome metrics
    -    metrics = qc.calculate_sex_chrs_metrics(genome_and_chr_to_sig=genome_and_chr_signatures)
    +1270
    +1271
    +1272
    +1273
    +1274
    +1275
    +1276
    +1277
    +1278
    +1279
    +1280
    +1281
    +1282
    +1283
    +1284
    +1285
    +1286
    +1287
    +1288
    +1289
    +1290
    +1291
    +1292
    +1293
    +1294
    +1295
    +1296
    +1297
    +1298
    +1299
    +1300
    +1301
    +1302
    +1303
    +1304
    +1305
    +1306
    +1307
    +1308
    +1309
    +1310
    +1311
    +1312
    +1313
    +1314
    +1315
    +1316
    +1317
    +1318
    +1319
    +1320
    +1321
    +1322
    +1323
    +1324
    +1325
    +1326
    +1327
    +1328
    +1329
    +1330
    +1331
    +1332
    +1333
    +1334
    +1335
    +1336
    +1337
    def calculate_sex_chrs_metrics(self, genome_and_chr_to_sig: Dict[str, SnipeSig]) -> Dict[str, Any]:
    +    r"""
    +    Calculate sex chromosome-related metrics based on genome and chromosome-specific signatures.
    +
    +    This method processes a collection of genome and chromosome-specific `SnipeSig` instances to compute
    +    metrics such as the X-Ploidy score and Y-Coverage. It ensures that each chromosome signature contains
    +    only unique hashes that do not overlap with hashes from other chromosomes or the autosomal genome.
    +    The method excludes sex chromosomes (e.g., Y chromosome) from the autosomal genome signature to
    +    accurately assess sex chromosome metrics.
    +
    +    **Mathematical Explanation**:
     
    -    print(metrics)
    -    # Output Example:
    -    # {
    -    #     "X-Ploidy score": 2.6667,
    -    #     "Y-Coverage": 0.0
    -    # }
    -    ```
    +    - **X-Ploidy Score**:
    +
    +      The X-Ploidy score is calculated using the formula:
    +
    +      $$
    +      \text{X-Ploidy} = \left(\frac{\mu_X}{\mu_{\text{autosomal}}}\right) \times \left(\frac{N_{\text{autosomal}}}{N_X}\right)
    +      $$
     
    -    **Notes**:
    -
    -        - **Signature Naming Convention**:  
    -          The autosomal genome signature must have a name ending with `'-snipegenome'`. Chromosome-specific
    -          signatures should be named accordingly (e.g., `'sex-x'`, `'sex-y'`, `'autosomal-1'`, `'autosomal-2'`, etc.).
    +      Where:
    +      - \( \mu_X \) is the mean abundance of X chromosome-specific k-mers in the sample.
    +      - \( \mu_{\text{autosomal}} \) is the mean abundance of autosomal k-mers in the sample.
    +      - \( N_{\text{autosomal}} \) is the number of autosomal k-mers in the reference genome.
    +      - \( N_X \) is the number of X chromosome-specific k-mers in the reference genome.
     
    -        - **Exclusion of Sex Chromosomes from Autosomal Genome**:  
    -          The Y chromosome signature (`'sex-y'`) is subtracted from the autosomal genome signature to ensure
    -          that Y chromosome k-mers are not counted towards autosomal metrics.
    +    - **Y-Coverage**:
    +
    +      The Y-Coverage is calculated using the formula:
     
    -        - **Robustness**:  
    -          The method includes comprehensive logging for debugging purposes, tracking each major step and
    -          any exclusions made during processing.
    -    """
    -
    -    # Ensure that the chromosome X signature exists
    -    if 'sex-x' not in genome_and_chr_to_sig:
    -        self.logger.warning("Chromosome X ('sex-x') not found in the provided signatures. X-Ploidy score will be set to zero.")
    -        # set sex-x to an empty signature
    -        genome_and_chr_to_sig['sex-x'] = SnipeSig.create_from_hashes_abundances(
    -            hashes=np.array([], dtype=np.uint64),
    -            abundances=np.array([], dtype=np.uint32),
    -            ksize=genome_and_chr_to_sig[list(genome_and_chr_to_sig.keys())[0]].ksize,
    -            scale=genome_and_chr_to_sig[list(genome_and_chr_to_sig.keys())[0]].scale,
    -        )
    -
    -    # Separate the autosomal genome signature from chromosome-specific signatures
    -    chr_to_sig: Dict[str, SnipeSig] = {}
    -    autosomals_genome_sig: Optional[SnipeSig] = None
    -    self.logger.debug("Separating autosomal genome signature from chromosome-specific signatures.")
    -
    -    for name, sig in genome_and_chr_to_sig.items():
    -        if name.endswith('-snipegenome'):
    -            self.logger.debug("\t- Identified autosomal genome signature: '%s'.", name)
    -            autosomals_genome_sig = sig
    -        else:
    -            chr_to_sig[name] = sig
    +      $$
    +      \text{Y-Coverage} = \frac{\left(\frac{N_Y^{\text{sample}}}{N_Y}\right)}{\left(\frac{N_{\text{autosomal}}^{\text{sample}}}{N_{\text{autosomal}}}\right)}
    +      $$
    +
    +      Where:
    +      - \( N_Y^{\text{sample}} \) is the number of Y chromosome-specific k-mers in the sample.
    +      - \( N_Y \) is the number of Y chromosome-specific k-mers in the reference genome.
    +      - \( N_{\text{autosomal}}^{\text{sample}} \) is the number of autosomal k-mers in the sample.
    +      - \( N_{\text{autosomal}} \) is the number of autosomal k-mers in the reference genome.
    +
    +    **Parameters**:
    +
    +        - `genome_and_chr_to_sig` (`Dict[str, SnipeSig]`):  
    +          A dictionary mapping signature names to their corresponding `SnipeSig` instances. This should include
    +          the autosomal genome signature (with a name ending in `'-snipegenome'`) and chromosome-specific
    +          signatures (e.g., `'sex-x'`, `'sex-y'`, `'autosome-1'`, `'autosome-2'`, etc.).
    +
    +    **Returns**:
    +
    +        - `Dict[str, Any]`:  
    +          A dictionary containing the calculated sex-related metrics:
    +              - `"X-Ploidy score"` (`float`):  
    +                The ploidy score of the X chromosome, reflecting the ratio of X chromosome k-mer abundance
    +                to autosomal k-mer abundance, adjusted by genome and X chromosome sizes.
    +              - `"Y-Coverage"` (`float`, optional):  
    +                The coverage of Y chromosome-specific k-mers in the sample relative to autosomal coverage.
    +                This key is present only if a Y chromosome signature is provided.
     
    -    if autosomals_genome_sig is None:
    -        self.logger.error("Autosomal genome signature (ending with '-snipegenome') not found.")
    -        raise ValueError("Autosomal genome signature (ending with '-snipegenome') not found.")
    -
    -    # Ensure all chromosome signatures have unique hashes
    -    specific_chr_to_sig = SnipeSig.get_unique_signatures(chr_to_sig)
    -
    -    # Exclude Y chromosome from the autosomal genome signature if present
    -    if 'sex-y' in chr_to_sig:
    -        self.logger.debug("Y chromosome ('sex-y') detected. Removing its hashes from the autosomal genome signature.")
    -        self.logger.debug("\t- Original autosomal genome size: %d hashes.", len(autosomals_genome_sig))
    -        autosomals_genome_sig = autosomals_genome_sig - chr_to_sig['sex-y']
    -        self.logger.debug("\t- Updated autosomal genome size after removing Y chromosome: %d hashes.", len(autosomals_genome_sig))
    -
    -    # Remove X chromosome hashes from the autosomal genome signature
    -    self.logger.debug("Removing X chromosome ('sex-x') hashes from the autosomal genome signature.")
    -    autosomals_genome_sig = autosomals_genome_sig - chr_to_sig['sex-x']
    -    self.logger.debug("\t- Updated autosomal genome size after removing X chromosome: %d hashes.", len(autosomals_genome_sig))
    -
    -    # Derive the X chromosome-specific signature by subtracting autosomal genome hashes
    -    specific_xchr_sig = specific_chr_to_sig["sex-x"] - autosomals_genome_sig
    -    self.logger.debug("\t-Derived X chromosome-specific signature size: %d hashes.", len(specific_xchr_sig))
    -
    -    # Intersect the sample signature with chromosome-specific signatures
    -    sample_specific_xchr_sig = self.sample_sig & specific_xchr_sig
    -    if len(sample_specific_xchr_sig) == 0:
    -        self.logger.warning("No X chromosome-specific k-mers found in the sample signature.")
    -    self.logger.debug("\t-Intersected sample signature with X chromosome-specific k-mers = %d hashes.", len(sample_specific_xchr_sig))
    -    sample_autosomal_sig = self.sample_sig & autosomals_genome_sig
    -    self.logger.debug("\t-Intersected sample signature with autosomal genome k-mers = %d hashes.", len(sample_autosomal_sig))
    +    **Raises**:
    +
    +        - `ValueError`:  
    +          - If the `'sex-x'` chromosome signature is not found in `genome_and_chr_to_sig`.
    +          - If the autosomal genome signature is not found or improperly labeled.
    +
    +    **Usage Example**:
    +
    +    ```python
    +    # Assume `genome_and_chr_signatures` is a dictionary of genome and chromosome-specific SnipeSig instances
    +    genome_and_chr_signatures = {
    +        "autosomal-snipegenome": sig_autosomal_genome,
    +        "1": sig_chr1,
    +        "2": sig_chr2,
    +        "sex-x": sig_sex_x,
    +        "sex-y": sig_sex_y
    +    }
    +
    +    # Calculate sex chromosome metrics
    +    metrics = qc.calculate_sex_chrs_metrics(genome_and_chr_to_sig=genome_and_chr_signatures)
    +
    +    print(metrics)
    +    # Output Example:
    +    # {
    +    #     "X-Ploidy score": 2.6667,
    +    #     "Y-Coverage": 0.0
    +    # }
    +    ```
    +
    +    **Notes**:
     
    -    # Retrieve mean abundances
    -    xchr_mean_abundance = sample_specific_xchr_sig.get_sample_stats.get("mean_abundance", 0.0)
    -    autosomal_mean_abundance = sample_autosomal_sig.get_sample_stats.get("mean_abundance", 0.0)
    +        - **Signature Naming Convention**:  
    +          The autosomal genome signature must have a name ending with `'-snipegenome'`. Chromosome-specific
    +          signatures should be named accordingly (e.g., `'sex-x'`, `'sex-y'`, `'autosomal-1'`, `'autosomal-2'`, etc.).
     
    -    # Calculate X-Ploidy score
    -    if autosomal_mean_abundance == 0:
    -        self.logger.warning("Autosomal mean abundance is zero. Setting X-Ploidy score to zero to avoid division by zero.")
    -        xploidy_score = 0.0
    -    else:
    -        xploidy_score = (xchr_mean_abundance / autosomal_mean_abundance) * \
    -                        (len(autosomals_genome_sig) / len(specific_xchr_sig) if len(specific_xchr_sig) > 0 else 0.0)
    -
    -    self.logger.debug("Calculated X-Ploidy score: %.4f", xploidy_score)
    -    self.sex_stats.update({"X-Ploidy score": xploidy_score})
    -
    -    # Calculate Y-Coverage if Y chromosome is present
    -    if 'sex-y' in specific_chr_to_sig:
    -        self.logger.debug("Calculating Y-Coverage based on Y chromosome-specific k-mers.")
    -
    -        # Derive Y chromosome-specific k-mers by excluding autosomal and X chromosome k-mers
    -        ychr_specific_kmers = chr_to_sig["sex-y"] - autosomals_genome_sig - specific_xchr_sig
    -        self.logger.debug("\t-Derived Y chromosome-specific signature size: %d hashes.", len(ychr_specific_kmers))
    -
    -        # Intersect Y chromosome-specific k-mers with the sample signature
    -        ychr_in_sample = self.sample_sig & ychr_specific_kmers
    -        self.logger.debug("\t-Intersected sample signature with Y chromosome-specific k-mers = %d hashes.", len(ychr_in_sample))
    -        if len(ychr_in_sample) == 0:
    -            self.logger.warning("No Y chromosome-specific k-mers found in the sample signature.")
    +        - **Exclusion of Sex Chromosomes from Autosomal Genome**:  
    +          The Y chromosome signature (`'sex-y'`) is subtracted from the autosomal genome signature to ensure
    +          that Y chromosome k-mers are not counted towards autosomal metrics.
    +
    +        - **Robustness**:  
    +          The method includes comprehensive logging for debugging purposes, tracking each major step and
    +          any exclusions made during processing.
    +    """
    +
    +    # Ensure that the chromosome X signature exists
    +    if 'sex-x' not in genome_and_chr_to_sig:
    +        self.logger.warning("Chromosome X ('sex-x') not found in the provided signatures. X-Ploidy score will be set to zero.")
    +        # set sex-x to an empty signature
    +        genome_and_chr_to_sig['sex-x'] = SnipeSig.create_from_hashes_abundances(
    +            hashes=np.array([], dtype=np.uint64),
    +            abundances=np.array([], dtype=np.uint32),
    +            ksize=genome_and_chr_to_sig[list(genome_and_chr_to_sig.keys())[0]].ksize,
    +            scale=genome_and_chr_to_sig[list(genome_and_chr_to_sig.keys())[0]].scale,
    +        )
    +
    +    # Separate the autosomal genome signature from chromosome-specific signatures
    +    chr_to_sig: Dict[str, SnipeSig] = {}
    +    autosomals_genome_sig: Optional[SnipeSig] = None
    +    self.logger.debug("Separating autosomal genome signature from chromosome-specific signatures.")
     
    -        # Derive autosomal-specific k-mers by excluding X and Y chromosome k-mers from the reference signature
    -        autosomals_specific_kmers = self.reference_sig - specific_chr_to_sig["sex-x"] - specific_chr_to_sig['sex-y']
    -
    -        # Calculate Y-Coverage metric
    -        if len(ychr_specific_kmers) == 0 or len(autosomals_specific_kmers) == 0:
    -            self.logger.warning("Insufficient k-mers for Y-Coverage calculation. Setting Y-Coverage to zero.")
    -            ycoverage = 0.0
    -        else:
    -            ycoverage = (len(ychr_in_sample) / len(ychr_specific_kmers)) / \
    -                    (len(sample_autosomal_sig) / len(autosomals_specific_kmers))
    +    for name, sig in genome_and_chr_to_sig.items():
    +        if name.endswith('-snipegenome'):
    +            self.logger.debug("\t- Identified autosomal genome signature: '%s'.", name)
    +            autosomals_genome_sig = sig
    +        else:
    +            chr_to_sig[name] = sig
    +
    +    if autosomals_genome_sig is None:
    +        self.logger.error("Autosomal genome signature (ending with '-snipegenome') not found.")
    +        raise ValueError("Autosomal genome signature (ending with '-snipegenome') not found.")
     
    -        self.logger.debug("Calculated Y-Coverage: %.4f", ycoverage)
    -        self.sex_stats.update({"Y-Coverage": ycoverage})
    +    # Ensure all chromosome signatures have unique hashes
    +    specific_chr_to_sig = SnipeSig.get_unique_signatures(chr_to_sig)
     
    -    return self.sex_stats
    +    # Exclude Y chromosome from the autosomal genome signature if present
    +    if 'sex-y' in chr_to_sig:
    +        self.logger.debug("Y chromosome ('sex-y') detected. Removing its hashes from the autosomal genome signature.")
    +        self.logger.debug("\t- Original autosomal genome size: %d hashes.", len(autosomals_genome_sig))
    +        autosomals_genome_sig = autosomals_genome_sig - chr_to_sig['sex-y']
    +        self.logger.debug("\t- Updated autosomal genome size after removing Y chromosome: %d hashes.", len(autosomals_genome_sig))
    +
    +    # Remove X chromosome hashes from the autosomal genome signature
    +    self.logger.debug("Removing X chromosome ('sex-x') hashes from the autosomal genome signature.")
    +    autosomals_genome_sig = autosomals_genome_sig - chr_to_sig['sex-x']
    +    self.logger.debug("\t- Updated autosomal genome size after removing X chromosome: %d hashes.", len(autosomals_genome_sig))
    +
    +    # Derive the X chromosome-specific signature by subtracting autosomal genome hashes
    +    specific_xchr_sig = specific_chr_to_sig["sex-x"] - autosomals_genome_sig
    +    self.logger.debug("\t-Derived X chromosome-specific signature size: %d hashes.", len(specific_xchr_sig))
    +
    +    # Intersect the sample signature with chromosome-specific signatures
    +    sample_specific_xchr_sig = self.sample_sig & specific_xchr_sig
    +    if len(sample_specific_xchr_sig) == 0:
    +        self.logger.warning("No X chromosome-specific k-mers found in the sample signature.")
    +    self.logger.debug("\t-Intersected sample signature with X chromosome-specific k-mers = %d hashes.", len(sample_specific_xchr_sig))
    +    sample_autosomal_sig = self.sample_sig & autosomals_genome_sig
    +    self.logger.debug("\t-Intersected sample signature with autosomal genome k-mers = %d hashes.", len(sample_autosomal_sig))
    +
    +    # Retrieve mean abundances
    +    xchr_mean_abundance = sample_specific_xchr_sig.get_sample_stats.get("mean_abundance", 0.0)
    +    autosomal_mean_abundance = sample_autosomal_sig.get_sample_stats.get("mean_abundance", 0.0)
    +
    +    # Calculate X-Ploidy score
    +    if autosomal_mean_abundance == 0:
    +        self.logger.warning("Autosomal mean abundance is zero. Setting X-Ploidy score to zero to avoid division by zero.")
    +        xploidy_score = 0.0
    +    else:
    +        xploidy_score = (xchr_mean_abundance / autosomal_mean_abundance) * \
    +                        (len(autosomals_genome_sig) / len(specific_xchr_sig) if len(specific_xchr_sig) > 0 else 0.0)
    +
    +    self.logger.debug("Calculated X-Ploidy score: %.4f", xploidy_score)
    +    self.sex_stats.update({"X-Ploidy score": xploidy_score})
    +
    +    # Calculate Y-Coverage if Y chromosome is present
    +    if 'sex-y' in specific_chr_to_sig:
    +        self.logger.debug("Calculating Y-Coverage based on Y chromosome-specific k-mers.")
    +
    +        # Derive Y chromosome-specific k-mers by excluding autosomal and X chromosome k-mers
    +        ychr_specific_kmers = chr_to_sig["sex-y"] - autosomals_genome_sig - specific_xchr_sig
    +        self.logger.debug("\t-Derived Y chromosome-specific signature size: %d hashes.", len(ychr_specific_kmers))
    +
    +        # Intersect Y chromosome-specific k-mers with the sample signature
    +        ychr_in_sample = self.sample_sig & ychr_specific_kmers
    +        self.logger.debug("\t-Intersected sample signature with Y chromosome-specific k-mers = %d hashes.", len(ychr_in_sample))
    +        if len(ychr_in_sample) == 0:
    +            self.logger.warning("No Y chromosome-specific k-mers found in the sample signature.")
    +
    +        # Derive autosomal-specific k-mers by excluding X and Y chromosome k-mers from the reference signature
    +        autosomals_specific_kmers = self.reference_sig - specific_chr_to_sig["sex-x"] - specific_chr_to_sig['sex-y']
    +
    +        # Calculate Y-Coverage metric
    +        if len(ychr_specific_kmers) == 0 or len(autosomals_specific_kmers) == 0:
    +            self.logger.warning("Insufficient k-mers for Y-Coverage calculation. Setting Y-Coverage to zero.")
    +            ycoverage = 0.0
    +        else:
    +            ycoverage = (len(ychr_in_sample) / len(ychr_specific_kmers)) / \
    +                    (len(sample_autosomal_sig) / len(autosomals_specific_kmers))
    +
    +        self.logger.debug("Calculated Y-Coverage: %.4f", ycoverage)
    +        self.sex_stats.update({"Y-Coverage": ycoverage})
    +
    +    return self.sex_stats
     
    @@ -4660,105 +6007,105 @@

    Source code in src/snipe/api/reference_QC.py -
    @staticmethod
    -def distribute_kmers_random(original_dict: Dict[int, int], n: int) -> List[Dict[int, int]]:
    -    r"""
    -    Distribute the k-mers randomly into `n` parts based on their abundances.
    -
    -    This helper method performs the actual distribution of k-mers using a multinomial distribution.
    -
    -    **Mathematical Explanation**:
    -
    -    Given a k-mer with hash \( h \) and abundance \( a_h \), the distribution of its abundance across \( n \)
    -    parts is modeled as:
    -
    -    $$
    -    a_{h,1}, a_{h,2}, \dots, a_{h,n} \sim \text{Multinomial}(a_h, p_1, p_2, \dots, p_n)
    -    $$
    -
    -    Where \( p_i = \frac{1}{n} \) for all \( i \).
    -
    -    **Parameters**:
    -
    -    - `original_dict` (`Dict[int, int]`):  
    -      Dictionary mapping k-mer hashes to their abundances.
    -    - `n` (`int`): Number of parts to split into.
    -
    -    **Returns**:
    -
    -    - `List[Dict[int, int]]`:  
    -      List of dictionaries, each mapping k-mer hashes to their abundances in that part.
    -
    -    **Usage Example**:
    -
    -    ```python
    -    distributed = ReferenceQC.distribute_kmers_random(hash_to_abund, n=3)
    -    ```
    -    """
    -    # Initialize the resulting dictionaries
    -    distributed_dicts = [{} for _ in range(n)]
    -
    -    # For each k-mer and its abundance
    -    for kmer_hash, abundance in original_dict.items():
    -        if abundance == 0:
    -            continue  # Skip zero abundances
    -        # Generate multinomial split of abundance
    -        counts = np.random.multinomial(abundance, [1.0 / n] * n)
    -        # Update each dictionary
    -        for i in range(n):
    -            if counts[i] > 0:
    -                distributed_dicts[i][kmer_hash] = counts[i]
    -
    -    return distributed_dicts
    +              
    @staticmethod
    +def distribute_kmers_random(original_dict: Dict[int, int], n: int) -> List[Dict[int, int]]:
    +    r"""
    +    Distribute the k-mers randomly into `n` parts based on their abundances.
    +
    +    This helper method performs the actual distribution of k-mers using a multinomial distribution.
    +
    +    **Mathematical Explanation**:
    +
    +    Given a k-mer with hash \( h \) and abundance \( a_h \), the distribution of its abundance across \( n \)
    +    parts is modeled as:
    +
    +    $$
    +    a_{h,1}, a_{h,2}, \dots, a_{h,n} \sim \text{Multinomial}(a_h, p_1, p_2, \dots, p_n)
    +    $$
    +
    +    Where \( p_i = \frac{1}{n} \) for all \( i \).
    +
    +    **Parameters**:
    +
    +    - `original_dict` (`Dict[int, int]`):  
    +      Dictionary mapping k-mer hashes to their abundances.
    +    - `n` (`int`): Number of parts to split into.
    +
    +    **Returns**:
    +
    +    - `List[Dict[int, int]]`:  
    +      List of dictionaries, each mapping k-mer hashes to their abundances in that part.
    +
    +    **Usage Example**:
    +
    +    ```python
    +    distributed = ReferenceQC.distribute_kmers_random(hash_to_abund, n=3)
    +    ```
    +    """
    +    # Initialize the resulting dictionaries
    +    distributed_dicts = [{} for _ in range(n)]
    +
    +    # For each k-mer and its abundance
    +    for kmer_hash, abundance in original_dict.items():
    +        if abundance == 0:
    +            continue  # Skip zero abundances
    +        # Generate multinomial split of abundance
    +        counts = np.random.multinomial(abundance, [1.0 / n] * n)
    +        # Update each dictionary
    +        for i in range(n):
    +            if counts[i] > 0:
    +                distributed_dicts[i][kmer_hash] = counts[i]
    +
    +    return distributed_dicts
     
    @@ -4795,89 +6142,727 @@

    Source code in src/snipe/api/reference_QC.py -
    437
    -438
    -439
    -440
    -441
    -442
    -443
    -444
    -445
    -446
    -447
    -448
    -449
    -450
    -451
    -452
    -453
    -454
    -455
    -456
    -457
    -458
    -459
    -460
    -461
    -462
    -463
    -464
    -465
    -466
    -467
    -468
    -469
    -470
    -471
    -472
    +              
    def get_aggregated_stats(self, include_advanced: bool = False) -> Dict[str, Any]:
    -    r"""
    -    Retrieve aggregated statistics from the quality control analysis.
    -
    -    **Parameters**
    -
    -    - `include_advanced (bool)`:  
    -      If set to `True`, includes advanced metrics in the aggregated statistics.
    -
    -    **Returns**
    -
    -    - `Dict[str, Any]`:  
    -      A dictionary containing the aggregated statistics, which may include:
    -      - Sample statistics
    -      - Genome statistics
    -      - Amplicon statistics (if provided)
    -      - Predicted assay type
    -      - Advanced statistics (if `include_advanced` is `True`)
    -    """
    -    aggregated_stats: Dict[str, Any] = {}
    -    # Include sample_stats
    -    aggregated_stats.update(self.sample_stats)
    -    # Include genome_stats
    -    aggregated_stats.update(self.genome_stats)
    -    # Include amplicon_stats if available
    -    if self.amplicon_sig is not None:
    -        self.logger.debug("While aggregating stats; amplicon signature provided.")
    -        aggregated_stats.update(self.amplicon_stats)
    -        aggregated_stats["Predicted Assay Type"] = self.predicted_assay_type
    -
    -    if self.chrs_stats:
    -        aggregated_stats.update(self.chrs_stats)
    -
    -    if self.sex_stats:
    -        aggregated_stats.update(self.sex_stats)
    -
    -    # Include advanced_stats if requested
    -    if include_advanced:
    -        self._calculate_advanced_stats()
    -        aggregated_stats.update(self.advanced_stats)
    +478
    +479
    +480
    +481
    +482
    +483
    +484
    +485
    +486
    +487
    +488
    +489
    +490
    +491
    +492
    +493
    +494
    +495
    +496
    +497
    +498
    +499
    +500
    +501
    +502
    +503
    +504
    +505
    +506
    +507
    +508
    +509
    +510
    +511
    +512
    +513
    +514
    +515
    +516
    +517
    +518
    +519
    def get_aggregated_stats(self, include_advanced: bool = False) -> Dict[str, Any]:
    +    r"""
    +    Retrieve aggregated statistics from the quality control analysis.
    +
    +    **Parameters**
     
    -    return aggregated_stats
    +    - `include_advanced (bool)`:  
    +      If set to `True`, includes advanced metrics in the aggregated statistics.
    +
    +    **Returns**
    +
    +    - `Dict[str, Any]`:  
    +      A dictionary containing the aggregated statistics, which may include:
    +      - Sample statistics
    +      - Genome statistics
    +      - Amplicon statistics (if provided)
    +      - Predicted assay type
    +      - Advanced statistics (if `include_advanced` is `True`)
    +    """
    +    aggregated_stats: Dict[str, Any] = {}
    +    # Include sample_stats
    +    aggregated_stats.update(self.sample_stats)
    +    # Include genome_stats
    +    aggregated_stats.update(self.genome_stats)
    +    # Include amplicon_stats if available
    +    if self.amplicon_sig is not None:
    +        self.logger.debug("While aggregating stats; amplicon signature provided.")
    +        aggregated_stats.update(self.amplicon_stats)
    +        aggregated_stats["Predicted Assay Type"] = self.predicted_assay_type
    +
    +    if self.chrs_stats:
    +        aggregated_stats.update(self.chrs_stats)
    +
    +    if self.sex_stats:
    +        aggregated_stats.update(self.sex_stats)
    +
    +    if self.vars_nonref_stats:
    +        aggregated_stats.update(self.vars_nonref_stats)
    +
    +    # Include advanced_stats if requested
    +    if include_advanced:
    +        self._calculate_advanced_stats()
    +        aggregated_stats.update(self.advanced_stats)
    +
    +    if self.predicted_error_contamination_index:
    +        aggregated_stats.update(self.predicted_error_contamination_index)
    +
    +    return aggregated_stats
    +
    + +
    + + + +
    + + +

    + load_genome_sig_to_dict(*, zip_file_path, **kwargs) + +

    + + +
    + +

    Load a genome signature into a dictionary of SnipeSig instances.

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    + zip_file_path + + str + +
    +

    Path to the zip file containing the genome signatures.

    +
    +
    + required +
    + **kwargs + + +
    +

    Additional keyword arguments to pass to the SnipeSig constructor.

    +
    +
    + {} +
    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + Dict[str, SnipeSig] + +
    +

    Dict[str, SnipeSig]: A dictionary mapping genome names to SnipeSig instances.

    +
    +
    + +
    + Source code in src/snipe/api/reference_QC.py +
    def load_genome_sig_to_dict(self, *, zip_file_path: str, **kwargs) -> Dict[str, 'SnipeSig']:
    +    """
    +    Load a genome signature into a dictionary of SnipeSig instances.
    +
    +    Parameters:
    +        zip_file_path (str): Path to the zip file containing the genome signatures.
    +        **kwargs: Additional keyword arguments to pass to the SnipeSig constructor.
    +
    +    Returns:
    +        Dict[str, SnipeSig]: A dictionary mapping genome names to SnipeSig instances.
    +    """
    +
    +    genome_chr_name_to_sig = {}
    +
    +    sourmash_sigs: List[sourmash.signature.SourmashSignature] = sourmash.load_file_as_signatures(zip_file_path)
    +    sex_count = 0
    +    autosome_count = 0
    +    genome_count = 0
    +    for sig in sourmash_sigs:
    +        name = sig.name
    +        if name.endswith("-snipegenome"):
    +            self.logger.debug(f"Loading genome signature: {name}")
    +            restored_name = name.replace("-snipegenome", "")
    +            genome_chr_name_to_sig[restored_name] = SnipeSig(sourmash_sig=sig, sig_type=SigType.GENOME)
    +            genome_count += 1
    +        elif "sex" in name:
    +            sex_count += 1
    +            genome_chr_name_to_sig[name.replace('sex-','')] = SnipeSig(sourmash_sig=sig, sig_type=SigType.GENOME)
    +        elif "autosome" in name:
    +            autosome_count += 1
    +            genome_chr_name_to_sig[name.replace('autosome-','')] = SnipeSig(sourmash_sig=sig, sig_type=SigType.GENOME)
    +        else:
    +            logging.warning(f"Unknown genome signature name: {name}, are you sure you generated this with `snipe sketch --ref`?")
    +
    +    self.logger.debug("Loaded %d genome signatures and %d sex chrs and %d autosome chrs", genome_count, sex_count, autosome_count)
    +
    +    if genome_count != 1:
    +        logging.error(f"Expected 1 genome signature, found {genome_count}")
    +
    +
    +    return genome_chr_name_to_sig
    +
    +
    +
    + +
    + +
    + + +

    + nonref_consume_from_vars(*, vars, vars_order, **kwargs) + +

    + + +
    + +

    Consume and analyze non-reference k-mers from provided variable signatures.

    +

    This method processes non-reference k-mers in the sample signature by intersecting them with a set of +variable-specific SnipeSig instances. It calculates coverage and total abundance metrics for each +variable in a specified order, ensuring that each non-reference k-mer is accounted for without overlap +between variables. The method updates internal statistics that reflect the distribution of non-reference +k-mers across the provided variables.

    +

    Process Overview:

    +
      +
    1. Validation:
    2. +
    3. Verifies that all variable names specified in vars_order are present in the vars dictionary.
    4. +
    5. +

      Raises a ValueError if any variable in vars_order is missing from vars.

      +
    6. +
    7. +

      Non-Reference K-mer Extraction:

      +
    8. +
    9. Computes the set of non-reference non-singleton k-mers by subtracting the reference signature from the sample signature.
    10. +
    11. +

      If no non-reference k-mers are found, the method logs a warning and returns an empty dictionary.

      +
    12. +
    13. +

      Variable-wise Consumption:

      +
    14. +
    15. Iterates over each variable name in vars_order.
    16. +
    17. +

      For each variable:

      +
        +
      • Intersects the remaining non-reference k-mers with the variable-specific signature.
      • +
      • Calculates the total abundance and coverage index for the intersected k-mers.
      • +
      • Updates the vars_nonref_stats dictionary with the computed metrics.
      • +
      • Removes the consumed k-mers from the remaining non-reference set to prevent overlap.
      • +
      +
    18. +
    19. +

      Final State Logging:

      +
    20. +
    21. Logs the final size and total abundance of the remaining non-reference k-mers after consumption.
    22. +
    +

    Parameters:

    +
    - `vars` (`Dict[str, SnipeSig]`):  
    +A dictionary mapping variable names to their corresponding `SnipeSig` instances. Each `SnipeSig` 
    +represents a set of k-mers associated with a specific non-reference category or variable.
    +
    +- `vars_order` (`List[str]`):  
    +A list specifying the order in which variables should be processed. The order determines the priority 
    +of consumption, ensuring that earlier variables in the list have their k-mers accounted for before 
    +later ones.
    +
    +- `**kwargs`:  
    +Additional keyword arguments. Reserved for future extensions and should not be used in the current context.
    +
    +

    Returns:

    +
    - `Dict[str, float]`:  
    +A dictionary containing statistics for each variable name in `vars_order`, 
    +    - `"non-genomic total k-mer abundance"` (`float`):  
    +        The sum of abundances of non-reference k-mers associated with the variable.
    +    - `"non-genomic coverage index"` (`float`):  
    +        The ratio of unique non-reference k-mers associated with the variable to the total number 
    +        of non-reference k-mers in the sample before consumption.
    +
    +Example Output:
    +```python
    +{
    +    "variable_A non-genomic total k-mer abundance": 1500.0,
    +    "variable_A non-genomic coverage index": 0.20
    +    "variable_B non-genomic total k-mer abundance": 3500.0,
    +    "variable_B non-genomic coverage index": 0.70
    +    "non-var non-genomic total k-mer abundance": 0.10,
    +    "non-var non-genomic coverage index": 218
    +}
    +```
    +
    +

    Raises:

    +
    - `ValueError`:  
    +- If any variable specified in `vars_order` is not present in the `vars` dictionary.
    +- This ensures that all variables intended for consumption are available for processing.
    +
    +

    Usage Example:

    +
    # Assume `variables_signatures` is a dictionary of variable-specific SnipeSig instances
    +variables_signatures = {
    +    "GTDB": sig_GTDB,
    +    "VIRALDB": sig_VIRALDB,
    +    "contaminant_X": sig_contaminant_x
    +}
    +
    +# Define the order in which variables should be processed
    +processing_order = ["GTDB", "VIRALDB", "contaminant_X"]
    +
    +# Consume non-reference k-mers and retrieve statistics
    +nonref_stats = qc.nonref_consume_from_vars(vars=variables_signatures, vars_order=processing_order)
    +
    +print(nonref_stats)
    +# Output Example:
    +# {
    +#     "GTDB non-genomic total k-mer abundance": 1500.0,
    +#     "GTDB non-genomic coverage index": 0.2,
    +#     "VIRALDB non-genomic total k-mer abundance": 3500.0,
    +#     "VIRALDB non-genomic coverage index": 0.70,
    +#     "contaminant_X non-genomic total k-mer abundance": 0.0,
    +#     "contaminant_X non-genomic coverage index": 0.0,
    +#     "non-var non-genomic total k-mer abundance": 100.0,
    +#     "non-var non-genomic coverage index": 0.1
    +# }
    +
    +

    Notes:

    +
    - **Variable Processing Order**:  
    +The `vars_order` list determines the sequence in which variables are processed. This order is crucial
    +when there is potential overlap in k-mers between variables, as earlier variables in the list have 
    +higher priority in consuming shared k-mers.
    +
    +- **Non-Reference K-mers Definition**:  
    +Non-reference k-mers are defined as those present in the sample signature but absent in the reference 
    +signature. This method focuses on characterizing these unique k-mers relative to provided variables.
    +
    + +
    + Source code in src/snipe/api/reference_QC.py +
    def nonref_consume_from_vars(self, *, vars: Dict[str, SnipeSig], vars_order: List[str], **kwargs) -> Dict[str, float]:
    +    r"""
    +    Consume and analyze non-reference k-mers from provided variable signatures.
    +
    +    This method processes non-reference k-mers in the sample signature by intersecting them with a set of
    +    variable-specific `SnipeSig` instances. It calculates coverage and total abundance metrics for each
    +    variable in a specified order, ensuring that each non-reference k-mer is accounted for without overlap
    +    between variables. The method updates internal statistics that reflect the distribution of non-reference
    +    k-mers across the provided variables.
    +
    +    **Process Overview**:
    +
    +    1. **Validation**:
    +    - Verifies that all variable names specified in `vars_order` are present in the `vars` dictionary.
    +    - Raises a `ValueError` if any variable in `vars_order` is missing from `vars`.
    +
    +    2. **Non-Reference K-mer Extraction**:
    +    - Computes the set of non-reference non-singleton k-mers by subtracting the reference signature from the sample signature.
    +    - If no non-reference k-mers are found, the method logs a warning and returns an empty dictionary.
    +
    +    3. **Variable-wise Consumption**:
    +    - Iterates over each variable name in `vars_order`.
    +    - For each variable:
    +        - Intersects the remaining non-reference k-mers with the variable-specific signature.
    +        - Calculates the total abundance and coverage index for the intersected k-mers.
    +        - Updates the `vars_nonref_stats` dictionary with the computed metrics.
    +        - Removes the consumed k-mers from the remaining non-reference set to prevent overlap.
    +
    +    4. **Final State Logging**:
    +    - Logs the final size and total abundance of the remaining non-reference k-mers after consumption.
    +
    +    **Parameters**:
    +
    +        - `vars` (`Dict[str, SnipeSig]`):  
    +        A dictionary mapping variable names to their corresponding `SnipeSig` instances. Each `SnipeSig` 
    +        represents a set of k-mers associated with a specific non-reference category or variable.
    +
    +        - `vars_order` (`List[str]`):  
    +        A list specifying the order in which variables should be processed. The order determines the priority 
    +        of consumption, ensuring that earlier variables in the list have their k-mers accounted for before 
    +        later ones.
    +
    +        - `**kwargs`:  
    +        Additional keyword arguments. Reserved for future extensions and should not be used in the current context.
    +
    +    **Returns**:
    +
    +        - `Dict[str, float]`:  
    +        A dictionary containing statistics for each variable name in `vars_order`, 
    +            - `"non-genomic total k-mer abundance"` (`float`):  
    +                The sum of abundances of non-reference k-mers associated with the variable.
    +            - `"non-genomic coverage index"` (`float`):  
    +                The ratio of unique non-reference k-mers associated with the variable to the total number 
    +                of non-reference k-mers in the sample before consumption.
    +
    +        Example Output:
    +        ```python
    +        {
    +            "variable_A non-genomic total k-mer abundance": 1500.0,
    +            "variable_A non-genomic coverage index": 0.20
    +            "variable_B non-genomic total k-mer abundance": 3500.0,
    +            "variable_B non-genomic coverage index": 0.70
    +            "non-var non-genomic total k-mer abundance": 0.10,
    +            "non-var non-genomic coverage index": 218
    +        }
    +        ```
    +
    +    **Raises**:
    +
    +        - `ValueError`:  
    +        - If any variable specified in `vars_order` is not present in the `vars` dictionary.
    +        - This ensures that all variables intended for consumption are available for processing.
    +
    +    **Usage Example**:
    +
    +    ```python
    +    # Assume `variables_signatures` is a dictionary of variable-specific SnipeSig instances
    +    variables_signatures = {
    +        "GTDB": sig_GTDB,
    +        "VIRALDB": sig_VIRALDB,
    +        "contaminant_X": sig_contaminant_x
    +    }
    +
    +    # Define the order in which variables should be processed
    +    processing_order = ["GTDB", "VIRALDB", "contaminant_X"]
    +
    +    # Consume non-reference k-mers and retrieve statistics
    +    nonref_stats = qc.nonref_consume_from_vars(vars=variables_signatures, vars_order=processing_order)
    +
    +    print(nonref_stats)
    +    # Output Example:
    +    # {
    +    #     "GTDB non-genomic total k-mer abundance": 1500.0,
    +    #     "GTDB non-genomic coverage index": 0.2,
    +    #     "VIRALDB non-genomic total k-mer abundance": 3500.0,
    +    #     "VIRALDB non-genomic coverage index": 0.70,
    +    #     "contaminant_X non-genomic total k-mer abundance": 0.0,
    +    #     "contaminant_X non-genomic coverage index": 0.0,
    +    #     "non-var non-genomic total k-mer abundance": 100.0,
    +    #     "non-var non-genomic coverage index": 0.1
    +    # }
    +    ```
    +
    +    **Notes**:
    +
    +        - **Variable Processing Order**:  
    +        The `vars_order` list determines the sequence in which variables are processed. This order is crucial
    +        when there is potential overlap in k-mers between variables, as earlier variables in the list have 
    +        higher priority in consuming shared k-mers.
    +
    +        - **Non-Reference K-mers Definition**:  
    +        Non-reference k-mers are defined as those present in the sample signature but absent in the reference 
    +        signature. This method focuses on characterizing these unique k-mers relative to provided variables.
    +    """
    +
    +    # check the all vars in vars_order are in vars
    +    if not all([var in vars for var in vars_order]):
    +        # report dict keys, and the vars order
    +        self.logger.debug("Provided vars_order: %s, and vars keys: %s", vars_order, list(vars.keys()))
    +        self.logger.error("All variables in vars_order must be present in vars.")
    +        raise ValueError("All variables in vars_order must be present in vars.")
    +
    +    self.logger.debug("Consuming non-reference k-mers from provided variables.")
    +    self.logger.debug("\t-Current size of the sample signature: %d hashes.", len(self.sample_sig))
    +
    +    sample_nonref = self.sample_sig - self.reference_sig
    +
    +    sample_nonref.trim_singletons()
    +
    +    sample_nonref_unique_hashes = len(sample_nonref)
    +
    +    self.logger.debug("\t-Size of non-reference k-mers in the sample signature: %d hashes.", len(sample_nonref))
    +    if len(sample_nonref) == 0:
    +        self.logger.warning("No non-reference k-mers found in the sample signature.")
    +        return {}
    +
    +    # intersect and report coverage and depth, then subtract from sample_nonref so sum will be 100%
    +    for var_name in vars_order:
    +        sample_nonref_var: SnipeSig = sample_nonref & vars[var_name]
    +        sample_nonref_var_total_abundance = sample_nonref_var.total_abundance
    +        sample_nonref_var_unique_hashes = len(sample_nonref_var)
    +        sample_nonref_var_coverage_index = sample_nonref_var_unique_hashes / sample_nonref_unique_hashes
    +        self.vars_nonref_stats.update({
    +            f"{var_name} non-genomic total k-mer abundance": sample_nonref_var_total_abundance,
    +            f"{var_name} non-genomic coverage index": sample_nonref_var_coverage_index
    +        })
    +
    +        self.logger.debug("\t-Consuming non-reference k-mers from variable '%s'.", var_name)
    +        sample_nonref -= sample_nonref_var
    +        self.logger.debug("\t-Size of remaining non-reference k-mers in the sample signature: %d hashes.", len(sample_nonref))
    +
    +    self.vars_nonref_stats["non-var non-genomic total k-mer abundance"] = sample_nonref.total_abundance
    +    self.vars_nonref_stats["non-var non-genomic coverage index"] = len(sample_nonref) / sample_nonref_unique_hashes if sample_nonref_unique_hashes > 0 else 0.0
    +
    +    self.logger.debug(
    +        "After consuming all vars from the non reference k-mers, the size of the sample signature is: %d hashes, "
    +        "with total abundance of %s.", 
    +        len(sample_nonref), sample_nonref.total_abundance
    +    )
    +
    +    return self.vars_nonref_stats
     
    @@ -5002,299 +6987,299 @@

    Source code in src/snipe/api/reference_QC.py -
    def predict_coverage(self, extra_fold: float, n: int = 30) -> float:
    -    r"""
    -    Predict the coverage index if additional sequencing is performed.
    -
    -    This method estimates the potential increase in the genome coverage index when the sequencing depth
    -    is increased by a specified fold (extra sequencing). It does so by:
    -
    -    1. **Cumulative Coverage Calculation**:
    -    - Splitting the sample signature into `n` random parts to simulate incremental sequencing data.
    -    - Calculating the cumulative coverage index and cumulative sequencing depth at each incremental step.
    -
    -    2. **Saturation Curve Fitting**:
    -    - Modeling the relationship between cumulative coverage and cumulative sequencing depth using
    -        a hyperbolic saturation function.
    -    - The saturation model reflects how coverage approaches a maximum limit as sequencing depth increases.
    -
    -    3. **Coverage Prediction**:
    -    - Using the fitted model to predict the coverage index at an increased sequencing depth (current depth
    -        multiplied by `1 + extra_fold`).
    -
    -    **Mathematical Explanation**:
    -
    -    - **Saturation Model**:
    -    The coverage index \( C \) as a function of sequencing depth \( D \) is modeled using the function:
    -
    -    $$
    -    C(D) = \frac{a \cdot D}{b + D}
    -    $$
    -
    -    Where:
    -    - \( a \) and \( b \) are parameters estimated from the data.
    -    - \( D \) is the cumulative sequencing depth (total abundance).
    -    - \( C(D) \) is the cumulative coverage index at depth \( D \).
    -
    -    - **Parameter Estimation**:
    -    The parameters \( a \) and \( b \) are determined by fitting the model to the observed cumulative
    -    coverage and depth data using non-linear least squares optimization.
    -
    -    - **Coverage Prediction**:
    -    The predicted coverage index \( C_{\text{pred}} \) at an increased sequencing depth \( D_{\text{pred}} \)
    -    is calculated as:
    -
    -    $$
    -    D_{\text{pred}} = D_{\text{current}} \times (1 + \text{extra\_fold})
    -    $$
    -
    -    $$
    -    C_{\text{pred}} = \frac{a \cdot D_{\text{pred}}}{b + D_{\text{pred}}}
    -    $$
    -
    -    **Parameters**:
    -
    -    - `extra_fold` (*float*):  
    -      The fold increase in sequencing depth to simulate. For example, extra_fold = 1.0 represents doubling
    -      the current sequencing depth.
    -
    -    - `n` (*int, optional*):  
    -      The number of parts to split the sample signature into for modeling the saturation curve.
    -      Default is 30.
    -
    -    **Returns**:
    -        - `float`:  
    -          The predicted genome coverage index at the increased sequencing depth.
    -
    -    **Raises**:
    -        - `RuntimeError`:  
    -          If the saturation model fails to converge during curve fitting.
    -
    -    **Usage Example**:
    -
    -    ```python
    -    # Create a ReferenceQC instance with sample and reference signatures
    -    qc = ReferenceQC(sample_sig=sample_sig, reference_sig=reference_sig)
    -
    -    # Predict coverage index after increasing sequencing depth by 50%
    -    predicted_coverage = qc.predict_coverage(extra_fold=0.5)
    -
    -    print(f"Predicted coverage index at 1.5x sequencing depth: {predicted_coverage:.4f}")
    -    ```
    -
    -    **Implementation Details**:
    +              
    def predict_coverage(self, extra_fold: float, n: int = 30) -> float:
    +    r"""
    +    Predict the coverage index if additional sequencing is performed.
    +
    +    This method estimates the potential increase in the genome coverage index when the sequencing depth
    +    is increased by a specified fold (extra sequencing). It does so by:
    +
    +    1. **Cumulative Coverage Calculation**:
    +    - Splitting the sample signature into `n` random parts to simulate incremental sequencing data.
    +    - Calculating the cumulative coverage index and cumulative sequencing depth at each incremental step.
    +
    +    2. **Saturation Curve Fitting**:
    +    - Modeling the relationship between cumulative coverage and cumulative sequencing depth using
    +        a hyperbolic saturation function.
    +    - The saturation model reflects how coverage approaches a maximum limit as sequencing depth increases.
    +
    +    3. **Coverage Prediction**:
    +    - Using the fitted model to predict the coverage index at an increased sequencing depth (current depth
    +        multiplied by `1 + extra_fold`).
     
    -    - **Splitting the Sample Signature**:
    -        - The sample signature is split into `n` random parts using a multinomial distribution based on k-mer abundances.
    -        - Each part represents an incremental addition of sequencing data.
    -
    -    - **Cumulative Calculations**:
    -        - At each incremental step, the cumulative signature is updated, and the cumulative coverage index and sequencing depth are calculated.
    -
    -    - **Curve Fitting**:
    -        - The `scipy.optimize.curve_fit` function is used to fit the saturation model to the cumulative data.
    -        - Initial parameter guesses are based on the observed data to aid convergence.
    -    """
    -    if extra_fold < 1:
    -        raise ValueError("extra_fold must be >= 1.0.")
    +    **Mathematical Explanation**:
    +
    +    - **Saturation Model**:
    +    The coverage index \( C \) as a function of sequencing depth \( D \) is modeled using the function:
    +
    +    $$
    +    C(D) = \frac{a \cdot D}{b + D}
    +    $$
    +
    +    Where:
    +    - \( a \) and \( b \) are parameters estimated from the data.
    +    - \( D \) is the cumulative sequencing depth (total abundance).
    +    - \( C(D) \) is the cumulative coverage index at depth \( D \).
     
    -    if n < 1 or not isinstance(n, int):
    -        raise ValueError("n must be a positive integer.")
    -
    -    self.logger.debug("Predicting coverage with extra fold: %f", extra_fold)
    -    coverage_depth_data = self.calculate_coverage_vs_depth(n=n)
    -
    -    # Extract cumulative total abundance and coverage index
    -    x_data = np.array([d["cumulative_total_abundance"] for d in coverage_depth_data])
    -    y_data = np.array([d["cumulative_coverage_index"] for d in coverage_depth_data])
    -
    -    # Saturation model function
    -    def saturation_model(x, a, b):
    -        return a * x / (b + x)
    -
    -    # Initial parameter guesses
    -    initial_guess = [y_data[-1], x_data[int(len(x_data) / 2)]]
    -
    -    # Fit the model to the data
    -    try:
    -        with warnings.catch_warnings():
    -            warnings.simplefilter("error", OptimizeWarning)
    -            params, covariance = curve_fit(
    -                saturation_model,
    -                x_data,
    -                y_data,
    -                p0=initial_guess,
    -                bounds=(0, np.inf),
    -                maxfev=10000
    -            )
    -    except (RuntimeError, OptimizeWarning) as exc:
    -        self.logger.error("Curve fitting failed.")
    -        raise RuntimeError("Saturation model fitting failed. Cannot predict coverage.") from exc
    -
    -    # Check if covariance contains inf or nan
    -    if np.isinf(covariance).any() or np.isnan(covariance).any():
    -        self.logger.error("Covariance of parameters could not be estimated.")
    -        raise RuntimeError("Saturation model fitting failed. Cannot predict coverage.")
    -
    -    a, b = params
    +    - **Parameter Estimation**:
    +    The parameters \( a \) and \( b \) are determined by fitting the model to the observed cumulative
    +    coverage and depth data using non-linear least squares optimization.
    +
    +    - **Coverage Prediction**:
    +    The predicted coverage index \( C_{\text{pred}} \) at an increased sequencing depth \( D_{\text{pred}} \)
    +    is calculated as:
    +
    +    $$
    +    D_{\text{pred}} = D_{\text{current}} \times (1 + \text{extra\_fold})
    +    $$
    +
    +    $$
    +    C_{\text{pred}} = \frac{a \cdot D_{\text{pred}}}{b + D_{\text{pred}}}
    +    $$
    +
    +    **Parameters**:
    +
    +    - `extra_fold` (*float*):  
    +      The fold increase in sequencing depth to simulate. For example, extra_fold = 1.0 represents doubling
    +      the current sequencing depth.
    +
    +    - `n` (*int, optional*):  
    +      The number of parts to split the sample signature into for modeling the saturation curve.
    +      Default is 30.
    +
    +    **Returns**:
    +        - `float`:  
    +          The predicted genome coverage index at the increased sequencing depth.
    +
    +    **Raises**:
    +        - `RuntimeError`:  
    +          If the saturation model fails to converge during curve fitting.
    +
    +    **Usage Example**:
    +
    +    ```python
    +    # Create a ReferenceQC instance with sample and reference signatures
    +    qc = ReferenceQC(sample_sig=sample_sig, reference_sig=reference_sig)
     
    -    # Predict coverage at increased sequencing depth
    -    total_abundance = x_data[-1]
    -    predicted_total_abundance = total_abundance * (1 + extra_fold)
    -    predicted_coverage = saturation_model(predicted_total_abundance, a, b)
    -
    -    # Ensure the predicted coverage does not exceed maximum possible coverage
    -    max_coverage = 1.0  # Coverage index cannot exceed 1
    -    predicted_coverage = min(predicted_coverage, max_coverage)
    -
    -    self.logger.debug("Predicted coverage at %.2f-fold increase: %f", extra_fold, predicted_coverage)
    -    return predicted_coverage
    +    # Predict coverage index after increasing sequencing depth by 50%
    +    predicted_coverage = qc.predict_coverage(extra_fold=0.5)
    +
    +    print(f"Predicted coverage index at 1.5x sequencing depth: {predicted_coverage:.4f}")
    +    ```
    +
    +    **Implementation Details**:
    +
    +    - **Splitting the Sample Signature**:
    +        - The sample signature is split into `n` random parts using a multinomial distribution based on k-mer abundances.
    +        - Each part represents an incremental addition of sequencing data.
    +
    +    - **Cumulative Calculations**:
    +        - At each incremental step, the cumulative signature is updated, and the cumulative coverage index and sequencing depth are calculated.
    +
    +    - **Curve Fitting**:
    +        - The `scipy.optimize.curve_fit` function is used to fit the saturation model to the cumulative data.
    +        - Initial parameter guesses are based on the observed data to aid convergence.
    +    """
    +    if extra_fold < 1:
    +        raise ValueError("extra_fold must be >= 1.0.")
    +
    +    if n < 1 or not isinstance(n, int):
    +        raise ValueError("n must be a positive integer.")
    +
    +    self.logger.debug("Predicting coverage with extra fold: %f", extra_fold)
    +    coverage_depth_data = self.calculate_coverage_vs_depth(n=n)
    +
    +    # Extract cumulative total abundance and coverage index
    +    x_data = np.array([d["cumulative_total_abundance"] for d in coverage_depth_data])
    +    y_data = np.array([d["cumulative_coverage_index"] for d in coverage_depth_data])
    +
    +    # Saturation model function
    +    def saturation_model(x, a, b):
    +        return a * x / (b + x)
    +
    +    # Initial parameter guesses
    +    initial_guess = [y_data[-1], x_data[int(len(x_data) / 2)]]
    +
    +    # Fit the model to the data
    +    try:
    +        with warnings.catch_warnings():
    +            warnings.simplefilter("error", OptimizeWarning)
    +            params, covariance = curve_fit(
    +                saturation_model,
    +                x_data,
    +                y_data,
    +                p0=initial_guess,
    +                bounds=(0, np.inf),
    +                maxfev=10000
    +            )
    +    except (RuntimeError, OptimizeWarning) as exc:
    +        self.logger.error("Curve fitting failed.")
    +        raise RuntimeError("Saturation model fitting failed. Cannot predict coverage.") from exc
    +
    +    # Check if covariance contains inf or nan
    +    if np.isinf(covariance).any() or np.isnan(covariance).any():
    +        self.logger.error("Covariance of parameters could not be estimated.")
    +        raise RuntimeError("Saturation model fitting failed. Cannot predict coverage.")
    +
    +    a, b = params
    +
    +    # Predict coverage at increased sequencing depth
    +    total_abundance = x_data[-1]
    +    predicted_total_abundance = total_abundance * (1 + extra_fold)
    +    predicted_coverage = saturation_model(predicted_total_abundance, a, b)
    +
    +    # Ensure the predicted coverage does not exceed maximum possible coverage
    +    max_coverage = 1.0  # Coverage index cannot exceed 1
    +    predicted_coverage = min(predicted_coverage, max_coverage)
    +
    +    self.logger.debug("Predicted coverage at %.2f-fold increase: %f", extra_fold, predicted_coverage)
    +    return predicted_coverage
     
    @@ -5341,48 +7326,7 @@

    Source code in src/snipe/api/reference_QC.py -
    612
    -613
    -614
    -615
    -616
    -617
    -618
    -619
    -620
    -621
    -622
    -623
    -624
    -625
    -626
    -627
    -628
    -629
    -630
    -631
    -632
    -633
    -634
    -635
    -636
    -637
    -638
    -639
    -640
    -641
    -642
    -643
    -644
    -645
    -646
    -647
    -648
    -649
    -650
    -651
    -652
    -653
    +              
    653
     654
     655
     656
    @@ -5394,60 +7338,127 @@ 

    662 663 664 -665

    def split_sig_randomly(self, n: int) -> List[SnipeSig]:
    -    r"""
    -    Split the sample signature into `n` random parts based on abundances.
    -
    -    This method distributes the k-mers of the sample signature into `n` parts using a multinomial distribution
    -    based on their abundances. Each k-mer's abundance is split across the `n` parts proportionally.
    -
    -    **Mathematical Explanation**:
    -
    -    For each k-mer with hash \( h \) and abundance \( a_h \), its abundance is distributed into \( n \) parts
    -    according to a multinomial distribution. Specifically, the abundance in each part \( i \) is given by:
    -
    -    $$
    -    a_{h,i} \sim \text{Multinomial}(a_h, \frac{1}{n}, \frac{1}{n}, \dots, \frac{1}{n})
    -    $$
    -
    -    Where:
    -    - \( a_{h,i} \) is the abundance of k-mer \( h \) in part \( i \).
    -    - Each \( a_{h,i} \) is a non-negative integer such that \( \sum_{i=1}^{n} a_{h,i} = a_h \).
    -
    -    **Parameters**:
    -
    -    - `n` (`int`): Number of parts to split into.
    -
    -    **Returns**:
    -
    -    - `List[SnipeSig]`:  
    -      List of `SnipeSig` instances representing the split parts.
    -
    -    **Usage Example**:
    -
    -    ```python
    -    split_sigs = qc.split_sig_randomly(n=3)
    -    for idx, sig in enumerate(split_sigs, 1):
    -        print(f"Signature part {idx}: {sig}")
    -    ```
    -    """
    -    self.logger.debug("Splitting sample signature into %d random parts.", n)
    -    # Get k-mers and abundances
    -    hash_to_abund = dict(zip(self.sample_sig.hashes, self.sample_sig.abundances))
    -    random_split_sigs = self.distribute_kmers_random(hash_to_abund, n)
    -    split_sigs = [
    -        SnipeSig.create_from_hashes_abundances(
    -            hashes=np.array(list(kmer_dict.keys()), dtype=np.uint64),
    -            abundances=np.array(list(kmer_dict.values()), dtype=np.uint32),
    -            ksize=self.sample_sig.ksize,
    -            scale=self.sample_sig.scale,
    -            name=f"{self.sample_sig.name}_part_{i+1}",
    -            filename=self.sample_sig.filename,
    -            enable_logging=self.enable_logging
    -        )
    -        for i, kmer_dict in enumerate(random_split_sigs)
    -    ]
    -    return split_sigs
    +665
    +666
    +667
    +668
    +669
    +670
    +671
    +672
    +673
    +674
    +675
    +676
    +677
    +678
    +679
    +680
    +681
    +682
    +683
    +684
    +685
    +686
    +687
    +688
    +689
    +690
    +691
    +692
    +693
    +694
    +695
    +696
    +697
    +698
    +699
    +700
    +701
    +702
    +703
    +704
    +705
    +706
    +707
    +708
    +709
    +710
    +711
    +712
    +713
    +714
    +715
    +716
    +717
    +718
    +719
    def split_sig_randomly(self, n: int) -> List[SnipeSig]:
    +    r"""
    +    Split the sample signature into `n` random parts based on abundances.
    +
    +    This method distributes the k-mers of the sample signature into `n` parts using a multinomial distribution
    +    based on their abundances. Each k-mer's abundance is split across the `n` parts proportionally.
    +
    +    **Mathematical Explanation**:
    +
    +    For each k-mer with hash \( h \) and abundance \( a_h \), its abundance is distributed into \( n \) parts
    +    according to a multinomial distribution. Specifically, the abundance in each part \( i \) is given by:
    +
    +    $$
    +    a_{h,i} \sim \text{Multinomial}(a_h, \frac{1}{n}, \frac{1}{n}, \dots, \frac{1}{n})
    +    $$
    +
    +    Where:
    +    - \( a_{h,i} \) is the abundance of k-mer \( h \) in part \( i \).
    +    - Each \( a_{h,i} \) is a non-negative integer such that \( \sum_{i=1}^{n} a_{h,i} = a_h \).
    +
    +    **Parameters**:
    +
    +    - `n` (`int`): Number of parts to split into.
    +
    +    **Returns**:
    +
    +    - `List[SnipeSig]`:  
    +      List of `SnipeSig` instances representing the split parts.
    +
    +    **Usage Example**:
    +
    +    ```python
    +    split_sigs = qc.split_sig_randomly(n=3)
    +    for idx, sig in enumerate(split_sigs, 1):
    +        print(f"Signature part {idx}: {sig}")
    +    ```
    +    """
    +    self.logger.debug("Attempting to split sample signature into %d random parts.", n)
    +
    +    # Check if the split for this n is already cached
    +    if n in self._split_cache:
    +        self.logger.debug("Using cached split signatures for n=%d.", n)
    +        # Return deep copies to prevent external modifications
    +        return [sig.copy() for sig in self._split_cache[n]]
    +
    +    self.logger.debug("No cached splits found for n=%d. Proceeding to split.", n)
    +    # Get k-mers and abundances
    +    hash_to_abund = dict(zip(self.sample_sig.hashes, self.sample_sig.abundances))
    +    random_split_sigs = self.distribute_kmers_random(hash_to_abund, n)
    +    split_sigs = [
    +        SnipeSig.create_from_hashes_abundances(
    +            hashes=np.array(list(kmer_dict.keys()), dtype=np.uint64),
    +            abundances=np.array(list(kmer_dict.values()), dtype=np.uint32),
    +            ksize=self.sample_sig.ksize,
    +            scale=self.sample_sig.scale,
    +            name=f"{self.sample_sig.name}_part_{i+1}",
    +            filename=self.sample_sig.filename,
    +            enable_logging=self.enable_logging
    +        )
    +        for i, kmer_dict in enumerate(random_split_sigs)
    +    ]
    +
    +    # Cache the split signatures
    +    self._split_cache[n] = split_sigs
    +    self.logger.debug("Cached split signatures for n=%d.", n)
    +
    +    return split_sigs
     
    @@ -5492,7 +7503,7 @@

    - October 13, 2024 + October 15, 2024 @@ -5502,7 +7513,7 @@

    - October 13, 2024 + October 15, 2024 diff --git a/Sketch/index.html b/Sketch/index.html new file mode 100644 index 0000000..8946c1c --- /dev/null +++ b/Sketch/index.html @@ -0,0 +1,3574 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Sketch - Snipe Documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + +
    + + + + + + +
    + + + + + + + +
    + +
    + + + + +
    +
    + + + +
    +
    +
    + + + + + + + +
    +
    +
    + + + +
    + +
    + + + +
    +
    + + + + +

    Python API Documentation

    + + +
    + + + + +
    + + + + + + + + +
    + + + + + + + + +
    + + + +

    + SnipeSketch + + +

    + + +
    + + +

    SnipeSketch is responsible for creating FracMinHash sketches from genomic data. +It supports parallel processing, progress monitoring, and different sketching modes +including sample, genome, and amplicon sketching.

    + + + + + + +
    + Source code in src/snipe/api/sketch.py +
     18
    + 19
    + 20
    + 21
    + 22
    + 23
    + 24
    + 25
    + 26
    + 27
    + 28
    + 29
    + 30
    + 31
    + 32
    + 33
    + 34
    + 35
    + 36
    + 37
    + 38
    + 39
    + 40
    + 41
    + 42
    + 43
    + 44
    + 45
    + 46
    + 47
    + 48
    + 49
    + 50
    + 51
    + 52
    + 53
    + 54
    + 55
    + 56
    + 57
    + 58
    + 59
    + 60
    + 61
    + 62
    + 63
    + 64
    + 65
    + 66
    + 67
    + 68
    + 69
    + 70
    + 71
    + 72
    + 73
    + 74
    + 75
    + 76
    + 77
    + 78
    + 79
    + 80
    + 81
    + 82
    + 83
    + 84
    + 85
    + 86
    + 87
    + 88
    + 89
    + 90
    + 91
    + 92
    + 93
    + 94
    + 95
    + 96
    + 97
    + 98
    + 99
    +100
    +101
    +102
    +103
    +104
    +105
    +106
    +107
    +108
    +109
    +110
    +111
    +112
    +113
    +114
    +115
    +116
    +117
    +118
    +119
    +120
    +121
    +122
    +123
    +124
    +125
    +126
    +127
    +128
    +129
    +130
    +131
    +132
    +133
    +134
    +135
    +136
    +137
    +138
    +139
    +140
    +141
    +142
    +143
    +144
    +145
    +146
    +147
    +148
    +149
    +150
    +151
    +152
    +153
    +154
    +155
    +156
    +157
    +158
    +159
    +160
    +161
    +162
    +163
    +164
    +165
    +166
    +167
    +168
    +169
    +170
    +171
    +172
    +173
    +174
    +175
    +176
    +177
    +178
    +179
    +180
    +181
    +182
    +183
    +184
    +185
    +186
    +187
    +188
    +189
    +190
    +191
    +192
    +193
    +194
    +195
    +196
    +197
    +198
    +199
    +200
    +201
    +202
    +203
    +204
    +205
    +206
    +207
    +208
    +209
    +210
    +211
    +212
    +213
    +214
    +215
    +216
    +217
    +218
    +219
    +220
    +221
    +222
    +223
    +224
    +225
    +226
    +227
    +228
    +229
    +230
    +231
    +232
    +233
    +234
    +235
    +236
    +237
    +238
    +239
    +240
    +241
    +242
    +243
    +244
    +245
    +246
    +247
    +248
    +249
    +250
    +251
    +252
    +253
    +254
    +255
    +256
    +257
    +258
    +259
    +260
    +261
    +262
    +263
    +264
    +265
    +266
    +267
    +268
    +269
    +270
    +271
    +272
    +273
    +274
    +275
    +276
    +277
    +278
    +279
    +280
    +281
    +282
    +283
    +284
    +285
    +286
    +287
    +288
    +289
    +290
    +291
    +292
    +293
    +294
    +295
    +296
    +297
    +298
    +299
    +300
    +301
    +302
    +303
    +304
    +305
    +306
    +307
    +308
    +309
    +310
    +311
    +312
    +313
    +314
    +315
    +316
    +317
    +318
    +319
    +320
    +321
    +322
    +323
    +324
    +325
    +326
    +327
    +328
    +329
    +330
    +331
    +332
    +333
    +334
    +335
    +336
    +337
    +338
    +339
    +340
    +341
    +342
    +343
    +344
    +345
    +346
    +347
    +348
    +349
    +350
    +351
    +352
    +353
    +354
    +355
    +356
    +357
    +358
    +359
    +360
    +361
    +362
    +363
    +364
    +365
    +366
    +367
    +368
    +369
    +370
    +371
    +372
    +373
    +374
    +375
    +376
    +377
    +378
    +379
    +380
    +381
    +382
    +383
    +384
    +385
    +386
    +387
    +388
    +389
    +390
    +391
    +392
    +393
    +394
    +395
    +396
    +397
    +398
    +399
    +400
    +401
    +402
    +403
    +404
    +405
    +406
    +407
    +408
    +409
    +410
    +411
    +412
    +413
    +414
    +415
    +416
    +417
    +418
    +419
    +420
    +421
    +422
    +423
    +424
    +425
    +426
    +427
    +428
    +429
    +430
    +431
    +432
    +433
    +434
    +435
    +436
    +437
    +438
    +439
    +440
    +441
    +442
    +443
    +444
    +445
    +446
    +447
    +448
    +449
    +450
    +451
    +452
    +453
    +454
    +455
    +456
    +457
    +458
    +459
    +460
    +461
    +462
    +463
    +464
    +465
    +466
    +467
    +468
    +469
    +470
    +471
    +472
    +473
    +474
    +475
    +476
    +477
    +478
    +479
    +480
    +481
    +482
    +483
    +484
    +485
    +486
    +487
    +488
    +489
    +490
    +491
    +492
    +493
    +494
    +495
    +496
    +497
    +498
    +499
    +500
    +501
    +502
    +503
    +504
    +505
    +506
    +507
    +508
    +509
    +510
    +511
    +512
    +513
    +514
    +515
    +516
    +517
    +518
    +519
    +520
    +521
    +522
    +523
    +524
    class SnipeSketch:
    +    """
    +    SnipeSketch is responsible for creating FracMinHash sketches from genomic data.
    +    It supports parallel processing, progress monitoring, and different sketching modes
    +    including sample, genome, and amplicon sketching.
    +    """
    +
    +    def __init__(self, enable_logging: bool) -> None:
    +        """
    +        Initialize the SnipeSketch instance.
    +
    +        Args:
    +            enable_logging (bool): Flag to enable or disable logging.
    +        """
    +        self.logger = logging.getLogger(self.__class__.__name__)
    +        self._configure_logging(enable_logging)
    +
    +    def _configure_logging(self, enable_logging: bool) -> None:
    +        """
    +        Configure the logging for the class.
    +
    +        Args:
    +            enable_logging (bool): Flag to enable or disable logging.
    +        """
    +        if enable_logging:
    +            self.logger.setLevel(logging.DEBUG)
    +            if not self.logger.hasHandlers():
    +                handler = logging.StreamHandler()
    +                handler.setLevel(logging.DEBUG)
    +                formatter = logging.Formatter(
    +                    "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
    +                )
    +                handler.setFormatter(formatter)
    +                self.logger.addHandler(handler)
    +            self.logger.debug("Logging is enabled for SnipeSketch.")
    +        else:
    +            self.logger.setLevel(logging.CRITICAL)
    +
    +    # *******************************
    +    # *        Sketching            *
    +    # *******************************
    +
    +    def process_sequences(
    +        self,
    +        fasta_file: str,
    +        thread_id: int,
    +        total_threads: int,
    +        progress_queue: multiprocessing.Queue,
    +        batch_size: int = 100_000,
    +        ksize: int = 51,
    +        scaled: int = 10_000,
    +    ) -> sourmash.MinHash:
    +        """
    +        Process a subset of sequences to create a FracMinHash sketch.
    +
    +        Each process creates its own MinHash instance and processes sequences
    +        assigned based on the thread ID. Progress is reported via a shared queue.
    +
    +        Args:
    +            fasta_file (str): Path to the FASTA file.
    +            thread_id (int): Identifier for the current thread.
    +            total_threads (int): Total number of threads.
    +            progress_queue (multiprocessing.Queue): Queue for reporting progress.
    +            batch_size (int, optional): Number of sequences per progress update. Defaults to 100_000.
    +            ksize (int, optional): K-mer size. Defaults to 51.
    +            scaled (int, optional): Scaling factor for MinHash. Defaults to 10_000.
    +
    +        Returns:
    +            sourmash.MinHash: The resulting FracMinHash sketch.
    +        """
    +        self._register_signal_handler()
    +        try:
    +            fa_reader = SequenceReader(fasta_file)
    +            mh = sourmash.MinHash(
    +                n=0, ksize=ksize, scaled=scaled, track_abundance=True
    +            )
    +            local_count = 0
    +
    +            for idx, (_, seq) in enumerate(fa_reader):
    +                if idx % total_threads == thread_id:
    +                    mh.add_sequence(seq, force=True)
    +                    local_count += 1
    +
    +                    if local_count >= batch_size:
    +                        progress_queue.put(batch_size)
    +                        local_count = 0
    +
    +            if local_count > 0:
    +                progress_queue.put(local_count)
    +
    +            self.logger.debug(
    +                "Thread %d processed %d hashes.", thread_id, len(mh)
    +            )
    +            return mh
    +
    +        except KeyboardInterrupt:
    +            self.logger.warning("KeyboardInterrupt detected in process_sequences.")
    +            sys.exit(0)
    +        except Exception as e:
    +            self.logger.error("Error in process_sequences: %s", e)
    +            raise
    +
    +    def _register_signal_handler(self) -> None:
    +        """
    +        Register the signal handler for graceful shutdown.
    +        """
    +        signal.signal(signal.SIGINT, self._worker_signal_handler)
    +
    +    def progress_monitor(
    +        self,
    +        progress_queue: multiprocessing.Queue,
    +        progress_interval: int,
    +        total_threads: int,
    +        stop_event: threading.Event,
    +    ) -> None:
    +        """
    +        Monitor and display the progress of sequence processing.
    +
    +        Args:
    +            progress_queue (multiprocessing.Queue): Queue for receiving progress updates.
    +            progress_interval (int): Interval for progress updates.
    +            total_threads (int): Number of processing threads.
    +            stop_event (threading.Event): Event to signal the monitor to stop.
    +        """
    +        total = 0
    +        next_update = progress_interval
    +        try:
    +            while not stop_event.is_set() or not progress_queue.empty():
    +                try:
    +                    count = progress_queue.get(timeout=0.5)
    +                    total += count
    +                    if total >= next_update:
    +                        print(f"\rProcessed {next_update:,} sequences.", end="", flush=True)
    +                        next_update += progress_interval
    +                except queue.Empty:
    +                    continue
    +        except Exception as e:
    +            self.logger.error("Error in progress_monitor: %s", e)
    +        finally:
    +            print(f"\rProcessed {total:,} sequences in total.")
    +
    +    def _worker_signal_handler(self, signum: int, frame: Any) -> None:
    +        """
    +        Handle signals in worker processes to exit gracefully.
    +
    +        Args:
    +            signum (int): Signal number.
    +            frame (Any): Current stack frame.
    +        """
    +        self.logger.info("Received signal %d. Exiting worker.", signum)
    +        sys.exit(0)
    +
    +    def _sketch_sample(
    +        self,
    +        sample_name: str,
    +        fasta_file: str,
    +        num_processes: int = 4,
    +        progress_interval: int = 1_000_000,
    +        batch_size: int = 100_000,
    +        k_size: int = 51,
    +        scale: int = 10_000,
    +        **kwargs: Any,
    +    ) -> sourmash.SourmashSignature:
    +        """
    +        Create a FracMinHash sketch for a sample using parallel processing.
    +
    +        Args:
    +            sample_name (str): Name of the sample.
    +            fasta_file (str): Path to the FASTA file.
    +            num_processes (int, optional): Number of parallel processes. Defaults to 4.
    +            progress_interval (int, optional): Interval for progress updates. Defaults to 1_000_000.
    +            batch_size (int, optional): Number of sequences per progress update. Defaults to 100_000.
    +            k_size (int, optional): K-mer size. Defaults to 51.
    +            scale (int, optional): Scaling factor for MinHash. Defaults to 10_000.
    +            **kwargs (Any): Additional keyword arguments.
    +
    +        Returns:
    +            sourmash.SourmashSignature: The resulting Sourmash signature.
    +        """
    +        self.logger.info("Starting sketching with %d processes...", num_processes)
    +
    +        manager = multiprocessing.Manager()
    +        progress_queue = manager.Queue()
    +        stop_event = threading.Event()
    +
    +        monitor_thread = threading.Thread(
    +            target=self.progress_monitor,
    +            args=(progress_queue, progress_interval, num_processes, stop_event),
    +            daemon=True,
    +        )
    +        monitor_thread.start()
    +
    +        pool = Pool(nodes=num_processes)
    +        results: List[Any] = []
    +
    +        try:
    +            for thread_id in range(num_processes):
    +                result = pool.apipe(
    +                    self.process_sequences,
    +                    fasta_file,
    +                    thread_id,
    +                    num_processes,
    +                    progress_queue,
    +                    batch_size,
    +                    k_size,
    +                    scale,
    +                )
    +                results.append(result)
    +
    +            pool.close()
    +            pool.join()
    +
    +        except KeyboardInterrupt:
    +            self.logger.warning("Interrupt received. Terminating processes...")
    +            pool.terminate()
    +            pool.join()
    +            stop_event.set()
    +            monitor_thread.join()
    +            sys.exit(1)
    +
    +        except Exception as e:
    +            self.logger.error("Error during sketching: %s", e)
    +            pool.terminate()
    +            pool.join()
    +            stop_event.set()
    +            monitor_thread.join()
    +            raise
    +
    +        finally:
    +            stop_event.set()
    +            monitor_thread.join()
    +
    +        minhashes = []
    +        for idx, result in enumerate(results):
    +            try:
    +                mh = result.get()
    +                if mh:
    +                    minhashes.append(mh)
    +                    self.logger.debug("MinHash from thread %d collected.", idx)
    +            except Exception as e:
    +                self.logger.error("Error retrieving MinHash from thread %d: %s", idx, e)
    +
    +        if not minhashes:
    +            raise ValueError("No MinHashes were generated.")
    +
    +        # Merge all MinHashes into one
    +        mh_full = minhashes[0]
    +        for mh in minhashes[1:]:
    +            mh_full.merge(mh)
    +
    +        signature = sourmash.SourmashSignature(mh_full, name=sample_name)
    +        self.logger.info("Sketching completed for sample: %s", sample_name)
    +
    +        return signature
    +
    +    def sample_sketch(
    +        self,
    +        sample_name: str,
    +        filename: str,
    +        num_processes: int,
    +        batch_size: int,
    +        ksize: int,
    +        scale: int,
    +        **kwargs: Any,
    +    ) -> sourmash.SourmashSignature:
    +        """
    +        Generate a sketch for a sample and return its signature.
    +
    +        Args:
    +            sample_name (str): Name of the sample.
    +            filename (str): Path to the input FASTA file.
    +            num_processes (int): Number of processes to use.
    +            batch_size (int): Batch size for processing.
    +            ksize (int): K-mer size.
    +            scale (int): Scaling factor.
    +            **kwargs (Any): Additional keyword arguments.
    +
    +        Returns:
    +            sourmash.SourmashSignature: The generated signature.
    +
    +        Raises:
    +            RuntimeError: If an error occurs during sketching.
    +        """
    +        self.logger.info("Starting sample sketch for: %s", sample_name)
    +        try:
    +            signature = self._sketch_sample(
    +                sample_name=sample_name,
    +                fasta_file=filename,
    +                num_processes=num_processes,
    +                batch_size=batch_size,
    +                k_size=ksize,
    +                scale=scale,
    +                **kwargs,
    +            )
    +            self.logger.info("Sample sketch completed for: %s", sample_name)
    +            return signature
    +        except Exception as e:
    +            self.logger.error(
    +                "Error occurred during sample sketching: %s", str(e)
    +            )
    +            raise RuntimeError("Error occurred during sample sketching.") from e
    +
    +    # *******************************
    +    # *      Genome Sketching       *
    +    # *******************************
    +
    +    def parse_fasta_header(self, header: str) -> Tuple[str, str]:
    +        """
    +        Parse a FASTA header and categorize the sequence type.
    +
    +        Args:
    +            header (str): The FASTA header string.
    +
    +        Returns:
    +            Tuple[str, str]: A tuple containing the sequence type and name.
    +        """
    +        full_header = header.strip()
    +        header_lower = full_header.lower()
    +
    +        if header_lower.startswith(">"):
    +            header_lower = header_lower[1:]
    +            full_header = full_header[1:]
    +
    +        seq_type = "unknown"
    +        name = "unknown"
    +
    +        patterns = {
    +            "scaffold": re.compile(r"\b(scaffold|unplaced|unlocalized)\b"),
    +            "contig": re.compile(r"\bcontig\b"),
    +            "mitochondrial DNA": re.compile(r"\b(mt|mitochondrion|mitochondrial|mitochondria|mito|mtdna)\b"),
    +            "chloroplast DNA": re.compile(r"\b(chloroplast|cpdna|plastid)\b"),
    +            "plasmid": re.compile(r"\bplasmid\b"),
    +            "chromosome": re.compile(r"\bchromosome\b|\bchr\b"),
    +            "reference chromosome": re.compile(r"NC_\d{6}\.\d+"),
    +        }
    +
    +        for stype, pattern in patterns.items():
    +            if pattern.search(header_lower):
    +                if stype in {"scaffold", "contig", "plasmid"}:
    +                    match = re.match(r"(\S+)", full_header)
    +                    name = match.group(1) if match else "unknown"
    +                elif stype in {"mitochondrial DNA", "chloroplast DNA"}:
    +                    name = stype.split()[0]
    +                elif stype == "chromosome":
    +                    match = re.search(r"(?:chromosome|chr)[_\s]*([^\s,]+)", header_lower)
    +                    if match:
    +                        name = match.group(1).rstrip(".,")
    +                        if name.upper() in {"X", "Y", "W", "Z"}:
    +                            stype = "sex"
    +                        else:
    +                            stype = "autosome"
    +                elif stype == "reference chromosome":
    +                    match = pattern.search(full_header)
    +                    if match and not (patterns["scaffold"].search(header_lower) or patterns["contig"].search(header_lower)):
    +                        name = match.group()
    +                return stype, name
    +
    +        return seq_type, name
    +
    +    def parallel_genome_sketching(
    +        self,
    +        fasta_file: str,
    +        cores: int = 1,
    +        ksize: int = 51,
    +        scale: int = 10_000,
    +        assigned_genome_name: str = "full_genome",
    +        **kwargs: Any,
    +    ) -> Tuple[sourmash.SourmashSignature, Dict[str, sourmash.SourmashSignature]]:
    +        """
    +        Perform parallel genome sketching from a FASTA file.
    +
    +        Args:
    +            fasta_file (str): Path to the FASTA file.
    +            cores (int, optional): Number of parallel cores. Defaults to 1.
    +            ksize (int, optional): K-mer size. Defaults to 51.
    +            scale (int, optional): Scaling factor for MinHash. Defaults to 10_000.
    +            assigned_genome_name (str, optional): Name for the genome signature. Defaults to "full_genome".
    +            **kwargs (Any): Additional keyword arguments.
    +
    +        Returns:
    +            Tuple[sourmash.SourmashSignature, Dict[str, sourmash.SourmashSignature]]:
    +                The full genome signature and a dictionary of chromosome signatures.
    +        """
    +        self.logger.info("Starting parallel genome sketching with %d cores.", cores)
    +        fa_reader = SequenceReader(fasta_file, comment=True)
    +        mh_full = sourmash.MinHash(n=0, ksize=ksize, scaled=scale)
    +        chr_to_mh: Dict[str, sourmash.MinHash] = {}
    +
    +        mh_lock = threading.Lock()
    +        chr_lock = threading.Lock()
    +
    +        def process_sequence(
    +            name: str, seq: str, comment: Optional[str]
    +        ) -> None:
    +            header = f"{name} {comment}" if comment else name
    +            seq_type, seq_name = self.parse_fasta_header(header)
    +            current_mh = sourmash.MinHash(n=0, ksize=ksize, scaled=scale, track_abundance=True)
    +            current_mh.add_sequence(seq, force=True)
    +
    +            with mh_lock:
    +                mh_full.merge(current_mh)
    +
    +            if seq_type in {"sex", "autosome"}:
    +                with chr_lock:
    +                    key = f"{seq_type}-{seq_name}"
    +                    if key not in chr_to_mh:
    +                        chr_to_mh[key] = current_mh
    +                    else:
    +                        chr_to_mh[key].merge(current_mh)
    +
    +        try:
    +            with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:
    +                futures = [
    +                    executor.submit(
    +                        process_sequence,
    +                        name,
    +                        seq,
    +                        comment
    +                    )
    +                    for name, seq, comment in fa_reader
    +                ]
    +
    +                for future in concurrent.futures.as_completed(futures):
    +                    try:
    +                        future.result()
    +                    except Exception as e:
    +                        self.logger.error("Error processing sequence: %s", e)
    +
    +        except Exception as e:
    +            self.logger.error("Error during parallel genome sketching: %s", e)
    +            raise
    +
    +        mh_full_signature = sourmash.SourmashSignature(mh_full, name=assigned_genome_name)
    +        chr_signatures = {
    +            name: sourmash.SourmashSignature(mh, name=name)
    +            for name, mh in chr_to_mh.items()
    +        }
    +
    +        self.logger.info("Parallel genome sketching completed.")
    +        return mh_full_signature, chr_signatures
    +
    +    def amplicon_sketching(
    +        self,
    +        fasta_file: str,
    +        ksize: int = 51,
    +        scale: int = 10_000,
    +        amplicon_name: str = "amplicon",
    +    ) -> sourmash.SourmashSignature:
    +        """
    +        Create a FracMinHash sketch for an amplicon.
    +
    +        Args:
    +            fasta_file (str): Path to the FASTA file.
    +            ksize (int, optional): K-mer size. Defaults to 51.
    +            scale (int, optional): Scaling factor for MinHash. Defaults to 10_000.
    +            amplicon_name (str, optional): Name of the amplicon. Defaults to "amplicon".
    +
    +        Returns:
    +            sourmash.SourmashSignature: The resulting amplicon signature.
    +        """
    +        self.logger.info("Starting amplicon sketching for: %s", amplicon_name)
    +        try:
    +            fa_reader = SequenceReader(fasta_file)
    +            mh_full = sourmash.MinHash(n=0, ksize=ksize, scaled=scale)
    +            for _, seq in fa_reader:
    +                mh_full.add_sequence(seq, force=True)
    +
    +            amplicon_sig = sourmash.SourmashSignature(mh_full, name=amplicon_name)
    +            self.logger.info("Amplicon sketching completed for: %s", amplicon_name)
    +            return amplicon_sig
    +
    +        except Exception as e:
    +            self.logger.error("Error during amplicon sketching: %s", e)
    +            raise
    +
    +    # *******************************
    +    # *        Exporting            *
    +    # *******************************
    +
    +    @staticmethod
    +    def export_sigs_to_zip(
    +        sigs: List[sourmash.SourmashSignature], output_file: str
    +    ) -> None:
    +        """
    +        Export a list of signatures to a ZIP file.
    +
    +        Args:
    +            sigs (List[sourmash.SourmashSignature]): List of Sourmash signatures.
    +            output_file (str): Path to the output ZIP file.
    +
    +        Raises:
    +            ValueError: If the output file does not have a .zip extension.
    +            FileExistsError: If the output file already exists.
    +        """
    +        if not output_file.lower().endswith(".zip"):
    +            raise ValueError("Output file must have a .zip extension.")
    +
    +        if os.path.exists(output_file): 
    +            raise FileExistsError("Output file already exists.")
    +
    +        try:
    +            with sourmash.save_load.SaveSignatures_ZipFile(output_file) as save_sigs:
    +                for signature in sigs:
    +                    save_sigs.add(signature)
    +        except Exception as e:
    +            logging.error("Failed to export signatures to zip: %s", e)
    +            raise
    +
    +
    + + + +
    + + + + + + + + + +
    + + +

    + __init__(enable_logging) + +

    + + +
    + +

    Initialize the SnipeSketch instance.

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    + enable_logging + + bool + +
    +

    Flag to enable or disable logging.

    +
    +
    + required +
    + +
    + Source code in src/snipe/api/sketch.py +
    25
    +26
    +27
    +28
    +29
    +30
    +31
    +32
    +33
    def __init__(self, enable_logging: bool) -> None:
    +    """
    +    Initialize the SnipeSketch instance.
    +
    +    Args:
    +        enable_logging (bool): Flag to enable or disable logging.
    +    """
    +    self.logger = logging.getLogger(self.__class__.__name__)
    +    self._configure_logging(enable_logging)
    +
    +
    +
    + +
    + +
    + + +

    + amplicon_sketching(fasta_file, ksize=51, scale=10000, amplicon_name='amplicon') + +

    + + +
    + +

    Create a FracMinHash sketch for an amplicon.

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    + fasta_file + + str + +
    +

    Path to the FASTA file.

    +
    +
    + required +
    + ksize + + int + +
    +

    K-mer size. Defaults to 51.

    +
    +
    + 51 +
    + scale + + int + +
    +

    Scaling factor for MinHash. Defaults to 10_000.

    +
    +
    + 10000 +
    + amplicon_name + + str + +
    +

    Name of the amplicon. Defaults to "amplicon".

    +
    +
    + 'amplicon' +
    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + SourmashSignature + +
    +

    sourmash.SourmashSignature: The resulting amplicon signature.

    +
    +
    + +
    + Source code in src/snipe/api/sketch.py +
    def amplicon_sketching(
    +    self,
    +    fasta_file: str,
    +    ksize: int = 51,
    +    scale: int = 10_000,
    +    amplicon_name: str = "amplicon",
    +) -> sourmash.SourmashSignature:
    +    """
    +    Create a FracMinHash sketch for an amplicon.
    +
    +    Args:
    +        fasta_file (str): Path to the FASTA file.
    +        ksize (int, optional): K-mer size. Defaults to 51.
    +        scale (int, optional): Scaling factor for MinHash. Defaults to 10_000.
    +        amplicon_name (str, optional): Name of the amplicon. Defaults to "amplicon".
    +
    +    Returns:
    +        sourmash.SourmashSignature: The resulting amplicon signature.
    +    """
    +    self.logger.info("Starting amplicon sketching for: %s", amplicon_name)
    +    try:
    +        fa_reader = SequenceReader(fasta_file)
    +        mh_full = sourmash.MinHash(n=0, ksize=ksize, scaled=scale)
    +        for _, seq in fa_reader:
    +            mh_full.add_sequence(seq, force=True)
    +
    +        amplicon_sig = sourmash.SourmashSignature(mh_full, name=amplicon_name)
    +        self.logger.info("Amplicon sketching completed for: %s", amplicon_name)
    +        return amplicon_sig
    +
    +    except Exception as e:
    +        self.logger.error("Error during amplicon sketching: %s", e)
    +        raise
    +
    +
    +
    + +
    + +
    + + +

    + export_sigs_to_zip(sigs, output_file) + + + staticmethod + + +

    + + +
    + +

    Export a list of signatures to a ZIP file.

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    + sigs + + List[SourmashSignature] + +
    +

    List of Sourmash signatures.

    +
    +
    + required +
    + output_file + + str + +
    +

    Path to the output ZIP file.

    +
    +
    + required +
    + + +

    Raises:

    + + + + + + + + + + + + + + + + + +
    TypeDescription
    + ValueError + +
    +

    If the output file does not have a .zip extension.

    +
    +
    + FileExistsError + +
    +

    If the output file already exists.

    +
    +
    + +
    + Source code in src/snipe/api/sketch.py +
    @staticmethod
    +def export_sigs_to_zip(
    +    sigs: List[sourmash.SourmashSignature], output_file: str
    +) -> None:
    +    """
    +    Export a list of signatures to a ZIP file.
    +
    +    Args:
    +        sigs (List[sourmash.SourmashSignature]): List of Sourmash signatures.
    +        output_file (str): Path to the output ZIP file.
    +
    +    Raises:
    +        ValueError: If the output file does not have a .zip extension.
    +        FileExistsError: If the output file already exists.
    +    """
    +    if not output_file.lower().endswith(".zip"):
    +        raise ValueError("Output file must have a .zip extension.")
    +
    +    if os.path.exists(output_file): 
    +        raise FileExistsError("Output file already exists.")
    +
    +    try:
    +        with sourmash.save_load.SaveSignatures_ZipFile(output_file) as save_sigs:
    +            for signature in sigs:
    +                save_sigs.add(signature)
    +    except Exception as e:
    +        logging.error("Failed to export signatures to zip: %s", e)
    +        raise
    +
    +
    +
    + +
    + +
    + + +

    + parallel_genome_sketching(fasta_file, cores=1, ksize=51, scale=10000, assigned_genome_name='full_genome', **kwargs) + +

    + + +
    + +

    Perform parallel genome sketching from a FASTA file.

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    + fasta_file + + str + +
    +

    Path to the FASTA file.

    +
    +
    + required +
    + cores + + int + +
    +

    Number of parallel cores. Defaults to 1.

    +
    +
    + 1 +
    + ksize + + int + +
    +

    K-mer size. Defaults to 51.

    +
    +
    + 51 +
    + scale + + int + +
    +

    Scaling factor for MinHash. Defaults to 10_000.

    +
    +
    + 10000 +
    + assigned_genome_name + + str + +
    +

    Name for the genome signature. Defaults to "full_genome".

    +
    +
    + 'full_genome' +
    + **kwargs + + Any + +
    +

    Additional keyword arguments.

    +
    +
    + {} +
    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + Tuple[SourmashSignature, Dict[str, SourmashSignature]] + +
    +

    Tuple[sourmash.SourmashSignature, Dict[str, sourmash.SourmashSignature]]: +The full genome signature and a dictionary of chromosome signatures.

    +
    +
    + +
    + Source code in src/snipe/api/sketch.py +
    def parallel_genome_sketching(
    +    self,
    +    fasta_file: str,
    +    cores: int = 1,
    +    ksize: int = 51,
    +    scale: int = 10_000,
    +    assigned_genome_name: str = "full_genome",
    +    **kwargs: Any,
    +) -> Tuple[sourmash.SourmashSignature, Dict[str, sourmash.SourmashSignature]]:
    +    """
    +    Perform parallel genome sketching from a FASTA file.
    +
    +    Args:
    +        fasta_file (str): Path to the FASTA file.
    +        cores (int, optional): Number of parallel cores. Defaults to 1.
    +        ksize (int, optional): K-mer size. Defaults to 51.
    +        scale (int, optional): Scaling factor for MinHash. Defaults to 10_000.
    +        assigned_genome_name (str, optional): Name for the genome signature. Defaults to "full_genome".
    +        **kwargs (Any): Additional keyword arguments.
    +
    +    Returns:
    +        Tuple[sourmash.SourmashSignature, Dict[str, sourmash.SourmashSignature]]:
    +            The full genome signature and a dictionary of chromosome signatures.
    +    """
    +    self.logger.info("Starting parallel genome sketching with %d cores.", cores)
    +    fa_reader = SequenceReader(fasta_file, comment=True)
    +    mh_full = sourmash.MinHash(n=0, ksize=ksize, scaled=scale)
    +    chr_to_mh: Dict[str, sourmash.MinHash] = {}
    +
    +    mh_lock = threading.Lock()
    +    chr_lock = threading.Lock()
    +
    +    def process_sequence(
    +        name: str, seq: str, comment: Optional[str]
    +    ) -> None:
    +        header = f"{name} {comment}" if comment else name
    +        seq_type, seq_name = self.parse_fasta_header(header)
    +        current_mh = sourmash.MinHash(n=0, ksize=ksize, scaled=scale, track_abundance=True)
    +        current_mh.add_sequence(seq, force=True)
    +
    +        with mh_lock:
    +            mh_full.merge(current_mh)
    +
    +        if seq_type in {"sex", "autosome"}:
    +            with chr_lock:
    +                key = f"{seq_type}-{seq_name}"
    +                if key not in chr_to_mh:
    +                    chr_to_mh[key] = current_mh
    +                else:
    +                    chr_to_mh[key].merge(current_mh)
    +
    +    try:
    +        with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:
    +            futures = [
    +                executor.submit(
    +                    process_sequence,
    +                    name,
    +                    seq,
    +                    comment
    +                )
    +                for name, seq, comment in fa_reader
    +            ]
    +
    +            for future in concurrent.futures.as_completed(futures):
    +                try:
    +                    future.result()
    +                except Exception as e:
    +                    self.logger.error("Error processing sequence: %s", e)
    +
    +    except Exception as e:
    +        self.logger.error("Error during parallel genome sketching: %s", e)
    +        raise
    +
    +    mh_full_signature = sourmash.SourmashSignature(mh_full, name=assigned_genome_name)
    +    chr_signatures = {
    +        name: sourmash.SourmashSignature(mh, name=name)
    +        for name, mh in chr_to_mh.items()
    +    }
    +
    +    self.logger.info("Parallel genome sketching completed.")
    +    return mh_full_signature, chr_signatures
    +
    +
    +
    + +
    + +
    + + +

    + parse_fasta_header(header) + +

    + + +
    + +

    Parse a FASTA header and categorize the sequence type.

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    + header + + str + +
    +

    The FASTA header string.

    +
    +
    + required +
    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + Tuple[str, str] + +
    +

    Tuple[str, str]: A tuple containing the sequence type and name.

    +
    +
    + +
    + Source code in src/snipe/api/sketch.py +
    def parse_fasta_header(self, header: str) -> Tuple[str, str]:
    +    """
    +    Parse a FASTA header and categorize the sequence type.
    +
    +    Args:
    +        header (str): The FASTA header string.
    +
    +    Returns:
    +        Tuple[str, str]: A tuple containing the sequence type and name.
    +    """
    +    full_header = header.strip()
    +    header_lower = full_header.lower()
    +
    +    if header_lower.startswith(">"):
    +        header_lower = header_lower[1:]
    +        full_header = full_header[1:]
    +
    +    seq_type = "unknown"
    +    name = "unknown"
    +
    +    patterns = {
    +        "scaffold": re.compile(r"\b(scaffold|unplaced|unlocalized)\b"),
    +        "contig": re.compile(r"\bcontig\b"),
    +        "mitochondrial DNA": re.compile(r"\b(mt|mitochondrion|mitochondrial|mitochondria|mito|mtdna)\b"),
    +        "chloroplast DNA": re.compile(r"\b(chloroplast|cpdna|plastid)\b"),
    +        "plasmid": re.compile(r"\bplasmid\b"),
    +        "chromosome": re.compile(r"\bchromosome\b|\bchr\b"),
    +        "reference chromosome": re.compile(r"NC_\d{6}\.\d+"),
    +    }
    +
    +    for stype, pattern in patterns.items():
    +        if pattern.search(header_lower):
    +            if stype in {"scaffold", "contig", "plasmid"}:
    +                match = re.match(r"(\S+)", full_header)
    +                name = match.group(1) if match else "unknown"
    +            elif stype in {"mitochondrial DNA", "chloroplast DNA"}:
    +                name = stype.split()[0]
    +            elif stype == "chromosome":
    +                match = re.search(r"(?:chromosome|chr)[_\s]*([^\s,]+)", header_lower)
    +                if match:
    +                    name = match.group(1).rstrip(".,")
    +                    if name.upper() in {"X", "Y", "W", "Z"}:
    +                        stype = "sex"
    +                    else:
    +                        stype = "autosome"
    +            elif stype == "reference chromosome":
    +                match = pattern.search(full_header)
    +                if match and not (patterns["scaffold"].search(header_lower) or patterns["contig"].search(header_lower)):
    +                    name = match.group()
    +            return stype, name
    +
    +    return seq_type, name
    +
    +
    +
    + +
    + +
    + + +

    + process_sequences(fasta_file, thread_id, total_threads, progress_queue, batch_size=100000, ksize=51, scaled=10000) + +

    + + +
    + +

    Process a subset of sequences to create a FracMinHash sketch.

    +

    Each process creates its own MinHash instance and processes sequences +assigned based on the thread ID. Progress is reported via a shared queue.

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    + fasta_file + + str + +
    +

    Path to the FASTA file.

    +
    +
    + required +
    + thread_id + + int + +
    +

    Identifier for the current thread.

    +
    +
    + required +
    + total_threads + + int + +
    +

    Total number of threads.

    +
    +
    + required +
    + progress_queue + + Queue + +
    +

    Queue for reporting progress.

    +
    +
    + required +
    + batch_size + + int + +
    +

    Number of sequences per progress update. Defaults to 100_000.

    +
    +
    + 100000 +
    + ksize + + int + +
    +

    K-mer size. Defaults to 51.

    +
    +
    + 51 +
    + scaled + + int + +
    +

    Scaling factor for MinHash. Defaults to 10_000.

    +
    +
    + 10000 +
    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + MinHash + +
    +

    sourmash.MinHash: The resulting FracMinHash sketch.

    +
    +
    + +
    + Source code in src/snipe/api/sketch.py +
    def process_sequences(
    +    self,
    +    fasta_file: str,
    +    thread_id: int,
    +    total_threads: int,
    +    progress_queue: multiprocessing.Queue,
    +    batch_size: int = 100_000,
    +    ksize: int = 51,
    +    scaled: int = 10_000,
    +) -> sourmash.MinHash:
    +    """
    +    Process a subset of sequences to create a FracMinHash sketch.
    +
    +    Each process creates its own MinHash instance and processes sequences
    +    assigned based on the thread ID. Progress is reported via a shared queue.
    +
    +    Args:
    +        fasta_file (str): Path to the FASTA file.
    +        thread_id (int): Identifier for the current thread.
    +        total_threads (int): Total number of threads.
    +        progress_queue (multiprocessing.Queue): Queue for reporting progress.
    +        batch_size (int, optional): Number of sequences per progress update. Defaults to 100_000.
    +        ksize (int, optional): K-mer size. Defaults to 51.
    +        scaled (int, optional): Scaling factor for MinHash. Defaults to 10_000.
    +
    +    Returns:
    +        sourmash.MinHash: The resulting FracMinHash sketch.
    +    """
    +    self._register_signal_handler()
    +    try:
    +        fa_reader = SequenceReader(fasta_file)
    +        mh = sourmash.MinHash(
    +            n=0, ksize=ksize, scaled=scaled, track_abundance=True
    +        )
    +        local_count = 0
    +
    +        for idx, (_, seq) in enumerate(fa_reader):
    +            if idx % total_threads == thread_id:
    +                mh.add_sequence(seq, force=True)
    +                local_count += 1
    +
    +                if local_count >= batch_size:
    +                    progress_queue.put(batch_size)
    +                    local_count = 0
    +
    +        if local_count > 0:
    +            progress_queue.put(local_count)
    +
    +        self.logger.debug(
    +            "Thread %d processed %d hashes.", thread_id, len(mh)
    +        )
    +        return mh
    +
    +    except KeyboardInterrupt:
    +        self.logger.warning("KeyboardInterrupt detected in process_sequences.")
    +        sys.exit(0)
    +    except Exception as e:
    +        self.logger.error("Error in process_sequences: %s", e)
    +        raise
    +
    +
    +
    + +
    + +
    + + +

    + progress_monitor(progress_queue, progress_interval, total_threads, stop_event) + +

    + + +
    + +

    Monitor and display the progress of sequence processing.

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    + progress_queue + + Queue + +
    +

    Queue for receiving progress updates.

    +
    +
    + required +
    + progress_interval + + int + +
    +

    Interval for progress updates.

    +
    +
    + required +
    + total_threads + + int + +
    +

    Number of processing threads.

    +
    +
    + required +
    + stop_event + + Event + +
    +

    Event to signal the monitor to stop.

    +
    +
    + required +
    + +
    + Source code in src/snipe/api/sketch.py +
    def progress_monitor(
    +    self,
    +    progress_queue: multiprocessing.Queue,
    +    progress_interval: int,
    +    total_threads: int,
    +    stop_event: threading.Event,
    +) -> None:
    +    """
    +    Monitor and display the progress of sequence processing.
    +
    +    Args:
    +        progress_queue (multiprocessing.Queue): Queue for receiving progress updates.
    +        progress_interval (int): Interval for progress updates.
    +        total_threads (int): Number of processing threads.
    +        stop_event (threading.Event): Event to signal the monitor to stop.
    +    """
    +    total = 0
    +    next_update = progress_interval
    +    try:
    +        while not stop_event.is_set() or not progress_queue.empty():
    +            try:
    +                count = progress_queue.get(timeout=0.5)
    +                total += count
    +                if total >= next_update:
    +                    print(f"\rProcessed {next_update:,} sequences.", end="", flush=True)
    +                    next_update += progress_interval
    +            except queue.Empty:
    +                continue
    +    except Exception as e:
    +        self.logger.error("Error in progress_monitor: %s", e)
    +    finally:
    +        print(f"\rProcessed {total:,} sequences in total.")
    +
    +
    +
    + +
    + +
    + + +

    + sample_sketch(sample_name, filename, num_processes, batch_size, ksize, scale, **kwargs) + +

    + + +
    + +

    Generate a sketch for a sample and return its signature.

    + + +

    Parameters:

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    NameTypeDescriptionDefault
    + sample_name + + str + +
    +

    Name of the sample.

    +
    +
    + required +
    + filename + + str + +
    +

    Path to the input FASTA file.

    +
    +
    + required +
    + num_processes + + int + +
    +

    Number of processes to use.

    +
    +
    + required +
    + batch_size + + int + +
    +

    Batch size for processing.

    +
    +
    + required +
    + ksize + + int + +
    +

    K-mer size.

    +
    +
    + required +
    + scale + + int + +
    +

    Scaling factor.

    +
    +
    + required +
    + **kwargs + + Any + +
    +

    Additional keyword arguments.

    +
    +
    + {} +
    + + +

    Returns:

    + + + + + + + + + + + + + +
    TypeDescription
    + SourmashSignature + +
    +

    sourmash.SourmashSignature: The generated signature.

    +
    +
    + + +

    Raises:

    + + + + + + + + + + + + + +
    TypeDescription
    + RuntimeError + +
    +

    If an error occurs during sketching.

    +
    +
    + +
    + Source code in src/snipe/api/sketch.py +
    def sample_sketch(
    +    self,
    +    sample_name: str,
    +    filename: str,
    +    num_processes: int,
    +    batch_size: int,
    +    ksize: int,
    +    scale: int,
    +    **kwargs: Any,
    +) -> sourmash.SourmashSignature:
    +    """
    +    Generate a sketch for a sample and return its signature.
    +
    +    Args:
    +        sample_name (str): Name of the sample.
    +        filename (str): Path to the input FASTA file.
    +        num_processes (int): Number of processes to use.
    +        batch_size (int): Batch size for processing.
    +        ksize (int): K-mer size.
    +        scale (int): Scaling factor.
    +        **kwargs (Any): Additional keyword arguments.
    +
    +    Returns:
    +        sourmash.SourmashSignature: The generated signature.
    +
    +    Raises:
    +        RuntimeError: If an error occurs during sketching.
    +    """
    +    self.logger.info("Starting sample sketch for: %s", sample_name)
    +    try:
    +        signature = self._sketch_sample(
    +            sample_name=sample_name,
    +            fasta_file=filename,
    +            num_processes=num_processes,
    +            batch_size=batch_size,
    +            k_size=ksize,
    +            scale=scale,
    +            **kwargs,
    +        )
    +        self.logger.info("Sample sketch completed for: %s", sample_name)
    +        return signature
    +    except Exception as e:
    +        self.logger.error(
    +            "Error occurred during sample sketching: %s", str(e)
    +        )
    +        raise RuntimeError("Error occurred during sample sketching.") from e
    +
    +
    +
    + +
    + + + +
    + +
    + +
    + + + + +
    + +
    + +
    + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    + + + +
    + +
    + + + +
    +
    +
    +
    + + + + + + + + + + + + + + \ No newline at end of file diff --git a/SnipeSig/index.html b/SnipeSig/index.html index 0dcfb28..ea55ff8 100644 --- a/SnipeSig/index.html +++ b/SnipeSig/index.html @@ -421,16 +421,16 @@