Skip to content

Commit

Permalink
Do not keep empty blocks (#690, thanks @Tong-Chen) (#692)
Browse files Browse the repository at this point in the history
* debug bp seq

* Do not keep empty blocks (#690, thanks @Tong-Chen)

* Add test_empty_blocks

* move filter_blocks outside print_to_file
  • Loading branch information
tanghaibao authored Jul 22, 2024
1 parent d647955 commit 0e61632
Show file tree
Hide file tree
Showing 9 changed files with 665 additions and 21 deletions.
54 changes: 39 additions & 15 deletions jcvi/compara/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict
from typing import Dict, Tuple

from ..apps.base import logger
from ..formats.base import BaseFile, read_block, must_open
Expand Down Expand Up @@ -48,29 +49,52 @@ def make_ranges(self, order, clip=10):
ranges.append(r)
return ranges, block_pairs

def print_to_file(self, filename="stdout", accepted=None):
fw = must_open(filename, "w")
blocks = self.blocks
def filter_blocks(self, accepted: Dict[Tuple[str, str], str]):
"""
Filter the blocks based on the accepted pairs. This is used to update
the anchors so that they match the info in the LAST file.
"""
new_blocks = []
nremoved = 0
ncorrected = 0
for block in blocks:
print("###", file=fw)
nblocks_removed = 0
for block in self.blocks:
new_block = []
for line in block:
a, b, score = line
pair = (a, b)
if accepted:
if pair not in accepted:
nremoved += 1
continue
av = accepted[pair]
if score != av and score != av + "L":
score = av
ncorrected += 1
print("\t".join((a, b, score)), file=fw)
fw.close()
if pair not in accepted:
nremoved += 1
continue
av = accepted[pair]
if score != av and score != av + "L":
score = av
ncorrected += 1
new_block.append((a, b, score))
if new_block:
new_blocks.append(new_block)
else:
nblocks_removed += 1

logger.debug("Removed %d existing anchors", nremoved)
if nblocks_removed:
logger.debug("Removed %d empty blocks", nblocks_removed)
logger.debug("Corrected scores for %d anchors", ncorrected)
self.blocks = new_blocks

def print_to_file(self, filename="stdout"):
"""
Print the anchors to a file, optionally filtering them based on the
accepted pairs.
"""
fw = must_open(filename, "w")
for block in self.blocks:
print("###", file=fw)
for line in block:
a, b, score = line
print("\t".join((a, b, score)), file=fw)
fw.close()

logger.debug("Anchors written to `%s`", filename)

def blast(self, blastfile=None, outfile=None):
Expand Down
6 changes: 4 additions & 2 deletions jcvi/compara/synteny.py
Original file line number Diff line number Diff line change
Expand Up @@ -1869,9 +1869,11 @@ def liftover(args):
ac.blocks[block_id].append((query, subject, str(score) + "L"))
lifted += 1

logger.debug("{} new pairs found (dist={}).".format(lifted, dist))
logger.debug("%d new pairs found (dist=%d).", lifted, dist)
newanchorfile = anchor_file.rsplit(".", 1)[0] + ".lifted.anchors"
ac.print_to_file(filename=newanchorfile, accepted=accepted)
if accepted:
ac.filter_blocks(accepted)
ac.print_to_file(filename=newanchorfile)
summary([newanchorfile])

return newanchorfile
Expand Down
14 changes: 10 additions & 4 deletions jcvi/formats/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
from .base import BaseFile, logger


FLANK = 60


class Maf(BaseFile, dict):
def __init__(self, filename, index=False):
super().__init__(filename)
Expand Down Expand Up @@ -119,26 +122,29 @@ def breakpoints(args):
logger.info("Total alignments: %d", len(filtered_msa))

final = []
# Load the sequences
ar = next(SeqIO.parse(a_fasta, "fasta"))
br = next(SeqIO.parse(b_fasta, "fasta"))
for bp in bps:
i = bisect(filtered_msa, (bp,))
_, arec, brec = filtered_msa[i]
logger.info("Breakpoint at %d")
logger.info("%s", arec)
logger.info("%s", brec)
assert len(arec) == len(brec)
# Find the midpoint, safe to crossover there
midpoint = len(arec) // 2
aseq = arec.seq[:midpoint]
astart = arec.annotations["start"] + len(aseq) - aseq.count("-")
logger.info("%s|%s", aseq[-FLANK:], arec.seq[midpoint:][:FLANK])
bseq = brec.seq[:midpoint]
bstart = brec.annotations["start"] + len(bseq) - bseq.count("-")
logger.info("%s|%s", bseq[-FLANK:], brec.seq[midpoint:][:FLANK])
bpt = Breakpoint(arec.id, astart, brec.id, bstart)
logger.info("-" * FLANK * 2 + ">")
logger.info("%s|%s", ar.seq[:astart][-FLANK:], br.seq[bstart:][:FLANK])
final.append(bpt)

logger.info("Breakpoints found: %s", final)
# Load the sequences
ar = next(SeqIO.parse(a_fasta, "fasta"))
br = next(SeqIO.parse(b_fasta, "fasta"))
if len(final) == 2:
bp1, bp2 = final[:2]
# ====-------=======
Expand Down
12 changes: 12 additions & 0 deletions tests/compara/synteny.py/inputs/test_empty_blocks.a.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Chr4 54561438 54564013 evm.model.Chr4.1717 0 +
Chr4 54565362 54567326 evm.model.Chr4.1718.1 0 -
Chr4 121756560 121761288 evm.model.Chr4.3572 0 +
Chr4 121933314 121948423 evm.model.Chr4.3580 0 -
Chr4 180331940 180335210 evm.model.Chr4.5631 0 +
Chr4 180428386 180432960 evm.model.Chr4.5633 0 +
Chr4 180435251 180437141 evm.model.Chr4.5634 0 -
Chr4 217516296 217519765 evm.model.Chr4.7127 0 +
Chr4 217519781 217522525 evm.model.Chr4.7128 0 -
Chr4 227171285 227174253 evm.model.Chr4.7543 0 -
Chr4 227302006 227306154 evm.model.Chr4.7546 0 +
Chr4 231178304 231189633 evm.model.Chr4.7694 0 +
19 changes: 19 additions & 0 deletions tests/compara/synteny.py/inputs/test_empty_blocks.anchors
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
###
evm.model.Chr4.5631 mrna.AA559650.t1 919
evm.model.Chr4.5633 mrna.AA559640.t1 1600
evm.model.Chr4.5634 mrna.AA559630.t1 571
###
evm.model.Chr4.7694 mrna.AA560110.t1 2300
evm.model.Chr4.7694 mrna.AA560120.t1 1410
###
evm.model.Chr4.1717 mrna.AA564650.t1 1210
evm.model.Chr4.1718.1 mrna.AA564640.t1 389
###
evm.model.Chr4.7543 mrna.AA563300.t1 356
evm.model.Chr4.7546 mrna.AA563320.t1 639
###
evm.model.Chr4.3572 mrna.AA563680.t1 2030
evm.model.Chr4.3580 mrna.AA563690.t1 411
###
evm.model.Chr4.7127 mrna.AA565190.t1 1210
evm.model.Chr4.7128 mrna.AA565170.t1 1580
13 changes: 13 additions & 0 deletions tests/compara/synteny.py/inputs/test_empty_blocks.b.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
PKPP01013534 12975 15522 mrna.AA559630.t1 GCTI12_AA559630 +
PKPP01013534 18440 23163 mrna.AA559640.t1 GCTI12_AA559640 -
PKPP01013534 25546 29984 mrna.AA559650.t1 GCTI12_AA559650 -
PKPP01013569 17807 21064 mrna.AA560110.t1 GCTI12_AA560110 -
PKPP01013569 23741 29216 mrna.AA560120.t1 GCTI12_AA560120 -
PKPP01013816 18770 21277 mrna.AA564640.t1 GCTI12_AA564640 +
PKPP01013816 23831 25817 mrna.AA564650.t1 GCTI12_AA564650 -
PKPP01013847 17713 19380 mrna.AA563300.t1 GCTI12_AA563300 -
PKPP01013847 27303 28529 mrna.AA563320.t1 GCTI12_AA563320 +
PKPP01013878 9875 15811 mrna.AA563680.t1 GCTI12_AA563680 +
PKPP01013878 15786 19578 mrna.AA563690.t1 GCTI12_AA563690 -
PKPP01013995 6766 9385 mrna.AA565170.t1 GCTI12_AA565170 +
PKPP01013995 13891 18772 mrna.AA565190.t1 GCTI12_AA565190 -
Loading

0 comments on commit 0e61632

Please sign in to comment.