Skip to content

Commit

Permalink
✨ Allow passing vcf as 2nd arg to macros (#45) (#46)
Browse files Browse the repository at this point in the history
* ✨ Allow passing vcf as 2nd arg to macros (#45)

* ⬆️ Update dependencies

* 🔖 0.6.0
  • Loading branch information
pwwang authored Sep 27, 2023
1 parent 5f1b6ec commit da19dc3
Show file tree
Hide file tree
Showing 31 changed files with 816 additions and 756 deletions.
5 changes: 5 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Change Log

## 0.6.0

- ✨ Allow passing vcf as 2nd arg to macros (#45)
- ⬆️ Update dependencies

## 0.5.0

- ➕ Use argx instead of pyparam
Expand Down
13 changes: 13 additions & 0 deletions docs/macros.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,19 @@ def GTTYPEs(variant)

To get the genotype in sample 1 in formula: `GTTYPEs{0}`. You can also use sample name as well: `GTTYPEs{some sample}`

It's also allowed to pass `vcf` (the instance of `cyvcf.VCF`) as the second argument to the macro. For example:

```python
from vcf.macros import cont

@cont
def MIXED_INFO(variant, vcf):
...
```

Check the [API documentation](https://brentp.github.io/cyvcf2/docstrings.html) of `cyvcf2` to see what information we can get from `vcf`.


## Macros with filters

`aggregation`s have different syntax for filters. Here we are discussing about `continuous` and `categorical`.
Expand Down
Binary file modified examples/allele-frequency-on-chromosome-12.density.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/allele-frequency-on-each-chromosome-boxplot.boxplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/allele-frequency-on-each-chromosome.violin.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/depths-between-sample-1-and-2.scatter.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/gq-vs-depth-sample-1.scatter.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/mutant-genotypes-on-each-chromosome-sample-1.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/number-of-substitutions-of-snps-passed.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/number-of-substitutions-of-snps.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/number-of-variants-on-each-chromosome-first-5.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/number-of-variants-on-each-chromosome-modified.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/number-of-variants-on-each-chromosome.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/overall-allele-frequency-distribution.histogram.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/types-of-variants-on-chromosome-1.pie.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/types-of-variants-on-each-chromosome.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/types-of-variants-on-whole-genome.col.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified examples/types-of-variants-on-whole-genome.pie.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1,402 changes: 707 additions & 695 deletions poetry.lock

Large diffs are not rendered by default.

9 changes: 4 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.masonry.api"

[tool.poetry]
name = "vcfstats"
version = "0.5.0"
version = "0.6.0"
description = "Powerful statistics for VCF files"
authors = [ "pwwang <[email protected]>",]
license = "MIT"
Expand All @@ -17,12 +17,11 @@ include = ["vcfstats/args.toml"]
python = "^3.8"
cyvcf2 = "^0.30"
lark-parser = "^0.12"
plotnine = "^0.10"
plotnine-prism = "^0.2"
plotnine-prism = "^0.3"
python-slugify = "^8"
datar = { version = "^0.11", extras = ["pandas"] }
datar = { version = "^0.13", extras = ["pandas"] }
py = "^1.10"
argx = "^0.1.0"
argx = "^0.2"
rich = "^13"

[tool.poetry.dev-dependencies]
Expand Down
74 changes: 37 additions & 37 deletions tests/test_formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,28 +109,28 @@ def test_term_init():

def test_term_run(variants):
term = Term("FILTER", "PASS")
assert term.run(variants[0], passed=True) == False
assert term.run(variants[5], passed=True) == ["PASS"]
assert term.run(variants[0], None, passed=True) == False
assert term.run(variants[5], None, passed=True) == ["PASS"]

term = Term("FILTER2")
assert term.run(variants[5], passed=True) == False
assert term.run(variants[5], None, passed=True) == False

term = Term("GTTYPEs", None, ["0"])
term.set_samples(variants[-1])
assert term.run(variants[0], passed=False) == ["HOM_REF"]
assert term.run(variants[0], None, passed=False) == ["HOM_REF"]

term = Term("AAF", [0.126, None])
# .125
assert term.run(variants[0], passed=False) == False
assert term.run(variants[2], passed=False) == [0.25]
assert term.run(variants[0], None, passed=False) == False
assert term.run(variants[2], None, passed=False) == [0.25]
term = Term("AAF", [None, 0.24])
# .25
assert term.run(variants[0], passed=False) == [0.125]
assert term.run(variants[2], passed=False) == False
assert term.run(variants[0], None, passed=False) == [0.125]
assert term.run(variants[2], None, passed=False) == False

term = Term("FILTER", "PASS")
assert term.run(variants[0], passed=False) == False
assert term.run(variants[5], passed=False) == ["PASS"]
assert term.run(variants[0], None, passed=False) == False
assert term.run(variants[5], None, passed=False) == ["PASS"]


def test_aggr_init():
Expand Down Expand Up @@ -194,54 +194,54 @@ def test_aggr_run(variants):
aggr = Aggr(
"COUNT", One(), filter=Term("FILTER", ["PASS"]), group=Term("VARTYPE")
)
aggr.run(variants[0], passed=True)
aggr.run(variants[0], None, passed=True)
assert len(aggr.cache) == 0

aggr2 = Aggr("COUNT", One(), filter=Term("FILTER", ["PASS"]))
with pytest.raises(RuntimeError):
aggr2.run(variants[5], passed=True)
aggr2.run(variants[5], None, passed=True)

aggr3 = Aggr(
"COUNT", One(), filter=Term("FILTER", ["PASS"]), group=Term("FILTER2")
)
aggr3.run(variants[5], passed=False)
aggr3.run(variants[5], None, passed=False)
assert len(aggr3.cache) == 0

aggr4 = Aggr("COUNT", One(), Term("GTTYPEs"))
with pytest.raises(ValueError):
aggr4.run(variants[0], passed=False)
aggr4.run(variants[0], None, passed=False)

aggr5 = Aggr("COUNT", One(), Term("VARTYPE"))
aggr5.run(variants[0], passed=False)
aggr5.run(variants[0], None, passed=False)
assert aggr5.cache == {"snp": [1]}
aggr5.run(variants[1], passed=False)
aggr5.run(variants[1], None, passed=False)
assert aggr5.cache == {"snp": [1, 1]}
aggr5.run(variants[3], passed=False)
aggr5.run(variants[3], None, passed=False)
assert aggr5.cache == {"snp": [1, 1], "indel": [1]}

assert aggr5.dump() == {"snp": 2, "indel": 1}

aggr5.cache.clear()
aggr5.setxgroup(Term("FILTER", None))
aggr5.run(variants[0], passed=False)
aggr5.run(variants[0], None, passed=False)
assert aggr5.cache == {"MinMQ": {"snp": [1]}}

aggr5.cache.clear()
aggr5.setxgroup(Term("FILTER2", None))
aggr5.run(variants[0], passed=False)
aggr5.run(variants[0], None, passed=False)
assert aggr5.cache == {"MinMQ": {"snp": [1]}}
aggr5.run(variants[5], passed=False)
aggr5.run(variants[5], None, passed=False)
assert aggr5.cache == {"MinMQ": {"snp": [1]}}
aggr5.run(variants[1], passed=False)
aggr5.run(variants[1], None, passed=False)
assert aggr5.cache == {"MinMQ": {"snp": [1, 1]}}
assert aggr5.dump() == {"MinMQ": [(2, "snp")]}

aggr5.setxgroup(Term("GTTYPEs", ["HOM_REF", "HET"]))
with pytest.raises(ValueError):
aggr5.run(variants[0], passed=False)
aggr5.run(variants[0], None, passed=False)

aggr6 = Aggr("MEAN", Term("AAF", [".2", None]), Term("CHROM"))
aggr6.run(variants[0], passed=False) # .125
aggr6.run(variants[0], None, passed=False) # .125
assert len(aggr6.cache) == 0


Expand Down Expand Up @@ -272,14 +272,14 @@ def test_formula_run(variants):
data = []
fmula = Formula("AFs{0,1} ~ GTTYPEs{0-2}", variants[-1], False, "title")
with pytest.raises(RuntimeError):
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)

fmula = Formula("FILTER2 ~ CHROM", variants[-1], False, "title")
fmula.run(variants[5], data.append, data.extend)
fmula.run(variants[5], None, data.append, data.extend)
assert data == []

fmula = Formula("GTTYPEs ~ CHROM", variants[-1], False, "title")
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
assert data == [
("HOM_REF", "1"),
("HOM_REF", "1"),
Expand All @@ -289,7 +289,7 @@ def test_formula_run(variants):

data = []
fmula = Formula("CHROM ~ GTTYPEs", variants[-1], False, "title")
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
assert data == [
("1", "HOM_REF"),
("1", "HOM_REF"),
Expand All @@ -304,8 +304,8 @@ def test_formula_run(variants):
False,
"title",
)
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[1], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
fmula.run(variants[1], None, data.append, data.extend)
assert data == []
fmula.done(data.append, data.extend)
assert data == [(2, 2, "1")]
Expand All @@ -317,7 +317,7 @@ def test_formula_run(variants):
False,
"title",
)
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
assert data == []

with pytest.raises(ValueError):
Expand All @@ -329,24 +329,24 @@ def test_formula_run(variants):
)

fmula = Formula("COUNT(1) ~ CHROM", variants[-1], False, "title")
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[1], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
fmula.run(variants[1], None, data.append, data.extend)
assert data == []
assert fmula.Y.cache == {"1": [1, 1]}
fmula.done(data.append, data.extend)
assert data == [(2, "1")]

fmula = Formula("CHROM ~ COUNT(1)", variants[-1], False, "title")
with pytest.raises(TypeError):
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)

data = []
fmula = Formula(
"COUNT(1, group = VARTYPE) ~ CHROM", variants[-1], False, "title"
)
fmula.run(variants[0], data.append, data.extend)
fmula.run(variants[1], data.append, data.extend)
fmula.run(variants[2], data.append, data.extend)
fmula.run(variants[3], data.append, data.extend)
fmula.run(variants[0], None, data.append, data.extend)
fmula.run(variants[1], None, data.append, data.extend)
fmula.run(variants[2], None, data.append, data.extend)
fmula.run(variants[3], None, data.append, data.extend)
fmula.done(data.append, data.extend)
assert data == [(3, "1", "snp"), (1, "1", "indel")]
2 changes: 1 addition & 1 deletion tests/test_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_one_iterate(tmp_path):
False,
)
with pytest.raises(AttributeError):
instance.iterate(None)
instance.iterate(None, None)


# def test_summarize(instance):
Expand Down
20 changes: 20 additions & 0 deletions tests/test_macros.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from cyvcf2 import VCF

from vcfstats.macros import (
cat,
_ONE,
TITV,
VARTYPE,
Expand All @@ -26,6 +27,12 @@
HERE = Path(__file__).parent.resolve()


@cat
def mixedinfo(variant, vcf):
"""Global position of the variant"""
return vcf.raw_header[:6] + variant.CHROM


@pytest.fixture(scope="module")
def variants():
vcf = VCF(
Expand All @@ -35,6 +42,19 @@ def variants():
return list(vcf)


@pytest.fixture(scope="module")
def variants_vcf():
vcf = VCF(
str(HERE.parent.joinpath("examples", "sample.vcf")),
gts012=True,
)
return list(vcf), vcf


def test_variant_vcf(variants_vcf):
assert mixedinfo(variants_vcf[0][0], variants_vcf[1]) == "##file1"


def test_vartype(variants):
assert VARTYPE(variants[0]) == "snp"
assert VARTYPE(variants[1]) == "snp"
Expand Down
2 changes: 1 addition & 1 deletion vcfstats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

from . import macros

__version__ = "0.5.0"
__version__ = "0.6.0"
2 changes: 1 addition & 1 deletion vcfstats/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Main entrance for `python -m vcfstats`"""

# pragma: no cover
from .cli import main

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion vcfstats/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def main():
for i, variant in enumerate(vcf):
for instance in ones:
# save entries, cache aggr
instance.iterate(variant)
instance.iterate(variant, vcf)
if i % 10000 == 0: # pragma: no cover
logger.debug("- %s variants read.", i)
logger.info(
Expand Down
29 changes: 16 additions & 13 deletions vcfstats/formula.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,14 @@ def __eq__(self, other):
def __ne__(self, other):
return not self.__eq__(other)

def run(self, variant, passed):
def run(self, variant, vcf, passed):
"""Run the variant"""
if passed and variant.FILTER:
return False
value = self.term["func"](variant)
if self.term["nargs"] == 2:
value = self.term["func"](variant, vcf)
else:
value = self.term["func"](variant)

if value is False or value is None:
return False
Expand Down Expand Up @@ -206,21 +209,21 @@ def setxgroup(self, xvar):
else:
self.xgroup = xvar

def run(self, variant, passed):
def run(self, variant, vcf, passed):
"""Run each variant"""
if self.filter and self.filter.run(variant, passed) is False:
if self.filter and self.filter.run(variant, vcf, passed) is False:
return

if not self.group:
raise RuntimeError(
"No group specified, don't know how to aggregate."
)

group = self.group.run(variant, passed)
group = self.group.run(variant, vcf, passed)
if group is False:
return

value = self.term.run(variant, passed)
value = self.term.run(variant, vcf, passed)
if value is False:
return

Expand All @@ -233,7 +236,7 @@ def run(self, variant, passed):

xgroup = None
if self.xgroup:
xgroup = self.xgroup.run(variant, passed)
xgroup = self.xgroup.run(variant, vcf, passed)
if xgroup is False:
return
if len(xgroup) > 1 and len(value) != len(xgroup):
Expand Down Expand Up @@ -312,12 +315,12 @@ def __init__(self, formula, samples, passed, title):
):
self.passed = False

def run(self, variant, data_append, data_extend):
def run(self, variant, vcf, data_append, data_extend):
"""Run each variant"""
if isinstance(self.Y, Term) and isinstance(self.X, Term):
yvar, xvar = (
self.Y.run(variant, self.passed),
self.X.run(variant, self.passed),
self.Y.run(variant, vcf, self.passed),
self.X.run(variant, vcf, self.passed),
)
if yvar is False or xvar is False:
return
Expand All @@ -336,10 +339,10 @@ def run(self, variant, data_append, data_extend):

data_extend(((yvar[i], rvar) for i, rvar in enumerate(xvar)))
elif isinstance(self.Y, Aggr) and isinstance(self.X, Aggr):
self.Y.run(variant, self.passed)
self.X.run(variant, self.passed)
self.Y.run(variant, vcf, self.passed)
self.X.run(variant, vcf, self.passed)
elif isinstance(self.Y, Aggr) and isinstance(self.X, Term):
self.Y.run(variant, self.passed)
self.Y.run(variant, vcf, self.passed)
else:
raise TypeError(
"Cannot do 'TERM ~ AGGREGATION'. "
Expand Down
Loading

0 comments on commit da19dc3

Please sign in to comment.