Skip to content

Commit

Permalink
fix: gnomad_re should accept all nucleotide characters for ref/alt (#276
Browse files Browse the repository at this point in the history
)

Co-authored-by: Alex H. Wagner, PhD <[email protected]>
  • Loading branch information
korikuzma and ahwagner authored Nov 8, 2023
1 parent 593508c commit 0419979
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 29 deletions.
5 changes: 4 additions & 1 deletion src/ga4gh/vrs/extras/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ class Translator:
"""

beacon_re = re.compile(r"(?P<chr>[^-]+)\s*:\s*(?P<pos>\d+)\s*(?P<ref>\w+)\s*>\s*(?P<alt>\w+)")
gnomad_re = re.compile(r"(?P<chr>[^-]+)-(?P<pos>\d+)-(?P<ref>[ACGTN]+)-(?P<alt>[ACGTN]+|\*|\.)", re.IGNORECASE)
gnomad_re = re.compile(
r"(?P<chr>[^-]+)-(?P<pos>\d+)-(?P<ref>[ACGTURYKMSWBDHVN]+)-(?P<alt>[ACGTURYKMSWBDHVN]+)",
re.IGNORECASE
)
hgvs_re = re.compile(r"[^:]+:[cgnpr]\.")
spdi_re = re.compile(r"(?P<ac>[^:]+):(?P<pos>\d+):(?P<del_len_or_seq>\w*):(?P<ins_seq>\w*)")

Expand Down
155 changes: 128 additions & 27 deletions tests/extras/cassettes/test_from_gnomad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ interactions:
Connection:
- keep-alive
User-Agent:
- python-requests/2.28.1
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:19?start=44908821&end=44908822
response:
Expand All @@ -23,7 +23,7 @@ interactions:
Content-Type:
- text/plain; charset=utf-8
Date:
- Mon, 16 Jan 2023 16:32:42 GMT
- Tue, 07 Nov 2023 23:05:59 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand All @@ -39,33 +39,134 @@ interactions:
Connection:
- keep-alive
User-Agent:
- python-requests/2.28.1
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/GRCh38:19
uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:17?start=83129586&end=83129598
response:
body:
string: "{\n \"added\": \"2016-08-24T08:19:02Z\",\n \"aliases\": [\n \"Ensembl:19\",\n
\ \"ensembl:19\",\n \"GRCh38:19\",\n \"GRCh38:chr19\",\n \"GRCh38.p1:19\",\n
\ \"GRCh38.p1:chr19\",\n \"GRCh38.p10:19\",\n \"GRCh38.p10:chr19\",\n
\ \"GRCh38.p11:19\",\n \"GRCh38.p11:chr19\",\n \"GRCh38.p12:19\",\n
\ \"GRCh38.p12:chr19\",\n \"GRCh38.p2:19\",\n \"GRCh38.p2:chr19\",\n
\ \"GRCh38.p3:19\",\n \"GRCh38.p3:chr19\",\n \"GRCh38.p4:19\",\n \"GRCh38.p4:chr19\",\n
\ \"GRCh38.p5:19\",\n \"GRCh38.p5:chr19\",\n \"GRCh38.p6:19\",\n \"GRCh38.p6:chr19\",\n
\ \"GRCh38.p7:19\",\n \"GRCh38.p7:chr19\",\n \"GRCh38.p8:19\",\n \"GRCh38.p8:chr19\",\n
\ \"GRCh38.p9:19\",\n \"GRCh38.p9:chr19\",\n \"MD5:b0eba2c7bb5c953d1e06a508b5e487de\",\n
\ \"NCBI:NC_000019.10\",\n \"refseq:NC_000019.10\",\n \"SEGUID:AHxM5/L8jIX08UhBBkKXkiO5rhY\",\n
\ \"SHA1:007c4ce7f2fc8c85f4f148410642979223b9ae16\",\n \"VMC:GS_IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n
\ \"sha512t24u:IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n \"ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\"\n
\ ],\n \"alphabet\": \"ACGNT\",\n \"length\": 58617616\n}\n"
string: GTTGWCACATGA
headers:
Connection:
- close
Content-Length:
- '1035'
- '12'
Content-Type:
- text/plain; charset=utf-8
Date:
- Tue, 07 Nov 2023 23:05:59 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/GRCh38:17
response:
body:
string: "{\n \"added\": \"2016-08-27T23:52:54Z\",\n \"aliases\": [\n \"GRCh38:17\",\n
\ \"GRCh38:chr17\",\n \"GRCh38.p1:17\",\n \"GRCh38.p1:chr17\",\n \"GRCh38.p10:17\",\n
\ \"GRCh38.p10:chr17\",\n \"GRCh38.p11:17\",\n \"GRCh38.p11:chr17\",\n
\ \"GRCh38.p12:17\",\n \"GRCh38.p12:chr17\",\n \"GRCh38.p2:17\",\n
\ \"GRCh38.p2:chr17\",\n \"GRCh38.p3:17\",\n \"GRCh38.p3:chr17\",\n
\ \"GRCh38.p4:17\",\n \"GRCh38.p4:chr17\",\n \"GRCh38.p5:17\",\n \"GRCh38.p5:chr17\",\n
\ \"GRCh38.p6:17\",\n \"GRCh38.p6:chr17\",\n \"GRCh38.p7:17\",\n \"GRCh38.p7:chr17\",\n
\ \"GRCh38.p8:17\",\n \"GRCh38.p8:chr17\",\n \"GRCh38.p9:17\",\n \"GRCh38.p9:chr17\",\n
\ \"MD5:f9a0fb01553adb183568e3eb9d8626db\",\n \"NCBI:NC_000017.11\",\n
\ \"refseq:NC_000017.11\",\n \"SEGUID:s2Skupj8o6wdjf0aPrgOipAr67Q\",\n
\ \"SHA1:b364a4ba98fca3ac1d8dfd1a3eb80e8a902bebb4\",\n \"VMC:GS_dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7\",\n
\ \"sha512t24u:dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7\",\n \"ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7\"\n
\ ],\n \"alphabet\": \"ACGKNRSTWY\",\n \"length\": 83257441\n}\n"
headers:
Connection:
- close
Content-Length:
- '1004'
Content-Type:
- application/json
Date:
- Tue, 07 Nov 2023 23:05:59 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:7?start=1&end=17
response:
body:
string: NNNNNNNNNNNNNNNN
headers:
Connection:
- close
Content-Length:
- '16'
Content-Type:
- text/plain; charset=utf-8
Date:
- Tue, 07 Nov 2023 23:05:59 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
code: 200
message: OK
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/GRCh38:7
response:
body:
string: "{\n \"added\": \"2016-08-27T21:23:35Z\",\n \"aliases\": [\n \"GRCh38:7\",\n
\ \"GRCh38:chr7\",\n \"GRCh38.p1:7\",\n \"GRCh38.p1:chr7\",\n \"GRCh38.p10:7\",\n
\ \"GRCh38.p10:chr7\",\n \"GRCh38.p11:7\",\n \"GRCh38.p11:chr7\",\n
\ \"GRCh38.p12:7\",\n \"GRCh38.p12:chr7\",\n \"GRCh38.p2:7\",\n \"GRCh38.p2:chr7\",\n
\ \"GRCh38.p3:7\",\n \"GRCh38.p3:chr7\",\n \"GRCh38.p4:7\",\n \"GRCh38.p4:chr7\",\n
\ \"GRCh38.p5:7\",\n \"GRCh38.p5:chr7\",\n \"GRCh38.p6:7\",\n \"GRCh38.p6:chr7\",\n
\ \"GRCh38.p7:7\",\n \"GRCh38.p7:chr7\",\n \"GRCh38.p8:7\",\n \"GRCh38.p8:chr7\",\n
\ \"GRCh38.p9:7\",\n \"GRCh38.p9:chr7\",\n \"MD5:cc044cc2256a1141212660fb07b6171e\",\n
\ \"NCBI:NC_000007.14\",\n \"refseq:NC_000007.14\",\n \"SEGUID:4+JjCcBVhPCr8vdIhUKFycPv8bY\",\n
\ \"SHA1:e3e26309c05584f0abf2f748854285c9c3eff1b6\",\n \"VMC:GS_F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul\",\n
\ \"sha512t24u:F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul\",\n \"ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul\"\n
\ ],\n \"alphabet\": \"ACGNRSTY\",\n \"length\": 159345973\n}\n"
headers:
Connection:
- close
Content-Length:
- '977'
Content-Type:
- application/json
Date:
- Mon, 16 Jan 2023 16:32:42 GMT
- Tue, 07 Nov 2023 23:05:59 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand All @@ -81,7 +182,7 @@ interactions:
Connection:
- keep-alive
User-Agent:
- python-requests/2.28.1
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:13?start=32936731&end=32936732
response:
Expand All @@ -95,7 +196,7 @@ interactions:
Content-Type:
- text/plain; charset=utf-8
Date:
- Mon, 16 Jan 2023 16:32:42 GMT
- Tue, 07 Nov 2023 23:05:59 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand All @@ -111,7 +212,7 @@ interactions:
Connection:
- keep-alive
User-Agent:
- python-requests/2.28.1
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:13?start=32936731&end=32936732
response:
Expand All @@ -125,7 +226,7 @@ interactions:
Content-Type:
- text/plain; charset=utf-8
Date:
- Mon, 16 Jan 2023 16:32:42 GMT
- Tue, 07 Nov 2023 23:05:59 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand All @@ -141,7 +242,7 @@ interactions:
Connection:
- keep-alive
User-Agent:
- python-requests/2.28.1
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:13?start=32936731&end=32936732
response:
Expand All @@ -155,7 +256,7 @@ interactions:
Content-Type:
- text/plain; charset=utf-8
Date:
- Mon, 16 Jan 2023 16:32:42 GMT
- Tue, 07 Nov 2023 23:05:59 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand All @@ -171,7 +272,7 @@ interactions:
Connection:
- keep-alive
User-Agent:
- python-requests/2.28.1
- python-requests/2.31.0
method: GET
uri: http://localhost:5000/seqrepo/1/metadata/GRCh38:13
response:
Expand All @@ -197,7 +298,7 @@ interactions:
Content-Type:
- application/json
Date:
- Mon, 16 Jan 2023 16:32:42 GMT
- Tue, 07 Nov 2023 23:05:59 GMT
Server:
- Werkzeug/2.2.2 Python/3.10.4
status:
Expand Down
8 changes: 8 additions & 0 deletions tests/extras/test_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,14 @@ def test_from_beacon(tlr):
def test_from_gnomad(tlr):
assert tlr._from_gnomad(snv_inputs["gnomad"]).as_dict() == snv_output

assert tlr._from_gnomad("17-83129587-GTTGWCACATGA-G")

# Test valid characters
assert tlr._from_gnomad(
"7-2-ACGTURYKMSWBDHVN-ACGTURYKMSWBDHVN",
require_validation=False
)

# Invalid input. Ref does not match regex
assert not tlr._from_gnomad("13-32936732-helloworld-C")

Expand Down
2 changes: 1 addition & 1 deletion tests/extras/test_vcf_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,4 @@ def test_get_vrs_object_invalid_input(vcf_annotator, caplog):

# No ALT
vcf_annotator._get_vrs_object("7-140753336-A-.", {}, [], "GRCh38")
assert "ValidationError when translating 7-140753336-A-. from gnomad" in caplog.text
assert "None was returned when translating 7-140753336-A-. from gnomad" in caplog.text

0 comments on commit 0419979

Please sign in to comment.