diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst index 2104012d2..4a73772c0 100644 --- a/docs/source/modules/datasets.rst +++ b/docs/source/modules/datasets.rst @@ -169,19 +169,19 @@ of vocabs. - 115 - абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿ґіїєҐІЇЄ₴ * - vietnamese - - 236 - - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ + - 234 + - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựíìỉĩịýỳỷỹỵÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰÍÌỈĨỊÝỲỶỸỴ * - hebrew - 123 - 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~°£€¥¢฿אבגדהוזחטיכלמנסעפצקרשת₪ * - hindi - - 71 - - अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥॰ + - 68 + - अआइईउऊऋॠऌॡएऐओऔंःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह०१२३४५६७८९।,?!:्ॐ॰॥ * - bangla - 70 - অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ০১২৩৪৫৬৭৮৯ * - gujarati - - 106 + - 104 - અઆઇઈઉઊઋએઐઓઔઅંઅઃકખગઘચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળક્ષજ્ઞ૦૧૨૩૪૫૬૭૮૯૰ઽ◌ંઃ॥ૐ઼ઁ!"#$%&'()*+,-./:;<=>?@[\]^_{|}~ * - multilingual - 195 diff --git a/doctr/datasets/vocabs.py b/doctr/datasets/vocabs.py index 91c5b215d..29716804a 100644 --- a/doctr/datasets/vocabs.py +++ b/doctr/datasets/vocabs.py @@ -19,9 +19,9 @@ "arabic_digits": "٠١٢٣٤٥٦٧٨٩", "arabic_diacritics": "ًٌٍَُِّْ", "arabic_punctuation": "؟؛«»—", - "hindi_letters": "अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह", + "hindi_letters": "अआइईउऊऋॠऌॡएऐओऔंःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह", "hindi_digits": "०१२३४५६७८९", - "hindi_punctuation": "।,?!:्ॐ॰॥॰", + "hindi_punctuation": "।,?!:्ॐ॰॥", "gujarati_vowels": "અઆઇઈઉઊઋએઐઓઔઅંઅઃ", "gujarati_consonants":"કખગઘચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહળક્ષજ્ઞ", "gujarati_digits":"૦૧૨૩૪૫૬૭૮૯", @@ -57,8 +57,8 @@ VOCABS["swedish"] = VOCABS["english"] + "åäöÅÄÖ" VOCABS["vietnamese"] = ( VOCABS["english"] - + "áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựiíìỉĩịýỳỷỹỵ" - + "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰIÍÌỈĨỊÝỲỶỸỴ" + + "áàảạãăắằẳẵặâấầẩẫậđéèẻẽẹêếềểễệóòỏõọôốồổộỗơớờởợỡúùủũụưứừửữựíìỉĩịýỳỷỹỵ" + + "ÁÀẢẠÃĂẮẰẲẴẶÂẤẦẨẪẬĐÉÈẺẼẸÊẾỀỂỄỆÓÒỎÕỌÔỐỒỔỘỖƠỚỜỞỢỠÚÙỦŨỤƯỨỪỬỮỰÍÌỈĨỊÝỲỶỸỴ" ) VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪" VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"] diff --git a/tests/common/test_datasets_vocabs.py b/tests/common/test_datasets_vocabs.py new file mode 100644 index 000000000..cd84bf7ac --- /dev/null +++ b/tests/common/test_datasets_vocabs.py @@ -0,0 +1,11 @@ +from collections import Counter + +from doctr.datasets import VOCABS + + +def test_vocabs_duplicates(): + for key, vocab in VOCABS.items(): + assert isinstance(vocab, str) + + duplicates = [char for char, count in Counter(vocab).items() if count > 1] + assert not duplicates, f"Duplicate characters in {key} vocab: {duplicates}"