Skip to content

Commit

Permalink
adding Gujarati language vocabulary
Browse files Browse the repository at this point in the history
  • Loading branch information
Juneja Sarjil authored and bhavya-work committed Dec 2, 2024
1 parent a522f0e commit d9f0eef
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions doctr/datasets/vocabs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,23 @@
"hindi_letters": "अआइईउऊऋॠऌॡएऐओऔअंअःकखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह",
"hindi_digits": "०१२३४५६७८९",
"hindi_punctuation": "।,?!:्ॐ॰॥॰",
"gujarati_consonants":"કખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલવશષસહષજ્ઞ",
"gujarati_vowels": "અઆઇઈઉઊઋએઐઓઔઅંઅઃ ",
"gujarati_digits":"૦૧૨૩૪૫૬૭૮૯",
"gujarati_diacritics":"""કકાકિકીકુકૂકૃકેકૈકોકૌકંકઃ ખખાખિખીખુખૂખૃખેખૈખોખૌખંખઃ ગગાગિગીગુગૂગૃગેગૈગોગૌગંગઃ ઘઘાઘિઘીઘુઘૂઘૃઘેઘૈઘોઘૌઘંઘઃ ઙઙાઙિઙીઙુઙૂઙૃઙેઙૈઙોઙૌઙંઙઃ ચચાચિચીચુચૂચૃચેચૈચોચૌચંચઃ
છછાછિછીછુછૂછૃછેછૈછોછૌછંછઃ જજાજિજીજુજુજૃજેજૈજોજૌજંજઃ ઝઝાઝિઝીઝુઝૂઝૃઝેઝૈઝોઝૌઝંઝઃ ઞઞાઞિઞીઞુઞૂઞૃઞેઞૈઞોઞૌઞંઞઃ ટટાટિટીટુટૂટૃટેટૈટોટૌટંટઃ ઠઠાઠિઠીઠુઠૂઠૃઠેઠૈઠોઠૌઠંઠઃ ડડાડિડીડુડૂડૃડેડૈડોડૌડંડઃ ઢઢાઢિઢીઢુઢૂઢૃઢેઢૈઢોઢૌઢંઢઃ ણણાણિણીણુણૂણૃણેણૈણોણૌણંણઃ
તતાતિતીતુતૂતૃતેતૈતોતૌતંતઃ થથાથિથીથુથૂથૃથીથૈથોથૌથંથઃ દદાદિદીદુદૂદૃદેદૈદોદૌદંદઃ ધધાધિધીધુધૂધૃધેધૈધોધૌધંધઃ નનાનિનીનુનૂનૃનેનૈનોનૌનંનઃ પપાપિપીપુપૂપૃપેપૈપોપૌપંપઃ ફફાફિફીફુફૂફૃફેફૈફોફૌફંફઃ બબાબિબીબુબૂબૃબેબૈબોબૌબંબઃ ભભાભિભીભુભૂભૃભેભૈભોભૌભંભઃ
મમામિમીમુમૂમૃમેમામોમાયમંમઃ યયાયિયીયુયુયૃયેયૈયોયૌયંયઃ રરારિરીરૂરૃરેરૈરોરૌરંરઃ લલાલિલીલુલૂલૃલેલૈલોલૌલંલઃ વવાવિવીવિવૂવૃવેવૈવોવૈવંવઃ શશાશિશીશુશૂશૃશેશૈશોશૌશંશઃ ષષાષિષીષુષૂષૃષેષૈષોષૌષંષઃ જ્ઞજ્ઞાજ્ઞિજ્ઞીજ્ઞુજ્ઞૂજ્ઞૃજ્ઞેજ્ઞૈજ્ઞોજ્ઞૌજ્ઞંજ્ઞઃ """,
"gujarati_punctuation":",.!?:;'()[]-_/|\✶૰૱`'",
"bangla_letters": "অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহ়ঽািীুূৃেৈোৌ্ৎংঃঁ",
"bangla_digits": "০১২৩৪৫৬৭৮৯",
"generic_cyrillic_letters": "абвгдежзийклмнопрстуфхцчшщьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯ",
}

VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"]
VOCABS["english"] = VOCABS["latin"] + "°" + VOCABS["currency"]
VOCABS["english"] = VOCABS["latin"] + "©" +VOCABS["currency"].replace("¥",' ').replace('€','™').replace('¢','®')
VOCABS["legacy_french"] = VOCABS["latin"] + "°" + "àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ" + VOCABS["currency"]
VOCABS["french"] = VOCABS["english"] + "àâéèêëîïôùûüçÀÂÉÈÊËÎÏÔÙÛÜÇ"
VOCABS["french"] = VOCABS["english"] + "àâéèêëáîôùûüÀÂÉÈÊËÎÏÔÙÛÜÚÇ"

Check notice on line 42 in doctr/datasets/vocabs.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

doctr/datasets/vocabs.py#L42

Trailing whitespace
VOCABS["portuguese"] = VOCABS["english"] + "áàâãéêíïóôõúüçÁÀÂÃÉÊÍÏÓÔÕÚÜÇ"
VOCABS["spanish"] = VOCABS["english"] + "áéíóúüñÁÉÍÓÚÜÑ" + "¡¿"
VOCABS["italian"] = VOCABS["english"] + "àèéìíîòóùúÀÈÉÌÍÎÒÓÙÚ"
Expand Down Expand Up @@ -59,6 +67,13 @@
)
VOCABS["hebrew"] = VOCABS["english"] + "אבגדהוזחטיכלמנסעפצקרשת" + "₪"
VOCABS["hindi"] = VOCABS["hindi_letters"] + VOCABS["hindi_digits"] + VOCABS["hindi_punctuation"]
VOCABS["gujarati"] = (

Check notice on line 70 in doctr/datasets/vocabs.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

doctr/datasets/vocabs.py#L70

Trailing whitespace
VOCABS['gujarati_consonants']

Check notice on line 71 in doctr/datasets/vocabs.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

doctr/datasets/vocabs.py#L71

Trailing whitespace
+ VOCABS["gujarati_vowels"]

Check notice on line 72 in doctr/datasets/vocabs.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

doctr/datasets/vocabs.py#L72

Trailing whitespace
+ VOCABS['gujarati_digits']

Check notice on line 73 in doctr/datasets/vocabs.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

doctr/datasets/vocabs.py#L73

Trailing whitespace
+ VOCABS['gujarati_diacritics']

Check notice on line 74 in doctr/datasets/vocabs.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

doctr/datasets/vocabs.py#L74

Trailing whitespace
+ VOCABS['gujarati_punctuation']
)
VOCABS["bangla"] = VOCABS["bangla_letters"] + VOCABS["bangla_digits"]
VOCABS["ukrainian"] = (
VOCABS["generic_cyrillic_letters"] + VOCABS["digits"] + VOCABS["punctuation"] + VOCABS["currency"] + "ґіїєҐІЇЄ₴"
Expand Down

0 comments on commit d9f0eef

Please sign in to comment.