-
Notifications
You must be signed in to change notification settings - Fork 0
/
consolidate.py
62 lines (49 loc) · 1.95 KB
/
consolidate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from tqdm import tqdm
import re
import pandas as pd
en_lang_pairs_count = {}
def add_pair(en_word, lang_word):
if type(en_word) is not str or type(lang_word) is not str:
return
en_word = en_word.replace('"', '').strip().lower().rstrip('.')
lang_word = lang_word.replace('"', '').strip().rstrip('.')
key = (en_word, lang_word)
if len(key[0])<2 or len(key[1])<2:
return
if key in en_lang_pairs_count:
en_lang_pairs_count[key] += 1
else:
en_lang_pairs_count[key] = 1
return
###################################################################
# https://github.com/linuxkathirvel/eng2tamildictionary
import json
data = json.load(open('raw/eng2tamildictionary/dictionary.json', encoding="utf-8"))
for record in tqdm(data, desc='eng2tamildictionary'):
try:
en_word, lang_words = record["eng"], record["tamil"]
except:
if 'word_list' in record:
continue
raise
en_word = re.sub('\([^)]*?\)', '', en_word) # Remove brackets
lang_words = re.sub('\([^)]*?\)', '', lang_words) # Remove brackets
lang_words = re.sub("[A-Za-z]\.?", ' ', lang_words) # Remove English words from tamil
lang_words = re.sub("-[1-9]", ' ', lang_words) # Remove numbered bullets
for lang_word in lang_words.split(','):
add_pair(en_word, lang_word)
# https://github.com/abuvanth/english-tamil-dictionary-api
# Same as above, so ignore.
# https://github.com/sathia27/dictionary
# Bad format. TODO: Clean and parse
# import sqlite3, os
# con = sqlite3.connect('raw/dictionary/word.db', isolation_level=None, detect_types=sqlite3.PARSE_COLNAMES)
# df = pd.read_sql_query("SELECT * FROM words", con)
# os.makedirs('tmp', exist_ok=True)
# df.to_csv("tmp/dict.csv")
## WRITE
out = open('consolidated.tsv', 'w', encoding='utf-8')
out.write(f"ENG\tLANG\tCOUNT\n")
for (en_word, lang_word), count in en_lang_pairs_count.items():
out.write(f"{en_word}\t{lang_word}\t{count}\n")
out.close()