-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgender_list.py
109 lines (100 loc) · 2.99 KB
/
gender_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#-*- coding:utf-8 -*-
"""gender_list.py
"""
from __future__ import print_function
import sys, re,codecs
def gender_freq(lines):
"""
<eid>2294<syns><s>DArA-strI,praDi-puMstrI,nemi-strI</s>
"""
d = {} # gender:count dictionary. Returned
regex = r'<syns><s>(.*?)</s>'
metaline = None
for iline,line in enumerate(lines):
if line.startswith('<L>'):
metaline = line
elif line.startswith('<LEND>'):
metaline = None
elif metaline == None:
pass
else:
# line within body of an entry
m = re.search(regex,line)
if m != None:
syngens_str = m.group(1)
syngens = re.split(r' *, *',syngens_str)
for syngen in syngens:
print(syngen)
if '-' in syngen:
syn,gen = syngen.split('-')
else:
syn = syngen
gen = 'ajYAta'
# update d
if gen not in d:
d[gen] = 0
d[gen] = d[gen] + 1
return d
def get_gender_names():
d = {}
d['a'] = 'avyayaH,indeclineable'
d['klI'] = 'klIba,neuter'
d['klIba'] = 'klIba bahuvacana,neuter plural'
d['klIdvi'] = 'klIba dvivacana,neuter dual'
d['klIa'] = 'klIba avyayaH vA,neuter or indeclinable'
d['puM'] = 'puMs,masculine'
d['puMba'] = 'puMs bahuvacana,masculine plural'
d['puMdvi'] = 'puMs dvivacana,masculine dual'
d['puMklI'] = 'puMs klIba vA,masculine or neuter'
d['puMklIba'] = 'puMs klIba vA bahuvacana,masculine or neuter plural'
d['puMklIdvi'] = 'puMs klIba vA dvivacana,masculine or neuter dual'
d['puMstrI'] = 'puMs strI vA,masculine or feminine'
d['puMstrIba'] = 'puMs strI vA bahuvacana,masculine or feminine plural'
d['sa'] = '?'
d['strI'] = 'strI,feminine'
d['strIba'] = 'strI bahuvacana,feminine plural'
d['strIdvi'] = 'strI dvivacana,feminine dual'
d['strIklI'] = 'strI klIba vA,feminine or neuter'
d['tri'] = 'puMs strI kIba vA, musculine feminine or neuter'
d['vA'] = '?'
d['vApuMklI'] = '?'
d['ajYAta'] = 'ajYAta, unknown'
return d
def make_outarr_1(dgen):
gender_names_d = get_gender_names()
outarr = []
genders = dgen.keys()
genders = sorted(genders) # Latin alphabetical order
for gen in genders:
count = dgen[gen]
tip = gender_names_d[gen]
out = '%04d %s %s' %(count,gen.ljust(10),tip)
# out = '%s %s' %(gen,count)
outarr.append(out)
return outarr
def make_outarr_2(dgen):
""" preliminary write a dictionary for gender-names
"""
outarr = []
genders = dgen.keys()
genders = sorted(genders) # Latin alphabetical order
outarr.append(' d = {}')
for gen in genders:
out = " d['%s'] = ''" % gen
outarr.append(out)
return outarr
def write_gender(fileout,dgen):
outarr = make_outarr_1(dgen)
# outarr = make_outarr_2(dgen) # preliminary
with codecs.open(fileout,'w','utf-8') as f:
for line in outarr:
f.write(line+'\n')
print(len(outarr),"records written to",fileout)
if __name__=="__main__":
filein = sys.argv[1] # xxx.txt input file
fileout = sys.argv[2] # list of genders with count
# slurp lines
with codecs.open(filein,encoding='utf-8',mode='r') as f:
lines = [line.rstrip('\r\n') for line in f]
d = gender_freq(lines)
write_gender(fileout,d)