-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
37 lines (27 loc) · 1.01 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# This file provides code which you may or may not find helpful.
# Use it if you want, or ignore it.
import random
import xor_data as xd
STUDENT={'name': 'Daniel Greenspan_Eilon Bashari',
'ID': '308243948_308576933'}
def read_data(fname):
data = []
for line in file(fname):
label, text = line.strip().lower().split("\t",1)
data.append((label, text))
return data
def text_to_bigrams(text):
return ["%s%s" % (c1,c2) for c1,c2 in zip(text,text[1:])]
TRAIN = [(l,text_to_bigrams(t)) for l,t in read_data("train")]
DEV = [(l,text_to_bigrams(t)) for l,t in read_data("dev")]
XOR = [(l,data) for l,data in xd.data]
from collections import Counter
fc = Counter()
for l,feats in TRAIN:
fc.update(feats)
# 600 most common bigrams in the training set.
vocab = set([x for x,c in fc.most_common(600)])
# label strings to IDs
L2I = {l:i for i,l in enumerate(list(sorted(set([l for l,t in TRAIN]))))}
# feature strings (bigrams) to IDs
F2I = {f:i for i,f in enumerate(list(sorted(vocab)))}