-
Notifications
You must be signed in to change notification settings - Fork 0
/
aggregation.py
115 lines (90 loc) · 3.42 KB
/
aggregation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 28 14:18:00 2019
@author: i7 Laptop
"""
import os
import wget
import tarfile
import re
from nltk.tokenize import word_tokenize
import collections
import pandas as pd
import pickle
import numpy as np
import json
import ijson
annotated = open( r'C:\Users\i7 Laptop\Documents\Masters articles\data\data\train1.jsonl')
#print(annotated)
data= []
for i,line in enumerate(annotated):
#json_data = json.reads(line)
#data = json.loads(line)
#data =
#print(data)
json_data = json.loads(line.replace("`","\'"))
data.append(json_data)
print(data)
#for i in json_data:
# datatemp = json_data[i]
# print(datatemp)
#annotated.close()
data_pd = pd.DataFrame()
from pandas.io.json import json_normalize
df = pd.DataFrame.from_dict(json_normalize(data), orient='columns')
df = df.apply(lambda x: x.astype(str).str.lower())
df = df.drop(['sql.conds','sql.sel','table_id','phase'],axis = 1)
print(df)
def clean_text(text, remove_stopwords = True):
text = text.lower()
text = re.sub(r'[_"\-;%()|+&*%.,!?:#$@\[\]/]', ' ', text)
return text
aggs = df['sql.agg']
question = df['question']
print(question,aggs)
from nltk.corpus import stopwords
words = list()
for ques in question:
words.append(clean_text(ques, remove_stopwords=False))
#print(words)
def build_word_dict():
word_counter = collections.Counter(words).most_common()
word_dict = dict()
word_dict["<pad>"] = 0
word_dict["<unk>"] = 1
word_dict["<eos>"] = 2
for word, _ in word_counter:
word_dict[word] = len(word_dict)
return word_dict
def build_word_dataset(word_dict,document_max_len):
#if step == "train":
x = list(map(lambda d: word_tokenize(clean_text(d)), df["question"]))
x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict["<unk>"]), d)), x))
x = list(map(lambda d: d + [word_dict["<eos>"]], x))
x = list(map(lambda d: d[:document_max_len], x))
x = list(map(lambda d: d + (document_max_len - len(d)) * [word_dict["<pad>"]], x))
y = list(map(lambda d: d,list(df["sql.agg"])))
return x,y
def build_char_dataset(model, document_max_len, alphabet_size):
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’'\"/|_#$%ˆ&*˜‘+=<>()[]{} "
#if step == "train":
char_dict = dict()
char_dict["<pad>"] = 0
char_dict["<unk>"] = 1
for c in alphabet:
char_dict[c] = len(char_dict)
alphabet_size = len(alphabet) + 2
x = list(map(lambda content: list(map(lambda d: char_dict.get(d, char_dict["<unk>"]), question.lower())), df["question"]))
x = list(map(lambda d: d[:document_max_len], x))
x = list(map(lambda d: d + (document_max_len - len(d)) * [char_dict["<pad>"]], x))
y = list(map(lambda d: d, list(df["sql.aggs"])))
return x, y, alphabet_size
def batch_iter(inputs, outputs, batch_size, num_epochs):
inputs = np.array(inputs)
outputs = np.array(outputs)
num_batches_per_epoch = (len(inputs) - 1) // batch_size + 1
for epoch in range(num_epochs):
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, len(inputs))
yield inputs[start_index:end_index], outputs[start_index:end_index]