From 5df5b6b782d253e07476a66c7fb889a74d668413 Mon Sep 17 00:00:00 2001 From: Wonseok Hwang Date: Mon, 30 Sep 2019 17:10:48 +0900 Subject: [PATCH 1/4] bug fix of using text type for numeric colum. Also, code linted --- add_csv.py | 77 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 10 deletions(-) diff --git a/add_csv.py b/add_csv.py index 0f37bd0..275f126 100755 --- a/add_csv.py +++ b/add_csv.py @@ -8,14 +8,20 @@ # All columns are treated as text - no attempt is made to sniff the type of value # stored in the column. -import argparse, csv, json, os +import argparse, csv, json, os, re from sqlalchemy import Column, create_engine, MetaData, String, Table + def get_table_name(table_id): return 'table_{}'.format(table_id) -def csv_to_sqlite(table_id, csv_file_name, sqlite_file_name): + +def csv_to_sqlite(table_id, csv_file_name, sqlite_file_name, working_folder='.'): + sqlite_file_name = os.path.join(working_folder, sqlite_file_name) + csv_file_name = os.path.join(working_folder, csv_file_name) + engine = create_engine('sqlite:///{}'.format(sqlite_file_name)) + with open(csv_file_name) as f: metadata = MetaData(bind=engine) cf = csv.DictReader(f, delimiter=',') @@ -30,29 +36,80 @@ def csv_to_sqlite(table_id, csv_file_name, sqlite_file_name): table.insert().values(**row).execute() return engine -def csv_to_json(table_id, csv_file_name, json_file_name): + +def is_num(val): + pattern = re.compile(r'[-+]?\d*\.\d+|\d+') + if pattern.search(val): + return True + else: + return False + + +def get_types(rows): + types = [] + row1 = rows[0] + types = [] + for val in row1: + if is_num(val): + types.append('real') + else: + types.append('text') + return types + + +def get_refined_rows(rows, types): + real_idx = [] + for i, type in enumerate(types): + if type == 'real': + real_idx.append(i) + + if len(real_idx) == 0: + rrs = rows + else: + rrs = [] + for row in rows: + rr = row + for idx in real_idx: + rr[idx] = float(row[idx]) + rrs.append(rr) + return rrs + + + + + +def csv_to_json(table_id, csv_file_name, json_file_name, working_folder='.'): + csv_file_name = os.path.join(working_folder, csv_file_name) + json_file_name = os.path.join(working_folder, json_file_name) with open(csv_file_name) as f: cf = csv.DictReader(f, delimiter=',') record = {} record['header'] = [(name or 'col{}'.format(i)) for i, name in enumerate(cf.fieldnames)] record['page_title'] = None - record['types'] = ['text'] * len(cf.fieldnames) record['id'] = table_id record['caption'] = None record['rows'] = [list(row.values()) for row in cf] record['name'] = get_table_name(table_id) - with open(json_file_name, 'a+') as fout: - json.dump(record, fout) - fout.write('\n') + + # infer type based on first row + + record['types'] = get_types(rows=record['rows']) + refined_rows = get_refined_rows(rows=record['rows'], types=record['types']) + record['rows'] = refined_rows + + # save + with open(json_file_name, 'a+') as fout: + json.dump(record, fout) + fout.write('\n') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('split') parser.add_argument('file', metavar='file.csv') + working_folder = './data_and_model' args = parser.parse_args() table_id = os.path.splitext(os.path.basename(args.file))[0] - csv_to_sqlite(table_id, args.file, '{}.db'.format(args.split)) - csv_to_json(table_id, args.file, '{}.tables.jsonl'.format(args.split)) + csv_to_sqlite(table_id, args.file, '{}.db'.format(args.split), working_folder) + csv_to_json(table_id, args.file, '{}.tables.jsonl'.format(args.split), working_folder) print("Added table with id '{id}' (name '{name}') to {split}.db and {split}.tables.jsonl".format( id=table_id, name=get_table_name(table_id), split=args.split)) - From fd9e897106b2447cae5b219a040db41f6275a2ce Mon Sep 17 00:00:00 2001 From: Wonseok Hwang Date: Mon, 30 Sep 2019 17:11:28 +0900 Subject: [PATCH 2/4] load_jsonl added --- sqlova/utils/utils.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/sqlova/utils/utils.py b/sqlova/utils/utils.py index 6d533c3..dac6b77 100644 --- a/sqlova/utils/utils.py +++ b/sqlova/utils/utils.py @@ -2,7 +2,8 @@ # Apache License v2.0 # Wonseok Hwang -import os +import os, json +import random as python_random from matplotlib.pylab import * @@ -65,3 +66,24 @@ def json_default_type_checker(o): """ if isinstance(o, int64): return int(o) raise TypeError + + +def load_jsonl(path_file, toy_data=False, toy_size=4, shuffle=False, seed=1): + data = [] + + with open(path_file, "r", encoding="utf-8") as f: + for idx, line in enumerate(f): + if toy_data and idx >= toy_size and (not shuffle): + break + t1 = json.loads(line.strip()) + data.append(t1) + + if shuffle and toy_data: + # When shuffle required, get all the data, shuffle, and get the part of data. + print( + f"If the toy-data is used, the whole data loaded first and then shuffled before get the first {toy_size} data") + + python_random.Random(seed).shuffle(data) # fixed + data = data[:toy_size] + + return data From f17868a2802712f696ee319335d41b6fcea86245 Mon Sep 17 00:00:00 2001 From: Wonseok Hwang Date: Mon, 30 Sep 2019 17:11:58 +0900 Subject: [PATCH 3/4] infer function (to test custom question) has added --- train.py | 325 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 222 insertions(+), 103 deletions(-) diff --git a/train.py b/train.py index 9c81742..5bfdb7b 100644 --- a/train.py +++ b/train.py @@ -3,6 +3,8 @@ # Wonseok Hwang # Sep30, 2018 + + import os, sys, argparse, re, json from matplotlib.pylab import * @@ -17,12 +19,20 @@ from bert.modeling import BertConfig, BertModel from sqlova.utils.utils_wikisql import * +from sqlova.utils.utils import load_jsonl from sqlova.model.nl2sql.wikisql_models import * from sqlnet.dbengine import DBEngine device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + def construct_hyper_param(parser): + parser.add_argument("--do_train", default=False, action='store_true') + parser.add_argument('--do_infer', default=False, action='store_true') + parser.add_argument('--infer_loop', default=False, action='store_true') + + parser.add_argument("--trained", default=False, action='store_true') + parser.add_argument('--tepoch', default=200, type=int) parser.add_argument("--bS", default=32, type=int, help="Batch size") @@ -41,7 +51,7 @@ def construct_hyper_param(parser): default='vocab.txt', type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--max_seq_length", - default=222, type=int, # Set based on maximum length of input tokens. + default=222, type=int, # Set based on maximum length of input tokens. help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--num_target_layers", @@ -96,7 +106,7 @@ def construct_hyper_param(parser): if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) - #args.toy_model = not torch.cuda.is_available() + # args.toy_model = not torch.cuda.is_available() args.toy_model = False args.toy_size = 12 @@ -104,14 +114,10 @@ def construct_hyper_param(parser): def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): - - bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json') vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt') init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin') - - bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) @@ -127,6 +133,7 @@ def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining): return model_bert, tokenizer, bert_config + def get_opt(model, model_bert, fine_tune): if fine_tune: opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), @@ -141,6 +148,7 @@ def get_opt(model, model_bert, fine_tune): return opt, opt_bert + def get_models(args, BERT_PT_PATH, trained=False, path_model_bert=None, path_model=None): # some constants agg_ops = ['', 'MAX', 'MIN', 'COUNT', 'SUM', 'AVG'] @@ -188,8 +196,10 @@ def get_models(args, BERT_PT_PATH, trained=False, path_model_bert=None, path_mod return model, model_bert, tokenizer, bert_config + def get_data(path_wikisql, args): - train_data, train_table, dev_data, dev_table, _, _ = load_wikisql(path_wikisql, args.toy_model, args.toy_size, no_w2i=True, no_hs_tok=True) + train_data, train_table, dev_data, dev_table, _, _ = load_wikisql(path_wikisql, args.toy_model, args.toy_size, + no_w2i=True, no_hs_tok=True) train_loader, dev_loader = get_loader_wikisql(train_data, dev_data, args.bS, shuffle_train=True) return train_data, train_table, dev_data, dev_table, train_loader, dev_loader @@ -202,16 +212,16 @@ def train(train_loader, train_table, model, model_bert, opt, bert_config, tokeni model_bert.train() ave_loss = 0 - cnt = 0 # count the # of examples - cnt_sc = 0 # count the # of correct predictions of select column - cnt_sa = 0 # of selectd aggregation - cnt_wn = 0 # of where number - cnt_wc = 0 # of where column - cnt_wo = 0 # of where operator - cnt_wv = 0 # of where-value - cnt_wvi = 0 # of where-value index (on question tokens) + cnt = 0 # count the # of examples + cnt_sc = 0 # count the # of correct predictions of select column + cnt_sa = 0 # of selectd aggregation + cnt_wn = 0 # of where number + cnt_wc = 0 # of where column + cnt_wo = 0 # of where operator + cnt_wv = 0 # of where-value + cnt_wvi = 0 # of where-value index (on question tokens) cnt_lx = 0 # of logical form acc - cnt_x = 0 # of execution acc + cnt_x = 0 # of execution acc # Engine for SQL querying. engine = DBEngine(os.path.join(path_db, f"{dset_name}.db")) @@ -263,7 +273,7 @@ def train(train_loader, train_table, model, model_bert, opt, bert_config, tokeni loss = Loss_sw_se(s_sc, s_sa, s_wn, s_wc, s_wo, s_wv, g_sc, g_sa, g_wn, g_wc, g_wo, g_wvi) # Calculate gradient - if iB % accumulate_gradients == 0: # mode + if iB % accumulate_gradients == 0: # mode # at start, perform zero_grad opt.zero_grad() if opt_bert: @@ -273,7 +283,7 @@ def train(train_loader, train_table, model, model_bert, opt, bert_config, tokeni opt.step() if opt_bert: opt_bert.step() - elif iB % accumulate_gradients == (accumulate_gradients-1): + elif iB % accumulate_gradients == (accumulate_gradients - 1): # at the final, take step with accumulated graident loss.backward() opt.step() @@ -293,14 +303,13 @@ def train(train_loader, train_table, model, model_bert, opt, bert_config, tokeni pr_wc_sorted = sort_pr_wc(pr_wc, g_wc) pr_sql_i = generate_sql_i(pr_sc, pr_sa, pr_wn, pr_wc_sorted, pr_wo, pr_wv_str, nlu) - # Cacluate accuracy cnt_sc1_list, cnt_sa1_list, cnt_wn1_list, \ cnt_wc1_list, cnt_wo1_list, \ cnt_wvi1_list, cnt_wv1_list = get_cnt_sw_list(g_sc, g_sa, g_wn, g_wc, g_wo, g_wvi, - pr_sc, pr_sa, pr_wn, pr_wc, pr_wo, pr_wvi, - sql_i, pr_sql_i, - mode='train') + pr_sc, pr_sa, pr_wn, pr_wc, pr_wo, pr_wvi, + sql_i, pr_sql_i, + mode='train') cnt_lx1_list = get_cnt_lx_list(cnt_sc1_list, cnt_sa1_list, cnt_wn1_list, cnt_wc1_list, cnt_wo1_list, cnt_wv1_list) @@ -340,6 +349,7 @@ def train(train_loader, train_table, model, model_bert, opt, bert_config, tokeni return acc, aux_out + def report_detail(hds, nlu, g_sc, g_sa, g_wn, g_wc, g_wo, g_wv, g_wv_str, g_sql_q, g_ans, pr_sc, pr_sa, pr_wn, pr_wc, pr_wo, pr_wv_str, pr_sql_q, pr_ans, @@ -380,11 +390,12 @@ def report_detail(hds, nlu, print(cnt_list) - print(f'acc_lx = {cnt_lx/cnt:.3f}, acc_x = {cnt_x/cnt:.3f}\n', - f'acc_sc = {cnt_sc/cnt:.3f}, acc_sa = {cnt_sa/cnt:.3f}, acc_wn = {cnt_wn/cnt:.3f}\n', - f'acc_wc = {cnt_wc/cnt:.3f}, acc_wo = {cnt_wo/cnt:.3f}, acc_wv = {cnt_wv/cnt:.3f}') + print(f'acc_lx = {cnt_lx / cnt:.3f}, acc_x = {cnt_x / cnt:.3f}\n', + f'acc_sc = {cnt_sc / cnt:.3f}, acc_sa = {cnt_sa / cnt:.3f}, acc_wn = {cnt_wn / cnt:.3f}\n', + f'acc_wc = {cnt_wc / cnt:.3f}, acc_wo = {cnt_wo / cnt:.3f}, acc_wv = {cnt_wv / cnt:.3f}') print(f'===============================') + def test(data_loader, data_table, model, model_bert, bert_config, tokenizer, max_seq_length, num_target_layers, detail=False, st_pos=0, cnt_tot=1, EG=False, beam_size=4, @@ -464,17 +475,14 @@ def test(data_loader, data_table, model, model_bert, bert_config, tokenizer, pr_wc, pr_wo, pr_wv, pr_sql_i = sort_and_generate_pr_w(pr_sql_i) # Follosing variables are just for the consistency with no-EG case. - pr_wvi = None # not used - pr_wv_str=None - pr_wv_str_wp=None + pr_wvi = None # not used + pr_wv_str = None + pr_wv_str_wp = None loss = torch.tensor([0]) - - g_sql_q = generate_sql_q(sql_i, tb) pr_sql_q = generate_sql_q(pr_sql_i, tb) - # Saving for the official evaluation later. for b, pr_sql_i1 in enumerate(pr_sql_i): results1 = {} @@ -485,10 +493,10 @@ def test(data_loader, data_table, model, model_bert, bert_config, tokenizer, cnt_sc1_list, cnt_sa1_list, cnt_wn1_list, \ cnt_wc1_list, cnt_wo1_list, \ - cnt_wvi1_list, cnt_wv1_list = get_cnt_sw_list(g_sc, g_sa,g_wn, g_wc,g_wo, g_wvi, - pr_sc, pr_sa, pr_wn, pr_wc, pr_wo, pr_wvi, - sql_i, pr_sql_i, - mode='test') + cnt_wvi1_list, cnt_wv1_list = get_cnt_sw_list(g_sc, g_sa, g_wn, g_wc, g_wo, g_wvi, + pr_sc, pr_sa, pr_wn, pr_wc, pr_wo, pr_wvi, + sql_i, pr_sql_i, + mode='test') cnt_lx1_list = get_cnt_lx_list(cnt_sc1_list, cnt_sa1_list, cnt_wn1_list, cnt_wc1_list, cnt_wo1_list, cnt_wv1_list) @@ -540,6 +548,86 @@ def test(data_loader, data_table, model, model_bert, bert_config, tokenizer, return acc, results, cnt_list +def tokenize_corenlp(client, nlu1): + nlu1_tok = [] + for sentence in client.annotate(nlu1): + for tok in sentence: + nlu1_tok.append(tok.originalText) + return nlu1_tok + + +def tokenize_corenlp_direct_version(client, nlu1): + nlu1_tok = [] + for sentence in client.annotate(nlu1).sentence: + for tok in sentence.token: + nlu1_tok.append(tok.originalText) + return nlu1_tok + + +def infer(nlu1, + table_name, data_table, path_db, db_name, + model, model_bert, bert_config, max_seq_length, num_target_layers, + beam_size=4, show_table=False, show_answer_only=False): + # I know it is of against the DRY principle but to minimize the risk of introducing bug w, the infer function introuced. + model.eval() + model_bert.eval() + engine = DBEngine(os.path.join(path_db, f"{db_name}.db")) + + # Get inputs + nlu = [nlu1] + # nlu_t1 = tokenize_corenlp(client, nlu1) + nlu_t1 = tokenize_corenlp_direct_version(client, nlu1) + nlu_t = [nlu_t1] + + tb1 = data_table[0] + hds1 = tb1['header'] + tb = [tb1] + hds = [hds1] + hs_t = [[]] + + wemb_n, wemb_h, l_n, l_hpu, l_hs, \ + nlu_tt, t_to_tt_idx, tt_to_t_idx \ + = get_wemb_bert(bert_config, model_bert, tokenizer, nlu_t, hds, max_seq_length, + num_out_layers_n=num_target_layers, num_out_layers_h=num_target_layers) + + prob_sca, prob_w, prob_wn_w, pr_sc, pr_sa, pr_wn, pr_sql_i = model.beam_forward(wemb_n, l_n, wemb_h, l_hpu, + l_hs, engine, tb, + nlu_t, nlu_tt, + tt_to_t_idx, nlu, + beam_size=beam_size) + + # sort and generate + pr_wc, pr_wo, pr_wv, pr_sql_i = sort_and_generate_pr_w(pr_sql_i) + if len(pr_sql_i) != 1: + raise EnvironmentError + pr_sql_q1 = generate_sql_q(pr_sql_i, [tb1]) + pr_sql_q = [pr_sql_q1] + + try: + pr_ans, _ = engine.execute_return_query(tb[0]['id'], pr_sc[0], pr_sa[0], pr_sql_i[0]['conds']) + except: + pr_ans = ['Answer not found.'] + pr_sql_q = ['Answer not found.'] + + if show_answer_only: + print(f'Q: {nlu[0]}') + print(f'A: {pr_ans[0]}') + print(f'SQL: {pr_sql_q}') + + else: + print(f'START ============================================================= ') + print(f'{hds}') + if show_table: + print(engine.show_table(table_name)) + print(f'nlu: {nlu}') + print(f'pr_sql_i : {pr_sql_i}') + print(f'pr_sql_q : {pr_sql_q}') + print(f'pr_ans: {pr_ans}') + print(f'---------------------------------------------------------------------') + + return pr_sql_i, pr_ans + + def print_result(epoch, acc, dname): ave_loss, acc_sc, acc_sa, acc_wn, acc_wc, acc_wo, acc_wvi, acc_wv, acc_lx, acc_x = acc @@ -549,6 +637,7 @@ def print_result(epoch, acc, dname): acc_wc: {acc_wc:.3f}, acc_wo: {acc_wo:.3f}, acc_wvi: {acc_wvi:.3f}, acc_wv: {acc_wv:.3f}, acc_lx: {acc_lx:.3f}, acc_x: {acc_x:.3f}" ) + if __name__ == '__main__': ## 1. Hyper parameters @@ -556,13 +645,14 @@ def print_result(epoch, acc, dname): args = construct_hyper_param(parser) ## 2. Paths - path_h = '/home/wonseok' - path_wikisql = os.path.join(path_h, 'data', 'wikisql_tok') + path_h = './data_and_model' # '/home/wonseok' + path_wikisql = './data_and_model' # os.path.join(path_h, 'data', 'wikisql_tok') BERT_PT_PATH = path_wikisql path_save_for_evaluation = './' ## 3. Load data + train_data, train_table, dev_data, dev_table, train_loader, dev_loader = get_data(path_wikisql, args) # test_data, test_table = load_wikisql_data(path_wikisql, mode='test', toy_model=args.toy_model, toy_size=args.toy_size, no_hs_tok=True) # test_loader = torch.utils.data.DataLoader( @@ -573,72 +663,101 @@ def print_result(epoch, acc, dname): # collate_fn=lambda x: x # now dictionary values are not merged! # ) ## 4. Build & Load models - model, model_bert, tokenizer, bert_config = get_models(args, BERT_PT_PATH) - - ## 4.1. - # To start from the pre-trained models, un-comment following lines. - # path_model_bert = - # path_model = - # model, model_bert, tokenizer, bert_config = get_models(args, BERT_PT_PATH, trained=True, path_model_bert=path_model_bert, path_model=path_model) + if not args.trained: + model, model_bert, tokenizer, bert_config = get_models(args, BERT_PT_PATH) + else: + # To start from the pre-trained models, un-comment following lines. + path_model_bert = './data_and_model/model_bert_best.pt' + path_model = './data_and_model/model_best.pt' + model, model_bert, tokenizer, bert_config = get_models(args, BERT_PT_PATH, trained=True, + path_model_bert=path_model_bert, path_model=path_model) ## 5. Get optimizers - opt, opt_bert = get_opt(model, model_bert, args.fine_tune) - - ## 6. Train - acc_lx_t_best = -1 - epoch_best = -1 - for epoch in range(args.tepoch): - # train - acc_train, aux_out_train = train(train_loader, - train_table, - model, - model_bert, - opt, - bert_config, - tokenizer, - args.max_seq_length, - args.num_target_layers, - args.accumulate_gradients, - opt_bert=opt_bert, - st_pos=0, - path_db=path_wikisql, - dset_name='train') - - # check DEV - with torch.no_grad(): - acc_dev, results_dev, cnt_list = test(dev_loader, - dev_table, - model, - model_bert, - bert_config, - tokenizer, - args.max_seq_length, - args.num_target_layers, - detail=False, - path_db=path_wikisql, - st_pos=0, - dset_name='dev', EG=args.EG) - - - print_result(epoch, acc_train, 'train') - print_result(epoch, acc_dev, 'dev') - - # save results for the official evaluation - save_for_evaluation(path_save_for_evaluation, results_dev, 'dev') - - - - # save best model - # Based on Dev Set logical accuracy lx - acc_lx_t = acc_dev[-2] - if acc_lx_t > acc_lx_t_best: - acc_lx_t_best = acc_lx_t - epoch_best = epoch - # save best model - state = {'model': model.state_dict()} - torch.save(state, os.path.join('.', 'model_best.pt') ) + if args.do_train: + opt, opt_bert = get_opt(model, model_bert, args.fine_tune) + + ## 6. Train + acc_lx_t_best = -1 + epoch_best = -1 + for epoch in range(args.tepoch): + # train + acc_train, aux_out_train = train(train_loader, + train_table, + model, + model_bert, + opt, + bert_config, + tokenizer, + args.max_seq_length, + args.num_target_layers, + args.accumulate_gradients, + opt_bert=opt_bert, + st_pos=0, + path_db=path_wikisql, + dset_name='train') + + # check DEV + with torch.no_grad(): + acc_dev, results_dev, cnt_list = test(dev_loader, + dev_table, + model, + model_bert, + bert_config, + tokenizer, + args.max_seq_length, + args.num_target_layers, + detail=False, + path_db=path_wikisql, + st_pos=0, + dset_name='dev', EG=args.EG) + + print_result(epoch, acc_train, 'train') + print_result(epoch, acc_dev, 'dev') + + # save results for the official evaluation + save_for_evaluation(path_save_for_evaluation, results_dev, 'dev') - state = {'model_bert': model_bert.state_dict()} - torch.save(state, os.path.join('.', 'model_bert_best.pt')) - - print(f" Best Dev lx acc: {acc_lx_t_best} at epoch: {epoch_best}") + # save best model + # Based on Dev Set logical accuracy lx + acc_lx_t = acc_dev[-2] + if acc_lx_t > acc_lx_t_best: + acc_lx_t_best = acc_lx_t + epoch_best = epoch + # save best model + state = {'model': model.state_dict()} + torch.save(state, os.path.join('.', 'model_best.pt')) + + state = {'model_bert': model_bert.state_dict()} + torch.save(state, os.path.join('.', 'model_bert_best.pt')) + + print(f" Best Dev lx acc: {acc_lx_t_best} at epoch: {epoch_best}") + + if args.do_infer: + # To use recent corenlp: https://github.com/stanfordnlp/python-stanford-corenlp + # 1. pip install stanford-corenlp + # 2. download java crsion + # 3. export CORENLP_HOME=/Users/wonseok/utils/stanford-corenlp-full-2018-10-05 + + # from stanza.nlp.corenlp import CoreNLPClient + # client = CoreNLPClient(server='http://localhost:9000', default_annotators='ssplit,tokenize'.split(',')) + + import corenlp + + client = corenlp.CoreNLPClient(annotators='ssplit,tokenize'.split(',')) + + nlu1 = "Which company have more than 100 employees?" + path_db = './data_and_model' + db_name = 'ctable' + data_table = load_jsonl('./data_and_model/ctable.tables.jsonl') + table_name = 'ftable1' + n_Q = 100000 if args.infer_loop else 1 + for i in range(n_Q): + if n_Q > 1: + nlu1 = input('Type question: ') + pr_sql_i, pr_ans = infer( + nlu1, + table_name, data_table, path_db, db_name, + model, model_bert, bert_config, max_seq_length=args.max_seq_length, + num_target_layers=args.num_target_layers, + beam_size=1, show_table=False, show_answer_only=False + ) From 0e3a83bc745b00d5a4d0f20bbcbfa4ab6c7bcd36 Mon Sep 17 00:00:00 2001 From: Wonseok Hwang Date: Mon, 30 Sep 2019 17:12:53 +0900 Subject: [PATCH 4/4] auxilarly bash scripts added --- run_infer.sh | 1 + run_make_table.sh | 2 ++ run_train.sh | 1 + 3 files changed, 4 insertions(+) create mode 100644 run_infer.sh create mode 100644 run_make_table.sh create mode 100644 run_train.sh diff --git a/run_infer.sh b/run_infer.sh new file mode 100644 index 0000000..7296cc7 --- /dev/null +++ b/run_infer.sh @@ -0,0 +1 @@ +python3 train.py --do_infer --infer_loop --trained --bert_type_abb uL --max_seq_leng 222 \ No newline at end of file diff --git a/run_make_table.sh b/run_make_table.sh new file mode 100644 index 0000000..767e4af --- /dev/null +++ b/run_make_table.sh @@ -0,0 +1,2 @@ +python3 add_csv.py ctable ftable1.csv +python3 add_csv.py ctable ftable2.csv \ No newline at end of file diff --git a/run_train.sh b/run_train.sh new file mode 100644 index 0000000..5ab4056 --- /dev/null +++ b/run_train.sh @@ -0,0 +1 @@ +python3 train.py --do_train --seed 1 --bS 16 --accumulate_gradients 2 --bert_type_abb uS --fine_tune --lr 0.001 --lr_bert 0.00001 --max_seq_leng 222 \ No newline at end of file