-
Notifications
You must be signed in to change notification settings - Fork 4
/
build_decoy_db.py
82 lines (62 loc) · 2.33 KB
/
build_decoy_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import sqlite3
import itertools
import numpy as np
from parsing import tokenize, frequency_table, paragraph_iter
def token_block_iter(title,html):
wiki = paragraph_iter(html)
token_table = map(tokenize, wiki)
freq = frequency_table(token_table)
wiki_title = unicode(wiki.title)
wiki_title_tokens = set(tokenize(wiki_title))
wiki_index = wiki.id
# Column normalize frequency table
freq /= freq.sum(axis=0)
for para_n,tokens in enumerate(token_table):
tokens = set(tokens).difference(wiki_title_tokens)
tokens = list(tokens)
local_freq = freq[tokens].ix[para_n]
yield para_n, tokens, wiki_index, wiki_title, local_freq
def SQL_result(item):
idx,title,html,length= item
print "Processing:", title,idx
all_data = []
for result in token_block_iter(title, html):
para_n, tokens, wiki_index, wiki_title, frq = result
f_str='[{}]'.format(','.join(map("{:0.2f}".format,
list(frq.values))))
tokens = ' '.join(tokens)
data = [wiki_index,title,para_n,tokens,f_str]
all_data.append(data)
return all_data
if __name__ == "__main__":
# This is where I stored my wikipedia data dump, change this to your location
f_wiki = "/media/travis/Seagate Expansion Drive/data_dump/wiki.db"
conn = sqlite3.connect(f_wiki, check_same_thread=False)
conn_decoy = sqlite3.connect("db/decoy.db")
cmd_template = '''
CREATE TABLE IF NOT EXISTS decoy (
decoy_idx INTEGER PRIMARY KEY AUTOINCREMENT,
wikipedia_idx INTEGER,
wikipedia_title STRING,
weights STRING, -- awful way to do it, but easy enough!
paragraph_count INTEGER,
tokens STRING
)
'''
conn_decoy.executescript(cmd_template)
cmd_search = "SELECT * FROM wiki ORDER BY title LIMIT 7000"
cmd_search = "SELECT * FROM wiki ORDER BY title"
WIKI = conn.execute(cmd_search)
ITR = itertools.imap(SQL_result, WIKI)
import multiprocessing
P = multiprocessing.Pool()
ITR = P.imap(SQL_result, WIKI)
cmd_insert = u'''
INSERT INTO decoy (
wikipedia_idx,wikipedia_title,
paragraph_count,tokens,weights)
VALUES (?,?,?,?,?)
'''
for result in ITR:
conn_decoy.executemany(cmd_insert, result)
conn_decoy.commit()