-
Notifications
You must be signed in to change notification settings - Fork 37
/
small_jobs.py
165 lines (140 loc) · 6.27 KB
/
small_jobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
'''
@author: v-lianji
'''
import config
from helper import *
import codecs
from helper import utils, data_loader
from collections import Counter, defaultdict
import math
import random
import numpy as np
import operator
def gen_word_hashing(raw_file, outfile, topk ):
wh_model = word_hashing.word_hash_agent(topk)
wh_model.do_hashing(raw_file)
utils.dump_obj_to_file(outfile, wh_model)
def load_wordhash_data(filename):
wh_model = utils.load_obj_from_file(filename)
cnt = 0
for p in wh_model.word2freq.items():
print(p)
cnt+=1
if cnt>10:
break
def prepare_autoencoder_files(infile, outfile ):
with codecs.open(infile, 'r', 'utf-8') as rd:
with codecs.open(outfile, 'w', 'utf-8') as wt:
while True:
line = rd.readline()
if not line:
break
words = line.split('\t')
wt.write(words[1])
def prepare_triple_autoencoder_files(article_raw, article_IFIDF, outfile):
'''
step 1 : from article_raw load a list of (article id, category)
step 2 : shuffle the list
step 3 : split the list into two disjoint list: A and B
step 4 : build a reverted-index C from category to article list on B
step 5 : for each article in A, find an article from C which has the same category, and find an article from C which has a different catetory
'''
all_articles = load_article_category_as_list(article_raw)
random.shuffle(all_articles)
articles_parta, articles_partb = split_list(all_articles)
category_list, category_prob, cate2articles, cate2cnt = build_category_probability_and_inverted_index(articles_partb)
print(cate2cnt)
debug_list = [(a,b) for a,b in zip(category_list,category_prob)]
debug_list .sort(key = operator.itemgetter(1))
print(debug_list)
article2content = data_loader.load_article_content(article_IFIDF)
with open(outfile, 'w') as wt:
for article in articles_parta:
if article[0] not in article2content or article[1] not in cate2cnt:
continue
selected_category = article[1]
while selected_category==article[1]:
selected_category = np.random.choice(category_list, size=1, replace=True, p = category_prob)[0]
article_same_cate = sample_one_article(cate2articles[article[1]], article2content)
article_diff_cate = sample_one_article(cate2articles[selected_category], article2content)
wt.write('{0}\t{1}\t{2}\n'.format(article2content[article[0]], article2content[article_same_cate], article2content[article_diff_cate]))
def sample_one_article(mylist, d):
res = random.choice(mylist)
while res not in d:
res = random.choice(mylist)
return res
def split_list(mylist):
cnt = len(mylist)
mid = cnt//2
return mylist[0:mid], mylist[mid:]
def build_category_probability_and_inverted_index(mylist):
cate2cnt = defaultdict(int)
cate2articles = defaultdict(list)
for p in mylist:
cate2cnt[p[1]]+=1
cate2articles[p[1]].append(p[0])
cate2cnt = dict([(p[0],p[1]) for p in cate2cnt.items() if p[1]>1000 and not (p[0] == 'null' or p[0] == 'rt_unclassified')])
total = sum([p[1] for p in cate2cnt.items()])
cate_list, cate_prob = [] , []
for p in cate2cnt.items():
cate_list.append(p[0])
cate_prob.append(p[1]*1.0/total)
return cate_list, cate_prob, cate2articles, cate2cnt
def load_article_category_as_list(infile):
articles = []
with codecs.open(infile, 'r', 'utf-8') as rd:
while True:
line = rd.readline()
if not line:
break
words = line.strip().split('\t')
if len(words)!=3:
continue
articles.append((words[0], words[1].lower()))
return articles
def convert_raw_file_to_indexed(infile,outfile,word_hashing_file,norm=False):
r'''
input format: id\t category\t title
output format: id\t word:weight ...
'''
wh_model = utils.load_obj_from_file(word_hashing_file)
with codecs.open(infile, 'r', 'utf-8') as rd:
with codecs.open(outfile, 'w', 'utf-8') as wt:
while True:
line = rd.readline()
if not line:
break
words = line.strip().split('\t')
#if len(words)!=2 or not line.startswith('http'):
if len(words)!=3 :# or not line.startswith('http'):
continue
r'''
tokens = utils.clean_str(words[2]).split(' ')
if len(tokens)<2:
continue
cur_word_dict = Counter(tokens)
cur_word_list = [(wh_model.word2idx[k],v) for k,v in cur_word_dict.items() if k in wh_model.word2idx]
if not cur_word_list:
continue
cur_word_list.sort()
doc_word_cnt = sum(p[1] for p in cur_word_list) * 1.0
if doc_word_cnt<=0.001:
continue
wt.write(words[0]+'\t')
for p in cur_word_list:
wt.write('{0}:{1:.2f} '.format(p[0],p[1]*1.0/doc_word_cnt * math.log2(1000000*1.0/wh_model.word2freq[wh_model.idx2word[p[0]]])) )
wt.write('\n')
'''
cur_word_list = utils.convert_line_to_tfidf(words[2], wh_model, norm)
if not cur_word_list:
continue
wt.write(words[0]+'\t')
for p in cur_word_list:
wt.write('{0}:{1:.2f} '.format(p[0],p[1]))
wt.write('\n')
if __name__ == '__main__':
r'''
gen_word_hashing(r'D:\My Projects\data\news\artitle_title_2017-01-01-2017-01-31.tsv',
r'D:\My Projects\data\news\processed\word_hashing_title_top1w.obj', 10000)
'''
#load_wordhash_data(r'D:\My Projects\data\news\processed\word_hashing_title_2017-01-01-2017-01-31.obj')