-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtf_idf_map.py
executable file
·44 lines (37 loc) · 1.49 KB
/
tf_idf_map.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python
from __future__ import print_function
import sys
import argparse
import map_reduce_utils as mru
from math import log
def map_tf_idf(corpus_size, input=sys.stdin, output=sys.stdout):
"""
(word file_name) (n N m) --> (word file_name) (tfidf)
computes the tf-idf metric for each word in each file in the corpus
which is defined as the term frequency multiplied by the inverse document
frequency. The term frequency is what porportion of the words in
the document are a given word. The inverse document frequency is the
number of documents in the corpus that the word appears.
"""
for in_key, in_value in mru.json_loader(input):
n = in_value['word_freq']
N = in_value['doc_size']
m = in_value['corp_freq']
D = corpus_size
tf = float(n) / float(N)
idf = (float(D) / float(m))
log_idf = log(idf, 10)
tfidf = tf * idf
tf_log_idf = tf * log_idf
# in_key == out_key
out_value = {'tfidf': tfidf, 'tf log idf': tf_log_idf,
'log idf': log_idf, 'idf': idf, 'tf': tf,
'word frequency': n, 'document length': N,
'corpus frequency': m, 'corpus size': D}
mru.reducer_emit(in_key, out_value, output)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--corpus_size', dest='s', type=int)
args = parser.parse_args()
corpus_size = args.s
map_tf_idf(corpus_size)