-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_set_main_eng_news.py
95 lines (78 loc) · 2.72 KB
/
word_set_main_eng_news.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#libraries and pythonfile imports
import os
import word_set as ws
import text_divider as td
import time
import matplotlib.pyplot as plt
#lists and variables
add_time_for_1000 = [0]
max_depth_for_each_1000 = [0]
size_for_each_1000 = [0]
bucket_list_size_for_each_1000 = [0]
count = 0
path = os.path.abspath(os.path.dirname(os.path.abspath(__file__))) + "\data\eng_news_100K-sentences.txt"
# Program starts
word_set = ws.new_empty_set() # Creates a word set with 10 empty buckets
print("Spliting words...")
split_time_start = time.time()
try:
word_list = td.word_splitter(path) # Split words and put in a list
except FileNotFoundError:
print("File not found. Terminating...")
quit()
split_time_stop = time.time()
print("Total amount of words:",len(word_list))
no_duplicates = set(word_list) # Makes sure that we dont have any duplicates so we actually add all items
print("\nAdding words to set...")
tot_add_time_start = time.time()
every_1000_start = time.time()
for word in no_duplicates:
count += 1
ws.add(word_set,word) # add word via addfunction
if count == 1000: # collects data for plots for every 1000 element
add_1000_time = time.time()
add_time_for_1000.append(add_1000_time - every_1000_start)
max_depth_for_each_1000.append(ws.max_bucket_size(word_set))
size_for_each_1000.append(ws.size)
bucket_list_size_for_each_1000.append(ws.bucket_list_size(word_set))
count = 0 # reset count
every_1000_start = time.time() # reset timer
tot_add_time_stop = time.time()
# Hash specific data
print("Elements in set:",ws.count(word_set))
mx = ws.max_bucket_size(word_set)
print("Max bucket:", mx)
buckets = ws.bucket_list_size(word_set)
print("Bucket list size:", buckets)
# Time data
time_word_split = split_time_stop - split_time_start
time_set_add = tot_add_time_stop - tot_add_time_start
print("Time for word split:",time_word_split)
print("Time for adding to set:",time_set_add)
#plot 1
# x-axis: size_for_each_1000
# y-axis: bucket_list_size_for_each_1000
x = [x for x in size_for_each_1000]
y = [q for q in bucket_list_size_for_each_1000]
plt.xlabel('Size')
plt.ylabel('Bucket size')
plt.plot(x, y)
plt.show()
#plot 2
# x-axis: size_for_each_1000
# y-axis: add_time_for_1000
x = [x for x in size_for_each_1000]
y = [q for q in add_time_for_1000]
plt.xlabel('Size')
plt.ylabel('Time')
plt.plot(x, y)
plt.show()
#plot 3
# x-axis: size_for_each_1000
# y-axis: max_depth_for_each_1000
x = [x for x in size_for_each_1000]
y = [q for q in max_depth_for_each_1000]
plt.xlabel('Size')
plt.ylabel('Depth')
plt.plot(x, y)
plt.show()