-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate_dataset.py
97 lines (88 loc) · 2.58 KB
/
generate_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# here is the high level algorithm
# we do 1% embeddings are used by 90% of samples
# we sample 1% embedding values and make sure they are repeated x% of times and rest are randomly chosen
import csv
import json
import math
import random
import numpy as np
embedding_table_sizes = [
1460,
583,
10131227,
2202608,
305,
24,
12517,
633,
3,
93145,
5683,
8351593,
3194,
27,
14992,
5461306,
10,
5652,
2173,
4,
7046547,
18,
15,
286181,
105,
142572,
]
dataset_lines = 39291958
def main(topk_perc, use_sample):
"""
Generarate embedding
topk_perc(int) : Percentage of embeddings used
use_sample(int): Percentage of samples using topk_perc
Read this as "topk_perc" embeddings used in use_sample_percentage embeddings
1% embeddings used by 90% of samples
"""
top_perc_embeddings = []
for emb_table_size in embedding_table_sizes:
# calculate number of top k% embeddings
num_embs = math.ceil((topk_perc * emb_table_size) / 100)
# choose those embeddings
top_perc_embeddings.append(random.sample(range(emb_table_size), num_embs))
# top_perc_embeddings contain the embeddings which need to be reeated topk_perc times
open_file = open(
f"kaggle_artifiial_top_{topk_perc}_use_{use_sample}_fraction.csv", "w"
)
for i in range(dataset_lines):
# we run run a binomial sample to choose if we have are going to have a hot entry or not
dense_line = list()
for emb_table_id, emb_table_size in enumerate(embedding_table_sizes):
is_hot = np.random.binomial(1, use_sample / 100)
if is_hot:
emb_id = random.sample(top_perc_embeddings[emb_table_id], 1)[0]
else:
emb_id = random.sample(range(embedding_table_sizes[emb_table_id]), 1)[0]
dense_line.append(emb_id)
data_dict = {
"label": 1.0,
"dense": [
2.5649492740631104,
3.044522523880005,
1.3862943649291992,
1.3862943649291992,
1.0986123085021973,
1.3862943649291992,
2.70805025100708,
3.7841897010803223,
3.8712010383605957,
1.0986123085021973,
1.3862943649291992,
0.0,
1.0986123085021973,
],
"sparse": dense_line,
}
open_file.write(json.dumps(data_dict))
open_file.write("\n")
if __name__ == "__main__":
main(1, 10)