-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfactorial_design.py
96 lines (79 loc) · 3.44 KB
/
factorial_design.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Given a number of output logs from zzt-puzzle,
# create a factorial experimental design to can be used to
# fit an estimated difficulty function later.
# Maybe I'll do Latin square or something else later?
import numpy as np
import itertools
def read_puzzle_metadata(puzzle_file, puzzle_data={}):
for line in puzzle_file:
# We're only interested in the metadata.
if not "summary:" in line:
continue
metadata_words = line.split()
name = metadata_words[1].rstrip(":")
values = [float(x) for x in metadata_words[3:]]
puzzle_data[name] = values
return puzzle_data
# Return the row indices of the k rows in the haystack with
# the best Euclidean distance to the needle.
def k_best_euclidean(haystack, needle, k):
distances = np.linalg.norm(haystack - needle, axis=1)
best_k = np.argsort(distances)[:k]
# Return a tuple both giving the k best Euclidean indices,
# and their rank (0 being best). This is used for a progressive
# sampling later so that when I test difficulty levels, then if I
# give up before all the puzzles have been solved, the data will still
# be useful.
return [(rank, best_k[rank]) for rank in range(k)]
# Read the files.
# You may want to add different file names here.
# These files are logs obtained by ./zzt-scan |tee filename.txt
y = {}
for file in ["para_skip_test_III.txt", "para_skip_test_II.txt",
"para_skip_test_IV.txt", "para_skip_test.txt", "para_skip_test_VII.txt",
"para_skip_test_VI.txt", "para_skip_test_V.txt"]:
y = read_puzzle_metadata(open(file, "r"), y)
print("Seen %d puzzles in total" % len(y))
# Get the keys and values
y_keys = tuple(y.keys())
y_values = np.array(list(y.values()))
# Remove the column with index 11 as that's the raw nodes
# visited count, which I'm pretty sure isn't important.
reduced_y_values = np.delete(y_values, 11, axis=1)
# Also remove axis 1
# Get the percentiles that will stand in for each factor level.
levels = 2
percentiles = np.percentile(reduced_y_values, axis=0,
q=np.linspace(0, 100, levels))
# Get the possible combinations of factor levels that we can
# use. The use of percentiles work to standardize things here,
# although I'm not sure if it generalizes to more than two
# levels. If we have three, we should probably use min, (min+max)/2,
# max, for isntance, even if the distribution is very skewed.
products = np.array(list(itertools.product(*percentiles.T)))
# How many puzzles (examples) from each experiment do we want?
max_puzzles_per_experiment = 10
puzzle_index_ranks = {}
for i in range(len(products)):
print("Progress: %.5f, puzzles: %d" % (i/len(products),
len(puzzle_index_ranks)))
best_ranks_indices = k_best_euclidean(
reduced_y_values, products[i], max_puzzles_per_experiment)
for rank, index in best_ranks_indices:
if not index in puzzle_index_ranks:
puzzle_index_ranks[index] = rank
else:
puzzle_index_ranks[index] = min(rank,
puzzle_index_ranks[index])
# Now extract the puzzle indices by highest rank, turn the
# indices into puzzle names, randomly shuffle, and then sort by
# the rank. This will place all the first rank puzzles first, then
# all the second rank ones, etc.
puzzle_rank_list = [(y_keys[x[0]], x[1]) for x in puzzle_index_ranks.items()]
np.random.shuffle(puzzle_rank_list)
puzzle_rank_list.sort(key=lambda in_tuple: in_tuple[1])
# Extract just the puzzle names, then print them.
puzzles = [x[0] for x in puzzle_rank_list]
print(puzzles)
# And now we can do something like say
# open("factorial_design_10.txt", "w"").write(str(puzzles))