-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot-cpp-data.py
267 lines (229 loc) · 11.6 KB
/
plot-cpp-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import sys
import os
if len(sys.argv) < 2:
print("Usage: python plot-cpp-data.py <path-to-csv-file> <path-to-hypparam-file> <optional: path-to-output-dir>")
exit(1)
def midi_to_timeseries(midiPath):
time_notes = []
uniqueTypes = [128,144,176]
with open(midiPath) as midi_events:
for line in midi_events:
processedLine = line.split(" ")
processedLine[0] = int(processedLine[0]) # processedLine[0] is timestamp
processedLine[1] = int(processedLine[1], 16) # processedLine[1] is event type
processedLine[2] = int(processedLine[2], 16) # processedLine[2] is note
processedLine[3] = int(processedLine[3],16) # processedLine[3] is velocity
if processedLine[1] not in uniqueTypes:
print("unexpected event type got! ", processedLine[1])
if processedLine[1] == 144:
time_notes.append([processedLine[0], processedLine[2], processedLine[3]])
return np.array(time_notes)
hypparam_file = open(sys.argv[2], 'r')
hypparam = json.load(hypparam_file)
simsDFall = pd.read_csv(sys.argv[1], dtype={'source_timestamp': int, 'target_timestamp':int, 'score':float,
'source_id_start':int, 'source_id_end':int, 'target_id_start':int, 'target_id_end':int,
'match_len':int, 'match_time':int})
simsDF = simsDFall.loc[simsDFall['score']>0.7]
notes = midi_to_timeseries(hypparam["midiPath"])
if len(sys.argv) > 3:
dir = sys.argv[3] + "/"
if os.path.isdir(dir) == False:
print("Making directory", dir)
os.mkdir(dir)
else:
dir = ""
start = int(hypparam["start"])
end = int(hypparam["end"])
skip = int(hypparam["skip"])
curr_times = np.arange(start, end, skip)
minNotes = int(hypparam["minNotes"])
maxNotes = int(hypparam["maxNotes"])
thresh = float(hypparam["thresh"])
def plot_above_thresh():
figure = plt.figure(figsize=(7,5))
figure.autolayout = True
# plotting all matches > threshold score
#title = "All matches >"+str(thresh)+", for \nMin Notes=" + str(minNotes) + " notes, Max Notes=" + str(maxNotes)
title = "All matches > 0.7"
simsDF.plot.scatter(x="source_timestamp", y="target_timestamp", s='score', title=title)
plt.savefig(dir+"above_thresh_test.png")
def plot_best_matches():
figure = plt.figure(figsize=(7,5))
figure.autolayout = True
percentage_matches_any = 0
percentage_matches_thresh = 0
percentage_matches_line = 0
percentage_matches_thresh_line = 0
acc = 50 #ms accuracy for distance from line
plt.rcParams["figure.figsize"] = [7, 5]
plt.rcParams["figure.autolayout"] = True
bestMatches = []
toPlot = simsDFall
# toPlot = simsDFall_old
total = 0
distribution_hist = []
distribution_all = []
close_matches = []
min_good_score = 1
for x in range(start, end, skip):
y = toPlot.loc[toPlot['source_timestamp'] == x]['score']
total += 1
if y.any():
y = y.idxmax()
percentage_matches_any += 1
if toPlot['score'].loc[y] > thresh:
percentage_matches_thresh += 1
if np.abs(toPlot['target_timestamp'].loc[y]-(x-191400))<acc:
percentage_matches_thresh_line += 1
if np.abs(toPlot['target_timestamp'].loc[y]-(x-191400))<acc:
if toPlot['score'].loc[y] < min_good_score:
min_good_score = toPlot['score'].loc[y]
distribution_hist.append((toPlot['target_timestamp'].loc[y]-(x-191400)))
distribution_all.append([x,(toPlot['target_timestamp'].loc[y]-(x-191400))])
percentage_matches_line += 1
close_matches.append(toPlot.loc[y].tolist())
else:
continue
# bestMatch = [toPlot['source_timestamp'].loc[y],toPlot['target_timestamp'].loc[y],toPlot['score'].loc[y],toPlot['source_id_start'].loc[y],toPlot['source_id_end'].loc[y],toPlot['target_id_start'].loc[y],toPlot['target_id_end'].loc[y]]
# bestMatches.append(bestMatch)
plt.scatter(toPlot['source_timestamp'].loc[y],toPlot['target_timestamp'].loc[y],toPlot['score'].loc[y],c='blue')
# bestMatches = np.array(bestMatches)
# bestMatchesDF = pd.DataFrame(data=bestMatches, columns=["source_timestamp","target_timestamp", "score", "source_id_start", "source_id_end", "target_id_start", "target_id_end"])
plt.plot(np.arange(170000,400000),np.arange(170000,400000))
plt.plot(np.arange(191400,391400),np.arange(200000))
percentage_matches_any /= total
percentage_matches_thresh /= total
percentage_matches_line /= total
percentage_matches_thresh_line /= total
print("Percentage matches found: {:.1f}%".format(percentage_matches_any*100))
print("Percentage matches >{:.1f} found: {:.1f}%".format(thresh,percentage_matches_thresh*100))
print("Percentage matches within {:d}ms of line: {:.1f}%".format(acc,percentage_matches_line*100))
print("Percentage matches >{:.1f} and within {:d}ms of the line: {:.1f}%".format(thresh,acc,percentage_matches_thresh_line*100))
print("Minimum score of a \"right match\": {:.2f}".format(min_good_score))
print()
# print("Number of loops sped up: {:d}".format(num_speedups))
# print("Percentage loops sped up: {:.2f} - rough estimate".format((num_speedups/(len(curr_times)))*100))
plt.title("Best match for \nMin Snippet Length=" + str(minNotes) + " notes, Max Notes=" + str(maxNotes))
plt.xlabel("Source Timestamp (ms)")
plt.ylabel("Best Match Timestamp")
plt.savefig(dir+"best_matches_test.png")
return distribution_hist, distribution_all, close_matches
def plot_expect_diff_histogram(distribution_hist):
figure = plt.figure(figsize=(7,5))
figure.autolayout = True
mean = np.mean(distribution_hist)
std = np.std(distribution_hist)
print("Distribution Mean:{:.2f}".format(mean))
print("Distribution Standad Deviation:{:.2f}".format(std))
plt.hist(distribution_hist,bins=int(max(distribution_hist)-min(distribution_hist)))
plt.title("Histogram of time difference from expected")
plt.xlabel("Time difference (ms)")
plt.ylabel("Frequency")
plt.savefig(dir+"expect_diff_histogram.png")
# Plotting k worst matches - matches on the line but farthest from it
def k_worst_matches(distribution_hist, close_matches, k = 1):
close_matches_array = np.array(close_matches)
distribution_hist_array = np.array(distribution_hist)
close_matches_df = pd.DataFrame(data = close_matches_array, columns=['source_timestamp', 'target_timestamp','score','source_id_start','source_id_end','target_id_start','target_id_end','match_len','match_time'])
close_matches_df['deviation'] = distribution_hist_array
close_matches_sorted = close_matches_df.sort_values(by=['deviation'])
worst_matches = np.array(close_matches_sorted.iloc[-k:],dtype=float)
print(worst_matches.astype(int))
for match in worst_matches:
source_start = int(match[0])
target_start = int(match[1])
score = match[2]
source_id_start = int(match[3])
source_id_end = int(match[4])
target_id_start = int(match[5])
target_id_end = int(match[6])
match_len = int(match[7])
match_time = int(match[8])
# plot worst match (in terms of distance from expected timestamp)
diff = source_start - target_start
sequence1 = np.copy(notes[source_id_end:source_id_start+3])
sequence1[:,0] = sequence1[:,0]-diff
sequence2 = np.copy(notes[target_id_end:target_id_start+3])
sequence2[:,0] = sequence2[:,0]
sequence3 = sequence2.copy()
sequence3[:,0] = sequence2[:,0]-diff+191400
display_expected_actual(sequence1,sequence2,source_start,target_start,source_start-diff,target_start,source_start-191400,diff,score,match_time,l1="Source",l2="Matching Target",l3="Expected Target")
plt.savefig(dir+str(k)+"th_worst_match_1.png")
display_expected_actual(sequence1,sequence3,source_start,target_start,source_start-191400,source_start-191400,target_start,diff,score,match_time,l1="Source",l2="Expected Target",l3="Matching Target")
plt.savefig(dir+str(k)+"th_worst_match_2.png")
k -= 1
def display_expected_actual(sequence_source,sequence_target,source_start,target_start,t1,t2,t3,diff,score,lent,l1="Source",l2="Matching Target",l3="Expected Target"):
fig, ax1 = plt.subplots()
sourceX = [] # x axis - source timestamp in ms
sourceY = [] # y axis - note integer repr for source
targetX = []
targetY = []
for i in range(len(sequence_source)):
sourceX.append(sequence_source[i][0])
sourceY.append(sequence_source[i][1])
for i in range(len(sequence_target)):
targetX.append(sequence_target[i][0])
targetY.append(sequence_target[i][1])
def s2t(x):
return x+diff
def t2s(x):
return x-diff
ax1.plot(np.full(2, t1),[0,88],linewidth=3, label=l1)
ax1.plot(np.full(2, t2),[0,88], label=l2)
ax1.plot(np.full(2, t3),[0,88], label=l3)
ax1.scatter(sourceX, sourceY, marker='*')
ax1.scatter(targetX, targetY, marker='.')
title = "Plot with alignment bw {:s} and {:s}: Source @ {:d} ms, Target @ {:d} ms, Score: {:.2f}, Length (ms): {:d}\
".format(l1,l2,source_start,target_start,score,lent)
secax = ax1.secondary_xaxis('top', functions=(s2t, t2s))
fig.tight_layout()
plt.title(title)
ax1.set_xlabel("Source Snippet Time (in ms)")
secax.set_xlabel("Target Snippet Time (in ms)")
ax1.set_ylabel("Note (Integer Representation)")
ax1.legend()
# Plotting distance from line vs source time
def dist_from_line(distribution_all):
figure = plt.figure(figsize=(10, 3.50))
distribution_all_arr = np.array(distribution_all)
plt.plot(distribution_all_arr[:,0],distribution_all_arr[:,1])
plt.xlabel("Source Timestamp (ms)")
plt.ylabel("Target Timestamp Deviation from Expected (ms)")
plt.title("Plot showing time difference betweeen Best Match and Expected Match for every source time in second playthrough")
plt.savefig(dir+"dist_from_line.png")
def best_match_score():
# Plotting best match score
figure = plt.figure(figsize=(10, 3.50))
figure.autolayout = True
# used for legend
pop_a = mpatches.Patch(color='blue', label='Unexpected match')
pop_b = mpatches.Patch(color='orange', label='On the line match')
count = 0
for x in curr_times:
y = simsDFall.loc[simsDFall['source_timestamp'] == x]['score'].max()
s = simsDFall.loc[(simsDFall['source_timestamp'] == x) & (simsDFall['score'] == y)]['match_len'].max()
if np.isnan(y):
count += 1
y = 0
else:
if (np.abs(simsDFall.loc[(simsDFall['source_timestamp'] == x) & (simsDFall['score'] == y) & (simsDFall['match_len'] == s)]['target_timestamp']-(x-191400))<100).any():
color = 'orange'
else:
color = 'blue'
plt.scatter(x,y,1,c=color) # need to include s (len of best match)
plt.title("Best match score for \nMin Snippet Length=" + str(minNotes) + " notes, Max Notes=" + str(maxNotes))
plt.xlabel("Source Timestamp (ms)")
plt.ylabel("Best Match Score")
plt.legend(handles=[pop_a,pop_b])
plt.savefig(dir+"best_match_score.png")
plot_above_thresh()
distribution_hist, distribution_all, close_matches = plot_best_matches()
plot_expect_diff_histogram(distribution_hist)
k_worst_matches(distribution_hist, close_matches, k=5)
dist_from_line(distribution_all)
best_match_score()