-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparticipant_statistics.py
132 lines (100 loc) · 5.93 KB
/
participant_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import sys
import pandas as pd
import numpy as np
from scipy import stats
# Calculate basic statistics for each participant
# Reads the file 'RESULTS_FILE.txt' generated for each participant by EyeLink
# usage: python participant_statistics.py /path/to/raweyetrackingdata/
def comprehension_score(results_df):
"""Calculate the comprehension score (avergage accuracy of all answered questions)"""
questions = results_df.loc[results_df['question'] != "NO QUESTION"]
print(sum(questions['QUESTION_ACCURACY']), len(questions))
average_accuracy = sum(questions['QUESTION_ACCURACY'])/len(questions)
return average_accuracy, len(questions)
def wpm_rate(results_df):
"""Calculate the word per minute rate"""
# reading time per screen in seconds (sampling rate 1000Hz)
absolute_reading_times = results_df['SENTENCE_RT']/1000
avg_absolute_reading_times = np.mean(absolute_reading_times)
wpms = []
for text, text_time in zip(results_df['text'], absolute_reading_times):
text = text.split()
words = len(text)
wpm = words * (60/text_time)
# exclude screens that were skipped by mistake
if not wpm > 1000:
wpms.append(wpm)
avg_wpm = np.mean(wpms)
return avg_wpm
def reading_time(results_df):
"""Extract the absolute reading time (seconds spent on each screen) and normalized by number of words on the screen"""
# reading time per screen in seconds (sampling rate 1000Hz)
absolute_reading_times = results_df['SENTENCE_RT']/1000
avg_absolute_reading_times = np.mean(absolute_reading_times)
relative_reading_times = []
for t, a in zip(results_df['text'], absolute_reading_times):
t = t.split()
words = len(t)
relative_reading_times.append(a/words)
avg_relative_reading_times = np.mean(relative_reading_times)
return avg_absolute_reading_times, avg_relative_reading_times
def add_demographic_info(df_participants):
info = pd.read_excel("./utils/ParticipantOverviewAnonymized.xlsx", usecols=["age", "sex", "dyslexia", "subj", "native_language", "score_reading_comprehension_test", "vision", "pseudohomophone_score"])
#print(info)
df_participants = pd.merge(df_participants, info, on='subj')
return df_participants
def main():
data_dir = sys.argv[1]
participant_stats = pd.DataFrame(columns=['subj', 'comprehension_accuracy', 'number_of_speeches', 'number_of_questions', 'absolute_reading_time', 'relative_reading_time'])
speeches_read_all = []
comprehension_accs = []
questions = []
speeches = []
for item in os.listdir(data_dir):
#if "P" in item and int(item[-2:]) >22 and int(item[-2:]) <=41: # and int(item[-2:]) <=22: # <=22 for typical readers, >=23 for dyslexic participants
if item.startswith("P"):
speeches_read = []
subject_id = item
results_file_path = os.path.join(data_dir, item, 'RESULTS_FILE.txt')
results = pd.read_csv(results_file_path, delimiter="\t")
# remove practice trials
results = results[results.condition != "practice"]
# remove beginning of speech trials
results = results[results.paragraphid != -1]
#print(results.Session_Name_.unique())
avg_accruacy, question_no = comprehension_score(results)
comprehension_accs.append(avg_accruacy)
questions.append(question_no)
abs_read_time, rel_read_time = reading_time(results)
subj_wpm_rate = wpm_rate(results)
speeches_read = list(set(results['speechid'].values))
participant_stats = participant_stats.append({'subj': subject_id, 'comprehension_accuracy': "{:.2f}".format(avg_accruacy), 'number_of_speeches': len(speeches_read), 'number_of_questions': question_no, 'absolute_reading_time': "{:.2f}".format(abs_read_time), 'relative_reading_time':"{:.2f}".format(rel_read_time), 'words_per_minute':"{:.2f}".format(subj_wpm_rate)}, ignore_index=True)
speeches_read_all += speeches_read
speeches.append(len(speeches_read))
participant_stats = add_demographic_info(participant_stats)
print(participant_stats.sort_values('subj'))
print("Average WPM:", participant_stats['words_per_minute'].astype(float).mean())
print("Correlation between comprehension accuracy and reading time:")
print(stats.spearmanr(participant_stats['comprehension_accuracy'], participant_stats['absolute_reading_time']))
print("OUTLIERS:")
# outliers: participants that have a reading time that deviates more than 2*std from the mean of all participants
max = np.mean(participant_stats['absolute_reading_time'].astype(float).tolist()) + 2*np.std(participant_stats['absolute_reading_time'].astype(float).tolist())
min = np.mean(participant_stats['absolute_reading_time'].astype(float).tolist()) - 2*np.std(participant_stats['absolute_reading_time'].astype(float).tolist())
for idx, row in participant_stats.iterrows():
if float(row.absolute_reading_time) > max or float(row.absolute_reading_time) < min:
print(row.subj)
print("MEANS (compr. acc, no. of speeches, no. of questions, reading time, age):")
print(np.mean(comprehension_accs), np.mean(speeches), np.mean(questions), np.mean(participant_stats['absolute_reading_time'].astype('float').tolist()), np.nanmean(participant_stats['age'].tolist()))
print("TOTAL (no. of speeches, no. of questions):")
print(len(speeches_read_all), sum(questions))
sorted_participant_stats = participant_stats.sort_values(by=["subj"])
sorted_participant_stats.to_csv("participant_stats.csv", index=False)
print("Total speeches read: ", len(speeches_read_all))
print("Unique speeches read: ", len(set(speeches_read_all)))
# how often each speech was read:
speech_freq = {i:speeches_read_all.count(i) for i in set(speeches_read_all)}
print("How often each speech is read:")
print(speech_freq)
if __name__ == '__main__':
main()