-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrandomize_words.py
77 lines (62 loc) · 2.74 KB
/
randomize_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import nltk
import json
import random
import numpy as np
# Coherent sentences
coherent_file = 'coherent_sentences.json'
file_content = []
for line in open("data/json/" + coherent_file, 'r'):
file_content.append(json.loads(line))
outfile = "data/random/coherent_sentences.txt"
open(outfile, 'w') # Clear contents of file
coherent_output = open(outfile, 'a+')
for line in file_content:
Arg1 = nltk.word_tokenize(line['Arg1Raw'].lower())
Arg2 = nltk.word_tokenize(line['Arg2Raw'].lower())
Conn = nltk.word_tokenize(line['ConnectiveRaw'].lower())
sentence = " ".join(Arg1) + " " + " ".join(Conn) + " " + " ".join(Arg2) + "\n"
coherent_output.write(sentence.encode('ascii', 'ignore'))
# Incoherent sentences
file_content = []
file_to_randomize = 'incoherent_sentences_arg2_diff_sense.json'
gammas = np.arange(0.0, 1.1,0.1)
for line in open("data/json/" + file_to_randomize, 'r'):
file_content.append(json.loads(line))
# Gammas represent the probability of a word being shuffled
for gamma in gammas:
outfile = "data/random/{}_gamma_{:g}.txt".format(file_to_randomize[:-5], gamma)
open(outfile, 'w') # Clear contents of file
randomized_file = open(outfile, 'a+')
for line in file_content:
# Get Arg1, Arg2 and Conn
Arg1 = nltk.word_tokenize(line['Arg1Raw'].lower())
Arg2 = nltk.word_tokenize(line['Arg2Raw'].lower())
Conn = nltk.word_tokenize(line['ConnectiveRaw'].lower())
words_to_shuffle = []
Arg2_index_shuffle = [False]*len(Arg2)
for i, word in enumerate(Arg2):
if random.uniform(0,1) < gamma:
words_to_shuffle.append(word)
Arg2_index_shuffle[i] = True
shuffled_Arg2 = []
# If we have words to shuffle, shuffle them
if len(words_to_shuffle) > 1:
for i, word in enumerate(Arg2):
if Arg2_index_shuffle[i] == True:
replacement_word = random.choice(words_to_shuffle)
# Ensure replacement word is not the same as the initial word
tries = 0
while replacement_word == word and tries < 100:
replacement_word = random.choice(words_to_shuffle)
tries += 1
# Add shuffled word, remove from words to shuffle
shuffled_Arg2.append(replacement_word)
words_to_shuffle.remove(replacement_word)
else:
shuffled_Arg2.append(word)
else:
shuffled_Arg2 = Arg2
# Shuffled sentence
sentence = " ".join(Arg1) + " " + " ".join(Conn) + " " + " ".join(shuffled_Arg2) + "\n"
randomized_file.write(sentence.encode('ascii', 'ignore'))