-
Notifications
You must be signed in to change notification settings - Fork 54
/
custom_datasets.py
89 lines (64 loc) · 2.52 KB
/
custom_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import random
import datasets
SEPARATOR = '<<<SEP>>>'
DATASETS = ['writing', 'english', 'german', 'pubmed']
def load_pubmed(cache_dir):
data = datasets.load_dataset('pubmed_qa', 'pqa_labeled', split='train', cache_dir=cache_dir)
# combine question and long_answer
data = [f'Question: {q} Answer:{SEPARATOR}{a}' for q, a in zip(data['question'], data['long_answer'])]
return data
def process_prompt(prompt):
return prompt.replace('[ WP ]', '').replace('[ OT ]', '')
def process_spaces(story):
return story.replace(
' ,', ',').replace(
' .', '.').replace(
' ?', '?').replace(
' !', '!').replace(
' ;', ';').replace(
' \'', '\'').replace(
' ’ ', '\'').replace(
' :', ':').replace(
'<newline>', '\n').replace(
'`` ', '"').replace(
' \'\'', '"').replace(
'\'\'', '"').replace(
'.. ', '... ').replace(
' )', ')').replace(
'( ', '(').replace(
' n\'t', 'n\'t').replace(
' i ', ' I ').replace(
' i\'', ' I\'').replace(
'\\\'', '\'').replace(
'\n ', '\n').strip()
def load_writing(cache_dir=None):
writing_path = 'data/writingPrompts'
with open(f'{writing_path}/valid.wp_source', 'r') as f:
prompts = f.readlines()
with open(f'{writing_path}/valid.wp_target', 'r') as f:
stories = f.readlines()
prompts = [process_prompt(prompt) for prompt in prompts]
joined = [process_spaces(prompt + " " + story) for prompt, story in zip(prompts, stories)]
filtered = [story for story in joined if 'nsfw' not in story and 'NSFW' not in story]
random.seed(0)
random.shuffle(filtered)
return filtered
def load_language(language, cache_dir):
# load either the english or german portion of the wmt16 dataset
assert language in ['en', 'de']
d = datasets.load_dataset('wmt16', 'de-en', split='train', cache_dir=cache_dir)
docs = d['translation']
desired_language_docs = [d[language] for d in docs]
lens = [len(d.split()) for d in desired_language_docs]
sub = [d for d, l in zip(desired_language_docs, lens) if l > 100 and l < 150]
return sub
def load_german(cache_dir):
return load_language('de', cache_dir)
def load_english(cache_dir):
return load_language('en', cache_dir)
def load(name, cache_dir, **kwargs):
if name in DATASETS:
load_fn = globals()[f'load_{name}']
return load_fn(cache_dir=cache_dir, **kwargs)
else:
raise ValueError(f'Unknown dataset {name}')