-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathopenwebtext_pipeline_modern.py
95 lines (80 loc) · 3.59 KB
/
openwebtext_pipeline_modern.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import newspaper
import numpy as np
import tensorflow as tf
import tensorflow_text as text
def pipeline_definition(src_path):
# vocabulary size 50001, GPT2 originally used 50257
vocabulary_size = 50001
bpe_model_path = tf.keras.utils.get_file("bpe_en_50k.model", "https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs50000.model")
bpe_model = open(bpe_model_path, 'rb').read()
embedding_dimension = 768
bpe_tokernizer = text.SentencepieceTokenizer(model=bpe_model, out_type=tf.dtypes.int32)
return [
{
"name": "read files",
"type": "source",
"op": tf.data.Dataset.list_files(src_path + "/*.txt").map(tf.io.read_file),
"output_schema": tf.TensorSpec([], tf.string)
},
{
"name": "extract content",
"type": "op",
"op": _extract_html_tf,
"input_schema": tf.TensorSpec([], tf.string),
"output_schema": tf.TensorSpec([], tf.string)
},
{
"name": "byte pair encoding",
"type": "op",
"op": _apply_bpe_tokenizer(bpe_tokernizer),
"input_schema": tf.TensorSpec([], tf.string),
"output_schema": tf.TensorSpec([None], tf.int32)
},
{
"name": "lookup embedding",
"type": "op",
"op": _get_embedding_lookup(vocabulary_size, embedding_dimension),
"input_schema": tf.TensorSpec([None], tf.int32),
"output_schema": tf.TensorSpec([None, embedding_dimension], tf.float32)
},
{
"name": "identity",
"type": "op",
"op": tf.identity,
"input_schema": tf.TensorSpec([None, embedding_dimension], tf.float32),
"output_schema": tf.TensorSpec([None, embedding_dimension], tf.float32)
},
]
def _extract_html_tf(html_tensor):
"""Extract main text from scraped HTML."""
# call from graph into Python
content = tf.py_function(_extract_html, [html_tensor], tf.string)
# necessary for some reason
content = tf.ensure_shape(content, ())
return content
def _extract_html(eager_html_tensor):
try:
article = newspaper.Article(url="", fetch_images=False)
article.set_html(eager_html_tensor.numpy())
article.parse()
# newspaper seems to get some strange errors on this dataset, probably some malformed html
# this happend 2-3 times in the whole dataset, so it hopefully does not change the final
# throughput and processing time. On average, a text file has 42.77KB, UTF8 encoded. For
# simplicity I'll assume 1 byte per char on average as we're dealing with raw text. The change
# between the raw and extracted file sizes are negligible, so we're just copying them. We're
# creating 44KB of text as filler:
# "Hello world" = 11 chars => 11 byte * 4000 = 44000 byte = 44 KB
except:
return ["Hello World"]*4000
return article.text
def _apply_bpe_tokenizer(bpe_tokernizer):
def tokenize(text_tensor):
return bpe_tokernizer.tokenize(text_tensor)
return tokenize
def _get_embedding_lookup(vocabulary_size, embedding_dimension):
# there is no fitting real embedding on https://nlp.h-its.org/bpemb/en/
# but actual numbers are not important for us anyways
# we are also going to not do the positional encoding here, as https://huggingface.co/transformers/model_doc/gpt2.html#gpt2tokenizer also doesnt do that
em = np.random.random((vocabulary_size, embedding_dimension)).astype(np.float32)
embedding_lookup = lambda token_ids: tf.nn.embedding_lookup(em, token_ids)
return embedding_lookup