official/nlp/data/create_pretraining_data_test.py

# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for official.nlp.data.create_pretraining_data."""
import random

import tensorflow as tf

from official.nlp.data import create_pretraining_data as cpd

_VOCAB_WORDS = ["vocab_1", "vocab_2"]


class CreatePretrainingDataTest(tf.test.TestCase):

  def assertTokens(self, input_tokens, output_tokens, masked_positions,
                   masked_labels):
    # Ensure the masked positions are unique.
    self.assertCountEqual(masked_positions, set(masked_positions))

    # Ensure we can reconstruct the input from the output.
    reconstructed_tokens = output_tokens
    for pos, label in zip(masked_positions, masked_labels):
      reconstructed_tokens[pos] = label
    self.assertEqual(input_tokens, reconstructed_tokens)

    # Ensure each label is valid.
    for pos, label in zip(masked_positions, masked_labels):
      output_token = output_tokens[pos]
      if (output_token == "[MASK]" or output_token in _VOCAB_WORDS or
          output_token == input_tokens[pos]):
        continue
      self.fail("invalid mask value: {}".format(output_token))

  def test_wordpieces_to_grams(self):
    tests = [
        (["That", "cone"], [(0, 1), (1, 2)]),
        (["That", "cone", "##s"], [(0, 1), (1, 3)]),
        (["Swit", "##zer", "##land"], [(0, 3)]),
        (["[CLS]", "Up", "##dog"], [(1, 3)]),
        (["[CLS]", "Up", "##dog", "[SEP]", "Down"], [(1, 3), (4, 5)]),
    ]
    for inp, expected in tests:
      output = cpd._wordpieces_to_grams(inp)
      self.assertEqual(expected, output)

  def test_window(self):
    input_list = [1, 2, 3, 4]
    window_outputs = [
        (1, [[1], [2], [3], [4]]),
        (2, [[1, 2], [2, 3], [3, 4]]),
        (3, [[1, 2, 3], [2, 3, 4]]),
        (4, [[1, 2, 3, 4]]),
        (5, []),
    ]
    for window, expected in window_outputs:
      output = cpd._window(input_list, window)
      self.assertEqual(expected, list(output))

  def test_create_masked_lm_predictions(self):
    tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
    rng = random.Random(123)
    for _ in range(0, 5):
      output_tokens, masked_positions, masked_labels = (
          cpd.create_masked_lm_predictions(
              tokens=tokens,
              masked_lm_prob=1.0,
              max_predictions_per_seq=3,
              vocab_words=_VOCAB_WORDS,
              rng=rng,
              do_whole_word_mask=False,
              max_ngram_size=None))
      self.assertEqual(len(masked_positions), 3)
      self.assertEqual(len(masked_labels), 3)
      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)

  def test_create_masked_lm_predictions_whole_word(self):
    tokens = ["[CLS]", "a", "##a", "b", "##b", "c", "##c", "[SEP]"]
    rng = random.Random(345)
    for _ in range(0, 5):
      output_tokens, masked_positions, masked_labels = (
          cpd.create_masked_lm_predictions(
              tokens=tokens,
              masked_lm_prob=1.0,
              max_predictions_per_seq=3,
              vocab_words=_VOCAB_WORDS,
              rng=rng,
              do_whole_word_mask=True,
              max_ngram_size=None))
      # since we can't get exactly three tokens without breaking a word we
      # only take two.
      self.assertEqual(len(masked_positions), 2)
      self.assertEqual(len(masked_labels), 2)
      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)
      # ensure that we took an entire word.
      self.assertIn(masked_labels, [["a", "##a"], ["b", "##b"], ["c", "##c"]])

  def test_create_masked_lm_predictions_ngram(self):
    tokens = ["[CLS]"] + ["tok{}".format(i) for i in range(0, 512)] + ["[SEP]"]
    rng = random.Random(345)
    for _ in range(0, 5):
      output_tokens, masked_positions, masked_labels = (
          cpd.create_masked_lm_predictions(
              tokens=tokens,
              masked_lm_prob=1.0,
              max_predictions_per_seq=76,
              vocab_words=_VOCAB_WORDS,
              rng=rng,
              do_whole_word_mask=True,
              max_ngram_size=3))
      self.assertEqual(len(masked_positions), 76)
      self.assertEqual(len(masked_labels), 76)
      self.assertTokens(tokens, output_tokens, masked_positions, masked_labels)


if __name__ == "__main__":
  tf.test.main()