From c476640d4dba21cfd22098d3007faa22276c70f2 Mon Sep 17 00:00:00 2001 From: Riccardo Orlando Date: Tue, 23 Mar 2021 13:07:26 +0100 Subject: [PATCH] Bugfixes --- setup.py | 2 +- transformer_embedder/tokenizer.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index f1d6523..759d8ef 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="transformer_embedder", # Replace with your own username - version="1.6.6", + version="1.6.7", author="Riccardo Orlando", author_email="orlandoricc@gmail.com", description="Word level transformer based embeddings", diff --git a/transformer_embedder/tokenizer.py b/transformer_embedder/tokenizer.py index 7b3df85..aa10b7d 100644 --- a/transformer_embedder/tokenizer.py +++ b/transformer_embedder/tokenizer.py @@ -58,6 +58,12 @@ def __init__(self, model_name: str, language: str = "xx_sent_ud_sm"): "token_type_ids", } + def __len__(self): + """ + Size of the full vocabulary with the added tokens. + """ + return len(self.huggingface_tokenizer) + def __call__( self, text: Union[List[List[str]], List[str], str], @@ -263,7 +269,7 @@ def _build_tokens( input_ids += [self.huggingface_tokenizer.cls_token_id] token_type_ids += [token_type_id] # first offset - offsets.append((1, 1)) + offsets.append((0, 0)) else: token_type_id = self.token_type_id # check if the input needs an additional sep token @@ -271,18 +277,18 @@ def _build_tokens( if isinstance(self.huggingface_tokenizer, MODELS_WITH_DOUBLE_SEP): input_ids += [self.huggingface_tokenizer.sep_token_id] token_type_ids += [token_type_id] - offsets.append((len(input_ids), len(input_ids))) + offsets.append((len(input_ids) - 1, len(input_ids) - 1)) for w in sentence: ids = self.huggingface_tokenizer(w, add_special_tokens=False)["input_ids"] # if max_len exceeded, stop (leave space for closing token) if len(input_ids) + len(ids) >= max_len - 1: break # token offset before wordpiece, (start, end + 1) - offsets.append((len(input_ids) + 1, len(input_ids) + len(ids))) + offsets.append((len(input_ids), len(input_ids) + len(ids) - 1)) input_ids += ids token_type_ids += [token_type_id] * len(ids) # last offset - offsets.append((len(input_ids) + 1, len(input_ids) + 1)) + offsets.append((len(input_ids), len(input_ids))) input_ids += [self.huggingface_tokenizer.sep_token_id] token_type_ids += [token_type_id] return input_ids, token_type_ids, offsets