Skip to content

Commit

Permalink
Bugfixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Riccorl committed Mar 23, 2021
1 parent 608aaaa commit c476640
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="transformer_embedder", # Replace with your own username
version="1.6.6",
version="1.6.7",
author="Riccardo Orlando",
author_email="[email protected]",
description="Word level transformer based embeddings",
Expand Down
14 changes: 10 additions & 4 deletions transformer_embedder/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ def __init__(self, model_name: str, language: str = "xx_sent_ud_sm"):
"token_type_ids",
}

def __len__(self):
"""
Size of the full vocabulary with the added tokens.
"""
return len(self.huggingface_tokenizer)

def __call__(
self,
text: Union[List[List[str]], List[str], str],
Expand Down Expand Up @@ -263,26 +269,26 @@ def _build_tokens(
input_ids += [self.huggingface_tokenizer.cls_token_id]
token_type_ids += [token_type_id]
# first offset
offsets.append((1, 1))
offsets.append((0, 0))
else:
token_type_id = self.token_type_id
# check if the input needs an additional sep token
# XLM-R for example wants an additional `</s>` between text pairs
if isinstance(self.huggingface_tokenizer, MODELS_WITH_DOUBLE_SEP):
input_ids += [self.huggingface_tokenizer.sep_token_id]
token_type_ids += [token_type_id]
offsets.append((len(input_ids), len(input_ids)))
offsets.append((len(input_ids) - 1, len(input_ids) - 1))
for w in sentence:
ids = self.huggingface_tokenizer(w, add_special_tokens=False)["input_ids"]
# if max_len exceeded, stop (leave space for closing token)
if len(input_ids) + len(ids) >= max_len - 1:
break
# token offset before wordpiece, (start, end + 1)
offsets.append((len(input_ids) + 1, len(input_ids) + len(ids)))
offsets.append((len(input_ids), len(input_ids) + len(ids) - 1))
input_ids += ids
token_type_ids += [token_type_id] * len(ids)
# last offset
offsets.append((len(input_ids) + 1, len(input_ids) + 1))
offsets.append((len(input_ids), len(input_ids)))
input_ids += [self.huggingface_tokenizer.sep_token_id]
token_type_ids += [token_type_id]
return input_ids, token_type_ids, offsets
Expand Down

0 comments on commit c476640

Please sign in to comment.