lib.py

# -*- coding: utf-8 -*-
import collections
import logging
import multiprocessing
import time
from itertools import combinations

from tqdm import tqdm

import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers.data.metrics.squad_metrics import _compute_softmax, get_final_text, _get_best_indexes, squad_evaluate, \
    compute_predictions_logits
from transformers.data.processors.squad import SquadFeatures, _new_check_is_max_context, \
    MULTI_SEP_TOKENS_TOKENIZERS_SET, SquadResult, SquadExample
from transformers.tokenization_utils_base import TruncationStrategy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger = logging.getLogger(__name__)


# convert torch.tensor to list
def to_list(tensor):
    return tensor.detach().cpu().tolist()


# Generate all possible r-length combinations of list L (r >= 2)
# length of all_combinations == 2 ^ len(li) - len(li) - 1
def get_all_combinations(li):
    all_combinations = []
    for length in range(2, len(li) + 1):
        all_combinations += list(combinations(li, length))
    return all_combinations


# Ensemble several logits files generated by same MODELTYPE but with different WEIGHTS
# suitable model_type: albert (with 3 params in SquadResult)
def ensemble_logits(logits_files, from_file=True):
    # assertion
    if not logits_files:
        return []
    logits = [torch.load(logits_file) for logits_file in logits_files] if from_file else logits_files
    num_models, num_features = len(logits_files), len(logits[0])
    for model_id in range(num_models):
        assert len(logits[model_id]) == num_features

    # ensemble
    ensembled_results = []
    for feature_id in range(num_features):
        assert len(set(logits[model_id][feature_id].unique_id for model_id in range(num_models))) == 1
        unique_id = logits[0][feature_id].unique_id
        start_logits_list = torch.tensor([logits[model_id][feature_id].start_logits for model_id in range(num_models)])
        start_logits = torch.mean(start_logits_list, 0)
        end_logits_list = torch.tensor([logits[model_id][feature_id].end_logits for model_id in range(num_models)])
        end_logits = torch.mean(end_logits_list, 0)
        ensembled_results.append(SquadResult(unique_id, start_logits, end_logits))
    return ensembled_results


# Setup function to compute prediction one example at a time (single model / ensemble)
def run_prediction(question_text, context_text, models, tokenizer):
    # Build an example
    start_t = time.time()
    example = SquadExample(
        qas_id=str(0),
        question_text=question_text,
        context_text=context_text,
        answer_text=None,
        start_position_character=None,
        title="Predict",
        is_impossible=False,
        answers=None,
    )
    # Converts an example into features that can be directly given as input to a model. (take care of multi-doc)
    features = squad_convert_example_to_features_custom(
        example,
        tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        padding_strategy="max_length",
    )
    for i in range(len(features)):
        features[i].unique_id = i
    # Convert to Tensors and build dataset, dataloader
    all_input_ids = torch.tensor([f.input_ids for f in features]).long()
    all_attention_masks = torch.tensor([f.attention_mask for f in features]).long()
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features]).long()
    all_feature_index = torch.arange(len(features)).long()
    dataset = TensorDataset(
        all_input_ids,
        all_attention_masks,
        all_token_type_ids,
        all_feature_index,
    )
    eval_dataloader = DataLoader(dataset, batch_size=10)
    print('D: {:.0f}ms'.format((time.time() - start_t) * 1e3))

    # Feed dataloader into model, and get start & end logits as output results
    start_t = time.time()
    # model infer in serial
    results_list = [run_model(features, eval_dataloader, model) for model in models]
    # model infer in parallel
    # pool = multiprocessing.Pool(processes=len(models))
    # process_list = []
    # for model in models:
    #     process_list.append(pool.apply_async(func=run_model, args=(features, eval_dataloader, model)))
    # pool.close()
    # pool.join()
    # results_list = [p.get() for p in process_list]
    results = ensemble_logits(results_list, from_file=False)
    print('EM: {:.0f}ms'.format((time.time() - start_t) * 1e3))

    # Compute start & end logits to final prediction answer text
    start_t = time.time()
    prediction = compute_predictions_logits_custom(
        example,
        features,
        results,
        tokenizer,
    )
    print('C: {:.0f}ms'.format((time.time() - start_t) * 1e3))
    return prediction


# Feed dataloader into model, and get start & end logits as output results
def run_model(features, eval_dataloader, model):
    start_t = time.time()
    results = []
    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        # print("Batch:\n", batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }
            example_indices = batch[3]
            outputs = model(**inputs)
            # print("Output:\n", outputs)
            for i, example_index in enumerate(example_indices):
                eval_feature = features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                start_logits = to_list(outputs['start_logits'][i])
                end_logits = to_list(outputs['end_logits'][i])
                result = SquadResult(unique_id, start_logits, end_logits)
                results.append(result)
    print('SM: {:.0f}ms'.format((time.time() - start_t) * 1e3))
    return results


# Setup function to compute predictions
def eval_benchmark(examples, features, results, n_best_size, null_score_diff_threshold, tokenizer):
    # print('features: ', len(features))
    start_time = time.time()
    all_predictions = compute_predictions_logits(
        examples,
        features,
        results,
        n_best_size,
        30,  # max_answer_length
        True,  # do_lower_case
        None,  # output_prediction_file
        None,  # output_nbest_file
        None,  # output_null_log_odds_file
        False,  # verbose_logging
        True,  # version_2_with_negative
        null_score_diff_threshold,
        tokenizer,
    )
    benchmark_results = squad_evaluate(examples, all_predictions)
    # print("Predictions:\n", predictions)
    return benchmark_results, "{:.2f}s".format(time.time() - start_time)


def squad_convert_example_to_features_custom(
        example,
        tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        padding_strategy="max_length",
):
    features = []
    tok_to_orig_index = []
    orig_to_tok_index = []
    all_doc_tokens = []
    for (i, token) in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        sub_tokens = tokenizer.tokenize(token)
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    spans = []
    truncated_query = tokenizer.encode(
        example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
    )
    # Tokenizers who insert 2 SEP tokens in-between <context> & <question> need to have special handling
    # in the way they compute mask of added tokens.
    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
    sequence_added_tokens = (
        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
    )
    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair

    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):
        # Define the side we want to truncate / pad and the text/pair sorting
        if tokenizer.padding_side == "right":
            texts = truncated_query
            pairs = span_doc_tokens
            truncation = TruncationStrategy.ONLY_SECOND.value
        else:
            texts = span_doc_tokens
            pairs = truncated_query
            truncation = TruncationStrategy.ONLY_FIRST.value

        encoded_dict = tokenizer.encode_plus(  # TODO(thom) update this logic
            texts,
            pairs,
            truncation=truncation,
            padding=padding_strategy,
            max_length=max_seq_length,
            return_overflowing_tokens=True,
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
            return_token_type_ids=True,
        )
        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
        )
        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            if tokenizer.padding_side == "right":
                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
            else:
                last_padding_id_position = (
                        len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(
                    tokenizer.pad_token_id)
                )
                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1:]
        else:
            non_padded_ids = encoded_dict["input_ids"]

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
        token_to_orig_map = {}
        for i in range(paragraph_len):
            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]

        encoded_dict["paragraph_len"] = paragraph_len
        encoded_dict["tokens"] = tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map
        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
        encoded_dict["token_is_max_context"] = {}
        encoded_dict["start"] = len(spans) * doc_stride
        encoded_dict["length"] = paragraph_len
        spans.append(encoded_dict)

        if "overflowing_tokens" not in encoded_dict or \
                "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0:
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]

    for doc_span_index in range(len(spans)):
        for j in range(spans[doc_span_index]["paragraph_len"]):
            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
            index = (
                j
                if tokenizer.padding_side == "left"
                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
            )
            spans[doc_span_index]["token_is_max_context"][index] = is_max_context

    for span in spans:
        # Identify the position of the CLS token
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
        # Original TF implem also keep the classification token (set to 0)
        p_mask = np.ones_like(span["token_type_ids"])
        if tokenizer.padding_side == "right":
            p_mask[len(truncated_query) + sequence_added_tokens:] = 0
        else:
            p_mask[-len(span["tokens"]): -(len(truncated_query) + sequence_added_tokens)] = 0

        pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
        special_token_indices = np.asarray(
            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
        ).nonzero()
        p_mask[pad_token_indices] = 1
        p_mask[special_token_indices] = 1
        # Set the cls index to 0: the CLS index can be used for impossible answers
        p_mask[cls_index] = 0
        span_is_impossible = example.is_impossible
        start_position = 0
        end_position = 0

        features.append(
            SquadFeatures(
                span["input_ids"],
                span["attention_mask"],
                span["token_type_ids"],
                cls_index,
                p_mask.tolist(),
                example_index=0,
                # Can not set unique_id and example_index here. They will be set after multiple processing.
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                start_position=start_position,
                end_position=end_position,
                is_impossible=span_is_impossible,
                qas_id=example.qas_id,
            )
        )
    return features


# get prediction of a single example,
# different from official version `compute_predictions_logits` of a list of examples
def compute_predictions_logits_custom(
        example,
        features,
        results,  # [start_logits, end_logits] * num_chunks
        tokenizer,
        n_best_size=10,
        max_answer_length=30,
        do_lower_case=True,
        verbose_logging=False,
        null_score_diff_threshold=-3.1,  # -5 for albert, -4.2 for roberta, -3.1 for albert 2~4 ensemble
):
    """Write final predictions to the json file and log-odds of null if needed."""
    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
    )
    prelim_predictions = []
    # keep track of the minimum score of null start+end of position 0
    score_null = 1000000  # large and positive
    min_null_feature_index = 0  # the paragraph slice with min null score
    null_start_logit = 0  # the start/end logit at the slice with min null score
    null_end_logit = null_start_logit
    for (feature_index, feature) in enumerate(features):
        # s, e = SoftMax(Linear(H))
        result = results[feature_index]
        start_indexes = _get_best_indexes(result.start_logits, n_best_size)
        end_indexes = _get_best_indexes(result.end_logits, n_best_size)
        # if we could have irrelevant answers, get the min score of irrelevant
        # score_null = Min(cls.s + cls.e) [for multi-Doc condition]
        feature_null_score = result.start_logits[0] + result.end_logits[0]  # test value = cls.s+cls.e
        if feature_null_score < score_null:
            score_null = feature_null_score
            min_null_feature_index = feature_index
            null_start_logit = result.start_logits[0]
            null_end_logit = result.end_logits[0]
        for start_index in start_indexes:
            for end_index in end_indexes:
                # We could hypothetically create invalid predictions, e.g., predict that the start of the span
                # is in the question. We throw out all invalid predictions.
                if min(start_index, end_index) >= len(feature.tokens) or \
                        end_index < start_index or \
                        start_index not in feature.token_to_orig_map or \
                        end_index not in feature.token_to_orig_map or \
                        not feature.token_is_max_context.get(start_index, False) or \
                        end_index - start_index + 1 > max_answer_length:
                    continue
                prelim_predictions.append(
                    _PrelimPrediction(
                        feature_index=feature_index,
                        start_index=start_index,
                        end_index=end_index,
                        start_logit=result.start_logits[start_index],
                        end_logit=result.end_logits[end_index],
                    )
                )
    prelim_predictions.append(
        _PrelimPrediction(
            feature_index=min_null_feature_index,
            start_index=0,
            end_index=0,
            start_logit=null_start_logit,
            end_logit=null_end_logit,
        )
    )
    prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
    _NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"])
    # print('prelim_predictions: (sorted)', len(prelim_predictions))

    seen_predictions = {}
    nbest = []
    for pred in prelim_predictions[:n_best_size]:
        feature = features[pred.feature_index]
        if pred.start_index > 0:  # this is a non-null prediction
            tok_tokens = feature.tokens[pred.start_index: pred.end_index + 1]
            orig_doc_start = feature.token_to_orig_map[pred.start_index]
            orig_doc_end = feature.token_to_orig_map[pred.end_index]
            orig_tokens = example.doc_tokens[orig_doc_start: orig_doc_end + 1]
            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
            # Clean whitespace
            tok_text = tok_text.strip()
            tok_text = " ".join(tok_text.split())
            orig_text = " ".join(orig_tokens)
            final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
            if final_text in seen_predictions:
                continue
            seen_predictions[final_text] = True
        else:
            final_text = ""
            seen_predictions[final_text] = True
        nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
    # if we didn't include the empty option in the n-best, include it
    if "" not in seen_predictions:
        nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
    # In very rare edge cases we could only have single null prediction or no valid predictions.
    # So we just create a nonce prediction in this case to avoid failure.
    if len(nbest) in (1, 0):
        nbest.insert(0, _NbestPrediction(text="empty", start_logit=null_start_logit, end_logit=null_end_logit))

    total_scores = []
    best_non_null_entry, best_non_null_id = None, None
    for i, entry in enumerate(nbest):
        total_scores.append(entry.start_logit + entry.end_logit)
        if not best_non_null_entry and entry.text:
            best_non_null_entry = entry
            best_non_null_id = i
    probs = _compute_softmax(total_scores)
    # nbest <-> probs is one-one match
    # TAV: predict "" if the null score - the score of best non-null > threshold
    # return type: pred_answer: str, score: float
    score_diff = score_null - best_non_null_entry.start_logit - best_non_null_entry.end_logit
    if score_diff > null_score_diff_threshold:
        return "", probs[0]  # ?
    else:
        return best_non_null_entry.text, probs[best_non_null_id]


def get_best_span(span_start_logits: torch.Tensor, span_end_logits: torch.Tensor) -> torch.Tensor:
    if span_start_logits.dim() != 2 or span_end_logits.dim() != 2:
        raise ValueError("Input shapes must be (batch_size, passage_length)")
    batch_size, passage_length = span_start_logits.size()
    max_span_log_prob = [-1e20] * batch_size
    span_start_argmax = [0] * batch_size
    best_word_span = span_start_logits.new_zeros((batch_size, 2), dtype=torch.long)

    span_start_logits = span_start_logits.detach().cpu().numpy()
    span_end_logits = span_end_logits.detach().cpu().numpy()

    for b in range(batch_size):  # pylint: disable=invalid-name
        for j in range(passage_length):
            val1 = span_start_logits[b, span_start_argmax[b]]
            if val1 < span_start_logits[b, j]:
                span_start_argmax[b] = j
                val1 = span_start_logits[b, j]

            val2 = span_end_logits[b, j]
            if val1 + val2 > max_span_log_prob[b]:
                best_word_span[b, 0] = span_start_argmax[b]
                best_word_span[b, 1] = j
                max_span_log_prob[b] = val1 + val2
    return best_word_span