Skip to content

Commit

Permalink
pxsim release
Browse files Browse the repository at this point in the history
  • Loading branch information
Kevin Heffernan committed Nov 30, 2023
1 parent a7905b9 commit 995c2f7
Show file tree
Hide file tree
Showing 5 changed files with 380 additions and 3 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
LASER is a library to calculate and use multilingual sentence embeddings.

**NEWS**
* 2023/11/30 Released [**P-xSIM**](tasks/pxsim), a dual approach extension to multilingual similarity search (xSIM)
* 2023/11/16 Released [**laser_encoders**](laser_encoders), a pip-installable package supporting LASER-2 and LASER-3 models
* 2023/06/26 [**xSIM++**](https://arxiv.org/abs/2306.12907) evaluation pipeline and data [**released**](tasks/xsimplusplus/README.md)
* 2022/07/06 Updated LASER models with support for over 200 languages are [**now available**](nllb/README.md)
* 2022/07/06 Multilingual similarity search (**xsim**) evaluation pipeline [**released**](tasks/xsim/README.md)
* 2022/07/06 Multilingual similarity search (**xSIM**) evaluation pipeline [**released**](tasks/xsim/README.md)
* 2022/05/03 [**Librivox S2S is available**](tasks/librivox-s2s): Speech-to-Speech translations automatically mined in Librivox [9]
* 2019/11/08 [**CCMatrix is available**](tasks/CCMatrix): Mining billions of high-quality parallel sentences on the WEB [8]
* 2019/07/31 Gilles Bodard and Jérémy Rapin provided a [**Docker environment**](docker) to use LASER
Expand Down
251 changes: 251 additions & 0 deletions source/pxsim.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
#
# LASER Language-Agnostic SEntence Representations
# is a toolkit to calculate multilingual sentence embeddings
# and to use them for various tasks such as document classification,
# and bitext filtering
#
# --------------------------------------------------------
#
# Tool to calculate the dual approach multilingual similarity error rate (P-xSIM)

import typing as tp
from pathlib import Path

import faiss
import numpy as np
import torch
from scipy.special import softmax
from sklearn.metrics.pairwise import cosine_similarity
from stopes.eval.auto_pcp.audio_comparator import Comparator, get_model_pred
from xsim import Margin, score_margin


def get_neighbors(
x: np.ndarray, y: np.ndarray, k: int, margin: str
) -> tp.Tuple[np.ndarray, np.ndarray, int]:
x_copy = x.astype(np.float32).copy()
y_copy = y.astype(np.float32).copy()
nbex, dim = x.shape
# create index
idx_x = faiss.IndexFlatIP(dim)
idx_y = faiss.IndexFlatIP(dim)
# L2 normalization needed for cosine distance
faiss.normalize_L2(x_copy)
faiss.normalize_L2(y_copy)
idx_x.add(x_copy)
idx_y.add(y_copy)
if margin == Margin.ABSOLUTE.value:
scores, indices = idx_y.search(x_copy, k)
else:
# return cosine similarity and indices of k closest neighbors
Cos_xy, Idx_xy = idx_y.search(x_copy, k)
Cos_yx, Idx_yx = idx_x.search(y_copy, k)

# average cosines
Avg_xy = Cos_xy.mean(axis=1)
Avg_yx = Cos_yx.mean(axis=1)

scores = score_margin(Cos_xy, Idx_xy, Avg_xy, Avg_yx, margin, k)
indices = Idx_xy
return scores, indices, nbex


def get_cosine_scores(src_emb: np.ndarray, neighbor_embs: np.ndarray) -> np.ndarray:
assert src_emb.shape[0] == neighbor_embs.shape[1]
src_embs = np.repeat(
np.expand_dims(src_emb, axis=0), neighbor_embs.shape[0], axis=0
)
cosine_scores = cosine_similarity(src_embs, neighbor_embs).diagonal()
return cosine_scores


def get_comparator_scores(
src_emb: np.ndarray,
neighbor_embs: np.ndarray,
comparator_model: tp.Any,
symmetrize_comparator: bool,
) -> np.ndarray:
src_embs = np.repeat(
np.expand_dims(src_emb, axis=0), neighbor_embs.shape[0], axis=0
)
a = torch.from_numpy(src_embs).unsqueeze(1) # restore depth dim
b = torch.from_numpy(neighbor_embs).unsqueeze(1)
res = get_comparator_preds(a, b, comparator_model, symmetrize_comparator)
scores_softmax = softmax(res)
return np.array(scores_softmax)


def get_comparator_preds(
src_emb: np.ndarray, tgt_emb: np.ndarray, model: tp.Any, symmetrize: bool
):
preds = (
get_model_pred(
model,
src=src_emb[:, 0],
mt=tgt_emb[:, 0],
use_gpu=model.use_gpu,
batch_size=1,
)[:, 0]
.cpu()
.numpy()
)
if symmetrize:
preds2 = (
get_model_pred(
model,
src=tgt_emb[:, 0],
mt=src_emb[:, 0],
use_gpu=model.use_gpu,
batch_size=1,
)[:, 0]
.cpu()
.numpy()
)
preds = (preds2 + preds) / 2
return preds


def get_blended_predictions(
alpha: float,
nbex: int,
margin_scores: np.ndarray,
x_aux: np.ndarray,
y_aux: np.ndarray,
neighbor_indices: np.ndarray,
comparator_model: tp.Optional[tp.Any] = None,
symmetrize_comparator: bool = False,
) -> list[int]:
predictions = []
for src_index in range(nbex):
neighbors = neighbor_indices[src_index]
neighbor_embs = y_aux[neighbors].astype(np.float32)
src_emb = x_aux[src_index].astype(np.float32)
aux_scores = (
get_comparator_scores(
src_emb, neighbor_embs, comparator_model, symmetrize_comparator
)
if comparator_model
else get_cosine_scores(src_emb, neighbor_embs)
)
assert margin_scores[src_index].shape == aux_scores.shape
blended_scores = alpha * margin_scores[src_index] + (1 - alpha) * aux_scores
blended_neighbor_idx = blended_scores.argmax()
predictions.append(neighbors[blended_neighbor_idx])
return predictions


def PxSIM(
x: np.ndarray,
y: np.ndarray,
x_aux: np.ndarray,
y_aux: np.ndarray,
alpha: float,
margin: str = Margin.RATIO.value,
k: int = 16,
comparator_path: tp.Optional[Path] = None,
symmetrize_comparator: bool = False,
) -> tp.Tuple[int, int, list[int]]:
"""
Parameters
----------
x : np.ndarray
source-side embedding array
y : np.ndarray
target-side embedding array
x_aux : np.ndarray
source-side embedding array using auxiliary model
y_aux : np.ndarray
target-side embedding array using auxiliary model
alpha : int
parameter to weight blended score
margin : str
margin scoring function (e.g. ratio, absolute, distance)
k : int
number of neighbors in k-nn search
comparator_path : Path
path to AutoPCP model config
symmetrize_comparator : bool
whether to symmetrize the comparator predictions
Returns
-------
err : int
Number of errors
nbex : int
Number of examples
preds : list[int]
List of (index-based) predictions
"""
assert Margin.has_value(margin), f"Margin type: {margin}, is not supported."
comparator_model = Comparator.load(comparator_path) if comparator_path else None
# get margin-based nearest neighbors
margin_scores, neighbor_indices, nbex = get_neighbors(x, y, k=k, margin=margin)
preds = get_blended_predictions(
alpha,
nbex,
margin_scores,
x_aux,
y_aux,
neighbor_indices,
comparator_model,
symmetrize_comparator,
)
err = sum([idx != pred for idx, pred in enumerate(preds)])
print(f"P-xSIM error: {100 * (err / nbex):.2f}")
return err, nbex, preds


def load_embeddings(
infile: Path, dim: int, fp16: bool = False, numpy_header: bool = False
) -> np.ndarray:
assert infile.exists(), f"file: {infile} does not exist."
if numpy_header:
return np.load(infile)
emb = np.fromfile(infile, dtype=np.float16 if fp16 else np.float32)
num_examples = emb.shape[0] // dim
emb.resize(num_examples, dim)
if fp16:
emb = emb.astype(np.float32) # faiss currently only supports fp32
return emb


def run(
src_emb: Path,
tgt_emb: Path,
src_aux_emb: Path,
tgt_aux_emb: Path,
alpha: float,
margin: str = Margin.RATIO.value,
k: int = 16,
emb_fp16: bool = False,
aux_emb_fp16: bool = False,
emb_dim: int = 1024,
aux_emb_dim: int = 1024,
numpy_header: bool = False,
comparator_path: tp.Optional[Path] = None,
symmetrize_comparator: bool = False,
prediction_savepath: tp.Optional[Path] = None,
) -> None:
x = load_embeddings(src_emb, emb_dim, emb_fp16, numpy_header)
y = load_embeddings(tgt_emb, emb_dim, emb_fp16, numpy_header)
x_aux = load_embeddings(src_aux_emb, aux_emb_dim, aux_emb_fp16, numpy_header)
y_aux = load_embeddings(tgt_aux_emb, aux_emb_dim, aux_emb_fp16, numpy_header)
assert (x.shape == y.shape) and (x_aux.shape == y_aux.shape)
_, _, preds = PxSIM(
x, y, x_aux, y_aux, alpha, margin, k, comparator_path, symmetrize_comparator
)
if prediction_savepath:
with open(prediction_savepath, "w") as outf:
for pred in preds:
print(pred, file=outf)


if __name__ == "__main__":
import func_argparse

func_argparse.main()
4 changes: 2 additions & 2 deletions source/xsim.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def _load_embeddings(infile: str, dim: int, fp16: bool = False) -> np.ndarray:
return emb


def _score_margin(
def score_margin(
Dxy: np.ndarray,
Ixy: np.ndarray,
Ax: np.ndarray,
Expand Down Expand Up @@ -103,7 +103,7 @@ def _score_knn(x: np.ndarray, y: np.ndarray, k: int, margin: str) -> np.ndarray:
Avg_xy = Cos_xy.mean(axis=1)
Avg_yx = Cos_yx.mean(axis=1)

scores = _score_margin(Cos_xy, Idx_xy, Avg_xy, Avg_yx, margin, k)
scores = score_margin(Cos_xy, Idx_xy, Avg_xy, Avg_yx, margin, k)

# find best
best = scores.argmax(axis=1)
Expand Down
27 changes: 27 additions & 0 deletions tasks/pxsim/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# LASER: P-xSIM (dual approach multilingual similarity error rate)

This README shows how to calculate the P-xSIM error rate (Seamless Communication et al., 2023) for a given language pair.

P-xSIM returns the error rate for recreating gold alignments using a blended combination of two different approaches.
It works by performing a k-nearest-neighbor search and margin calculation (i.e. margin-based parallel alignment) using the
first approach, followed by the scoring of each candidate neighbor using an auxiliary model (the second approach). Finally,
the scores of both the margin-based alignment and the auxiliary model are combined together using a blended score defined as:

$$ \text{blended-score}(x, y) = \alpha \cdot \text{margin} + (1 - \alpha) \cdot \text{auxiliary-score} $$

where the parameter $\alpha$ controls the combination of both the margin-based and auxiliary scores. By default, the auxiliary-score will be calculated as the cosine between the source and candidate neighbors using the auxiliary embeddings. However, there is also an option to perform inference using a comparator model (Seamless Communication et al., 2023). In this instance, the auxiliary-score will be the AutoPCP outputs.

P-xSIM offers three margin-based scoring options (discussed in detail [here](https://arxiv.org/pdf/1811.01136.pdf)):
- distance
- ratio
- absolute

## Example usage

Simply run the example script `bash ./eval.sh` to download a sample dataset (flores200), sample encoders (laser2 and LaBSE),
and then perform P-xSIM. In this toy example, we use laser2 to provide the k-nearest-neighbors, followed by applying LaBSE as an
auxiliary model on each candidate neighbor, before then applying the blended scoring function defined above. Dependending on
your data sources, you may want to alter the approach used for either margin-based parallel alignment, or the scoring of each candidate neighbor
(i.e. the auxiliary model).

In addition to LaBSE in the example above, you can also calculate P-xSIM using any model hosted on [HuggingFace sentence-transformers](https://huggingface.co/sentence-transformers).
Loading

0 comments on commit 995c2f7

Please sign in to comment.