diff --git a/.buildinfo b/.buildinfo new file mode 100644 index 00000000..8c8ae031 --- /dev/null +++ b/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file records the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 31899dca82da6cb45546146b22ddc4c6 +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/.doctrees/API/abc.doctree b/.doctrees/API/abc.doctree new file mode 100644 index 00000000..8989dcf3 Binary files /dev/null and b/.doctrees/API/abc.doctree differ diff --git a/.doctrees/API/abc/evaluation.doctree b/.doctrees/API/abc/evaluation.doctree new file mode 100644 index 00000000..e53b1d75 Binary files /dev/null and b/.doctrees/API/abc/evaluation.doctree differ diff --git a/.doctrees/API/abc/evaluation/arguments.doctree b/.doctrees/API/abc/evaluation/arguments.doctree new file mode 100644 index 00000000..1d9bd5f3 Binary files /dev/null and b/.doctrees/API/abc/evaluation/arguments.doctree differ diff --git a/.doctrees/API/abc/evaluation/data_loader.doctree b/.doctrees/API/abc/evaluation/data_loader.doctree new file mode 100644 index 00000000..cda33e19 Binary files /dev/null and b/.doctrees/API/abc/evaluation/data_loader.doctree differ diff --git a/.doctrees/API/abc/evaluation/evaluator.doctree b/.doctrees/API/abc/evaluation/evaluator.doctree new file mode 100644 index 00000000..e8c34e83 Binary files /dev/null and b/.doctrees/API/abc/evaluation/evaluator.doctree differ diff --git a/.doctrees/API/abc/evaluation/runner.doctree b/.doctrees/API/abc/evaluation/runner.doctree new file mode 100644 index 00000000..eb190639 Binary files /dev/null and b/.doctrees/API/abc/evaluation/runner.doctree differ diff --git a/.doctrees/API/abc/evaluation/searcher.doctree b/.doctrees/API/abc/evaluation/searcher.doctree new file mode 100644 index 00000000..64a7fe8f Binary files /dev/null and b/.doctrees/API/abc/evaluation/searcher.doctree differ diff --git a/.doctrees/API/abc/finetune.doctree b/.doctrees/API/abc/finetune.doctree new file mode 100644 index 00000000..65455db1 Binary files /dev/null and b/.doctrees/API/abc/finetune.doctree differ diff --git a/.doctrees/API/abc/finetune/embedder.doctree b/.doctrees/API/abc/finetune/embedder.doctree new file mode 100644 index 00000000..51e35d55 Binary files /dev/null and b/.doctrees/API/abc/finetune/embedder.doctree differ diff --git a/.doctrees/API/abc/finetune/embedder/AbsArguments.doctree b/.doctrees/API/abc/finetune/embedder/AbsArguments.doctree new file mode 100644 index 00000000..dd735738 Binary files /dev/null and b/.doctrees/API/abc/finetune/embedder/AbsArguments.doctree differ diff --git a/.doctrees/API/abc/finetune/embedder/AbsDataset.doctree b/.doctrees/API/abc/finetune/embedder/AbsDataset.doctree new file mode 100644 index 00000000..d55bdd31 Binary files /dev/null and b/.doctrees/API/abc/finetune/embedder/AbsDataset.doctree differ diff --git a/.doctrees/API/abc/finetune/embedder/AbsModeling.doctree b/.doctrees/API/abc/finetune/embedder/AbsModeling.doctree new file mode 100644 index 00000000..74454f36 Binary files /dev/null and b/.doctrees/API/abc/finetune/embedder/AbsModeling.doctree differ diff --git a/.doctrees/API/abc/finetune/embedder/AbsRunner.doctree b/.doctrees/API/abc/finetune/embedder/AbsRunner.doctree new file mode 100644 index 00000000..40c72bd8 Binary files /dev/null and b/.doctrees/API/abc/finetune/embedder/AbsRunner.doctree differ diff --git a/.doctrees/API/abc/finetune/embedder/AbsTrainer.doctree b/.doctrees/API/abc/finetune/embedder/AbsTrainer.doctree new file mode 100644 index 00000000..82685214 Binary files /dev/null and b/.doctrees/API/abc/finetune/embedder/AbsTrainer.doctree differ diff --git a/.doctrees/API/abc/finetune/reranker.doctree b/.doctrees/API/abc/finetune/reranker.doctree new file mode 100644 index 00000000..fe161a86 Binary files /dev/null and b/.doctrees/API/abc/finetune/reranker.doctree differ diff --git a/.doctrees/API/abc/finetune/reranker/AbsArguments.doctree b/.doctrees/API/abc/finetune/reranker/AbsArguments.doctree new file mode 100644 index 00000000..b79c0934 Binary files /dev/null and b/.doctrees/API/abc/finetune/reranker/AbsArguments.doctree differ diff --git a/.doctrees/API/abc/finetune/reranker/AbsDataset.doctree b/.doctrees/API/abc/finetune/reranker/AbsDataset.doctree new file mode 100644 index 00000000..fb8bacb4 Binary files /dev/null and b/.doctrees/API/abc/finetune/reranker/AbsDataset.doctree differ diff --git a/.doctrees/API/abc/finetune/reranker/AbsModeling.doctree b/.doctrees/API/abc/finetune/reranker/AbsModeling.doctree new file mode 100644 index 00000000..a672fd07 Binary files /dev/null and b/.doctrees/API/abc/finetune/reranker/AbsModeling.doctree differ diff --git a/.doctrees/API/abc/finetune/reranker/AbsRunner.doctree b/.doctrees/API/abc/finetune/reranker/AbsRunner.doctree new file mode 100644 index 00000000..13cfa50b Binary files /dev/null and b/.doctrees/API/abc/finetune/reranker/AbsRunner.doctree differ diff --git a/.doctrees/API/abc/finetune/reranker/AbsTrainer.doctree b/.doctrees/API/abc/finetune/reranker/AbsTrainer.doctree new file mode 100644 index 00000000..a59527f4 Binary files /dev/null and b/.doctrees/API/abc/finetune/reranker/AbsTrainer.doctree differ diff --git a/.doctrees/API/abc/inference.doctree b/.doctrees/API/abc/inference.doctree new file mode 100644 index 00000000..05e2076e Binary files /dev/null and b/.doctrees/API/abc/inference.doctree differ diff --git a/.doctrees/API/abc/inference/AbsEmbedder.doctree b/.doctrees/API/abc/inference/AbsEmbedder.doctree new file mode 100644 index 00000000..66315200 Binary files /dev/null and b/.doctrees/API/abc/inference/AbsEmbedder.doctree differ diff --git a/.doctrees/API/abc/inference/AbsReranker.doctree b/.doctrees/API/abc/inference/AbsReranker.doctree new file mode 100644 index 00000000..e48d489c Binary files /dev/null and b/.doctrees/API/abc/inference/AbsReranker.doctree differ diff --git a/.doctrees/API/evaluation.doctree b/.doctrees/API/evaluation.doctree new file mode 100644 index 00000000..f32d2465 Binary files /dev/null and b/.doctrees/API/evaluation.doctree differ diff --git a/.doctrees/API/evaluation/airbench.doctree b/.doctrees/API/evaluation/airbench.doctree new file mode 100644 index 00000000..ab26edd0 Binary files /dev/null and b/.doctrees/API/evaluation/airbench.doctree differ diff --git a/.doctrees/API/evaluation/airbench/arguments.doctree b/.doctrees/API/evaluation/airbench/arguments.doctree new file mode 100644 index 00000000..7d31c074 Binary files /dev/null and b/.doctrees/API/evaluation/airbench/arguments.doctree differ diff --git a/.doctrees/API/evaluation/airbench/runner.doctree b/.doctrees/API/evaluation/airbench/runner.doctree new file mode 100644 index 00000000..028979df Binary files /dev/null and b/.doctrees/API/evaluation/airbench/runner.doctree differ diff --git a/.doctrees/API/evaluation/beir.doctree b/.doctrees/API/evaluation/beir.doctree new file mode 100644 index 00000000..0b39e8a6 Binary files /dev/null and b/.doctrees/API/evaluation/beir.doctree differ diff --git a/.doctrees/API/evaluation/beir/arguments.doctree b/.doctrees/API/evaluation/beir/arguments.doctree new file mode 100644 index 00000000..6efa71e6 Binary files /dev/null and b/.doctrees/API/evaluation/beir/arguments.doctree differ diff --git a/.doctrees/API/evaluation/beir/data_loader.doctree b/.doctrees/API/evaluation/beir/data_loader.doctree new file mode 100644 index 00000000..cd09aadb Binary files /dev/null and b/.doctrees/API/evaluation/beir/data_loader.doctree differ diff --git a/.doctrees/API/evaluation/beir/evaluator.doctree b/.doctrees/API/evaluation/beir/evaluator.doctree new file mode 100644 index 00000000..bcf200d7 Binary files /dev/null and b/.doctrees/API/evaluation/beir/evaluator.doctree differ diff --git a/.doctrees/API/evaluation/beir/runner.doctree b/.doctrees/API/evaluation/beir/runner.doctree new file mode 100644 index 00000000..b30df356 Binary files /dev/null and b/.doctrees/API/evaluation/beir/runner.doctree differ diff --git a/.doctrees/API/evaluation/miracl.doctree b/.doctrees/API/evaluation/miracl.doctree new file mode 100644 index 00000000..f0993e96 Binary files /dev/null and b/.doctrees/API/evaluation/miracl.doctree differ diff --git a/.doctrees/API/evaluation/miracl/data_loader.doctree b/.doctrees/API/evaluation/miracl/data_loader.doctree new file mode 100644 index 00000000..44cd9db7 Binary files /dev/null and b/.doctrees/API/evaluation/miracl/data_loader.doctree differ diff --git a/.doctrees/API/evaluation/miracl/runner.doctree b/.doctrees/API/evaluation/miracl/runner.doctree new file mode 100644 index 00000000..45fd5ff0 Binary files /dev/null and b/.doctrees/API/evaluation/miracl/runner.doctree differ diff --git a/.doctrees/API/evaluation/mkqa.doctree b/.doctrees/API/evaluation/mkqa.doctree new file mode 100644 index 00000000..161fe549 Binary files /dev/null and b/.doctrees/API/evaluation/mkqa.doctree differ diff --git a/.doctrees/API/evaluation/mkqa/data_loader.doctree b/.doctrees/API/evaluation/mkqa/data_loader.doctree new file mode 100644 index 00000000..6abe6684 Binary files /dev/null and b/.doctrees/API/evaluation/mkqa/data_loader.doctree differ diff --git a/.doctrees/API/evaluation/mkqa/evaluator.doctree b/.doctrees/API/evaluation/mkqa/evaluator.doctree new file mode 100644 index 00000000..e310c1a2 Binary files /dev/null and b/.doctrees/API/evaluation/mkqa/evaluator.doctree differ diff --git a/.doctrees/API/evaluation/mkqa/runner.doctree b/.doctrees/API/evaluation/mkqa/runner.doctree new file mode 100644 index 00000000..f5b7146c Binary files /dev/null and b/.doctrees/API/evaluation/mkqa/runner.doctree differ diff --git a/.doctrees/API/evaluation/mldr.doctree b/.doctrees/API/evaluation/mldr.doctree new file mode 100644 index 00000000..df371214 Binary files /dev/null and b/.doctrees/API/evaluation/mldr.doctree differ diff --git a/.doctrees/API/evaluation/mldr/data_loader.doctree b/.doctrees/API/evaluation/mldr/data_loader.doctree new file mode 100644 index 00000000..47774bdc Binary files /dev/null and b/.doctrees/API/evaluation/mldr/data_loader.doctree differ diff --git a/.doctrees/API/evaluation/mldr/runner.doctree b/.doctrees/API/evaluation/mldr/runner.doctree new file mode 100644 index 00000000..38fdc27f Binary files /dev/null and b/.doctrees/API/evaluation/mldr/runner.doctree differ diff --git a/.doctrees/API/evaluation/msmarco.doctree b/.doctrees/API/evaluation/msmarco.doctree new file mode 100644 index 00000000..55a896d1 Binary files /dev/null and b/.doctrees/API/evaluation/msmarco.doctree differ diff --git a/.doctrees/API/evaluation/msmarco/data_loader.doctree b/.doctrees/API/evaluation/msmarco/data_loader.doctree new file mode 100644 index 00000000..5935d39e Binary files /dev/null and b/.doctrees/API/evaluation/msmarco/data_loader.doctree differ diff --git a/.doctrees/API/evaluation/msmarco/runner.doctree b/.doctrees/API/evaluation/msmarco/runner.doctree new file mode 100644 index 00000000..5295146d Binary files /dev/null and b/.doctrees/API/evaluation/msmarco/runner.doctree differ diff --git a/.doctrees/API/evaluation/mteb.doctree b/.doctrees/API/evaluation/mteb.doctree new file mode 100644 index 00000000..473ec5d8 Binary files /dev/null and b/.doctrees/API/evaluation/mteb.doctree differ diff --git a/.doctrees/API/evaluation/mteb/arguments.doctree b/.doctrees/API/evaluation/mteb/arguments.doctree new file mode 100644 index 00000000..e76a4fbf Binary files /dev/null and b/.doctrees/API/evaluation/mteb/arguments.doctree differ diff --git a/.doctrees/API/evaluation/mteb/runner.doctree b/.doctrees/API/evaluation/mteb/runner.doctree new file mode 100644 index 00000000..04d2537f Binary files /dev/null and b/.doctrees/API/evaluation/mteb/runner.doctree differ diff --git a/.doctrees/API/evaluation/mteb/searcher.doctree b/.doctrees/API/evaluation/mteb/searcher.doctree new file mode 100644 index 00000000..0ff9c928 Binary files /dev/null and b/.doctrees/API/evaluation/mteb/searcher.doctree differ diff --git a/.doctrees/API/finetune.doctree b/.doctrees/API/finetune.doctree new file mode 100644 index 00000000..fc5cb8a5 Binary files /dev/null and b/.doctrees/API/finetune.doctree differ diff --git a/.doctrees/API/finetune/embedder.doctree b/.doctrees/API/finetune/embedder.doctree new file mode 100644 index 00000000..1fe13a7a Binary files /dev/null and b/.doctrees/API/finetune/embedder.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only.doctree b/.doctrees/API/finetune/embedder/decoder_only.doctree new file mode 100644 index 00000000..a83f1749 Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/base.doctree b/.doctrees/API/finetune/embedder/decoder_only/base.doctree new file mode 100644 index 00000000..81f401fd Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/base.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/base/arguments.doctree b/.doctrees/API/finetune/embedder/decoder_only/base/arguments.doctree new file mode 100644 index 00000000..4a27b861 Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/base/arguments.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/base/modeling.doctree b/.doctrees/API/finetune/embedder/decoder_only/base/modeling.doctree new file mode 100644 index 00000000..7de758bd Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/base/modeling.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/base/runner.doctree b/.doctrees/API/finetune/embedder/decoder_only/base/runner.doctree new file mode 100644 index 00000000..91e42825 Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/base/runner.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/base/trainer.doctree b/.doctrees/API/finetune/embedder/decoder_only/base/trainer.doctree new file mode 100644 index 00000000..3b9193a0 Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/base/trainer.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/icl.doctree b/.doctrees/API/finetune/embedder/decoder_only/icl.doctree new file mode 100644 index 00000000..ffe58141 Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/icl.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/icl/arguments.doctree b/.doctrees/API/finetune/embedder/decoder_only/icl/arguments.doctree new file mode 100644 index 00000000..b98750f8 Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/icl/arguments.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/icl/dataset.doctree b/.doctrees/API/finetune/embedder/decoder_only/icl/dataset.doctree new file mode 100644 index 00000000..6d698f1a Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/icl/dataset.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/icl/modeling.doctree b/.doctrees/API/finetune/embedder/decoder_only/icl/modeling.doctree new file mode 100644 index 00000000..8c331196 Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/icl/modeling.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/icl/runner.doctree b/.doctrees/API/finetune/embedder/decoder_only/icl/runner.doctree new file mode 100644 index 00000000..98c90e2c Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/icl/runner.doctree differ diff --git a/.doctrees/API/finetune/embedder/decoder_only/icl/trainer.doctree b/.doctrees/API/finetune/embedder/decoder_only/icl/trainer.doctree new file mode 100644 index 00000000..9235ac99 Binary files /dev/null and b/.doctrees/API/finetune/embedder/decoder_only/icl/trainer.doctree differ diff --git a/.doctrees/API/finetune/embedder/encoder_only.doctree b/.doctrees/API/finetune/embedder/encoder_only.doctree new file mode 100644 index 00000000..13fa2f86 Binary files /dev/null and b/.doctrees/API/finetune/embedder/encoder_only.doctree differ diff --git a/.doctrees/API/finetune/embedder/encoder_only/base.doctree b/.doctrees/API/finetune/embedder/encoder_only/base.doctree new file mode 100644 index 00000000..5656bf1b Binary files /dev/null and b/.doctrees/API/finetune/embedder/encoder_only/base.doctree differ diff --git a/.doctrees/API/finetune/embedder/encoder_only/base/modeling.doctree b/.doctrees/API/finetune/embedder/encoder_only/base/modeling.doctree new file mode 100644 index 00000000..43a00ab4 Binary files /dev/null and b/.doctrees/API/finetune/embedder/encoder_only/base/modeling.doctree differ diff --git a/.doctrees/API/finetune/embedder/encoder_only/base/runner.doctree b/.doctrees/API/finetune/embedder/encoder_only/base/runner.doctree new file mode 100644 index 00000000..71bb9af7 Binary files /dev/null and b/.doctrees/API/finetune/embedder/encoder_only/base/runner.doctree differ diff --git a/.doctrees/API/finetune/embedder/encoder_only/base/trainer.doctree b/.doctrees/API/finetune/embedder/encoder_only/base/trainer.doctree new file mode 100644 index 00000000..52447ebc Binary files /dev/null and b/.doctrees/API/finetune/embedder/encoder_only/base/trainer.doctree differ diff --git a/.doctrees/API/finetune/embedder/encoder_only/m3.doctree b/.doctrees/API/finetune/embedder/encoder_only/m3.doctree new file mode 100644 index 00000000..a14c4266 Binary files /dev/null and b/.doctrees/API/finetune/embedder/encoder_only/m3.doctree differ diff --git a/.doctrees/API/finetune/embedder/encoder_only/m3/arguments.doctree b/.doctrees/API/finetune/embedder/encoder_only/m3/arguments.doctree new file mode 100644 index 00000000..61aa2ed3 Binary files /dev/null and b/.doctrees/API/finetune/embedder/encoder_only/m3/arguments.doctree differ diff --git a/.doctrees/API/finetune/embedder/encoder_only/m3/modeling.doctree b/.doctrees/API/finetune/embedder/encoder_only/m3/modeling.doctree new file mode 100644 index 00000000..5e06ff23 Binary files /dev/null and b/.doctrees/API/finetune/embedder/encoder_only/m3/modeling.doctree differ diff --git a/.doctrees/API/finetune/embedder/encoder_only/m3/runner.doctree b/.doctrees/API/finetune/embedder/encoder_only/m3/runner.doctree new file mode 100644 index 00000000..1ef8e02c Binary files /dev/null and b/.doctrees/API/finetune/embedder/encoder_only/m3/runner.doctree differ diff --git a/.doctrees/API/finetune/embedder/encoder_only/m3/trainer.doctree b/.doctrees/API/finetune/embedder/encoder_only/m3/trainer.doctree new file mode 100644 index 00000000..1ab7aa06 Binary files /dev/null and b/.doctrees/API/finetune/embedder/encoder_only/m3/trainer.doctree differ diff --git a/.doctrees/API/finetune/reranker.doctree b/.doctrees/API/finetune/reranker.doctree new file mode 100644 index 00000000..cc1ed8c1 Binary files /dev/null and b/.doctrees/API/finetune/reranker.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only.doctree b/.doctrees/API/finetune/reranker/decoder_only.doctree new file mode 100644 index 00000000..b4add9db Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only/base.doctree b/.doctrees/API/finetune/reranker/decoder_only/base.doctree new file mode 100644 index 00000000..61117270 Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only/base.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only/base/arguments.doctree b/.doctrees/API/finetune/reranker/decoder_only/base/arguments.doctree new file mode 100644 index 00000000..bd4e44ef Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only/base/arguments.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only/base/modeling.doctree b/.doctrees/API/finetune/reranker/decoder_only/base/modeling.doctree new file mode 100644 index 00000000..7a03f49c Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only/base/modeling.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only/base/runner.doctree b/.doctrees/API/finetune/reranker/decoder_only/base/runner.doctree new file mode 100644 index 00000000..10969ddd Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only/base/runner.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only/base/trainer.doctree b/.doctrees/API/finetune/reranker/decoder_only/base/trainer.doctree new file mode 100644 index 00000000..0f416c69 Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only/base/trainer.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only/layerwise.doctree b/.doctrees/API/finetune/reranker/decoder_only/layerwise.doctree new file mode 100644 index 00000000..805fa178 Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only/layerwise.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only/layerwise/arguments.doctree b/.doctrees/API/finetune/reranker/decoder_only/layerwise/arguments.doctree new file mode 100644 index 00000000..f3b8d026 Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only/layerwise/arguments.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only/layerwise/modeling.doctree b/.doctrees/API/finetune/reranker/decoder_only/layerwise/modeling.doctree new file mode 100644 index 00000000..1391a32e Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only/layerwise/modeling.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only/layerwise/runner.doctree b/.doctrees/API/finetune/reranker/decoder_only/layerwise/runner.doctree new file mode 100644 index 00000000..ccb82204 Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only/layerwise/runner.doctree differ diff --git a/.doctrees/API/finetune/reranker/decoder_only/layerwise/trainer.doctree b/.doctrees/API/finetune/reranker/decoder_only/layerwise/trainer.doctree new file mode 100644 index 00000000..eb8ce689 Binary files /dev/null and b/.doctrees/API/finetune/reranker/decoder_only/layerwise/trainer.doctree differ diff --git a/.doctrees/API/finetune/reranker/encoder_only.doctree b/.doctrees/API/finetune/reranker/encoder_only.doctree new file mode 100644 index 00000000..9c57e1bb Binary files /dev/null and b/.doctrees/API/finetune/reranker/encoder_only.doctree differ diff --git a/.doctrees/API/finetune/reranker/encoder_only/base.doctree b/.doctrees/API/finetune/reranker/encoder_only/base.doctree new file mode 100644 index 00000000..9b43a3bb Binary files /dev/null and b/.doctrees/API/finetune/reranker/encoder_only/base.doctree differ diff --git a/.doctrees/API/finetune/reranker/encoder_only/base/modeling.doctree b/.doctrees/API/finetune/reranker/encoder_only/base/modeling.doctree new file mode 100644 index 00000000..def5e5f9 Binary files /dev/null and b/.doctrees/API/finetune/reranker/encoder_only/base/modeling.doctree differ diff --git a/.doctrees/API/finetune/reranker/encoder_only/base/runner.doctree b/.doctrees/API/finetune/reranker/encoder_only/base/runner.doctree new file mode 100644 index 00000000..70007851 Binary files /dev/null and b/.doctrees/API/finetune/reranker/encoder_only/base/runner.doctree differ diff --git a/.doctrees/API/finetune/reranker/encoder_only/base/trainer.doctree b/.doctrees/API/finetune/reranker/encoder_only/base/trainer.doctree new file mode 100644 index 00000000..3fad5f94 Binary files /dev/null and b/.doctrees/API/finetune/reranker/encoder_only/base/trainer.doctree differ diff --git a/.doctrees/API/inference.doctree b/.doctrees/API/inference.doctree new file mode 100644 index 00000000..c09f1a9b Binary files /dev/null and b/.doctrees/API/inference.doctree differ diff --git a/.doctrees/API/inference/FlagAutoModel.doctree b/.doctrees/API/inference/FlagAutoModel.doctree new file mode 100644 index 00000000..7a862ebe Binary files /dev/null and b/.doctrees/API/inference/FlagAutoModel.doctree differ diff --git a/.doctrees/API/inference/FlagAutoReranker.doctree b/.doctrees/API/inference/FlagAutoReranker.doctree new file mode 100644 index 00000000..283df588 Binary files /dev/null and b/.doctrees/API/inference/FlagAutoReranker.doctree differ diff --git a/.doctrees/API/inference/embedder/decoder_only/BaseLLMEmbedder.doctree b/.doctrees/API/inference/embedder/decoder_only/BaseLLMEmbedder.doctree new file mode 100644 index 00000000..77d4bd30 Binary files /dev/null and b/.doctrees/API/inference/embedder/decoder_only/BaseLLMEmbedder.doctree differ diff --git a/.doctrees/API/inference/embedder/decoder_only/ICLLLMEmbedder.doctree b/.doctrees/API/inference/embedder/decoder_only/ICLLLMEmbedder.doctree new file mode 100644 index 00000000..c8b4e4a3 Binary files /dev/null and b/.doctrees/API/inference/embedder/decoder_only/ICLLLMEmbedder.doctree differ diff --git a/.doctrees/API/inference/embedder/embedder.doctree b/.doctrees/API/inference/embedder/embedder.doctree new file mode 100644 index 00000000..bbe63b79 Binary files /dev/null and b/.doctrees/API/inference/embedder/embedder.doctree differ diff --git a/.doctrees/API/inference/embedder/encoder_only/BaseEmbedder.doctree b/.doctrees/API/inference/embedder/encoder_only/BaseEmbedder.doctree new file mode 100644 index 00000000..8df8154e Binary files /dev/null and b/.doctrees/API/inference/embedder/encoder_only/BaseEmbedder.doctree differ diff --git a/.doctrees/API/inference/embedder/encoder_only/M3Embedder.doctree b/.doctrees/API/inference/embedder/encoder_only/M3Embedder.doctree new file mode 100644 index 00000000..6df1a52c Binary files /dev/null and b/.doctrees/API/inference/embedder/encoder_only/M3Embedder.doctree differ diff --git a/.doctrees/API/inference/reranker/decoder_only/BaseLLMReranker.doctree b/.doctrees/API/inference/reranker/decoder_only/BaseLLMReranker.doctree new file mode 100644 index 00000000..8e84399e Binary files /dev/null and b/.doctrees/API/inference/reranker/decoder_only/BaseLLMReranker.doctree differ diff --git a/.doctrees/API/inference/reranker/decoder_only/LayerWiseLLMReranker.doctree b/.doctrees/API/inference/reranker/decoder_only/LayerWiseLLMReranker.doctree new file mode 100644 index 00000000..94aa795e Binary files /dev/null and b/.doctrees/API/inference/reranker/decoder_only/LayerWiseLLMReranker.doctree differ diff --git a/.doctrees/API/inference/reranker/decoder_only/LightweightLLMReranker.doctree b/.doctrees/API/inference/reranker/decoder_only/LightweightLLMReranker.doctree new file mode 100644 index 00000000..8054dcca Binary files /dev/null and b/.doctrees/API/inference/reranker/decoder_only/LightweightLLMReranker.doctree differ diff --git a/.doctrees/API/inference/reranker/encoder_only/BaseReranker.doctree b/.doctrees/API/inference/reranker/encoder_only/BaseReranker.doctree new file mode 100644 index 00000000..6350dbe9 Binary files /dev/null and b/.doctrees/API/inference/reranker/encoder_only/BaseReranker.doctree differ diff --git a/.doctrees/API/inference/reranker/reranker.doctree b/.doctrees/API/inference/reranker/reranker.doctree new file mode 100644 index 00000000..0c55e0c3 Binary files /dev/null and b/.doctrees/API/inference/reranker/reranker.doctree differ diff --git a/.doctrees/C-MTEB.doctree b/.doctrees/C-MTEB.doctree new file mode 100644 index 00000000..da8ec98d Binary files /dev/null and b/.doctrees/C-MTEB.doctree differ diff --git a/.doctrees/Introduction/installation.doctree b/.doctrees/Introduction/installation.doctree new file mode 100644 index 00000000..0184ec5d Binary files /dev/null and b/.doctrees/Introduction/installation.doctree differ diff --git a/.doctrees/Introduction/quick_start.doctree b/.doctrees/Introduction/quick_start.doctree new file mode 100644 index 00000000..7b41a428 Binary files /dev/null and b/.doctrees/Introduction/quick_start.doctree differ diff --git a/.doctrees/bge/bge_icl.doctree b/.doctrees/bge/bge_icl.doctree new file mode 100644 index 00000000..6b70d5f8 Binary files /dev/null and b/.doctrees/bge/bge_icl.doctree differ diff --git a/.doctrees/bge/bge_m3.doctree b/.doctrees/bge/bge_m3.doctree new file mode 100644 index 00000000..9072b246 Binary files /dev/null and b/.doctrees/bge/bge_m3.doctree differ diff --git a/.doctrees/bge/bge_reranker.doctree b/.doctrees/bge/bge_reranker.doctree new file mode 100644 index 00000000..b37532f8 Binary files /dev/null and b/.doctrees/bge/bge_reranker.doctree differ diff --git a/.doctrees/bge/bge_v1.doctree b/.doctrees/bge/bge_v1.doctree new file mode 100644 index 00000000..e53b878e Binary files /dev/null and b/.doctrees/bge/bge_v1.doctree differ diff --git a/.doctrees/bge/introduction.doctree b/.doctrees/bge/introduction.doctree new file mode 100644 index 00000000..607d0bd4 Binary files /dev/null and b/.doctrees/bge/introduction.doctree differ diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle new file mode 100644 index 00000000..4809fdee Binary files /dev/null and b/.doctrees/environment.pickle differ diff --git a/.doctrees/index.doctree b/.doctrees/index.doctree new file mode 100644 index 00000000..d5b90d12 Binary files /dev/null and b/.doctrees/index.doctree differ diff --git a/.doctrees/tutorial/1_Embedding.doctree b/.doctrees/tutorial/1_Embedding.doctree new file mode 100644 index 00000000..d6e2623a Binary files /dev/null and b/.doctrees/tutorial/1_Embedding.doctree differ diff --git a/.doctrees/tutorial/1_Embedding/1.1.1.doctree b/.doctrees/tutorial/1_Embedding/1.1.1.doctree new file mode 100644 index 00000000..12fd649e Binary files /dev/null and b/.doctrees/tutorial/1_Embedding/1.1.1.doctree differ diff --git a/.doctrees/tutorial/1_Embedding/1.2.1.doctree b/.doctrees/tutorial/1_Embedding/1.2.1.doctree new file mode 100644 index 00000000..e44a8170 Binary files /dev/null and b/.doctrees/tutorial/1_Embedding/1.2.1.doctree differ diff --git a/.doctrees/tutorial/1_Embedding/1.2.2.doctree b/.doctrees/tutorial/1_Embedding/1.2.2.doctree new file mode 100644 index 00000000..2c176f92 Binary files /dev/null and b/.doctrees/tutorial/1_Embedding/1.2.2.doctree differ diff --git a/.doctrees/tutorial/1_Embedding/1.2.3.doctree b/.doctrees/tutorial/1_Embedding/1.2.3.doctree new file mode 100644 index 00000000..edf8adc2 Binary files /dev/null and b/.doctrees/tutorial/1_Embedding/1.2.3.doctree differ diff --git a/.doctrees/tutorial/2_Metrics.doctree b/.doctrees/tutorial/2_Metrics.doctree new file mode 100644 index 00000000..67eb10e4 Binary files /dev/null and b/.doctrees/tutorial/2_Metrics.doctree differ diff --git a/.doctrees/tutorial/2_Metrics/2.1.doctree b/.doctrees/tutorial/2_Metrics/2.1.doctree new file mode 100644 index 00000000..1ae9a4f1 Binary files /dev/null and b/.doctrees/tutorial/2_Metrics/2.1.doctree differ diff --git a/.doctrees/tutorial/2_Metrics/2.2.doctree b/.doctrees/tutorial/2_Metrics/2.2.doctree new file mode 100644 index 00000000..49ef1f2b Binary files /dev/null and b/.doctrees/tutorial/2_Metrics/2.2.doctree differ diff --git a/.doctrees/tutorial/3_Indexing.doctree b/.doctrees/tutorial/3_Indexing.doctree new file mode 100644 index 00000000..c8cf3931 Binary files /dev/null and b/.doctrees/tutorial/3_Indexing.doctree differ diff --git a/.doctrees/tutorial/3_Indexing/3.1.1.doctree b/.doctrees/tutorial/3_Indexing/3.1.1.doctree new file mode 100644 index 00000000..f2af08af Binary files /dev/null and b/.doctrees/tutorial/3_Indexing/3.1.1.doctree differ diff --git a/.doctrees/tutorial/3_Indexing/3.1.2.doctree b/.doctrees/tutorial/3_Indexing/3.1.2.doctree new file mode 100644 index 00000000..262907ac Binary files /dev/null and b/.doctrees/tutorial/3_Indexing/3.1.2.doctree differ diff --git a/.doctrees/tutorial/3_Indexing/3.1.3.doctree b/.doctrees/tutorial/3_Indexing/3.1.3.doctree new file mode 100644 index 00000000..ce105dd5 Binary files /dev/null and b/.doctrees/tutorial/3_Indexing/3.1.3.doctree differ diff --git a/.doctrees/tutorial/3_Indexing/3.1.4.doctree b/.doctrees/tutorial/3_Indexing/3.1.4.doctree new file mode 100644 index 00000000..69e519cf Binary files /dev/null and b/.doctrees/tutorial/3_Indexing/3.1.4.doctree differ diff --git a/.doctrees/tutorial/3_Indexing/3.1.5.doctree b/.doctrees/tutorial/3_Indexing/3.1.5.doctree new file mode 100644 index 00000000..ee023552 Binary files /dev/null and b/.doctrees/tutorial/3_Indexing/3.1.5.doctree differ diff --git a/.doctrees/tutorial/4_Evaluation.doctree b/.doctrees/tutorial/4_Evaluation.doctree new file mode 100644 index 00000000..d3d3dfde Binary files /dev/null and b/.doctrees/tutorial/4_Evaluation.doctree differ diff --git a/.doctrees/tutorial/4_Evaluation/4.1.1.doctree b/.doctrees/tutorial/4_Evaluation/4.1.1.doctree new file mode 100644 index 00000000..b2036595 Binary files /dev/null and b/.doctrees/tutorial/4_Evaluation/4.1.1.doctree differ diff --git a/.doctrees/tutorial/4_Evaluation/4.2.1.doctree b/.doctrees/tutorial/4_Evaluation/4.2.1.doctree new file mode 100644 index 00000000..d648a6eb Binary files /dev/null and b/.doctrees/tutorial/4_Evaluation/4.2.1.doctree differ diff --git a/.doctrees/tutorial/4_Evaluation/4.2.2.doctree b/.doctrees/tutorial/4_Evaluation/4.2.2.doctree new file mode 100644 index 00000000..a022c36f Binary files /dev/null and b/.doctrees/tutorial/4_Evaluation/4.2.2.doctree differ diff --git a/.doctrees/tutorial/4_Evaluation/4.3.1.doctree b/.doctrees/tutorial/4_Evaluation/4.3.1.doctree new file mode 100644 index 00000000..495bc9a3 Binary files /dev/null and b/.doctrees/tutorial/4_Evaluation/4.3.1.doctree differ diff --git a/.doctrees/tutorial/5_Reranking.doctree b/.doctrees/tutorial/5_Reranking.doctree new file mode 100644 index 00000000..a50d3ea9 Binary files /dev/null and b/.doctrees/tutorial/5_Reranking.doctree differ diff --git a/.doctrees/tutorial/5_Reranking/5.1.doctree b/.doctrees/tutorial/5_Reranking/5.1.doctree new file mode 100644 index 00000000..60cd01d9 Binary files /dev/null and b/.doctrees/tutorial/5_Reranking/5.1.doctree differ diff --git a/.doctrees/tutorial/6_RAG.doctree b/.doctrees/tutorial/6_RAG.doctree new file mode 100644 index 00000000..abb76722 Binary files /dev/null and b/.doctrees/tutorial/6_RAG.doctree differ diff --git a/.doctrees/tutorial/6_RAG/6.1.doctree b/.doctrees/tutorial/6_RAG/6.1.doctree new file mode 100644 index 00000000..73dcd204 Binary files /dev/null and b/.doctrees/tutorial/6_RAG/6.1.doctree differ diff --git a/.doctrees/tutorial/6_RAG/6.2.doctree b/.doctrees/tutorial/6_RAG/6.2.doctree new file mode 100644 index 00000000..d5c1b230 Binary files /dev/null and b/.doctrees/tutorial/6_RAG/6.2.doctree differ diff --git a/.doctrees/tutorial/6_RAG/6.3.doctree b/.doctrees/tutorial/6_RAG/6.3.doctree new file mode 100644 index 00000000..5ebbb812 Binary files /dev/null and b/.doctrees/tutorial/6_RAG/6.3.doctree differ diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/API/abc.html b/API/abc.html new file mode 100644 index 00000000..28e32e9d --- /dev/null +++ b/API/abc.html @@ -0,0 +1,737 @@ + + + + + + + + + Abstract Class - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Abstract Class

+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/evaluation.html b/API/abc/evaluation.html new file mode 100644 index 00000000..2a307298 --- /dev/null +++ b/API/abc/evaluation.html @@ -0,0 +1,512 @@ + + + + + + + + + Evaluation - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+ +
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/evaluation/arguments.html b/API/abc/evaluation/arguments.html new file mode 100644 index 00000000..d15ceaa3 --- /dev/null +++ b/API/abc/evaluation/arguments.html @@ -0,0 +1,495 @@ + + + + + + + + + Arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Arguments

+
+ +
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/evaluation/data_loader.html b/API/abc/evaluation/data_loader.html new file mode 100644 index 00000000..1d44cb69 --- /dev/null +++ b/API/abc/evaluation/data_loader.html @@ -0,0 +1,517 @@ + + + + + + + + + dataset loader - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

dataset loader

+
+

Methods

+
+
+ +
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/evaluation/evaluator.html b/API/abc/evaluation/evaluator.html new file mode 100644 index 00000000..d4a2aa6b --- /dev/null +++ b/API/abc/evaluation/evaluator.html @@ -0,0 +1,495 @@ + + + + + + + + + Evaluator - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Evaluator

+
+ +
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/evaluation/runner.html b/API/abc/evaluation/runner.html new file mode 100644 index 00000000..c817a981 --- /dev/null +++ b/API/abc/evaluation/runner.html @@ -0,0 +1,495 @@ + + + + + + + + + runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

runner

+
+ +
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/evaluation/searcher.html b/API/abc/evaluation/searcher.html new file mode 100644 index 00000000..1fb6ca2e --- /dev/null +++ b/API/abc/evaluation/searcher.html @@ -0,0 +1,525 @@ + + + + + + + + + searcher - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

searcher

+
+

EvalRetriever

+
+
+

EvalDenseRetriever

+
+
+

EvalReranker

+
+
+ +
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune.html b/API/abc/finetune.html new file mode 100644 index 00000000..d83cfcb9 --- /dev/null +++ b/API/abc/finetune.html @@ -0,0 +1,697 @@ + + + + + + + + + Finetune - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Finetune

+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/embedder.html b/API/abc/finetune/embedder.html new file mode 100644 index 00000000..fb65928f --- /dev/null +++ b/API/abc/finetune/embedder.html @@ -0,0 +1,614 @@ + + + + + + + + + Embedder - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Embedder

+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/embedder/AbsArguments.html b/API/abc/finetune/embedder/AbsArguments.html new file mode 100644 index 00000000..6619f2f0 --- /dev/null +++ b/API/abc/finetune/embedder/AbsArguments.html @@ -0,0 +1,545 @@ + + + + + + + + + AbsArguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsArguments

+
+
+class FlagEmbedding.abc.finetune.reranker.AbsRerankerModelArguments(model_name_or_path: str, config_name: str | None = None, tokenizer_name: str | None = None, cache_dir: str | None = None, trust_remote_code: bool = False, model_type: str = 'encoder', token: str = <factory>)[source]
+

Abstract class for reranker model arguments.

+
+ +
+
+class FlagEmbedding.abc.finetune.reranker.AbsRerankerDataArguments(train_data: str | None = None, cache_path: str | None = None, train_group_size: int = 8, query_max_len: int = 32, passage_max_len: int = 128, max_len: int = 512, pad_to_multiple_of: int | None = None, max_example_num_per_dataset: int = 100000000, query_instruction_for_rerank: str | None = None, query_instruction_format: str = '{}{}', knowledge_distillation: bool = False, passage_instruction_for_rerank: str | None = None, passage_instruction_format: str | None = '{}{}', shuffle_ratio: float = 0.0, sep_token: str = '\n')[source]
+

Abstract class for reranker data arguments.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/embedder/AbsDataset.html b/API/abc/finetune/embedder/AbsDataset.html new file mode 100644 index 00000000..eb9831da --- /dev/null +++ b/API/abc/finetune/embedder/AbsDataset.html @@ -0,0 +1,768 @@ + + + + + + + + + AbsDataset - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsDataset

+
+

AbsEmbedderTrainDataset

+
+
+class FlagEmbedding.abc.finetune.embedder.AbsEmbedderTrainDataset(args: AbsEmbedderDataArguments, tokenizer: PreTrainedTokenizer)[source]
+

Abstract class for training dataset.

+
+
Parameters:
+
+
+
+
+ +
+

Methods

+
+
+AbsEmbedderTrainDataset._load_dataset(file_path: str)[source]
+

Load dataset from path.

+
+
Parameters:
+

file_path (str) – Path to load the datasets from.

+
+
Raises:
+

ValueErrorpos_scores and neg_scores not found in the features of training data

+
+
Returns:
+

Loaded HF dataset.

+
+
Return type:
+

datasets.Dataset

+
+
+
+ +
+
+AbsEmbedderTrainDataset._shuffle_text(text)[source]
+

shuffle the input text.

+
+
Parameters:
+

text (str) – Input text.

+
+
Returns:
+

Shuffled text.

+
+
Return type:
+

str

+
+
+
+ +
+
+
+

AbsEmbedderCollator

+
+
+class FlagEmbedding.abc.finetune.embedder.AbsEmbedderCollator(tokenizer: PreTrainedTokenizerBase, padding: bool | str | PaddingStrategy = True, max_length: int | None = None, pad_to_multiple_of: int | None = None, return_tensors: str = 'pt', query_max_len: int = 32, passage_max_len: int = 128, sub_batch_size: int = -1)[source]
+

The abstract embedder collator.

+
+ +
+
+

AbsEmbedderSameDatasetTrainDataset

+
+
+class FlagEmbedding.abc.finetune.embedder.AbsEmbedderSameDatasetTrainDataset(args: AbsEmbedderDataArguments, default_batch_size: int, seed: int, tokenizer: PreTrainedTokenizer, process_index: int = 0, num_processes: int = 1)[source]
+

Abstract class for training dataset that samples batches from same dataset.

+
+
Parameters:
+
    +
  • args (AbsEmbedderDataArguments) – Data arguments.

  • +
  • default_batch_size (int) – The default batch size for training.

  • +
  • seed (int) – Random seed.

  • +
  • tokenizer (PreTrainedTokenizer) – Tokenizer to use.

  • +
  • process_index (int, optional) – Current process index. Defaults to 0.

  • +
  • num_processes (int, optional) – Total number of processes. Defaults to 1.

  • +
+
+
+
+ +
+

Methods

+
+
+AbsEmbedderSameDatasetTrainDataset.refresh_epoch()[source]
+

Refresh data for epoch.

+
+ +
+
+AbsEmbedderSameDatasetTrainDataset._load_dataset(file_path: str)[source]
+

Load datset from given path.

+
+
Parameters:
+

file_path (str) – The path to load or download from HF hub.

+
+
Returns:
+

The loaded dataset.

+
+
Return type:
+

datasets.Dataset

+
+
+
+ +
+
+static AbsEmbedderSameDatasetTrainDataset._get_file_batch_size(temp_dataset: Dataset, default_batch_size: int)[source]
+

Get the appropriate batch size for the dataset.

+
+
Parameters:
+
    +
  • temp_dataset (datasets.Dataset) – Loaded datasets.Dataset object.

  • +
  • default_batch_size (int) – The default batch size to use if not specified in the dataset.

  • +
+
+
Returns:
+

The final batch size to use.

+
+
Return type:
+

int

+
+
+
+ +
+
+AbsEmbedderSameDatasetTrainDataset._get_train_group_size(batch_raw_data)[source]
+

Get the training group size and data type.

+
+
Parameters:
+

batch_raw_data (datasets.Dataset) – One batch of raw data.

+
+
Returns:
+

The training group size. +str: The type of data for the task.

+
+
Return type:
+

int

+
+
+
+ +
+
+AbsEmbedderSameDatasetTrainDataset._create_batch_data(batch_raw_data)[source]
+

Create a comple batch of data with queries, documents and teacher scores.

+
+
Parameters:
+

batch_raw_data (datasets.Dataset) – One batch of raw data.

+
+
Returns:
+

Queries with instruction format. +List[str]: Documents with instruction format. +List[float]: Teacher scores for model distillation.

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+
+

AbsEmbedderSameDatasetCollator

+
+
+class FlagEmbedding.abc.finetune.embedder.AbsEmbedderSameDatasetCollator(tokenizer: PreTrainedTokenizerBase, padding: bool | str | PaddingStrategy = True, max_length: int | None = None, pad_to_multiple_of: int | None = None, return_tensors: str = 'pt', query_max_len: int = 32, passage_max_len: int = 128, sub_batch_size: int = -1)[source]
+

EmbedCollator for SameDataset. +Note that after using this collator, the training_args should be set as:

+

training_args.per_device_train_batch_size = 1

+

training_args.dataloader_num_workers = 0    # avoid multi-processing

+
+ +
+
+

EmbedderTrainerCallbackForDataRefresh

+
+
+class FlagEmbedding.abc.finetune.embedder.EmbedderTrainerCallbackForDataRefresh(train_dataset: AbsEmbedderSameDatasetTrainDataset)[source]
+

Callback class to inspect the state of the training loop and take decision.

+
+ +
+

Methods

+
+
+EmbedderTrainerCallbackForDataRefresh.on_epoch_end(args: AbsEmbedderTrainingArguments, state: TrainerState, control: TrainerControl, **kwargs)[source]
+

Event called at the end of an epoch.

+
+ +
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/embedder/AbsModeling.html b/API/abc/finetune/embedder/AbsModeling.html new file mode 100644 index 00000000..a90a89d2 --- /dev/null +++ b/API/abc/finetune/embedder/AbsModeling.html @@ -0,0 +1,767 @@ + + + + + + + + + AbsModeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsModeling

+
+

AbsEmbedderModel

+
+
+class FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel(base_model, tokenizer: AutoTokenizer | None = None, negatives_cross_device: bool = False, temperature: float = 1.0, sub_batch_size: int = -1, kd_loss_type: str = 'kl_div')[source]
+

Abstract class of embedding model for training.

+
+
Parameters:
+
    +
  • base_model – The base model to train on.

  • +
  • tokenizer (AutoTokenizer, optional) – The tokenizer to use. Defaults to None.

  • +
  • negatives_cross_device (bool, optional) – If True, will compute cross devices negative loss. Defaults to False.

  • +
  • temperature (float, optional) – Temperature to control the scale of scores. Defaults to 1.0.

  • +
  • sub_batch_size (int, optional) – Sub-batch size during encoding. If negative, will not split to sub-batch. +Defaults to -1.

  • +
  • kd_loss_type (str, optional) – Type of knowledge distillation loss. Defaults to "kl_div".

  • +
+
+
+
+ +
+

Methods

+
+
+abstract AbsEmbedderModel.encode(features)[source]
+

Abstract method encode and get the embedding.

+
+
Parameters:
+

features (Union[list, dict]) – Features feed to the model.

+
+
+
+ +
+
+abstract AbsEmbedderModel.compute_loss(scores, target)[source]
+

Abstract method compute the loss.

+
+
Parameters:
+
    +
  • scores (torch.Tensor) – Computed score.

  • +
  • target (torch.Tensor) – The target value.

  • +
+
+
+
+ +
+
+abstract AbsEmbedderModel.compute_score(q_reps, p_reps)[source]
+

Abstract method to compute the score.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Queries representations.

  • +
  • p_reps (torch.Tensor) – Passages rerpresentations.

  • +
+
+
+
+ +
+
+abstract AbsEmbedderModel.save(output_dir: str)[source]
+

Abstract method to save the model.

+
+
Parameters:
+

output_dir (str) – Directory for saving the model.

+
+
+
+ +
+
+AbsEmbedderModel.get_local_score(q_reps, p_reps, all_scores)[source]
+

Get the local score of queries and passages.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Queries representations.

  • +
  • p_reps (torch.Tensor) – Passages rerpresentations.

  • +
  • all_scores (torch.Tensor) – All the query-passage scores computed.

  • +
+
+
Returns:
+

Local scores to compute loss.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+AbsEmbedderModel.compute_local_score(q_reps, p_reps, compute_score_func=None, **kwargs)[source]
+

Compute the local score of queries and passages.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Queries representations.

  • +
  • p_reps (torch.Tensor) – Passages rerpresentations.

  • +
  • compute_score_func (function, optional) – Function to compute score. Defaults to None, which will use the +self.compute_score().

  • +
+
+
Returns:
+

Local scores to compute loss.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+AbsEmbedderModel.forward(queries: Dict[str, Tensor] | List[Dict[str, Tensor]] | None = None, passages: Dict[str, Tensor] | List[Dict[str, Tensor]] | None = None, teacher_scores: None | List[float] = None, no_in_batch_neg_flag: bool = False)[source]
+

The computation performed at every call.

+
+
Parameters:
+
    +
  • queries (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional) – Input queries. Defaults to None.

  • +
  • passages (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional) – Input passages. Defaults to None.

  • +
  • teacher_scores (Union[None, List[float]], optional) – Teacher scores for distillation. Defaults to None.

  • +
  • no_in_batch_neg_flag (bool, optional) – If True, use no in-batch negatives and no cross-device negatives. Defaults to False.

  • +
+
+
Returns:
+

Output of the forward call of model.

+
+
Return type:
+

EmbedderOutput

+
+
+
+ +
+
+static AbsEmbedderModel.distill_loss(kd_loss_type, teacher_targets, student_scores, group_size=None)[source]
+

Compute the distillation loss.

+
+
Parameters:
+
    +
  • kd_loss_type (str) – Type of knowledge distillation loss, supports “kl_div” and “m3_kd_loss”.

  • +
  • teacher_targets (torch.Tensor) – Targets from the teacher model.

  • +
  • student_scores (torch.Tensor) – Score of student model.

  • +
  • group_size (int, optional) – Number of groups for . Defaults to None.

  • +
+
+
Raises:
+

ValueError – Invalid kd_loss_type

+
+
Returns:
+

A scalar of computed distillation loss.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+AbsEmbedderModel._compute_no_in_batch_neg_loss(q_reps, p_reps, teacher_targets=None, compute_score_func=None, **kwargs)[source]
+

Compute loss when using no in-batch negatives and no cross-device negatives

+
+ +
+
+AbsEmbedderModel._compute_in_batch_neg_loss(q_reps, p_reps, teacher_targets=None, compute_score_func=None, **kwargs)[source]
+

Compute loss when only using in-batch negatives

+
+ +
+
+AbsEmbedderModel._compute_cross_device_neg_loss(q_reps, p_reps, teacher_targets=None, compute_score_func=None, **kwargs)[source]
+

Compute loss when using both in-batch negatives and cross-device negatives

+
+ +
+
+AbsEmbedderModel._dist_gather_tensor(t: Tensor | None)[source]
+

Gather a tensor from all processes in a distributed setting.

+
+
Parameters:
+

t (Optional[torch.Tensor]) – The input tensor to be gathered. If None, no gathering is performed.

+
+
Returns:
+

+
A concatenated tensor from all processes if t is not None,

otherwise returns None.

+
+
+

+
+
Return type:
+

Union[torch.Tensor, None]

+
+
+
+ +
+
+
+

EmbedderOutput

+
+
+class FlagEmbedding.abc.finetune.embedder.EmbedderOutput(q_reps: Tensor | None = None, p_reps: Tensor | None = None, loss: Tensor | None = None, scores: Tensor | None = None)[source]
+

Output information returned by the model.

+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/embedder/AbsRunner.html b/API/abc/finetune/embedder/AbsRunner.html new file mode 100644 index 00000000..fef04f8b --- /dev/null +++ b/API/abc/finetune/embedder/AbsRunner.html @@ -0,0 +1,626 @@ + + + + + + + + + AbsRunner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsRunner

+
+

AbsEmbedderTrainer

+
+
+class FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner(model_args: AbsEmbedderModelArguments, data_args: AbsEmbedderDataArguments, training_args: AbsEmbedderTrainingArguments)[source]
+

Abstract class to run embedding model fine-tuning.

+
+
Parameters:
+
+
+
+
+ +
+

Methods

+
+
+abstract AbsEmbedderRunner.load_tokenizer_and_model() Tuple[PreTrainedTokenizer, AbsEmbedderModel][source]
+

Abstract method to load the tokenizer and model.

+
+
Returns:
+

Loaded tokenizer and model instances.

+
+
Return type:
+

Tuple[PreTrainedTokenizer, AbsEmbedderModel]

+
+
+
+ +
+
+abstract AbsEmbedderRunner.load_trainer() AbsEmbedderTrainer[source]
+

Abstract method to load the trainer.

+
+
Returns:
+

The loaded trainer instance.

+
+
Return type:
+

AbsEmbedderTrainer

+
+
+
+ +
+
+AbsEmbedderRunner.load_train_dataset() AbsEmbedderTrainDataset[source]
+

Loads the training dataset based on data arguments.

+
+
Returns:
+

The loaded dataset instance.

+
+
Return type:
+

AbsEmbedderTrainDataset

+
+
+
+ +
+
+AbsEmbedderRunner.load_data_collator() AbsEmbedderCollator[source]
+

Loads the appropriate data collator.

+
+
Returns:
+

Loaded data collator.

+
+
Return type:
+

AbsEmbedderCollator

+
+
+
+ +
+
+AbsEmbedderRunner.run()[source]
+

Executes the training process.

+
+ +
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/embedder/AbsTrainer.html b/API/abc/finetune/embedder/AbsTrainer.html new file mode 100644 index 00000000..fe3a20ce --- /dev/null +++ b/API/abc/finetune/embedder/AbsTrainer.html @@ -0,0 +1,578 @@ + + + + + + + + + AbsTrainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsTrainer

+
+

AbsEmbedderTrainer

+
+
+class FlagEmbedding.abc.finetune.embedder.AbsEmbedderTrainer(model: PreTrainedModel | Module | None = None, args: TrainingArguments | None = None, data_collator: DataCollator | None = None, train_dataset: Dataset | IterableDataset | Dataset | None = None, eval_dataset: Dataset | Dict[str, Dataset] | Dataset | None = None, tokenizer: PreTrainedTokenizerBase | None = None, model_init: Callable[[], PreTrainedModel] | None = None, compute_metrics: Callable[[EvalPrediction], Dict] | None = None, callbacks: List[TrainerCallback] | None = None, optimizers: Tuple[Optimizer, LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[Tensor, Tensor], Tensor] | None = None)[source]
+

Abstract class for the trainer of embedder.

+
+ +
+

Methods

+
+
+AbsEmbedderTrainer.compute_loss(model, inputs, return_outputs=False, **kwargs)[source]
+

How the loss is computed by Trainer. By default, all models return the loss in the first element.

+

Subclass and override for custom behavior.

+
+
Parameters:
+
    +
  • model (AbsEmbedderModel) – The model being trained.

  • +
  • inputs (dict) – A dictionary of input tensors to be passed to the model.

  • +
  • return_outputs (bool, optional) – If True, returns both the loss and the model’s outputs. Otherwise, +returns only the loss.

  • +
+
+
Returns:
+

+
The computed loss. If return_outputs is True,

also returns the model’s outputs in a tuple (loss, outputs).

+
+
+

+
+
Return type:
+

Union[torch.Tensor, tuple(torch.Tensor, EmbedderOutput)]

+
+
+
+ +
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/reranker.html b/API/abc/finetune/reranker.html new file mode 100644 index 00000000..93de9e5d --- /dev/null +++ b/API/abc/finetune/reranker.html @@ -0,0 +1,594 @@ + + + + + + + + + Reranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Reranker

+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/reranker/AbsArguments.html b/API/abc/finetune/reranker/AbsArguments.html new file mode 100644 index 00000000..47f58ad4 --- /dev/null +++ b/API/abc/finetune/reranker/AbsArguments.html @@ -0,0 +1,545 @@ + + + + + + + + + AbsArguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsArguments

+
+
+class FlagEmbedding.abc.finetune.embedder.AbsEmbedderModelArguments(model_name_or_path: str, config_name: str | None = None, tokenizer_name: str | None = None, cache_dir: str | None = None, trust_remote_code: bool = False, token: str = <factory>)[source]
+

Abstract class for model arguments.

+
+ +
+
+class FlagEmbedding.abc.finetune.embedder.AbsEmbedderDataArguments(train_data: str | None = None, cache_path: str | None = None, train_group_size: int = 8, query_max_len: int = 32, passage_max_len: int = 128, pad_to_multiple_of: int | None = None, max_example_num_per_dataset: int = 100000000, query_instruction_for_retrieval: str | None = None, query_instruction_format: str = '{}{}', knowledge_distillation: bool = False, passage_instruction_for_retrieval: str | None = None, passage_instruction_format: str | None = '{}{}', shuffle_ratio: float = 0.0, same_dataset_within_batch: bool = False, small_threshold: int = 0, drop_threshold: int = 0)[source]
+

Abstract class for data arguments.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/reranker/AbsDataset.html b/API/abc/finetune/reranker/AbsDataset.html new file mode 100644 index 00000000..baeb005a --- /dev/null +++ b/API/abc/finetune/reranker/AbsDataset.html @@ -0,0 +1,667 @@ + + + + + + + + + AbsDataset - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsDataset

+
+

AbsRerankerTrainDataset

+
+
+class FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainDataset(args: AbsRerankerDataArguments, tokenizer: PreTrainedTokenizer)[source]
+

Abstract class for reranker training dataset.

+
+
Parameters:
+
+
+
+
+ +
+

Methods

+
+
+AbsRerankerTrainDataset.create_one_example(qry_encoding: str, doc_encoding: str)[source]
+

Creates a single input example by encoding and preparing a query and document pair for the model.

+
+
Parameters:
+
    +
  • qry_encoding (str) – Query to be encoded.

  • +
  • doc_encoding (str) – Document to be encoded.

  • +
+
+
Returns:
+

A dictionary containing tokenized and prepared inputs, ready for model consumption.

+
+
Return type:
+

dict

+
+
+
+ +
+
+AbsRerankerTrainDataset._load_dataset(file_path: str)[source]
+

Load dataset from path.

+
+
Parameters:
+

file_path (str) – Path to load the datasets from.

+
+
Raises:
+

ValueErrorpos_scores and neg_scores not found in the features of training data

+
+
Returns:
+

Loaded HF dataset.

+
+
Return type:
+

datasets.Dataset

+
+
+
+ +
+
+AbsRerankerTrainDataset._shuffle_text(text)[source]
+

shuffle the input text.

+
+
Parameters:
+

text (str) – Input text.

+
+
Returns:
+

Shuffled text.

+
+
Return type:
+

str

+
+
+
+ +
+
+
+

AbsRerankerCollator

+
+
+class FlagEmbedding.abc.finetune.reranker.AbsRerankerCollator(tokenizer: PreTrainedTokenizerBase, padding: bool | str | PaddingStrategy = True, max_length: int | None = None, pad_to_multiple_of: int | None = None, return_tensors: str = 'pt', query_max_len: int = 32, passage_max_len: int = 128)[source]
+

The abstract reranker collator.

+
+ +
+
+

AbsLLMRerankerTrainDataset

+
+
+class FlagEmbedding.abc.finetune.reranker.AbsLLMRerankerTrainDataset(args: AbsRerankerDataArguments, tokenizer: PreTrainedTokenizer)[source]
+

Abstract class for LLM reranker training dataset.

+
+
Parameters:
+
+
+
+
+ +
+
+

AbsLLMRerankerCollator

+
+
+class FlagEmbedding.abc.finetune.reranker.AbsLLMRerankerCollator(tokenizer: PreTrainedTokenizerBase, model: Any | None = None, padding: bool | str | PaddingStrategy = True, max_length: int | None = None, pad_to_multiple_of: int | None = None, label_pad_token_id: int = -100, return_tensors: str = 'pt', query_max_len: int = 32, passage_max_len: int = 128)[source]
+

Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg] +and pass batch separately to the actual collator. +Abstract out data detail for the model.

+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/reranker/AbsModeling.html b/API/abc/finetune/reranker/AbsModeling.html new file mode 100644 index 00000000..da181de4 --- /dev/null +++ b/API/abc/finetune/reranker/AbsModeling.html @@ -0,0 +1,658 @@ + + + + + + + + + AbsModeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsModeling

+
+

AbsRerankerModel

+
+
+class FlagEmbedding.abc.finetune.reranker.AbsRerankerModel(base_model: None, tokenizer: AutoTokenizer | None = None, train_batch_size: int = 4)[source]
+

Abstract class of embedding model for training.

+
+
Parameters:
+
    +
  • base_model – The base model to train on.

  • +
  • tokenizer (AutoTokenizer, optional) – The tokenizer to use. Defaults to None.

  • +
  • train_batch_size (int, optional) – Batch size used for training. Defaults to 4.

  • +
+
+
+
+ +
+

Methods

+
+
+abstract AbsRerankerModel.encode(features)[source]
+

Abstract method of encode.

+
+
Parameters:
+

features (dict) – Teatures to pass to the model.

+
+
+
+ +
+
+AbsRerankerModel.gradient_checkpointing_enable(**kwargs)[source]
+

Activates gradient checkpointing for the current model.

+
+ +
+
+AbsRerankerModel.enable_input_require_grads(**kwargs)[source]
+

Enables the gradients for the input embeddings.

+
+ +
+
+AbsRerankerModel.forward(pair: Dict[str, Tensor] | List[Dict[str, Tensor]] | None = None, teacher_scores: Tensor | None = None)[source]
+

The computation performed at every call.

+
+
Parameters:
+
    +
  • pair (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional) – The query-document pair. Defaults to None.

  • +
  • teacher_scores (Optional[Tensor], optional) – Teacher scores of knowledge distillation. Defaults to None.

  • +
+
+
Returns:
+

Output of reranker model.

+
+
Return type:
+

RerankerOutput

+
+
+
+ +
+
+AbsRerankerModel.compute_loss(scores, target)[source]
+

Compute the loss.

+
+
Parameters:
+
    +
  • scores (torch.Tensor) – Computed scores.

  • +
  • target (torch.Tensor) – The target value.

  • +
+
+
Returns:
+

The computed loss.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+AbsRerankerModel.save(output_dir: str)[source]
+

Save the model.

+
+
Parameters:
+

output_dir (str) – Directory for saving the model.

+
+
+
+ +
+
+AbsRerankerModel.save_pretrained(*args, **kwargs)[source]
+

Save the tokenizer and model.

+
+ +
+
+
+

RerankerOutput

+
+
+class FlagEmbedding.abc.finetune.reranker.RerankerOutput(loss: torch.Tensor | None = None, scores: torch.Tensor | None = None)[source]
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/reranker/AbsRunner.html b/API/abc/finetune/reranker/AbsRunner.html new file mode 100644 index 00000000..041f4dba --- /dev/null +++ b/API/abc/finetune/reranker/AbsRunner.html @@ -0,0 +1,626 @@ + + + + + + + + + AbsRunner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsRunner

+
+

AbsRerankerTrainer

+
+
+class FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner(model_args: AbsRerankerModelArguments, data_args: AbsRerankerDataArguments, training_args: AbsRerankerTrainingArguments)[source]
+

Abstract class to run reranker model fine-tuning.

+
+
Parameters:
+
+
+
+
+ +
+

Methods

+
+
+abstract AbsRerankerRunner.load_tokenizer_and_model() Tuple[PreTrainedTokenizer, AbsRerankerModel][source]
+

Abstract method to load the tokenizer and model.

+
+
Returns:
+

Loaded tokenizer and model instances.

+
+
Return type:
+

Tuple[PreTrainedTokenizer, AbsRerankerModel]

+
+
+
+ +
+
+abstract AbsRerankerRunner.load_trainer() AbsRerankerTrainer[source]
+

Abstract method to load the trainer.

+
+
Returns:
+

The loaded trainer instance.

+
+
Return type:
+

AbsRerankerTrainer

+
+
+
+ +
+
+AbsRerankerRunner.load_train_dataset() AbsRerankerTrainDataset[source]
+

Loads the training dataset based on data arguments.

+
+
Returns:
+

The loaded dataset instance.

+
+
Return type:
+

AbsRerankerTrainDataset

+
+
+
+ +
+
+AbsRerankerRunner.load_data_collator() AbsRerankerCollator[source]
+

Loads the appropriate data collator.

+
+
Returns:
+

Loaded data collator.

+
+
Return type:
+

AbsRerankerCollator

+
+
+
+ +
+
+AbsRerankerRunner.run()[source]
+

Executes the training process.

+
+ +
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/finetune/reranker/AbsTrainer.html b/API/abc/finetune/reranker/AbsTrainer.html new file mode 100644 index 00000000..c37fba8c --- /dev/null +++ b/API/abc/finetune/reranker/AbsTrainer.html @@ -0,0 +1,578 @@ + + + + + + + + + AbsTrainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsTrainer

+
+

AbsRerankerTrainer

+
+
+class FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainer(model: PreTrainedModel | Module | None = None, args: TrainingArguments | None = None, data_collator: DataCollator | None = None, train_dataset: Dataset | IterableDataset | Dataset | None = None, eval_dataset: Dataset | Dict[str, Dataset] | Dataset | None = None, tokenizer: PreTrainedTokenizerBase | None = None, model_init: Callable[[], PreTrainedModel] | None = None, compute_metrics: Callable[[EvalPrediction], Dict] | None = None, callbacks: List[TrainerCallback] | None = None, optimizers: Tuple[Optimizer, LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[Tensor, Tensor], Tensor] | None = None)[source]
+

Abstract class for the trainer of reranker.

+
+ +
+

Methods

+
+
+AbsRerankerTrainer.compute_loss(model, inputs, return_outputs=False, **kwargs)[source]
+

How the loss is computed by Trainer. By default, all models return the loss in the first element.

+

Subclass and override for custom behavior.

+
+
Parameters:
+
    +
  • model (AbsRerankerModel) – The model being trained.

  • +
  • inputs (dict) – A dictionary of input tensors to be passed to the model.

  • +
  • return_outputs (bool, optional) – If True, returns both the loss and the model’s outputs. Otherwise, +returns only the loss. Defaults to False.

  • +
+
+
Returns:
+

+
The computed loss. If return_outputs is True,

also returns the model’s outputs in a tuple (loss, outputs).

+
+
+

+
+
Return type:
+

Union[torch.Tensor, tuple(torch.Tensor, RerankerOutput)]

+
+
+
+ +
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/inference.html b/API/abc/inference.html new file mode 100644 index 00000000..bcff12f5 --- /dev/null +++ b/API/abc/inference.html @@ -0,0 +1,551 @@ + + + + + + + + + Inference - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+ + +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/inference/AbsEmbedder.html b/API/abc/inference/AbsEmbedder.html new file mode 100644 index 00000000..bac08b8f --- /dev/null +++ b/API/abc/inference/AbsEmbedder.html @@ -0,0 +1,757 @@ + + + + + + + + + AbsEmbedder - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsEmbedder

+
+
+class FlagEmbedding.abc.inference.AbsEmbedder(model_name_or_path: str, normalize_embeddings: bool = True, use_fp16: bool = True, query_instruction_for_retrieval: str | None = None, query_instruction_format: str = '{}{}', devices: str | int | List[str] | List[int] | None = None, batch_size: int = 256, query_max_length: int = 512, passage_max_length: int = 512, convert_to_numpy: bool = True, **kwargs: Any)[source]
+

Base class for embedder. +Extend this class and implement encode_queries(), encode_corpus(), encode() for custom embedders.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • normalize_embeddings (bool, optional) – If True, normalize the embedding vector. Defaults to True.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to True.

  • +
  • query_instruction_for_retrieval – (Optional[str], optional): Query instruction for retrieval tasks, which will be used with +with query_instruction_format. Defaults to None.

  • +
  • query_instruction_format – (str, optional): The template for query_instruction_for_retrieval. Defaults to "{}{}".

  • +
  • devices (Optional[Union[str, int, List[str], List[int]]], optional) – Devices to use for model inference. Defaults to None.

  • +
  • batch_size (int, optional) – Batch size for inference. Defaults to 256.

  • +
  • query_max_length (int, optional) – Maximum length for query. Defaults to 512.

  • +
  • passage_max_length (int, optional) – Maximum length for passage. Defaults to 512.

  • +
  • convert_to_numpy (bool, optional) – If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. +Defaults to True.

  • +
  • kwargs (Dict[Any], optional) – Additional parameters for HuggingFace Transformers config or children classes.

  • +
+
+
+
+ +
+

Methods

+
+
+static AbsEmbedder.get_target_devices(devices: str | int | List[str] | List[int]) List[str][source]
+
+
Parameters:
+

devices (Union[str, int, List[str], List[int]]) – specified devices, can be str, int, list of str, or list of int.

+
+
Raises:
+

ValueError – Devices should be a string or an integer or a list of strings or a list of integers.

+
+
Returns:
+

A list of target devices in format.

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+static AbsEmbedder.get_detailed_instruct(instruction_format: str, instruction: str, sentence: str)[source]
+

Combine the instruction and sentence along with the instruction format.

+
+
Parameters:
+
    +
  • instruction_format (str) – Format for instruction.

  • +
  • instruction (str) – The text of instruction.

  • +
  • sentence (str) – The sentence to concatenate with.

  • +
+
+
Returns:
+

The complete sentence with instruction

+
+
Return type:
+

str

+
+
+
+ +
+
+AbsEmbedder.encode_queries(queries: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any)[source]
+

encode the queries using the instruction if provided.

+
+
Parameters:
+
    +
  • queries (Union[List[str], str]) – Input queries to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

Return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+AbsEmbedder.encode_corpus(corpus: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any)[source]
+

encode the corpus using the instruction if provided.

+
+
Parameters:
+
    +
  • corpus (Union[List[str], str]) – Input corpus to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

Return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+AbsEmbedder.encode(sentences: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, instruction: str | None = None, instruction_format: str | None = None, **kwargs: Any)[source]
+

encode the input sentences with the embedding model.

+
+
Parameters:
+
    +
  • sentences (Union[List[str], str]) – Input sentences to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
  • instruction (Optional[str], optional) – The text of instruction. Defaults to None.

  • +
  • instruction_format (Optional[str], optional) – Format for instruction. Defaults to None.

  • +
+
+
Returns:
+

return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+abstract AbsEmbedder.encode_single_device(sentences: List[str] | str, batch_size: int = 256, max_length: int = 512, convert_to_numpy: bool = True, device: str | None = None, **kwargs: Any)[source]
+

This method should encode sentences and return embeddings on a single device.

+
+ +
+
+AbsEmbedder.start_multi_process_pool(process_target_func: Any) Dict[Literal['input', 'output', 'processes'], Any][source]
+

Starts a multi-process pool to process the encoding with several independent processes +via SentenceTransformer.encode_multi_process.

+

This method is recommended if you want to encode on multiple GPUs or CPUs. It is advised +to start only one process per GPU. This method works together with encode_multi_process +and stop_multi_process_pool.

+
+
Returns:
+

A dictionary with the target processes, an input queue, and an output queue.

+
+
Return type:
+

Dict[str, Any]

+
+
+
+ +
+
+static AbsEmbedder._encode_multi_process_worker(target_device: str, model: AbsEmbedder, input_queue: Queue, results_queue: Queue) None[source]
+

Internal working process to encode sentences in multi-process setup

+
+ +
+
+static AbsEmbedder.stop_multi_process_pool(pool: Dict[Literal['input', 'output', 'processes'], Any]) None[source]
+

Stops all processes started with start_multi_process_pool.

+
+
Parameters:
+

pool (Dict[str, object]) – A dictionary containing the input queue, output queue, and process list.

+
+
Returns:
+

None

+
+
+
+ +
+
+AbsEmbedder.encode_multi_process(sentences: List[str], pool: Dict[Literal['input', 'output', 'processes'], Any], **kwargs)[source]
+
+ +
+
+AbsEmbedder._concatenate_results_from_multi_process(results_list: List[Tensor | ndarray | Any])[source]
+

concatenate and return the results from all the processes

+
+
Parameters:
+

results_list (List[Union[torch.Tensor, np.ndarray, Any]]) – A list of results from all the processes.

+
+
Raises:
+

NotImplementedError – Unsupported type for results_list

+
+
Returns:
+

return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/abc/inference/AbsReranker.html b/API/abc/inference/AbsReranker.html new file mode 100644 index 00000000..ab417b28 --- /dev/null +++ b/API/abc/inference/AbsReranker.html @@ -0,0 +1,699 @@ + + + + + + + + + AbsReranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AbsReranker

+
+
+class FlagEmbedding.abc.inference.AbsReranker(model_name_or_path: str, use_fp16: bool = False, query_instruction_for_rerank: str | None = None, query_instruction_format: str = '{}{}', passage_instruction_for_rerank: str | None = None, passage_instruction_format: str = '{}{}', devices: str | int | List[str] | List[int] | None = None, batch_size: int = 128, query_max_length: int | None = None, max_length: int = 512, normalize: bool = False, **kwargs: Any)[source]
+

Base class for Reranker. +Extend this class and implement compute_score_single_gpu() for custom rerankers.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to False.

  • +
  • query_instruction_for_rerank – (Optional[str], optional): Query instruction for reranking, which will be used with +with query_instruction_format. Defaults to None.

  • +
  • query_instruction_format – (str, optional): The template for query_instruction_for_rerank. Defaults to "{}{}".

  • +
  • passage_instruction_for_rerank (Optional[str], optional) – Passage instruction for reranking. Defaults to None.

  • +
  • passage_instruction_format (str, optional) – Passage instruction format when using passage_instruction_for_rerank. +Defaults to "{}{}".

  • +
  • devices (Optional[Union[str, int, List[str], List[int]]], optional) – Devices to use for model inference. Defaults to None.

  • +
  • batch_size (int, optional) – Batch size for inference. Defaults to 128.

  • +
  • query_max_length (int, optional) – Maximum length for query. Defaults to None.

  • +
  • max_length (int, optional) – Maximum length. Defaults to 512.

  • +
  • normalize (bool, optional) – If true, normalize the result. Defaults to False.

  • +
  • kwargs (Dict[Any], optional) – Additional parameters for HuggingFace Transformers config or children classes.

  • +
+
+
+
+ +
+

Methods

+
+
+static AbsReranker.get_target_devices(devices: str | int | List[str] | List[int]) List[str][source]
+
+
Parameters:
+

devices (Union[str, int, List[str], List[int]]) – Specified devices, can be str, int, list of str, or list of int.

+
+
Raises:
+

ValueError – Devices should be a string or an integer or a list of strings or a list of integers.

+
+
Returns:
+

A list of target devices in format

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+AbsReranker.get_detailed_instruct(instruction_format: str, instruction: str, sentence: str)[source]
+

Combine the instruction and sentence along with the instruction format.

+
+
Parameters:
+
    +
  • instruction_format (str) – Format for instruction.

  • +
  • instruction (str) – The text of instruction.

  • +
  • sentence (str) – The sentence to concatenate with.

  • +
+
+
Returns:
+

The complete sentence with instruction

+
+
Return type:
+

str

+
+
+
+ +
+
+AbsReranker.get_detailed_inputs(sentence_pairs: str | List[str])[source]
+

get detailed instruct for all the inputs

+
+
Parameters:
+

sentence_pairs (Union[str, List[str]]) – Input sentence pairs

+
+
Returns:
+

The complete sentence pairs with instruction

+
+
Return type:
+

list[list[str]]

+
+
+
+ +
+
+AbsReranker.compute_score(sentence_pairs: List[Tuple[str, str]] | Tuple[str, str], **kwargs)[source]
+

Compute score for each sentence pair

+
+
Parameters:
+

sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]) – Input sentence pairs to compute.

+
+
Returns:
+

scores of all the sentence pairs.

+
+
Return type:
+

numpy.ndarray

+
+
+
+ +
+
+abstract AbsReranker.compute_score_single_gpu(sentence_pairs: List[Tuple[str, str]] | Tuple[str, str], batch_size: int = 256, query_max_length: int | None = None, max_length: int = 512, normalize: bool = False, device: str | None = None, **kwargs: Any)[source]
+

This method should compute the scores of sentence_pair and return scores.

+
+ +
+
+AbsReranker.start_multi_process_pool() Dict[Literal['input', 'output', 'processes'], Any][source]
+

Starts a multi-process pool to process the encoding with several independent processes +via SentenceTransformer.encode_multi_process.

+

This method is recommended if you want to encode on multiple GPUs or CPUs. It is advised +to start only one process per GPU. This method works together with encode_multi_process +and stop_multi_process_pool.

+
+
Returns:
+

A dictionary with the target processes, an input queue, and an output queue.

+
+
Return type:
+

Dict[str, Any]

+
+
+
+ +
+
+AbsReranker.encode_multi_process(sentence_pairs: List, pool: Dict[Literal['input', 'output', 'processes'], Any], **kwargs) ndarray[source]
+
+ +
+
+static AbsReranker._encode_multi_process_worker(target_device: str, model: AbsReranker, input_queue: Queue, results_queue: Queue) None[source]
+

Internal working process to encode sentences in multi-process setup

+
+ +
+
+static AbsReranker.stop_multi_process_pool(pool: Dict[Literal['input', 'output', 'processes'], Any]) None[source]
+

Stops all processes started with start_multi_process_pool.

+
+
Parameters:
+

pool (Dict[str, object]) – A dictionary containing the input queue, output queue, and process list.

+
+
Returns:
+

None

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation.html b/API/evaluation.html new file mode 100644 index 00000000..286e2d36 --- /dev/null +++ b/API/evaluation.html @@ -0,0 +1,524 @@ + + + + + + + + + Evaluation - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+ +
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/airbench.html b/API/evaluation/airbench.html new file mode 100644 index 00000000..f1c705cf --- /dev/null +++ b/API/evaluation/airbench.html @@ -0,0 +1,543 @@ + + + + + + + + + AIR-Bench - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

AIR-Bench

+

AIR-Bench (Automated heterogeneous Information Retrieval Benchmark) is a dynamic (actively being updated) benchmark for information retrieval. +Now the benchmark contains two versions. Notice that the testing data is generated by LLMs with out human intervention. +This helps the evaluation of new domains easier and faster to be updated. It also makes it impossible for any models to have the test data covered in their training sets.

+

You can evaluate model’s performance on AIR-Bench by running our provided shell script:

+
chmod +x /examples/evaluation/air_bench/eval_air_bench.sh
+./examples/evaluation/air_bench/eval_air_bench.sh
+
+
+

Or by running:

+
python -m FlagEmbedding.evaluation.air_bench \
+--benchmark_version AIR-Bench_24.05 \
+--task_types qa long-doc \
+--domains arxiv \
+--languages en \
+--splits dev test \
+--output_dir ./air_bench/search_results \
+--search_top_k 1000 \
+--rerank_top_k 100 \
+--cache_dir /root/.cache/huggingface/hub \
+--overwrite False \
+--embedder_name_or_path BAAI/bge-m3 \
+--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
+--devices cuda:0 cuda:1 \
+--model_cache_dir /root/.cache/huggingface/hub \
+--reranker_max_length 1024
+
+
+

change the embedder, reranker, devices and cache directory to your preference.

+
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/airbench/arguments.html b/API/evaluation/airbench/arguments.html new file mode 100644 index 00000000..6584db24 --- /dev/null +++ b/API/evaluation/airbench/arguments.html @@ -0,0 +1,513 @@ + + + + + + + + + arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

arguments

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/airbench/runner.html b/API/evaluation/airbench/runner.html new file mode 100644 index 00000000..dd88f869 --- /dev/null +++ b/API/evaluation/airbench/runner.html @@ -0,0 +1,513 @@ + + + + + + + + + runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

runner

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/beir.html b/API/evaluation/beir.html new file mode 100644 index 00000000..e089e2d9 --- /dev/null +++ b/API/evaluation/beir.html @@ -0,0 +1,546 @@ + + + + + + + + + BEIR - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

BEIR

+

BEIR (Benchmarking-IR) is a heterogeneous evaluation benchmark for information retrieval. +It is designed for evaluating the performance of NLP-based retrieval models and widely used by research of modern embedding models.

+

You can evaluate model’s performance on the BEIR benchmark by running our provided shell script:

+
chmod +x /examples/evaluation/beir/eval_beir.sh
+./examples/evaluation/beir/eval_beir.sh
+
+
+

Or by running:

+
python -m FlagEmbedding.evaluation.beir \
+--eval_name beir \
+--dataset_dir ./beir/data \
+--dataset_names fiqa arguana cqadupstack \
+--splits test dev \
+--corpus_embd_save_dir ./beir/corpus_embd \
+--output_dir ./beir/search_results \
+--search_top_k 1000 \
+--rerank_top_k 100 \
+--cache_path /root/.cache/huggingface/hub \
+--overwrite False \
+--k_values 10 100 \
+--eval_output_method markdown \
+--eval_output_path ./beir/beir_eval_results.md \
+--eval_metrics ndcg_at_10 recall_at_100 \
+--ignore_identical_ids True \
+--embedder_name_or_path BAAI/bge-large-en-v1.5 \
+--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
+--devices cuda:0 cuda:1 \
+--reranker_max_length 1024 \
+
+
+

change the embedder, devices and cache directory to your preference.

+
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/beir/arguments.html b/API/evaluation/beir/arguments.html new file mode 100644 index 00000000..0ef413d3 --- /dev/null +++ b/API/evaluation/beir/arguments.html @@ -0,0 +1,513 @@ + + + + + + + + + arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

arguments

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/beir/data_loader.html b/API/evaluation/beir/data_loader.html new file mode 100644 index 00000000..dbf80efc --- /dev/null +++ b/API/evaluation/beir/data_loader.html @@ -0,0 +1,513 @@ + + + + + + + + + data loader - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

data loader

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/beir/evaluator.html b/API/evaluation/beir/evaluator.html new file mode 100644 index 00000000..9cab6ff6 --- /dev/null +++ b/API/evaluation/beir/evaluator.html @@ -0,0 +1,513 @@ + + + + + + + + + evaluator - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

evaluator

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/beir/runner.html b/API/evaluation/beir/runner.html new file mode 100644 index 00000000..76f9c3de --- /dev/null +++ b/API/evaluation/beir/runner.html @@ -0,0 +1,513 @@ + + + + + + + + + runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

runner

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/miracl.html b/API/evaluation/miracl.html new file mode 100644 index 00000000..1d27180a --- /dev/null +++ b/API/evaluation/miracl.html @@ -0,0 +1,549 @@ + + + + + + + + + MIRACL - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

MIRACL

+

MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) +is an WSDM 2023 Cup challenge that focuses on search across 18 different languages. +They release a multilingual retrieval dataset containing the train and dev set for 16 “known languages” and only dev set for 2 “surprise languages”. +The topics are generated by native speakers of each language, who also label the relevance between the topics and a given document list. +You can found the dataset on HuggingFace.

+

You can evaluate model’s performance on MIRACL simply by running our provided shell script:

+
chmod +x /examples/evaluation/miracl/eval_miracl.sh
+./examples/evaluation/miracl/eval_miracl.sh
+
+
+

Or by running:

+
python -m FlagEmbedding.evaluation.miracl \
+--eval_name miracl \
+--dataset_dir ./miracl/data \
+--dataset_names bn hi sw te th yo \
+--splits dev \
+--corpus_embd_save_dir ./miracl/corpus_embd \
+--output_dir ./miracl/search_results \
+--search_top_k 1000 \
+--rerank_top_k 100 \
+--cache_path /root/.cache/huggingface/hub \
+--overwrite False \
+--k_values 10 100 \
+--eval_output_method markdown \
+--eval_output_path ./miracl/miracl_eval_results.md \
+--eval_metrics ndcg_at_10 recall_at_100 \
+--embedder_name_or_path BAAI/bge-m3 \
+--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
+--devices cuda:0 cuda:1 \
+--cache_dir /root/.cache/huggingface/hub \
+--reranker_max_length 1024
+
+
+

change the embedder, reranker, devices and cache directory to your preference.

+
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/miracl/data_loader.html b/API/evaluation/miracl/data_loader.html new file mode 100644 index 00000000..7cad120a --- /dev/null +++ b/API/evaluation/miracl/data_loader.html @@ -0,0 +1,535 @@ + + + + + + + + + data_loader - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

data_loader

+
+

Methods

+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/miracl/runner.html b/API/evaluation/miracl/runner.html new file mode 100644 index 00000000..9d6968de --- /dev/null +++ b/API/evaluation/miracl/runner.html @@ -0,0 +1,513 @@ + + + + + + + + + runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

runner

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mkqa.html b/API/evaluation/mkqa.html new file mode 100644 index 00000000..1addb402 --- /dev/null +++ b/API/evaluation/mkqa.html @@ -0,0 +1,586 @@ + + + + + + + + + MKQA - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

MKQA

+

MKQA is an open-domain question answering evaluation set comprising 10k question-answer pairs aligned across 26 typologically diverse languages. +The queries are sampled from the [Google Natural Questions Dataset](https://github.com/google-research-datasets/natural-questions).

+

Each example in the dataset has the following structure:

+
{
+    'example_id': 563260143484355911,
+    'queries': {
+        'en': "who sings i hear you knocking but you can't come in",
+        'ru': "кто поет i hear you knocking but you can't come in",
+        'ja': '「 I hear you knocking」は誰が歌っていますか',
+        'zh_cn': "《i hear you knocking but you can't come in》是谁演唱的",
+        ...
+    },
+    'query': "who sings i hear you knocking but you can't come in",
+    'answers': {
+        'en': [{
+            'type': 'entity',
+            'entity': 'Q545186',
+            'text': 'Dave Edmunds',
+            'aliases': [],
+        }],
+        'ru': [{
+            'type': 'entity',
+            'entity': 'Q545186',
+            'text': 'Эдмундс, Дэйв',
+            'aliases': ['Эдмундс', 'Дэйв Эдмундс', 'Эдмундс Дэйв', 'Dave Edmunds'],
+        }],
+        'ja': [{
+            'type': 'entity',
+            'entity': 'Q545186',
+            'text': 'デイヴ・エドモンズ',
+            'aliases': ['デーブ・エドモンズ', 'デイブ・エドモンズ'],
+        }],
+        'zh_cn': [{
+            'type': 'entity',
+            'text': '戴维·埃德蒙兹 ',
+            'entity': 'Q545186',
+        }],
+        ...
+    },
+}
+
+
+

You can evaluate model’s performance on MKQA simply by running our provided shell script:

+
chmod +x /examples/evaluation/mkqa/eval_mkqa.sh
+./examples/evaluation/mkqa/eval_mkqa.sh
+
+
+

Or by running:

+
python -m FlagEmbedding.evaluation.mkqa \
+--eval_name mkqa \
+--dataset_dir ./mkqa/data \
+--dataset_names en zh_cn \
+--splits test \
+--corpus_embd_save_dir ./mkqa/corpus_embd \
+--output_dir ./mkqa/search_results \
+--search_top_k 1000 \
+--rerank_top_k 100 \
+--cache_path /root/.cache/huggingface/hub \
+--overwrite False \
+--k_values 20 \
+--eval_output_method markdown \
+--eval_output_path ./mkqa/mkqa_eval_results.md \
+--eval_metrics qa_recall_at_20 \
+--embedder_name_or_path BAAI/bge-m3 \
+--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
+--devices cuda:0 cuda:1 \
+--cache_dir /root/.cache/huggingface/hub \
+--reranker_max_length 1024
+
+
+

change the embedder, reranker, devices and cache directory to your preference.

+
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mkqa/data_loader.html b/API/evaluation/mkqa/data_loader.html new file mode 100644 index 00000000..b8dba5ab --- /dev/null +++ b/API/evaluation/mkqa/data_loader.html @@ -0,0 +1,535 @@ + + + + + + + + + data_loader - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

data_loader

+
+

Methods

+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mkqa/evaluator.html b/API/evaluation/mkqa/evaluator.html new file mode 100644 index 00000000..d2973672 --- /dev/null +++ b/API/evaluation/mkqa/evaluator.html @@ -0,0 +1,513 @@ + + + + + + + + + evaluator - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

evaluator

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mkqa/runner.html b/API/evaluation/mkqa/runner.html new file mode 100644 index 00000000..697cc453 --- /dev/null +++ b/API/evaluation/mkqa/runner.html @@ -0,0 +1,513 @@ + + + + + + + + + runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

runner

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mldr.html b/API/evaluation/mldr.html new file mode 100644 index 00000000..b3942580 --- /dev/null +++ b/API/evaluation/mldr.html @@ -0,0 +1,590 @@ + + + + + + + + + MLDR - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

MLDR

+

MLDR is a Multilingual Long-Document Retrieval dataset built on Wikipeida, Wudao and mC4, covering 13 typologically diverse languages. +Specifically, we sample lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose paragraphs from them. +Then we use GPT-3.5 to generate questions based on these paragraphs. +The generated question and the sampled article constitute a new text pair to the dataset.

+

An example of train set looks like:

+
{
+    'query_id': 'q-zh-<...>',
+    'query': '...',
+    'positive_passages': [
+        {
+            'docid': 'doc-zh-<...>',
+            'text': '...'
+        }
+    ],
+    'negative_passages': [
+        {
+            'docid': 'doc-zh-<...>',
+            'text': '...'
+        },
+        ...
+    ]
+}
+
+
+

An example of dev and test set looks like:

+
{
+    'query_id': 'q-zh-<...>',
+    'query': '...',
+    'positive_passages': [
+        {
+            'docid': 'doc-zh-<...>',
+            'text': '...'
+        }
+    ],
+    'negative_passages': []
+}
+
+
+

An example of corpus looks like:

+
{
+    'docid': 'doc-zh-<...>',
+    'text': '...'
+}
+
+
+

You can evaluate model’s performance on MLDR simply by running our provided shell script:

+
chmod +x /examples/evaluation/mldr/eval_mldr.sh
+./examples/evaluation/mldr/eval_mldr.sh
+
+
+

Or by running:

+
python -m FlagEmbedding.evaluation.mldr \
+--eval_name mldr \
+--dataset_dir ./mldr/data \
+--dataset_names hi \
+--splits test \
+--corpus_embd_save_dir ./mldr/corpus_embd \
+--output_dir ./mldr/search_results \
+--search_top_k 1000 \
+--rerank_top_k 100 \
+--cache_path /root/.cache/huggingface/hub \
+--overwrite False \
+--k_values 10 100 \
+--eval_output_method markdown \
+--eval_output_path ./mldr/mldr_eval_results.md \
+--eval_metrics ndcg_at_10 \
+--embedder_name_or_path BAAI/bge-m3 \
+--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
+--devices cuda:0 cuda:1 \
+--cache_dir /root/.cache/huggingface/hub \
+--embedder_passage_max_length 8192 \
+--reranker_max_length 8192
+
+
+

change the args of embedder, reranker, devices and cache directory to your preference.

+
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mldr/data_loader.html b/API/evaluation/mldr/data_loader.html new file mode 100644 index 00000000..1fc5d029 --- /dev/null +++ b/API/evaluation/mldr/data_loader.html @@ -0,0 +1,535 @@ + + + + + + + + + data_loader - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

data_loader

+
+

Methods

+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mldr/runner.html b/API/evaluation/mldr/runner.html new file mode 100644 index 00000000..d1d890e9 --- /dev/null +++ b/API/evaluation/mldr/runner.html @@ -0,0 +1,513 @@ + + + + + + + + + runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

runner

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/msmarco.html b/API/evaluation/msmarco.html new file mode 100644 index 00000000..ff95f721 --- /dev/null +++ b/API/evaluation/msmarco.html @@ -0,0 +1,546 @@ + + + + + + + + + MSMARCO - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

MSMARCO

+

MS Marco (Microsoft MAchine Reading Comprehension) is a large scale real-world reading comprehension dataset. +It is widely used in information retrieval, question answering, and natural language processing research.

+

You can evaluate model’s performance on MS MARCO simply by running our provided shell script:

+
chmod +x /examples/evaluation/msmarco/eval_msmarco.sh
+./examples/evaluation/msmarco/eval_msmarco.sh
+
+
+

Or by running:

+
python -m FlagEmbedding.evaluation.msmarco \
+--eval_name msmarco \
+--dataset_dir ./msmarco/data \
+--dataset_names passage \
+--splits dev \
+--corpus_embd_save_dir ./msmarco/corpus_embd \
+--output_dir ./msmarco/search_results \
+--search_top_k 1000 \
+--rerank_top_k 100 \
+--cache_path /root/.cache/huggingface/hub \
+--overwrite True \
+--k_values 10 100 \
+--eval_output_method markdown \
+--eval_output_path ./msmarco/msmarco_eval_results.md \
+--eval_metrics ndcg_at_10 recall_at_100 \
+--embedder_name_or_path BAAI/bge-large-en-v1.5 \
+--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
+--devices cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 \
+--cache_dir /root/.cache/huggingface/hub \
+--reranker_max_length 1024
+
+
+

change the embedder, reranker, devices and cache directory to your preference.

+
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/msmarco/data_loader.html b/API/evaluation/msmarco/data_loader.html new file mode 100644 index 00000000..9c73dfdb --- /dev/null +++ b/API/evaluation/msmarco/data_loader.html @@ -0,0 +1,535 @@ + + + + + + + + + data_loader - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

data_loader

+
+

Methods

+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/msmarco/runner.html b/API/evaluation/msmarco/runner.html new file mode 100644 index 00000000..41445105 --- /dev/null +++ b/API/evaluation/msmarco/runner.html @@ -0,0 +1,513 @@ + + + + + + + + + runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

runner

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mteb.html b/API/evaluation/mteb.html new file mode 100644 index 00000000..44c2e6c8 --- /dev/null +++ b/API/evaluation/mteb.html @@ -0,0 +1,537 @@ + + + + + + + + + MTEB - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

MTEB

+

MTEB (The Massive Text Embedding Benchmark) is a large-scale evaluation framework designed to assess the performance of text embedding models across a wide variety of NLP tasks. +Introduced to standardize and improve the evaluation of text embeddings, MTEB is crucial for assessing how well these models generalize across various real-world applications. +It contains a wide range of datasets in eight main NLP tasks and different languages, and provides an easy pipeline for evaluation. +It also holds the well known MTEB leaderboard, which contains a ranking of the latest first-class embedding models.

+

You can evaluate model’s performance on the whole MTEB benchmark by running our provided shell script:

+
chmod +x /examples/evaluation/mteb/eval_mteb.sh
+./examples/evaluation/mteb/eval_mteb.sh
+
+
+

Or by running:

+
python -m FlagEmbedding.evaluation.mteb \
+--eval_name mteb \
+--output_dir ./mteb/search_results \
+--languages eng \
+--tasks NFCorpus BiorxivClusteringS2S SciDocsRR \
+--eval_output_path ./mteb/mteb_eval_results.json \
+--embedder_name_or_path BAAI/bge-large-en-v1.5 \
+--devices cuda:7 \
+--cache_dir /root/.cache/huggingface/hub
+
+
+

change the embedder, devices and cache directory to your preference.

+
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mteb/arguments.html b/API/evaluation/mteb/arguments.html new file mode 100644 index 00000000..fb568ac9 --- /dev/null +++ b/API/evaluation/mteb/arguments.html @@ -0,0 +1,513 @@ + + + + + + + + + arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

arguments

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mteb/runner.html b/API/evaluation/mteb/runner.html new file mode 100644 index 00000000..d079d64c --- /dev/null +++ b/API/evaluation/mteb/runner.html @@ -0,0 +1,513 @@ + + + + + + + + + runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

runner

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/evaluation/mteb/searcher.html b/API/evaluation/mteb/searcher.html new file mode 100644 index 00000000..3154e136 --- /dev/null +++ b/API/evaluation/mteb/searcher.html @@ -0,0 +1,513 @@ + + + + + + + + + searcher - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

searcher

+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune.html b/API/finetune.html new file mode 100644 index 00000000..5442721d --- /dev/null +++ b/API/finetune.html @@ -0,0 +1,787 @@ + + + + + + + + + Finetune - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Finetune

+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder.html b/API/finetune/embedder.html new file mode 100644 index 00000000..df897ef4 --- /dev/null +++ b/API/finetune/embedder.html @@ -0,0 +1,688 @@ + + + + + + + + + Embedder - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Embedder

+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only.html b/API/finetune/embedder/decoder_only.html new file mode 100644 index 00000000..c689be5a --- /dev/null +++ b/API/finetune/embedder/decoder_only.html @@ -0,0 +1,597 @@ + + + + + + + + + Decoder Only - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Decoder Only

+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/base.html b/API/finetune/embedder/decoder_only/base.html new file mode 100644 index 00000000..5eaf7887 --- /dev/null +++ b/API/finetune/embedder/decoder_only/base.html @@ -0,0 +1,542 @@ + + + + + + + + + Base - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+ + +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/base/arguments.html b/API/finetune/embedder/decoder_only/base/arguments.html new file mode 100644 index 00000000..835f0ad4 --- /dev/null +++ b/API/finetune/embedder/decoder_only/base/arguments.html @@ -0,0 +1,538 @@ + + + + + + + + + Arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Arguments

+
+
+class FlagEmbedding.finetune.embedder.decoder_only.base.DecoderOnlyEmbedderModelArguments(model_name_or_path: str, config_name: str | None = None, tokenizer_name: str | None = None, cache_dir: str | None = None, trust_remote_code: bool = False, token: str = <factory>, peft_model_path: str = '', use_lora: bool = True, lora_rank: int = 64, lora_alpha: float = 16, lora_dropout: float = 0.1, target_modules: ~typing.List[str] = <factory>, use_flash_attn: bool = False, use_slow_tokenizer: bool = False, from_peft: str | None = None, modules_to_save: str | None = None, raw_peft: str | None = None, additional_special_tokens: str | None = None, save_merged_lora_model: bool = False)[source]
+

Model argument class for decoder only base model.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/base/modeling.html b/API/finetune/embedder/decoder_only/base/modeling.html new file mode 100644 index 00000000..62f90894 --- /dev/null +++ b/API/finetune/embedder/decoder_only/base/modeling.html @@ -0,0 +1,571 @@ + + + + + + + + + Modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Modeling

+
+
+class FlagEmbedding.finetune.reranker.decoder_only.base.CrossDecoderModel(base_model: PreTrainedModel, tokenizer: AutoTokenizer | None = None, train_batch_size: int = 4)[source]
+

Model class for decoder only reranker.

+
+
Parameters:
+
    +
  • base_model (PreTrainedModel) – The underlying pre-trained model used for encoding and scoring input pairs.

  • +
  • tokenizer (AutoTokenizer, optional) – The tokenizer for encoding input text. Defaults to None.

  • +
  • train_batch_size (int, optional) – The batch size to use. Defaults to 4.

  • +
+
+
+
+ +
+

Methods

+
+
+CrossDecoderModel.encode(features)[source]
+

Encodes input features to logits.

+
+
Parameters:
+

features (dict) – Dictionary with input features.

+
+
Returns:
+

The logits output from the model.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/base/runner.html b/API/finetune/embedder/decoder_only/base/runner.html new file mode 100644 index 00000000..7b4971e2 --- /dev/null +++ b/API/finetune/embedder/decoder_only/base/runner.html @@ -0,0 +1,586 @@ + + + + + + + + + Runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Runner

+
+
+class FlagEmbedding.finetune.reranker.decoder_only.base.DecoderOnlyRerankerRunner(model_args: RerankerModelArguments, data_args: AbsRerankerDataArguments, training_args: AbsRerankerTrainingArguments)[source]
+

Decoder only reranker runner for finetuning.

+
+
Parameters:
+
+
+
+
+
+load_tokenizer_and_model() Tuple[PreTrainedTokenizer, AbsRerankerModel][source]
+

Load the tokenizer and model.

+
+
Returns:
+

Tokenizer and model instances.

+
+
Return type:
+

Tuple[PreTrainedTokenizer, AbsEmbedderModel]

+
+
+
+ +
+
+load_trainer() DecoderOnlyRerankerTrainer[source]
+

Load the trainer.

+
+
Returns:
+

Loaded trainer instance.

+
+
Return type:
+

DecoderOnlyRerankerTrainer

+
+
+
+ +
+
+run()[source]
+

Run the finetuning.

+
+ +
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/base/trainer.html b/API/finetune/embedder/decoder_only/base/trainer.html new file mode 100644 index 00000000..5319e205 --- /dev/null +++ b/API/finetune/embedder/decoder_only/base/trainer.html @@ -0,0 +1,538 @@ + + + + + + + + + Trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Trainer

+
+
+class FlagEmbedding.finetune.reranker.decoder_only.base.DecoderOnlyRerankerTrainer(model: PreTrainedModel | Module | None = None, args: TrainingArguments | None = None, data_collator: DataCollator | None = None, train_dataset: Dataset | IterableDataset | Dataset | None = None, eval_dataset: Dataset | Dict[str, Dataset] | Dataset | None = None, tokenizer: PreTrainedTokenizerBase | None = None, model_init: Callable[[], PreTrainedModel] | None = None, compute_metrics: Callable[[EvalPrediction], Dict] | None = None, callbacks: List[TrainerCallback] | None = None, optimizers: Tuple[Optimizer, LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[Tensor, Tensor], Tensor] | None = None)[source]
+

Trainer class for encoder only base reranker models.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/icl.html b/API/finetune/embedder/decoder_only/icl.html new file mode 100644 index 00000000..b500911c --- /dev/null +++ b/API/finetune/embedder/decoder_only/icl.html @@ -0,0 +1,566 @@ + + + + + + + + + ICL - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+ + +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/icl/arguments.html b/API/finetune/embedder/decoder_only/icl/arguments.html new file mode 100644 index 00000000..af53e361 --- /dev/null +++ b/API/finetune/embedder/decoder_only/icl/arguments.html @@ -0,0 +1,545 @@ + + + + + + + + + Arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Arguments

+
+
+class FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLModelArguments(model_name_or_path: str, config_name: str | None = None, tokenizer_name: str | None = None, cache_dir: str | None = None, trust_remote_code: bool = False, token: str = <factory>, peft_model_path: str = '', use_lora: bool = True, lora_rank: int = 64, lora_alpha: float = 16, lora_dropout: float = 0.1, target_modules: ~typing.List[str] = <factory>, use_flash_attn: bool = False, use_slow_tokenizer: bool = False, from_peft: str | None = None, modules_to_save: str | None = None, raw_peft: str | None = None, additional_special_tokens: str | None = None, save_merged_lora_model: bool = False)[source]
+

Model argument class for decoder only icl model.

+
+ +
+
+class FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLDataArguments(train_data: str | None = None, cache_path: str | None = None, train_group_size: int = 8, query_max_len: int = 32, passage_max_len: int = 128, pad_to_multiple_of: int | None = None, max_example_num_per_dataset: int = 100000000, query_instruction_for_retrieval: str | None = None, query_instruction_format: str = '{}{}', knowledge_distillation: bool = False, passage_instruction_for_retrieval: str | None = None, passage_instruction_format: str | None = '{}{}', shuffle_ratio: float = 0.0, same_dataset_within_batch: bool = False, small_threshold: int = 0, drop_threshold: int = 0, example_query_max_len: int = 64, example_passage_max_len: int = 96, retrieval_use_examples: bool = True, icl_suffix_str: str = '\nResponse:')[source]
+

Data argument class for decoder only icl model.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/icl/dataset.html b/API/finetune/embedder/decoder_only/icl/dataset.html new file mode 100644 index 00000000..390dcfb9 --- /dev/null +++ b/API/finetune/embedder/decoder_only/icl/dataset.html @@ -0,0 +1,598 @@ + + + + + + + + + Dataset - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Dataset

+
+

DecoderOnlyEmbedderICLSameDatasetTrainDataset

+
+
+class FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLSameDatasetTrainDataset(args: DecoderOnlyEmbedderICLDataArguments, default_batch_size: int, seed: int, tokenizer: PreTrainedTokenizer, process_index: int = 0, num_processes: int = 1)[source]
+

Dataset class for icl model.

+
+
Parameters:
+
    +
  • args (DecoderOnlyEmbedderICLDataArguments) – Data argument class for icl model.

  • +
  • default_batch_size (int) – The default batch size.

  • +
  • seed (int) – Random seed to use.

  • +
  • tokenizer (PreTrainedTokenizer) – Tokenzier.

  • +
  • process_index (int, optional) – Current process index. Defaults to 0.

  • +
  • num_processes (int, optional) – Total number of processes. Defaults to 1.

  • +
+
+
+
+ +
+

Methods

+
+
+DecoderOnlyEmbedderICLSameDatasetTrainDataset._create_batch_data(batch_raw_data)[source]
+

Create a comple batch of data with queries, documents and teacher scores.

+
+
Parameters:
+

batch_raw_data (datasets.Dataset) – One batch of raw data.

+
+
Returns:
+

Queries with instruction format. +List[str]: Documents with instruction format. +List[float]: Teacher scores for model distillation.

+
+
Return type:
+

List[str]

+
+
+
+ +
+
+
+

AbsEmbedderSameDatasetCollator

+
+
+class FlagEmbedding.finetune.embedder.decoder_only.icl.AbsEmbedderSameDatasetCollator(tokenizer: PreTrainedTokenizerBase, padding: bool | str | PaddingStrategy = True, max_length: int | None = None, pad_to_multiple_of: int | None = None, return_tensors: str = 'pt', query_max_len: int = 32, passage_max_len: int = 128, sub_batch_size: int = -1)[source]
+

EmbedCollator for SameDataset. +Note that after using this collator, the training_args should be set as:

+

training_args.per_device_train_batch_size = 1

+

training_args.dataloader_num_workers = 0    # avoid multi-processing

+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/icl/modeling.html b/API/finetune/embedder/decoder_only/icl/modeling.html new file mode 100644 index 00000000..2ef58932 --- /dev/null +++ b/API/finetune/embedder/decoder_only/icl/modeling.html @@ -0,0 +1,690 @@ + + + + + + + + + Modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Modeling

+
+
+class FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel(base_model: AutoModel, tokenizer: AutoTokenizer | None = None, negatives_cross_device: bool = False, temperature: float = 1.0, sub_batch_size: int = -1, kd_loss_type: str = 'kl_div', sentence_pooling_method: str = 'last_token', normalize_embeddings: bool = False)[source]
+

Embedder model class for decoder only model.

+
+
Parameters:
+
    +
  • base_model (AutoModel) – The base model to train on.

  • +
  • tokenizer (AutoTokenizer, optional) – The tokenizer to use. Defaults to None.

  • +
  • negatives_cross_device (bool, optional) – If True, will compute cross devices negative loss. Defaults to False.

  • +
  • temperature (float, optional) – Temperature to control the scale of scores. Defaults to 1.0.

  • +
  • sub_batch_size (int, optional) – Sub-batch size during encoding. If negative, will not split to sub-batch. +Defaults to -1.

  • +
  • kd_loss_type (str, optional) – Type of knowledge distillation loss. Defaults to 'kl_div'.

  • +
  • sentence_pooling_method (str, optional) – Pooling method to get sentence embedding. Defaults to 'last_token'.

  • +
  • normalize_embeddings (bool, optional) – If True, normalize the embedding vector. Defaults to False.

  • +
+
+
+
+ +
+

Methods

+
+
+BiDecoderOnlyEmbedderICLModel.encode(features)[source]
+

Encode and get the embedding.

+
+
Parameters:
+

features (Union[list, dict]) – Features feed to the model.

+
+
Returns:
+

The embedding vectors.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiDecoderOnlyEmbedderICLModel.compute_score(q_reps, p_reps)[source]
+

Computes the scores between query and passage representations.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
+
+
Returns:
+

The computed scores, adjusted by temperature.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiDecoderOnlyEmbedderICLModel.compute_loss(scores, target)[source]
+

Compute the loss using cross entropy.

+
+
Parameters:
+
    +
  • scores (torch.Tensor) – Computed score.

  • +
  • target (torch.Tensor) – The target value.

  • +
+
+
Returns:
+

The computed cross entropy loss.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiDecoderOnlyEmbedderICLModel.gradient_checkpointing_enable(**kwargs)[source]
+

Activates gradient checkpointing for the current model.

+
+ +
+
+BiDecoderOnlyEmbedderICLModel.enable_input_require_grads(**kwargs)[source]
+

Enables the gradients for the input embeddings.

+
+ +
+
+BiDecoderOnlyEmbedderICLModel.save(output_dir: str)[source]
+

Save the model to the directory.

+
+
Parameters:
+

output_dir (str) – Directory for saving the model.

+
+
+
+ +
+
+BiDecoderOnlyEmbedderICLModel._sentence_embedding(last_hidden_state, attention_mask)[source]
+

Use the pooling method to get the sentence embedding.

+
+
Parameters:
+
    +
  • last_hidden_state (torch.Tensor) – The model output’s last hidden state.

  • +
  • attention_mask (torch.Tensor) – Mask out padding tokens during pooling.

  • +
+
+
Raises:
+

NotImplementedError – Specified pooling method not implemented.

+
+
Returns:
+

The sentence embeddings.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiDecoderOnlyEmbedderICLModel._compute_similarity(q_reps, p_reps)[source]
+

Computes the similarity between query and passage representations using inner product.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
+
+
Returns:
+

The computed similarity matrix.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/icl/runner.html b/API/finetune/embedder/decoder_only/icl/runner.html new file mode 100644 index 00000000..7322cb25 --- /dev/null +++ b/API/finetune/embedder/decoder_only/icl/runner.html @@ -0,0 +1,604 @@ + + + + + + + + + Runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Runner

+
+
+class FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLRunner(model_args: DecoderOnlyEmbedderICLModelArguments, data_args: DecoderOnlyEmbedderICLDataArguments, training_args: AbsEmbedderTrainingArguments)[source]
+

Runner class for decoder only icl model.

+
+
Parameters:
+
+
+
+
+
+load_tokenizer_and_model() Tuple[PreTrainedTokenizer, AbsEmbedderModel][source]
+

Load tokenizer and model.

+
+
Returns:
+

Tokenizer and model instances.

+
+
Return type:
+

Tuple[PreTrainedTokenizer, AbsEmbedderModel]

+
+
+
+ +
+
+load_train_dataset() DecoderOnlyEmbedderICLSameDatasetTrainDataset[source]
+

Load the dataset instance for training.

+
+
Raises:
+

NotImplementedError – Only support same_dataset_within_batch for DecoderOnlyEmbedderICLRunner.

+
+
Returns:
+

The dataset instance.

+
+
Return type:
+

DecoderOnlyEmbedderICLSameDatasetTrainDataset

+
+
+
+ +
+
+load_trainer() DecoderOnlyEmbedderICLTrainer[source]
+

Load the trainer.

+
+
Returns:
+

Loaded trainer instance.

+
+
Return type:
+

DecoderOnlyEmbedderICLTrainer

+
+
+
+ +
+
+run()[source]
+

Run the finetune.

+
+ +
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/decoder_only/icl/trainer.html b/API/finetune/embedder/decoder_only/icl/trainer.html new file mode 100644 index 00000000..f0ae64ec --- /dev/null +++ b/API/finetune/embedder/decoder_only/icl/trainer.html @@ -0,0 +1,538 @@ + + + + + + + + + Trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Trainer

+
+
+class FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLTrainer(model: PreTrainedModel | Module | None = None, args: TrainingArguments | None = None, data_collator: DataCollator | None = None, train_dataset: Dataset | IterableDataset | Dataset | None = None, eval_dataset: Dataset | Dict[str, Dataset] | Dataset | None = None, tokenizer: PreTrainedTokenizerBase | None = None, model_init: Callable[[], PreTrainedModel] | None = None, compute_metrics: Callable[[EvalPrediction], Dict] | None = None, callbacks: List[TrainerCallback] | None = None, optimizers: Tuple[Optimizer, LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[Tensor, Tensor], Tensor] | None = None)[source]
+

Trainer class for base encoder models.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/encoder_only.html b/API/finetune/embedder/encoder_only.html new file mode 100644 index 00000000..31decb00 --- /dev/null +++ b/API/finetune/embedder/encoder_only.html @@ -0,0 +1,602 @@ + + + + + + + + + Encoder Only - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Encoder Only

+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/encoder_only/base.html b/API/finetune/embedder/encoder_only/base.html new file mode 100644 index 00000000..af7ead73 --- /dev/null +++ b/API/finetune/embedder/encoder_only/base.html @@ -0,0 +1,544 @@ + + + + + + + + + Base - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+ + +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/encoder_only/base/modeling.html b/API/finetune/embedder/encoder_only/base/modeling.html new file mode 100644 index 00000000..68dfee8a --- /dev/null +++ b/API/finetune/embedder/encoder_only/base/modeling.html @@ -0,0 +1,690 @@ + + + + + + + + + Modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Modeling

+
+
+class FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel(base_model: AutoModel, tokenizer: AutoTokenizer | None = None, negatives_cross_device: bool = False, temperature: float = 1.0, sub_batch_size: int = -1, kd_loss_type: str = 'kl_div', sentence_pooling_method: str = 'cls', normalize_embeddings: bool = False)[source]
+

Embedder class for encoder only model.

+
+
Parameters:
+
    +
  • base_model (AutoModel) – The base model to train on.

  • +
  • tokenizer (AutoTokenizer, optional) – The tokenizer to use. Defaults to None.

  • +
  • negatives_cross_device (bool, optional) – If True, will compute cross devices negative loss. Defaults to False.

  • +
  • temperature (float, optional) – Temperature to control the scale of scores. Defaults to 1.0.

  • +
  • sub_batch_size (int, optional) – Sub-batch size during encoding. If negative, will not split to sub-batch. +Defaults to -1.

  • +
  • kd_loss_type (str, optional) – Type of knowledge distillation loss. Defaults to "kl_div".

  • +
  • sentence_pooling_method (str, optional) – Pooling method to get sentence embedding. Defaults to 'cls'.

  • +
  • normalize_embeddings (bool, optional) – If True, normalize the embedding vector. Defaults to False.

  • +
+
+
+
+ +
+

Methods

+
+
+BiEncoderOnlyEmbedderModel.encode(features)[source]
+

Encode and get the embedding.

+
+
Parameters:
+

features (Union[list, dict]) – Features feed to the model.

+
+
Returns:
+

The embedding vectors.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiEncoderOnlyEmbedderModel.compute_score(q_reps, p_reps)[source]
+

Computes the scores between query and passage representations.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
+
+
Returns:
+

The computed scores, adjusted by temperature.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiEncoderOnlyEmbedderModel.compute_loss(scores, target)[source]
+

Compute the loss using cross entropy.

+
+
Parameters:
+
    +
  • scores (torch.Tensor) – Computed score.

  • +
  • target (torch.Tensor) – The target value.

  • +
+
+
Returns:
+

The computed cross entropy loss.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiEncoderOnlyEmbedderModel.gradient_checkpointing_enable(**kwargs)[source]
+

Activates gradient checkpointing for the current model.

+
+ +
+
+BiEncoderOnlyEmbedderModel.enable_input_require_grads(**kwargs)[source]
+

Enables the gradients for the input embeddings.

+
+ +
+
+BiEncoderOnlyEmbedderModel.save(output_dir: str)[source]
+

Save the model to the directory.

+
+
Parameters:
+

output_dir (str) – Directory for saving the model.

+
+
+
+ +
+
+BiEncoderOnlyEmbedderModel._sentence_embedding(last_hidden_state, attention_mask)[source]
+

Use the pooling method to get the sentence embedding.

+
+
Parameters:
+
    +
  • last_hidden_state (torch.Tensor) – The model output’s last hidden state.

  • +
  • attention_mask (torch.Tensor) – Mask out padding tokens during pooling.

  • +
+
+
Raises:
+

NotImplementedError – Specified pooling method not implemented.

+
+
Returns:
+

The sentence embeddings.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiEncoderOnlyEmbedderModel._compute_similarity(q_reps, p_reps)[source]
+

Computes the similarity between query and passage representations using inner product.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
+
+
Returns:
+

The computed similarity matrix.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/encoder_only/base/runner.html b/API/finetune/embedder/encoder_only/base/runner.html new file mode 100644 index 00000000..23abda9d --- /dev/null +++ b/API/finetune/embedder/encoder_only/base/runner.html @@ -0,0 +1,570 @@ + + + + + + + + + Runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Runner

+
+
+class FlagEmbedding.finetune.embedder.encoder_only.base.EncoderOnlyEmbedderRunner(model_args: AbsEmbedderModelArguments, data_args: AbsEmbedderDataArguments, training_args: AbsEmbedderTrainingArguments)[source]
+

Finetune Runner for base embedding models.

+
+
+load_tokenizer_and_model() Tuple[PreTrainedTokenizer, AbsEmbedderModel][source]
+

Load tokenizer and model.

+
+
Returns:
+

Tokenizer and model instances.

+
+
Return type:
+

Tuple[PreTrainedTokenizer, AbsEmbedderModel]

+
+
+
+ +
+
+load_trainer() EncoderOnlyEmbedderTrainer[source]
+

Load the trainer.

+
+
Returns:
+

Loaded trainer instance.

+
+
Return type:
+

EncoderOnlyEmbedderTrainer

+
+
+
+ +
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/encoder_only/base/trainer.html b/API/finetune/embedder/encoder_only/base/trainer.html new file mode 100644 index 00000000..2f903487 --- /dev/null +++ b/API/finetune/embedder/encoder_only/base/trainer.html @@ -0,0 +1,538 @@ + + + + + + + + + Trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Trainer

+
+
+class FlagEmbedding.finetune.embedder.encoder_only.base.EncoderOnlyEmbedderTrainer(model: PreTrainedModel | Module | None = None, args: TrainingArguments | None = None, data_collator: DataCollator | None = None, train_dataset: Dataset | IterableDataset | Dataset | None = None, eval_dataset: Dataset | Dict[str, Dataset] | Dataset | None = None, tokenizer: PreTrainedTokenizerBase | None = None, model_init: Callable[[], PreTrainedModel] | None = None, compute_metrics: Callable[[EvalPrediction], Dict] | None = None, callbacks: List[TrainerCallback] | None = None, optimizers: Tuple[Optimizer, LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[Tensor, Tensor], Tensor] | None = None)[source]
+

Trainer class for base encoder models.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/encoder_only/m3.html b/API/finetune/embedder/encoder_only/m3.html new file mode 100644 index 00000000..0548b957 --- /dev/null +++ b/API/finetune/embedder/encoder_only/m3.html @@ -0,0 +1,569 @@ + + + + + + + + + M3 - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

M3

+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/encoder_only/m3/arguments.html b/API/finetune/embedder/encoder_only/m3/arguments.html new file mode 100644 index 00000000..5963ca2e --- /dev/null +++ b/API/finetune/embedder/encoder_only/m3/arguments.html @@ -0,0 +1,545 @@ + + + + + + + + + Arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Arguments

+
+
+class FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3ModelArguments(model_name_or_path: str, config_name: str | None = None, tokenizer_name: str | None = None, cache_dir: str | None = None, trust_remote_code: bool = False, token: str = <factory>, colbert_dim: int = -1)[source]
+

Model argument class for M3.

+
+ +
+
+class FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3TrainingArguments(output_dir: str, overwrite_output_dir: bool = False, do_train: bool = False, do_eval: bool = False, do_predict: bool = False, eval_strategy: ~transformers.trainer_utils.IntervalStrategy | str = 'no', prediction_loss_only: bool = False, per_device_train_batch_size: int = 8, per_device_eval_batch_size: int = 8, per_gpu_train_batch_size: int | None = None, per_gpu_eval_batch_size: int | None = None, gradient_accumulation_steps: int = 1, eval_accumulation_steps: int | None = None, eval_delay: float | None = 0, torch_empty_cache_steps: int | None = None, learning_rate: float = 5e-05, weight_decay: float = 0.0, adam_beta1: float = 0.9, adam_beta2: float = 0.999, adam_epsilon: float = 1e-08, max_grad_norm: float = 1.0, num_train_epochs: float = 3.0, max_steps: int = -1, lr_scheduler_type: ~transformers.trainer_utils.SchedulerType | str = 'linear', lr_scheduler_kwargs: dict | str | None = <factory>, warmup_ratio: float = 0.0, warmup_steps: int = 0, log_level: str | None = 'passive', log_level_replica: str | None = 'warning', log_on_each_node: bool = True, logging_dir: str | None = None, logging_strategy: ~transformers.trainer_utils.IntervalStrategy | str = 'steps', logging_first_step: bool = False, logging_steps: float = 500, logging_nan_inf_filter: bool = True, save_strategy: ~transformers.trainer_utils.IntervalStrategy | str = 'steps', save_steps: float = 500, save_total_limit: int | None = None, save_safetensors: bool | None = True, save_on_each_node: bool = False, save_only_model: bool = False, restore_callback_states_from_checkpoint: bool = False, no_cuda: bool = False, use_cpu: bool = False, use_mps_device: bool = False, seed: int = 42, data_seed: int | None = None, jit_mode_eval: bool = False, use_ipex: bool = False, bf16: bool = False, fp16: bool = False, fp16_opt_level: str = 'O1', half_precision_backend: str = 'auto', bf16_full_eval: bool = False, fp16_full_eval: bool = False, tf32: bool | None = None, local_rank: int = -1, ddp_backend: str | None = None, tpu_num_cores: int | None = None, tpu_metrics_debug: bool = False, debug: str | ~typing.List[~transformers.debug_utils.DebugOption] = '', dataloader_drop_last: bool = False, eval_steps: float | None = None, dataloader_num_workers: int = 0, dataloader_prefetch_factor: int | None = None, past_index: int = -1, run_name: str | None = None, disable_tqdm: bool | None = None, remove_unused_columns: bool | None = True, label_names: ~typing.List[str] | None = None, load_best_model_at_end: bool | None = False, metric_for_best_model: str | None = None, greater_is_better: bool | None = None, ignore_data_skip: bool = False, fsdp: ~typing.List[~transformers.trainer_utils.FSDPOption] | str | None = '', fsdp_min_num_params: int = 0, fsdp_config: dict | str | None = None, fsdp_transformer_layer_cls_to_wrap: str | None = None, accelerator_config: dict | str | None = None, deepspeed: dict | str | None = None, label_smoothing_factor: float = 0.0, optim: ~transformers.training_args.OptimizerNames | str = 'adamw_torch', optim_args: str | None = None, adafactor: bool = False, group_by_length: bool = False, length_column_name: str | None = 'length', report_to: None | str | ~typing.List[str] = None, ddp_find_unused_parameters: bool | None = None, ddp_bucket_cap_mb: int | None = None, ddp_broadcast_buffers: bool | None = None, dataloader_pin_memory: bool = True, dataloader_persistent_workers: bool = False, skip_memory_metrics: bool = True, use_legacy_prediction_loop: bool = False, push_to_hub: bool = False, resume_from_checkpoint: str | None = None, hub_model_id: str | None = None, hub_strategy: ~transformers.trainer_utils.HubStrategy | str = 'every_save', hub_token: str | None = None, hub_private_repo: bool = False, hub_always_push: bool = False, gradient_checkpointing: bool = False, gradient_checkpointing_kwargs: dict | str | None = None, include_inputs_for_metrics: bool = False, eval_do_concat_batches: bool = True, fp16_backend: str = 'auto', evaluation_strategy: ~transformers.trainer_utils.IntervalStrategy | str | None = None, push_to_hub_model_id: str | None = None, push_to_hub_organization: str | None = None, push_to_hub_token: str | None = None, mp_parameters: str = '', auto_find_batch_size: bool = False, full_determinism: bool = False, torchdynamo: str | None = None, ray_scope: str | None = 'last', ddp_timeout: int | None = 1800, torch_compile: bool = False, torch_compile_backend: str | None = None, torch_compile_mode: str | None = None, dispatch_batches: bool | None = None, split_batches: bool | None = None, include_tokens_per_second: bool | None = False, include_num_input_tokens_seen: bool | None = False, neftune_noise_alpha: float | None = None, optim_target_modules: None | str | ~typing.List[str] = None, batch_eval_metrics: bool = False, eval_on_start: bool = False, eval_use_gather_object: bool | None = False, negatives_cross_device: bool = False, temperature: float | None = 0.02, fix_position_embedding: bool = False, sentence_pooling_method: str = 'cls', normalize_embeddings: bool = True, sub_batch_size: int | None = None, kd_loss_type: str = 'kl_div', unified_finetuning: bool = False, use_self_distill: bool = False, fix_encoder: bool = False, self_distill_start_step: int = -1)[source]
+

Training argument class for M3.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/encoder_only/m3/modeling.html b/API/finetune/embedder/encoder_only/m3/modeling.html new file mode 100644 index 00000000..42b99998 --- /dev/null +++ b/API/finetune/embedder/encoder_only/m3/modeling.html @@ -0,0 +1,940 @@ + + + + + + + + + Modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Modeling

+
+

EncoderOnlyEmbedderM3Model

+
+
+class FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model(base_model: Dict[str, Any], tokenizer: AutoTokenizer | None = None, negatives_cross_device: bool = False, temperature: float = 1, sub_batch_size: int = -1, kd_loss_type: str = 'm3_kd_loss', sentence_pooling_method: str = 'cls', normalize_embeddings: bool = False, unified_finetuning: bool = True, use_self_distill: bool = False, self_distill_start_step: int = -1)[source]
+

Embedder class for M3 model.

+
+
Parameters:
+
    +
  • base_model (AutoModel) – The base model to train on.

  • +
  • tokenizer (AutoTokenizer, optional) – The tokenizer to use. Defaults to None.

  • +
  • negatives_cross_device (bool, optional) – If True, will compute cross devices negative loss. Defaults to False.

  • +
  • temperature (float, optional) – Temperature to control the scale of scores. Defaults to 1.0.

  • +
  • sub_batch_size (int, optional) – Sub-batch size during encoding. If negative, will not split to sub-batch. +Defaults to -1.

  • +
  • kd_loss_type (str, optional) – Type of knowledge distillation loss. Defaults to 'm3_kd_loss'.

  • +
  • sentence_pooling_method (str, optional) – Pooling method to get sentence embedding. Defaults to 'cls'.

  • +
  • normalize_embeddings (bool, optional) – If True, normalize the embedding vector. Defaults to False.

  • +
  • unified_finetuning (bool, optional) – If True, will finetune colbert vector and sparce embedding. Defaults to True.

  • +
  • use_self_distill (bool, optional) – If True, will do self distillation. Defaults to False.

  • +
  • self_distill_start_step (int, optional) – Step num to start self distillation. Defaults to -1.

  • +
+
+
+
+ +
+

Methods

+
+
+EncoderOnlyEmbedderM3Model.encode(features)[source]
+

Encode and get the embedding.

+
+
Parameters:
+

features (Union[list, dict]) – Features feed to the model.

+
+
Returns:
+

Dense embeddings. +torch.Tensor: Sparce embeddings. +torch.Tensor: Colbert vectors.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model.compute_score(q_reps, p_reps, q_mask: Tensor, dense_weight: float = 1.0, sparse_weight: float = 0.3, colbert_weight: float = 1.0)[source]
+

_summary_

+
+
Parameters:
+
    +
  • q_reps (_type_) – Query representations.

  • +
  • p_reps (_type_) – Passage representations.

  • +
  • q_mask (torch.Tensor) – _description_

  • +
  • dense_weight (float, optional) – _description_. Defaults to 1.0.

  • +
  • sparse_weight (float, optional) – _description_. Defaults to 0.3.

  • +
  • colbert_weight (float, optional) – _description_. Defaults to 1.0.

  • +
+
+
Returns:
+

_description_

+
+
Return type:
+

_type_

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model.compute_dense_score(q_reps, p_reps)[source]
+

Compute the dense score.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
+
+
Returns:
+

The computed dense scores, adjusted by temperature.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model.compute_sparse_score(q_reps, p_reps)[source]
+

Compute the sparse score.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
+
+
Returns:
+

The computed sparse scores, adjusted by temperature.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model.compute_colbert_score(q_reps, p_reps, q_mask: Tensor | None = None)[source]
+

Compute the colbert score.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
+
+
Returns:
+

The computed colber scores, adjusted by temperature.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model.ensemble_score(q_reps, p_reps, dense_scores=None, sparse_scores=None, colbert_scores=None)[source]
+

Compute the ensemble score of the three methods.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
  • dense_scores (torch.Tensor, optional) – The dense scores. Defaults to None.

  • +
  • sparse_scores (torch.Tensor, optional) – The sparse scores. Defaults to None.

  • +
  • colbert_scores (torch.Tensor, optional) – The colbert scores. Defaults to None.

  • +
+
+
Raises:
+

ValueError – dense_scores, sparse_scores, colbert_scores must be provided

+
+
Returns:
+

The ensemble score of the three methods.

+
+
Return type:
+

_type_

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model.forward(queries: Dict[str, Tensor] | List[Dict[str, Tensor]] | None = None, passages: Dict[str, Tensor] | List[Dict[str, Tensor]] | None = None, teacher_scores: None | List[float] = None, no_in_batch_neg_flag: bool = False)[source]
+

The computation performed at every call.

+
+
Parameters:
+
    +
  • queries (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional) – Input queries. Defaults to None.

  • +
  • passages (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional) – Input passages. Defaults to None.

  • +
  • teacher_scores (Union[None, List[float]], optional) – Teacher scores for distillation. Defaults to None.

  • +
  • no_in_batch_neg_flag (bool, optional) – If True, use no in-batch negatives and no cross-device negatives. Defaults to False.

  • +
+
+
Returns:
+

Output of the forward call of model.

+
+
Return type:
+

EmbedderOutput

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model.compute_loss(scores, target)[source]
+

Compute the loss using cross entropy.

+
+
Parameters:
+
    +
  • scores (torch.Tensor) – Computed score.

  • +
  • target (torch.Tensor) – The target value.

  • +
+
+
Returns:
+

The computed cross entropy loss.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model.gradient_checkpointing_enable(**kwargs)[source]
+

Activates gradient checkpointing for the current model.

+
+ +
+
+EncoderOnlyEmbedderM3Model.enable_input_require_grads(**kwargs)[source]
+

Enables the gradients for the input embeddings.

+
+ +
+
+EncoderOnlyEmbedderM3Model.save(output_dir: str)[source]
+

Save the model to the directory.

+
+
Parameters:
+

output_dir (str) – Directory for saving the model.

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model._dense_embedding(last_hidden_state, attention_mask)[source]
+

Use the pooling method to get the dense embedding.

+
+
Parameters:
+
    +
  • last_hidden_state (torch.Tensor) – The model output’s last hidden state.

  • +
  • attention_mask (torch.Tensor) – Mask out padding tokens during pooling.

  • +
+
+
Raises:
+

NotImplementedError – Specified pooling method not implemented.

+
+
Returns:
+

The dense embeddings.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model._sparse_embedding(hidden_state, input_ids, return_embedding: bool = True)[source]
+

Compute and return the sparse embedding.

+
+
Parameters:
+
    +
  • hidden_state (torch.Tensor) – The model output’s last hidden state.

  • +
  • input_ids (_type_) – Ids from input features.

  • +
  • return_embedding (bool, optional) – If True, return the computed embedding, otherwise just return the token weights. +Defaults to True.

  • +
+
+
Returns:
+

The sparse embedding or just the token weights.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model._colbert_embedding(last_hidden_state, mask)[source]
+

Get the colbert vectors.

+
+
Parameters:
+
    +
  • last_hidden_state (torch.Tensor) – The model output’s last hidden state.

  • +
  • attention_mask (torch.Tensor) – Mask out padding tokens during pooling.

  • +
+
+
Returns:
+

The colbert vectors.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model._encode(features)[source]
+

Helper function to encode using input features.

+
+
Parameters:
+

features (Union[list, dict]) – Features feed to the model.

+
+
Returns:
+

Dense embedding. +torch.Tensor: Sparce embedding. +torch.Tensor: Colbert vector.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model._compute_similarity(q_reps, p_reps)[source]
+

Computes the similarity between query and passage representations using inner product.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
+
+
Returns:
+

The computed similarity matrix.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+EncoderOnlyEmbedderM3Model._get_queries_attention_mask(queries: Dict[str, Tensor] | List[Dict[str, Tensor]])[source]
+

padding attention mask for colbert

+
+
Parameters:
+

queries (Union[Dict[str, Tensor], List[Dict[str, Tensor]]]) – Input queries.

+
+
Returns:
+

The query attention mask.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+
+

EncoderOnlyEmbedderM3ModelForInference

+
+
+class FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3ModelForInference(base_model: Dict[str, Any], tokenizer: AutoTokenizer | None = None, negatives_cross_device: bool = False, temperature: float = 1, sub_batch_size: int = -1, kd_loss_type: str = 'm3_kd_loss', sentence_pooling_method: str = 'cls', normalize_embeddings: bool = False, unified_finetuning: bool = True, use_self_distill: bool = False, self_distill_start_step: int = -1)[source]
+

Inference class of M3 model.

+
+
+forward(text_input: Dict[str, Tensor] | None = None, return_dense: bool = True, return_sparse: bool = False, return_colbert_vecs: bool = False, return_sparse_embedding: bool = False)[source]
+

Encode the text input using the selected way.

+
+
Parameters:
+
    +
  • text_input (Dict[str, Tensor], optional) – Text inputs. Defaults to None.

  • +
  • return_dense (bool, optional) – If True, return the dense embedding. Defaults to True.

  • +
  • return_sparse (bool, optional) – If True, return the sparse embedding. Defaults to False.

  • +
  • return_colbert_vecs (bool, optional) – If True, return the colbert vectors. Defaults to False.

  • +
  • return_sparse_embedding (bool, optional) – Parameter for _sparse_embedding(). If True, will return sparse embedding. +Otherwise, return the token weights. Defaults to False.

  • +
+
+
Returns:
+

A dictionary containing the three types of embeddings.

+
+
Return type:
+

dict

+
+
+
+ +
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/encoder_only/m3/runner.html b/API/finetune/embedder/encoder_only/m3/runner.html new file mode 100644 index 00000000..853e89c6 --- /dev/null +++ b/API/finetune/embedder/encoder_only/m3/runner.html @@ -0,0 +1,603 @@ + + + + + + + + + Runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Runner

+
+
+class FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Runner(model_args: EncoderOnlyEmbedderM3ModelArguments, data_args: AbsEmbedderDataArguments, training_args: EncoderOnlyEmbedderM3TrainingArguments)[source]
+

M3 model runner for finetuning.

+
+
Parameters:
+
+
+
+
+
+static get_model(model_name_or_path: str, trust_remote_code: bool = False, colbert_dim: int = -1, cache_dir: str | None = None)[source]
+

Get the model.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • trust_remote_code (bool, optional) – trust_remote_code to use when loading models from HF. Defaults to False.

  • +
  • colbert_dim (int, optional) – Colbert dim to set. Defaults to -1.

  • +
  • cache_dir (str, optional) – HF cache dir to store the model. Defaults to None.

  • +
+
+
Returns:
+

A dictionary containing the model, colbert linear and sparse linear.

+
+
Return type:
+

dict

+
+
+
+ +
+
+load_tokenizer_and_model() Tuple[PreTrainedTokenizer, AbsEmbedderModel][source]
+

Load the tokenizer and model.

+
+
Returns:
+

Tokenizer and model instances.

+
+
Return type:
+

Tuple[PreTrainedTokenizer, AbsEmbedderModel]

+
+
+
+ +
+
+load_trainer() EncoderOnlyEmbedderM3Trainer[source]
+

Load the M3 trainer.

+
+
Returns:
+

M3 Trainer instance.

+
+
Return type:
+

EncoderOnlyEmbedderM3Trainer

+
+
+
+ +
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/embedder/encoder_only/m3/trainer.html b/API/finetune/embedder/encoder_only/m3/trainer.html new file mode 100644 index 00000000..898baf65 --- /dev/null +++ b/API/finetune/embedder/encoder_only/m3/trainer.html @@ -0,0 +1,538 @@ + + + + + + + + + Trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Trainer

+
+
+class FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Trainer(model: PreTrainedModel | Module | None = None, args: TrainingArguments | None = None, data_collator: DataCollator | None = None, train_dataset: Dataset | IterableDataset | Dataset | None = None, eval_dataset: Dataset | Dict[str, Dataset] | Dataset | None = None, tokenizer: PreTrainedTokenizerBase | None = None, model_init: Callable[[], PreTrainedModel] | None = None, compute_metrics: Callable[[EvalPrediction], Dict] | None = None, callbacks: List[TrainerCallback] | None = None, optimizers: Tuple[Optimizer, LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[Tensor, Tensor], Tensor] | None = None)[source]
+

Trainer class for M3.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker.html b/API/finetune/reranker.html new file mode 100644 index 00000000..c3358859 --- /dev/null +++ b/API/finetune/reranker.html @@ -0,0 +1,610 @@ + + + + + + + + + Reranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Reranker

+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only.html b/API/finetune/reranker/decoder_only.html new file mode 100644 index 00000000..d097aa3e --- /dev/null +++ b/API/finetune/reranker/decoder_only.html @@ -0,0 +1,581 @@ + + + + + + + + + Decoder Only - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+ + +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only/base.html b/API/finetune/reranker/decoder_only/base.html new file mode 100644 index 00000000..29f4dda7 --- /dev/null +++ b/API/finetune/reranker/decoder_only/base.html @@ -0,0 +1,549 @@ + + + + + + + + + Base - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+ + +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only/base/arguments.html b/API/finetune/reranker/decoder_only/base/arguments.html new file mode 100644 index 00000000..37d2fb04 --- /dev/null +++ b/API/finetune/reranker/decoder_only/base/arguments.html @@ -0,0 +1,538 @@ + + + + + + + + + Arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Arguments

+
+
+class FlagEmbedding.finetune.reranker.decoder_only.base.RerankerModelArguments(model_name_or_path: str, config_name: str | None = None, tokenizer_name: str | None = None, cache_dir: str | None = None, trust_remote_code: bool = False, model_type: str = 'encoder', token: str = <factory>, use_lora: bool = True, lora_rank: int = 64, lora_alpha: float = 16, lora_dropout: float = 0.1, target_modules: ~typing.List[str] = <factory>, modules_to_save: ~typing.List[str] | None = None, use_flash_attn: bool = False, from_peft: str | None = None, raw_peft: ~typing.List[str] | None = None, save_merged_lora_model: bool = False)[source]
+

Model argument class for decoder only reranker.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only/base/modeling.html b/API/finetune/reranker/decoder_only/base/modeling.html new file mode 100644 index 00000000..d81892ba --- /dev/null +++ b/API/finetune/reranker/decoder_only/base/modeling.html @@ -0,0 +1,690 @@ + + + + + + + + + Modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Modeling

+
+
+class FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel(base_model: AutoModel, tokenizer: AutoTokenizer | None = None, negatives_cross_device: bool = False, temperature: float = 1.0, sub_batch_size: int = -1, kd_loss_type: str = 'kl_div', sentence_pooling_method: str = 'last_token', normalize_embeddings: bool = False)[source]
+

Embedder model class for decoder only model.

+
+
Parameters:
+
    +
  • base_model (AutoModel) – The base model to train on.

  • +
  • tokenizer (AutoTokenizer, optional) – The tokenizer to use. Defaults to None.

  • +
  • negatives_cross_device (bool, optional) – If True, will compute cross devices negative loss. Defaults to False.

  • +
  • temperature (float, optional) – Temperature to control the scale of scores. Defaults to 1.0.

  • +
  • sub_batch_size (int, optional) – Sub-batch size during encoding. If negative, will not split to sub-batch. +Defaults to -1.

  • +
  • kd_loss_type (str, optional) – Type of knowledge distillation loss. Defaults to 'kl_div'.

  • +
  • sentence_pooling_method (str, optional) – Pooling method to get sentence embedding. Defaults to 'last_token'.

  • +
  • normalize_embeddings (bool, optional) – If True, normalize the embedding vector. Defaults to False.

  • +
+
+
+
+ +
+

Methods

+
+
+BiDecoderOnlyEmbedderModel.encode(features)[source]
+

Encode and get the embedding.

+
+
Parameters:
+

features (Union[list, dict]) – Features feed to the model.

+
+
Returns:
+

The embedding vectors.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiDecoderOnlyEmbedderModel.compute_score(q_reps, p_reps)[source]
+

Computes the scores between query and passage representations.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
+
+
Returns:
+

The computed scores, adjusted by temperature.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiDecoderOnlyEmbedderModel.compute_loss(scores, target)[source]
+

Compute the loss using cross entropy.

+
+
Parameters:
+
    +
  • scores (torch.Tensor) – Computed score.

  • +
  • target (torch.Tensor) – The target value.

  • +
+
+
Returns:
+

The computed cross entropy loss.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiDecoderOnlyEmbedderModel.gradient_checkpointing_enable(**kwargs)[source]
+

Activates gradient checkpointing for the current model.

+
+ +
+
+BiDecoderOnlyEmbedderModel.enable_input_require_grads(**kwargs)[source]
+

Enables the gradients for the input embeddings.

+
+ +
+
+BiDecoderOnlyEmbedderModel.save(output_dir: str)[source]
+

Save the model to the directory.

+
+
Parameters:
+

output_dir (str) – Directory for saving the model.

+
+
+
+ +
+
+BiDecoderOnlyEmbedderModel._sentence_embedding(last_hidden_state, attention_mask)[source]
+

Use the pooling method to get the sentence embedding.

+
+
Parameters:
+
    +
  • last_hidden_state (torch.Tensor) – The model output’s last hidden state.

  • +
  • attention_mask (torch.Tensor) – Mask out padding tokens during pooling.

  • +
+
+
Raises:
+

NotImplementedError – Specified pooling method not implemented.

+
+
Returns:
+

The sentence embeddings.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+BiDecoderOnlyEmbedderModel._compute_similarity(q_reps, p_reps)[source]
+

Computes the similarity between query and passage representations using inner product.

+
+
Parameters:
+
    +
  • q_reps (torch.Tensor) – Query representations.

  • +
  • p_reps (torch.Tensor) – Passage representations.

  • +
+
+
Returns:
+

The computed similarity matrix.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only/base/runner.html b/API/finetune/reranker/decoder_only/base/runner.html new file mode 100644 index 00000000..f26c160e --- /dev/null +++ b/API/finetune/reranker/decoder_only/base/runner.html @@ -0,0 +1,586 @@ + + + + + + + + + Runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Runner

+
+
+class FlagEmbedding.finetune.embedder.decoder_only.base.DecoderOnlyEmbedderRunner(model_args: DecoderOnlyEmbedderModelArguments, data_args: AbsEmbedderDataArguments, training_args: AbsEmbedderTrainingArguments)[source]
+

Runner class for decoder only embedding model.

+
+
Parameters:
+
+
+
+
+
+load_tokenizer_and_model() Tuple[PreTrainedTokenizer, AbsEmbedderModel][source]
+

Load tokenizer and model.

+
+
Returns:
+

Tokenizer and model instances.

+
+
Return type:
+

Tuple[PreTrainedTokenizer, AbsEmbedderModel]

+
+
+
+ +
+
+load_trainer() DecoderOnlyEmbedderTrainer[source]
+

Load the trainer.

+
+
Returns:
+

Loaded trainer instance.

+
+
Return type:
+

DecoderOnlyEmbedderTrainer

+
+
+
+ +
+
+run()[source]
+

Run the finetune.

+
+ +
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only/base/trainer.html b/API/finetune/reranker/decoder_only/base/trainer.html new file mode 100644 index 00000000..2727d9a6 --- /dev/null +++ b/API/finetune/reranker/decoder_only/base/trainer.html @@ -0,0 +1,538 @@ + + + + + + + + + Trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Trainer

+
+
+class FlagEmbedding.finetune.embedder.decoder_only.base.DecoderOnlyEmbedderTrainer(model: PreTrainedModel | Module | None = None, args: TrainingArguments | None = None, data_collator: DataCollator | None = None, train_dataset: Dataset | IterableDataset | Dataset | None = None, eval_dataset: Dataset | Dict[str, Dataset] | Dataset | None = None, tokenizer: PreTrainedTokenizerBase | None = None, model_init: Callable[[], PreTrainedModel] | None = None, compute_metrics: Callable[[EvalPrediction], Dict] | None = None, callbacks: List[TrainerCallback] | None = None, optimizers: Tuple[Optimizer, LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[Tensor, Tensor], Tensor] | None = None)[source]
+

Trainer class for base encoder models.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only/layerwise.html b/API/finetune/reranker/decoder_only/layerwise.html new file mode 100644 index 00000000..aebad9dd --- /dev/null +++ b/API/finetune/reranker/decoder_only/layerwise.html @@ -0,0 +1,543 @@ + + + + + + + + + Layerwise - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+ + +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only/layerwise/arguments.html b/API/finetune/reranker/decoder_only/layerwise/arguments.html new file mode 100644 index 00000000..c1210ad7 --- /dev/null +++ b/API/finetune/reranker/decoder_only/layerwise/arguments.html @@ -0,0 +1,538 @@ + + + + + + + + + Arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Arguments

+
+
+class FlagEmbedding.finetune.reranker.decoder_only.layerwise.RerankerModelArguments(model_name_or_path: str, config_name: str | None = None, tokenizer_name: str | None = None, cache_dir: str | None = None, trust_remote_code: bool = False, model_type: str = 'from_raw_model', token: str = <factory>, use_lora: bool = True, lora_rank: int = 64, lora_alpha: float = 16, lora_dropout: float = 0.1, target_modules: ~typing.List[str] = <factory>, modules_to_save: ~typing.List[str] | None = None, use_flash_attn: bool = False, from_peft: str | None = None, raw_peft: ~typing.List[str] | None = None, save_merged_lora_model: bool = False, start_layer: int = 8, head_multi: bool = False, head_type: str = 'simple')[source]
+

Model argument class for decoder only reranker.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only/layerwise/modeling.html b/API/finetune/reranker/decoder_only/layerwise/modeling.html new file mode 100644 index 00000000..fae96420 --- /dev/null +++ b/API/finetune/reranker/decoder_only/layerwise/modeling.html @@ -0,0 +1,587 @@ + + + + + + + + + Modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Modeling

+
+
+class FlagEmbedding.finetune.reranker.decoder_only.layerwise.CrossDecoderModel(base_model: PreTrainedModel, tokenizer: AutoTokenizer | None = None, train_batch_size: int = 4, start_layer: int = 8)[source]
+

Model class for decoder only reranker.

+
+
Parameters:
+
    +
  • base_model (PreTrainedModel) – The underlying pre-trained model used for encoding and scoring input pairs.

  • +
  • tokenizer (AutoTokenizer, optional) – The tokenizer for encoding input text. Defaults to None.

  • +
  • train_batch_size (int, optional) – The batch size to use. Defaults to 4.

  • +
  • start_layer (int, optional) – Starting layer for layerwise. Defaults to 8.

  • +
+
+
+
+ +
+

Methods

+
+
+CrossDecoderModel.encode(features)[source]
+

Abstract method of encode.

+
+
Parameters:
+

features (dict) – Teatures to pass to the model.

+
+
+
+ +
+
+CrossDecoderModel.forward(pair: Dict[str, Tensor] | List[Dict[str, Tensor]] | None = None, teacher_scores: Tensor | None = None)[source]
+

The computation performed at every call.

+
+
Parameters:
+
    +
  • pair (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional) – The query-document pair. Defaults to None.

  • +
  • teacher_scores (Optional[Tensor], optional) – Teacher scores of knowledge distillation. Defaults to None.

  • +
+
+
Returns:
+

Output of reranker model.

+
+
Return type:
+

RerankerOutput

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only/layerwise/runner.html b/API/finetune/reranker/decoder_only/layerwise/runner.html new file mode 100644 index 00000000..272bc25d --- /dev/null +++ b/API/finetune/reranker/decoder_only/layerwise/runner.html @@ -0,0 +1,586 @@ + + + + + + + + + Runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Runner

+
+
+class FlagEmbedding.finetune.reranker.decoder_only.layerwise.DecoderOnlyRerankerRunner(model_args: RerankerModelArguments, data_args: AbsRerankerDataArguments, training_args: AbsRerankerTrainingArguments)[source]
+

Decoder only layerwise reranker runner for finetuning.

+
+
Parameters:
+
+
+
+
+
+load_tokenizer_and_model() Tuple[PreTrainedTokenizer, AbsRerankerModel][source]
+

Load the tokenizer and model.

+
+
Returns:
+

Tokenizer and model instances.

+
+
Return type:
+

Tuple[PreTrainedTokenizer, AbsEmbedderModel]

+
+
+
+ +
+
+load_trainer() DecoderOnlyRerankerTrainer[source]
+

Load the trainer.

+
+
Returns:
+

Loaded trainer instance.

+
+
Return type:
+

DecoderOnlyRerankerTrainer

+
+
+
+ +
+
+run()[source]
+

Run the finetuning.

+
+ +
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/decoder_only/layerwise/trainer.html b/API/finetune/reranker/decoder_only/layerwise/trainer.html new file mode 100644 index 00000000..d26897a2 --- /dev/null +++ b/API/finetune/reranker/decoder_only/layerwise/trainer.html @@ -0,0 +1,538 @@ + + + + + + + + + Trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Trainer

+
+
+class FlagEmbedding.finetune.reranker.decoder_only.layerwise.DecoderOnlyRerankerTrainer(model: PreTrainedModel | Module | None = None, args: TrainingArguments | None = None, data_collator: DataCollator | None = None, train_dataset: Dataset | IterableDataset | Dataset | None = None, eval_dataset: Dataset | Dict[str, Dataset] | Dataset | None = None, tokenizer: PreTrainedTokenizerBase | None = None, model_init: Callable[[], PreTrainedModel] | None = None, compute_metrics: Callable[[EvalPrediction], Dict] | None = None, callbacks: List[TrainerCallback] | None = None, optimizers: Tuple[Optimizer, LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[Tensor, Tensor], Tensor] | None = None)[source]
+

Trainer class for encoder only base reranker models.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/encoder_only.html b/API/finetune/reranker/encoder_only.html new file mode 100644 index 00000000..01812238 --- /dev/null +++ b/API/finetune/reranker/encoder_only.html @@ -0,0 +1,540 @@ + + + + + + + + + Encoder Only - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+ +
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/encoder_only/base.html b/API/finetune/reranker/encoder_only/base.html new file mode 100644 index 00000000..8f17ee23 --- /dev/null +++ b/API/finetune/reranker/encoder_only/base.html @@ -0,0 +1,537 @@ + + + + + + + + + Base - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+ +
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/encoder_only/base/modeling.html b/API/finetune/reranker/encoder_only/base/modeling.html new file mode 100644 index 00000000..c11ebf4a --- /dev/null +++ b/API/finetune/reranker/encoder_only/base/modeling.html @@ -0,0 +1,571 @@ + + + + + + + + + Modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Modeling

+
+
+class FlagEmbedding.finetune.reranker.encoder_only.base.CrossEncoderModel(base_model: PreTrainedModel, tokenizer: AutoTokenizer | None = None, train_batch_size: int = 4)[source]
+

Model class for reranker.

+
+
Parameters:
+
    +
  • base_model (PreTrainedModel) – The underlying pre-trained model used for encoding and scoring input pairs.

  • +
  • tokenizer (AutoTokenizer, optional) – The tokenizer for encoding input text. Defaults to None.

  • +
  • train_batch_size (int, optional) – The batch size to use. Defaults to 4.

  • +
+
+
+
+ +
+

Methods

+
+
+CrossEncoderModel.encode(features)[source]
+

Encodes input features to logits.

+
+
Parameters:
+

features (dict) – Dictionary with input features.

+
+
Returns:
+

The logits output from the model.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/encoder_only/base/runner.html b/API/finetune/reranker/encoder_only/base/runner.html new file mode 100644 index 00000000..873692dc --- /dev/null +++ b/API/finetune/reranker/encoder_only/base/runner.html @@ -0,0 +1,570 @@ + + + + + + + + + Runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Runner

+
+
+class FlagEmbedding.finetune.reranker.encoder_only.base.EncoderOnlyRerankerRunner(model_args: AbsRerankerModelArguments, data_args: AbsRerankerDataArguments, training_args: AbsRerankerTrainingArguments)[source]
+

Encoder only reranker runner for finetuning.

+
+
+load_tokenizer_and_model() Tuple[PreTrainedTokenizer, AbsRerankerModel][source]
+

Load the tokenizer and model.

+
+
Returns:
+

Tokenizer and model instances.

+
+
Return type:
+

Tuple[PreTrainedTokenizer, AbsEmbedderModel]

+
+
+
+ +
+
+load_trainer() EncoderOnlyRerankerTrainer[source]
+

Load the trainer.

+
+
Returns:
+

Loaded trainer instance.

+
+
Return type:
+

EncoderOnlyRerankerTrainer

+
+
+
+ +
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/finetune/reranker/encoder_only/base/trainer.html b/API/finetune/reranker/encoder_only/base/trainer.html new file mode 100644 index 00000000..062db1d7 --- /dev/null +++ b/API/finetune/reranker/encoder_only/base/trainer.html @@ -0,0 +1,538 @@ + + + + + + + + + Trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Trainer

+
+
+class FlagEmbedding.finetune.reranker.encoder_only.base.EncoderOnlyRerankerTrainer(model: PreTrainedModel | Module | None = None, args: TrainingArguments | None = None, data_collator: DataCollator | None = None, train_dataset: Dataset | IterableDataset | Dataset | None = None, eval_dataset: Dataset | Dict[str, Dataset] | Dataset | None = None, tokenizer: PreTrainedTokenizerBase | None = None, model_init: Callable[[], PreTrainedModel] | None = None, compute_metrics: Callable[[EvalPrediction], Dict] | None = None, callbacks: List[TrainerCallback] | None = None, optimizers: Tuple[Optimizer, LambdaLR] = (None, None), preprocess_logits_for_metrics: Callable[[Tensor, Tensor], Tensor] | None = None)[source]
+

Trainer class for encoder only base reranker models.

+
+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference.html b/API/inference.html new file mode 100644 index 00000000..a4ae2a58 --- /dev/null +++ b/API/inference.html @@ -0,0 +1,636 @@ + + + + + + + + + Inference - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Inference

+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/FlagAutoModel.html b/API/inference/FlagAutoModel.html new file mode 100644 index 00000000..08a68183 --- /dev/null +++ b/API/inference/FlagAutoModel.html @@ -0,0 +1,579 @@ + + + + + + + + + FlagAutoModel - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

FlagAutoModel

+
+
+class FlagEmbedding.inference.FlagAutoModel[source]
+

Automatically choose the appropriate class to load the embedding model.

+
+ +
+

Methods

+
+
+classmethod FlagAutoModel.from_finetuned(model_name_or_path: str, model_class: str | EmbedderModelClass | None = None, normalize_embeddings: bool = True, use_fp16: bool = True, query_instruction_for_retrieval: str | None = None, devices: None | str | List[str] = None, pooling_method: str | None = None, trust_remote_code: bool | None = None, query_instruction_format: str | None = None, **kwargs)[source]
+

Load a finetuned model according to the provided vars.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • model_class (Optional[Union[str, EmbedderModelClass]], optional) – The embedder class to use. Defaults to None.

  • +
  • normalize_embeddings (bool, optional) – If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. +Defaults to True.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to True.

  • +
  • query_instruction_for_retrieval (Optional[str], optional) – Query instruction for retrieval tasks, which will be used with +query_instruction_format. Defaults to None.

  • +
  • devices (Optional[Union[str, List[str]]], optional) – Devices to use for model inference. Defaults to None.

  • +
  • pooling_method (Optional[str], optional) – Pooling method to get embedding vector from the last hidden state. Defaults to None.

  • +
  • trust_remote_code (Optional[bool], optional) – trust_remote_code for HF datasets or models. Defaults to None.

  • +
  • query_instruction_format (Optional[str], optional) – The template for query_instruction_for_retrieval. Defaults to None.

  • +
+
+
Raises:
+

ValueError

+
+
Returns:
+

The model class to load model, which is child class of AbsEmbedder.

+
+
Return type:
+

AbsEmbedder

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/FlagAutoReranker.html b/API/inference/FlagAutoReranker.html new file mode 100644 index 00000000..3b8dd886 --- /dev/null +++ b/API/inference/FlagAutoReranker.html @@ -0,0 +1,572 @@ + + + + + + + + + FlagAutoReranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

FlagAutoReranker

+
+
+class FlagEmbedding.inference.FlagAutoReranker[source]
+

Automatically choose the appropriate class to load the reranker model.

+
+ +
+

Methods

+
+
+classmethod FlagAutoReranker.from_finetuned(model_name_or_path: str, model_class: str | RerankerModelClass | None = None, use_fp16: bool = False, trust_remote_code: bool | None = None, **kwargs)[source]
+

Load a finetuned model according to the provided vars.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • model_class (Optional[Union[str, RerankerModelClass]], optional) – The reranker class to use.. Defaults to None.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to False.

  • +
  • trust_remote_code (Optional[bool], optional) – trust_remote_code for HF datasets or models. Defaults to None.

  • +
+
+
Raises:
+

ValueError

+
+
Returns:
+

The reranker class to load model, which is child class of AbsReranker.

+
+
Return type:
+

AbsReranker

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/embedder/decoder_only/BaseLLMEmbedder.html b/API/inference/embedder/decoder_only/BaseLLMEmbedder.html new file mode 100644 index 00000000..1d4a1b8d --- /dev/null +++ b/API/inference/embedder/decoder_only/BaseLLMEmbedder.html @@ -0,0 +1,672 @@ + + + + + + + + + BaseEmbedder - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

BaseEmbedder

+
+
+class FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder(model_name_or_path: str, normalize_embeddings: bool = True, use_fp16: bool = True, query_instruction_for_retrieval: str | None = None, query_instruction_format: str = 'Instruct: {}\nQuery: {}', devices: None | str | List[str] = None, trust_remote_code: bool = False, cache_dir: str | None = None, batch_size: int = 256, query_max_length: int = 512, passage_max_length: int = 512, convert_to_numpy: bool = True, **kwargs: Any)[source]
+

Base embedder class for LLM like decoder only models.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • normalize_embeddings (bool, optional) – If True, normalize the embedding vector. Defaults to True.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to True.

  • +
  • query_instruction_for_retrieval (Optional[str], optional) – Query instruction for retrieval tasks, which will be used with +with query_instruction_format. Defaults to None.

  • +
  • query_instruction_format (str, optional) – The template for query_instruction_for_retrieval. Defaults to "{}{}".

  • +
  • devices (Optional[Union[str, int, List[str], List[int]]], optional) – Devices to use for model inference. Defaults to None.

  • +
  • trust_remote_code (bool, optional) – trust_remote_code for HF datasets or models. Defaults to False.

  • +
  • cache_dir (Optional[str], optional) – Cache directory for the model. Defaults to None.

  • +
  • batch_size (int, optional) – Batch size for inference. Defaults to 256.

  • +
  • query_max_length (int, optional) – Maximum length for query. Defaults to 512.

  • +
  • passage_max_length (int, optional) – Maximum length for passage. Defaults to 512.

  • +
  • convert_to_numpy (bool, optional) – If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. +Defaults to True.

  • +
+
+
+
+
+DEFAULT_POOLING_METHOD
+

The default pooling method when running the model.

+
+ +
+ +
+

Methods

+
+
+BaseLLMEmbedder.encode_queries(queries: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any) ndarray | Tensor[source]
+

Encode the queries.

+
+
Parameters:
+
    +
  • queries (Union[List[str], str]) – Input queries to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

Return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+BaseLLMEmbedder.encode_corpus(corpus: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any) ndarray | Tensor[source]
+

Encode the corpus.

+
+
Parameters:
+
    +
  • corpus (Union[List[str], str]) – Input corpus to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

Return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+BaseLLMEmbedder.encode(sentences: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any) ndarray | Tensor[source]
+

Encode the input sentences with the embedding model.

+
+
Parameters:
+
    +
  • sentences (Union[List[str], str]) – Input sentences to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+BaseLLMEmbedder.encode_single_device(sentences: List[str] | str, batch_size: int = 256, max_length: int = 512, convert_to_numpy: bool = True, device: str | None = None, **kwargs: Any)[source]
+

Encode input sentences by a single device.

+
+
Parameters:
+
    +
  • sentences (Union[List[str], str]) – Input sentences to encode.

  • +
  • batch_size (int, optional) – Number of sentences for each iter. Defaults to 256.

  • +
  • max_length (int, optional) – Maximum length of tokens. Defaults to 512.

  • +
  • convert_to_numpy (bool, optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to True.

  • +
  • device (Optional[str], optional) – Device to use for encoding. Defaults to None.

  • +
+
+
Returns:
+

return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/embedder/decoder_only/ICLLLMEmbedder.html b/API/inference/embedder/decoder_only/ICLLLMEmbedder.html new file mode 100644 index 00000000..6591949e --- /dev/null +++ b/API/inference/embedder/decoder_only/ICLLLMEmbedder.html @@ -0,0 +1,735 @@ + + + + + + + + + ICLLLMEmbedder - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

ICLLLMEmbedder

+
+
+class FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder(model_name_or_path: str, normalize_embeddings: bool = True, use_fp16: bool = True, query_instruction_for_retrieval: str | None = None, query_instruction_format: str = '<instruct>{}\n<query>{}', suffix: str = '\n<response>', devices: None | str | List[str] = None, examples_for_task: List[dict] | None = None, examples_instruction_format: str = '<instruct>{}\n<query>{}\n<response>{}', trust_remote_code: bool = False, cache_dir: str | None = None, batch_size: int = 256, query_max_length: int = 512, passage_max_length: int = 512, convert_to_numpy: bool = True, **kwargs: Any)[source]
+

Embedder class for BGE-EN-icl.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • normalize_embeddings (bool, optional) – If True, normalize the embedding vector. Defaults to True.

  • +
  • use_fp16 (bool, optional) – degradation. Defaults to True.

  • +
  • query_instruction_for_retrieval (Optional[str], optional) – Query instruction for retrieval tasks, which will be used with +with query_instruction_format. Defaults to None.

  • +
  • query_instruction_format (str, optional) – The template for query_instruction_for_retrieval. Defaults to "{}{}".

  • +
  • devices (Optional[Union[str, int, List[str], List[int]]], optional) – Devices to use for model inference. Defaults to None.

  • +
  • examples_for_task (Optional[List[dict]], optional) – Few-shot examples for the model to enhance model’s ability. +Defaults to None.

  • +
  • examples_instruction_format (str, optional) – Example format when using examples_for_task.

  • +
  • trust_remote_code (bool, optional) – trust_remote_code for HF datasets or models. Defaults to False.

  • +
  • cache_dir (Optional[str], optional) – Cache directory for the model. Defaults to None.

  • +
  • batch_size (int, optional) – Batch size for inference. Defaults to 256.

  • +
  • query_max_length (int, optional) – Maximum length for query. Defaults to 512.

  • +
  • passage_max_length (int, optional) – Maximum length for passage. Defaults to 512.

  • +
  • convert_to_numpy (bool, optional) – If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. +Defaults to True.

  • +
+
+
+
+
+DEFAULT_POOLING_METHOD
+

The default pooling method when running the model.

+
+ +
+ +
+

Methods

+
+
+ICLLLMEmbedder.encode_queries(queries: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any) ndarray | Tensor[source]
+

Encode the queries.

+
+
Parameters:
+
    +
  • queries (Union[List[str], str]) – Input queries to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

Return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+ICLLLMEmbedder.encode_corpus(corpus: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any) ndarray | Tensor[source]
+

Encode the corpus.

+
+
Parameters:
+
    +
  • corpus (Union[List[str], str]) – Input corpus to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

Return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+ICLLLMEmbedder.encode(sentences: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any) ndarray | Tensor[source]
+

Encode the input sentences with the embedding model.

+
+
Parameters:
+
    +
  • sentences (Union[List[str], str]) – Input sentences to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+ICLLLMEmbedder.set_examples(examples_for_task: List[dict] | None = None)[source]
+

Set the prefix to the provided examples.

+
+
Parameters:
+

examples_for_task (Optional[List[dict]], optional) – Few-shot examples for the model to enhance model’s ability. +Defaults to None.

+
+
+
+ +
+
+static ICLLLMEmbedder.get_detailed_example(instruction_format: str, instruction: str, query: str, response: str)[source]
+

Combine the instruction and sentence along with the instruction format.

+
+
Parameters:
+
    +
  • instruction_format (str) – Format for instruction.

  • +
  • instruction (str) – The text of instruction.

  • +
  • query (str) – The text of example query.

  • +
  • response (str) – The text of example response.

  • +
+
+
Returns:
+

The complete example following the given format.

+
+
Return type:
+

str

+
+
+
+ +
+
+ICLLLMEmbedder.encode_queries_single_device(queries: List[str] | str, batch_size: int = 256, max_length: int = 512, convert_to_numpy: bool = True, device: str | None = None, **kwargs: Any)[source]
+

Encode queries by a single device.

+
+
Parameters:
+
    +
  • queries (Union[List[str], str]) – Input queries to encode.

  • +
  • batch_size (int, optional) – Number of queries for each iter. Defaults to 256.

  • +
  • max_length (int, optional) – Maximum length of tokens. Defaults to 512.

  • +
  • convert_to_numpy (bool, optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to True.

  • +
  • device (Optional[str], optional) – Device to use for encoding. Defaults to None.

  • +
+
+
Returns:
+

return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+ICLLLMEmbedder.encode_single_device(sentences: List[str] | str, batch_size: int = 256, max_length: int = 512, convert_to_numpy: bool = True, device: str | None = None, **kwargs: Any)[source]
+

Encode input sentences by a single device.

+
+
Parameters:
+
    +
  • sentences (Union[List[str], str]) – Input sentences to encode.

  • +
  • batch_size (int, optional) – Number of sentences for each iter. Defaults to 256.

  • +
  • max_length (int, optional) – Maximum length of tokens. Defaults to 512.

  • +
  • convert_to_numpy (bool, optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to True.

  • +
  • device (Optional[str], optional) – Device to use for encoding. Defaults to None.

  • +
+
+
Returns:
+

return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/embedder/embedder.html b/API/inference/embedder/embedder.html new file mode 100644 index 00000000..c9f6f626 --- /dev/null +++ b/API/inference/embedder/embedder.html @@ -0,0 +1,582 @@ + + + + + + + + + Embedder - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Embedder

+ +
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/embedder/encoder_only/BaseEmbedder.html b/API/inference/embedder/encoder_only/BaseEmbedder.html new file mode 100644 index 00000000..9afbaeb0 --- /dev/null +++ b/API/inference/embedder/encoder_only/BaseEmbedder.html @@ -0,0 +1,696 @@ + + + + + + + + + BaseEmbedder - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

BaseEmbedder

+
+
+class FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder(model_name_or_path: str, normalize_embeddings: bool = True, use_fp16: bool = True, query_instruction_for_retrieval: str | None = None, query_instruction_format: str = '{}{}', devices: None | str | List[str] = None, pooling_method: str = 'cls', trust_remote_code: bool = False, cache_dir: str | None = None, batch_size: int = 256, query_max_length: int = 512, passage_max_length: int = 512, convert_to_numpy: bool = True, **kwargs: Any)[source]
+

Base embedder for encoder only models.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • normalize_embeddings (bool, optional) – If True, normalize the embedding vector. Defaults to True.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to True.

  • +
  • query_instruction_for_retrieval (Optional[str], optional) – Query instruction for retrieval tasks, which will be used with +with query_instruction_format. Defaults to None.

  • +
  • query_instruction_format (str, optional) – The template for query_instruction_for_retrieval. Defaults to "{}{}".

  • +
  • devices (Optional[Union[str, int, List[str], List[int]]], optional) – Devices to use for model inference. Defaults to None.

  • +
  • pooling_method (str, optional) – Pooling method to get embedding vector from the last hidden state. Defaults to "cls".

  • +
  • trust_remote_code (bool, optional) – trust_remote_code for HF datasets or models. Defaults to False.

  • +
  • cache_dir (Optional[str], optional) – Cache directory for the model. Defaults to None.

  • +
  • batch_size (int, optional) – Batch size for inference. Defaults to 256.

  • +
  • query_max_length (int, optional) – Maximum length for query. Defaults to 512.

  • +
  • passage_max_length (int, optional) – Maximum length for passage. Defaults to 512.

  • +
  • convert_to_numpy (bool, optional) – If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. +Defaults to True.

  • +
+
+
+
+
+DEFAULT_POOLING_METHOD
+

The default pooling method when running the model.

+
+ +
+ +
+

Methods

+
+
+BaseEmbedder.encode_queries(queries: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any) ndarray | Tensor[source]
+

Encode the queries.

+
+
Parameters:
+
    +
  • queries (Union[List[str], str]) – Input queries to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

Return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+BaseEmbedder.encode_corpus(corpus: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any) ndarray | Tensor[source]
+

Encode the corpus using the instruction if provided.

+
+
Parameters:
+
    +
  • corpus (Union[List[str], str]) – Input corpus to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

Return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+BaseEmbedder.encode(sentences: List[str] | str, batch_size: int | None = None, max_length: int | None = None, convert_to_numpy: bool | None = None, **kwargs: Any) ndarray | Tensor[source]
+

Encode the input sentences with the embedding model.

+
+
Parameters:
+
    +
  • sentences (Union[List[str], str]) – Input sentences to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • convert_to_numpy (Optional[bool], optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to None.

  • +
+
+
Returns:
+

return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+BaseEmbedder.encode_single_device(sentences: List[str] | str, batch_size: int = 256, max_length: int = 512, convert_to_numpy: bool = True, device: str | None = None, **kwargs: Any)[source]
+

Encode input sentences by a single device.

+
+
Parameters:
+
    +
  • sentences (Union[List[str], str]) – Input sentences to encode.

  • +
  • batch_size (int, optional) – Number of sentences for each iter. Defaults to 256.

  • +
  • max_length (int, optional) – Maximum length of tokens. Defaults to 512.

  • +
  • convert_to_numpy (bool, optional) – If True, the output embedding will be a Numpy array. Otherwise, it will +be a Torch Tensor. Defaults to True.

  • +
  • device (Optional[str], optional) – Device to use for encoding. Defaults to None.

  • +
+
+
Returns:
+

return the embedding vectors in a numpy array or tensor.

+
+
Return type:
+

Union[torch.Tensor, np.ndarray]

+
+
+
+ +
+
+BaseEmbedder.pooling(last_hidden_state: Tensor, attention_mask: Tensor | None = None)[source]
+

The pooling function.

+
+
Parameters:
+
    +
  • last_hidden_state (torch.Tensor) – The last hidden state of the model.

  • +
  • attention_mask (Optional[torch.Tensor], optional) – Attention mask. Defaults to None.

  • +
+
+
Raises:
+

NotImplementedError – pooling method not implemented.

+
+
Returns:
+

The embedding vectors after pooling.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/embedder/encoder_only/M3Embedder.html b/API/inference/embedder/encoder_only/M3Embedder.html new file mode 100644 index 00000000..4eef719c --- /dev/null +++ b/API/inference/embedder/encoder_only/M3Embedder.html @@ -0,0 +1,776 @@ + + + + + + + + + M3Embedder - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

M3Embedder

+
+
+class FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder(model_name_or_path: str, normalize_embeddings: bool = True, use_fp16: bool = True, query_instruction_for_retrieval: str | None = None, query_instruction_format: str = '{}{}', devices: None | str | List[str] = None, pooling_method: str = 'cls', trust_remote_code: bool = False, cache_dir: str | None = None, colbert_dim: int = -1, batch_size: int = 256, query_max_length: int = 512, passage_max_length: int = 512, return_dense: bool = True, return_sparse: bool = False, return_colbert_vecs: bool = False, **kwargs: Any)[source]
+

Embedder class for BGE-M3.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • normalize_embeddings (bool, optional) – If True, normalize the dense embedding vector. Defaults to True.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to True.

  • +
  • query_instruction_for_retrieval – (Optional[str], optional): Query instruction for retrieval tasks, which will be used with +with query_instruction_format. Defaults to None.

  • +
  • query_instruction_format – (str, optional): The template for query_instruction_for_retrieval. Defaults to "{}{}".

  • +
  • devices (Optional[Union[str, int, List[str], List[int]]], optional) – Devices to use for model inference. Defaults to None.

  • +
  • pooling_method (str, optional) – Pooling method to get embedding vector from the last hidden state. Defaults to "cls".

  • +
  • trust_remote_code (bool, optional) – trust_remote_code for HF datasets or models. Defaults to False.

  • +
  • cache_dir (Optional[str], optional) – Cache directory for the model. Defaults to None.

  • +
  • cobert_dim (int, optional) – Dimension of colbert linear. Return the hidden_size if -1. Defaults to -1.

  • +
  • batch_size (int, optional) – Batch size for inference. Defaults to 256.

  • +
  • query_max_length (int, optional) – Maximum length for query. Defaults to 512.

  • +
  • passage_max_length (int, optional) – Maximum length for passage. Defaults to 512.

  • +
  • return_dense (bool, optional) – If true, will return the dense embedding. Defaults to True.

  • +
  • return_sparse (bool, optional) – If true, will return the sparce embedding. Defaults to False.

  • +
  • return_colbert_vecs (bool, optional) – If true, will return the colbert vectors. Defaults to False.

  • +
+
+
+
+
+DEFAULT_POOLING_METHOD
+

The default pooling method when running the model.

+
+ +
+ +
+

Methods

+
+
+M3Embedder.encode_queries(queries: List[str] | str, batch_size: int | None = None, max_length: int | None = None, return_dense: bool | None = None, return_sparse: bool | None = None, return_colbert_vecs: bool | None = None, **kwargs: Any) Dict[Literal['dense_vecs', 'lexical_weights', 'colbert_vecs'], ndarray | List[Dict[str, float]] | List[ndarray]][source]
+

Encode the queries using the specified way.

+
+
Parameters:
+
    +
  • queries (Union[List[str], str]) – The input queries to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • return_dense (Optional[bool], optional) – If True, compute and return dense embedding. Defaults to None.

  • +
  • return_sparse (Optional[bool], optional) – If True, compute and return sparce embedding. Defaults to None.

  • +
  • return_colbert_vecs (Optional[bool], optional) – If True, compute and return cobert vectors. Defaults to None.

  • +
+
+
Returns:
+

Dict[Literal[“dense_vecs”, “lexical_weights”, “colbert_vecs”], Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]]

+
+
+
+ +
+
+M3Embedder.encode_corpus(corpus: List[str] | str, batch_size: int | None = None, max_length: int | None = None, return_dense: bool | None = None, return_sparse: bool | None = None, return_colbert_vecs: bool | None = None, **kwargs: Any) Dict[Literal['dense_vecs', 'lexical_weights', 'colbert_vecs'], ndarray | List[Dict[str, float]] | List[ndarray]][source]
+

Encode the corpus using the specified way.

+
+
Parameters:
+
    +
  • corpus (Union[List[str], str]) – The input corpus to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • return_dense (Optional[bool], optional) – If True, compute and return dense embedding. Defaults to None.

  • +
  • return_sparse (Optional[bool], optional) – If True, compute and return sparce embedding. Defaults to None.

  • +
  • return_colbert_vecs (Optional[bool], optional) – If True, compute and return cobert vectors. Defaults to None.

  • +
+
+
Returns:
+

Dict[Literal[“dense_vecs”, “lexical_weights”, “colbert_vecs”], Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]]

+
+
+
+ +
+
+M3Embedder.encode(sentences: List[str] | str, batch_size: int | None = None, max_length: int | None = None, return_dense: bool | None = None, return_sparse: bool | None = None, return_colbert_vecs: bool | None = None, **kwargs: Any) Dict[Literal['dense_vecs', 'lexical_weights', 'colbert_vecs'], ndarray | List[Dict[str, float]] | List[ndarray]][source]
+

Encode the sentences using the specified way.

+
+
Parameters:
+
    +
  • sentences (Union[List[str], str]) – The input sentences to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • return_dense (Optional[bool], optional) – If True, compute and return dense embedding. Defaults to None.

  • +
  • return_sparse (Optional[bool], optional) – If True, compute and return sparce embedding. Defaults to None.

  • +
  • return_colbert_vecs (Optional[bool], optional) – If True, compute and return cobert vectors. Defaults to None.

  • +
+
+
Returns:
+

Dict[Literal[“dense_vecs”, “lexical_weights”, “colbert_vecs”], Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]]

+
+
+
+ +
+
+M3Embedder.convert_id_to_token(lexical_weights: List[Dict])[source]
+

Convert the ids back to tokens.

+
+
Parameters:
+

lexical_weights (List[Dict]) – A list of dictionaries of id & weights.

+
+
Returns:
+

A list of dictionaries of tokens & weights.

+
+
Return type:
+

List[Dict]

+
+
+
+ +
+
+M3Embedder.compute_lexical_matching_score(lexical_weights_1: Dict[str, float] | List[Dict[str, float]], lexical_weights_2: Dict[str, float] | List[Dict[str, float]]) ndarray | float[source]
+

Compute the laxical matching score of two given lexical weights.

+
+
Parameters:
+
    +
  • lexical_weights_1 (Union[Dict[str, float], List[Dict[str, float]]]) – First array of lexical weights.

  • +
  • lexical_weights_2 (Union[Dict[str, float], List[Dict[str, float]]]) – Second array of lexical weights.

  • +
+
+
Returns:
+

The computed lexical weights across the two arries of lexical weights.

+
+
Return type:
+

Union[np.ndarray, float]

+
+
+
+ +
+
+M3Embedder.colbert_score(q_reps, p_reps)[source]
+

Compute colbert scores of input queries and passages.

+
+
Parameters:
+
    +
  • q_reps (np.ndarray) – Multi-vector embeddings for queries.

  • +
  • p_reps (np.ndarray) – Multi-vector embeddings for passages/corpus.

  • +
+
+
Returns:
+

Computed colbert scores.

+
+
Return type:
+

torch.Tensor

+
+
+
+ +
+
+M3Embedder.encode_single_device(sentences: List[str] | str, batch_size: int = 256, max_length: int = 512, return_dense: bool = True, return_sparse: bool = False, return_colbert_vecs: bool = False, device: str | None = None, **kwargs: Any)[source]
+

Using single device to encode the input sentences.

+
+
Parameters:
+
    +
  • sentences (Union[List[str], str]) – The input sentences to encode.

  • +
  • batch_size (Optional[int], optional) – Number of sentences for each iter. Defaults to 256.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to 512.

  • +
  • return_dense (Optional[bool], optional) – If True, compute and return dense embedding. Defaults to True.

  • +
  • return_sparse (Optional[bool], optional) – If True, compute and return sparce embedding. Defaults to False.

  • +
  • return_colbert_vecs (Optional[bool], optional) – If True, compute and return cobert vectors. Defaults to False.

  • +
  • device (Optional[str], optional) – _description_. Defaults to None.

  • +
+
+
Returns:
+

Dict[Literal[“dense_vecs”, “lexical_weights”, “colbert_vecs”], Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]]

+
+
+
+ +
+
+M3Embedder.compute_score(sentence_pairs: List[Tuple[str, str]] | Tuple[str, str], batch_size: int | None = None, max_query_length: int | None = None, max_passage_length: int | None = None, weights_for_different_modes: None | List[float] = None, **kwargs: Any) Dict[Literal['colbert', 'sparse', 'dense', 'sparse+dense', 'colbert+sparse+dense'], List[float]][source]
+

Compute the relevance score of different attributes.

+
+
Parameters:
+
    +
  • sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]) – _description_

  • +
  • batch_size (Optional[int], optional) – _description_. Defaults to None.

  • +
  • max_query_length (Optional[int], optional) – _description_. Defaults to None.

  • +
  • max_passage_length (Optional[int], optional) – _description_. Defaults to None.

  • +
  • weights_for_different_modes (Optional[List[float]], optional) – _description_. Defaults to None.

  • +
+
+
Returns:
+

Dict[Literal[“colbert”, “sparse”, “dense”, “sparse+dense”, “colbert+sparse+dense”], List[float]]

+
+
+
+ +
+
+M3Embedder.compute_score_multi_process(sentence_pairs: List[Tuple[str, str]], pool: Dict[Literal['input', 'output', 'processes'], Any], **kwargs)[source]
+
+ +
+
+M3Embedder.compute_score_single_device(sentence_pairs: List[Tuple[str, str]] | Tuple[str, str], batch_size: int = 256, max_query_length: int = 512, max_passage_length: int = 512, weights_for_different_modes: List[float] | None = None, device: str | None = None, **kwargs: Any) Dict[Literal['colbert', 'sparse', 'dense', 'sparse+dense', 'colbert+sparse+dense'], List[float]][source]
+

Compute the relevance score of different attributes.

+
+
Parameters:
+
    +
  • sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]) – Pairs of sentences to compute the score.

  • +
  • batch_size (Optional[int], optional) – _description_. Defaults to None.

  • +
  • max_query_length (Optional[int], optional) – _description_. Defaults to None.

  • +
  • max_passage_length (Optional[int], optional) – _description_. Defaults to None.

  • +
  • weights_for_different_modes (Optional[List[float]], optional) – The weights for different methods. Defaults to None.

  • +
  • device (Optional[str], optional) – The device to use. Defaults to None.

  • +
+
+
Returns:
+

Dict[Literal[“colbert”, “sparse”, “dense”, “sparse+dense”, “colbert+sparse+dense”], List[float]]

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/reranker/decoder_only/BaseLLMReranker.html b/API/inference/reranker/decoder_only/BaseLLMReranker.html new file mode 100644 index 00000000..c577f874 --- /dev/null +++ b/API/inference/reranker/decoder_only/BaseLLMReranker.html @@ -0,0 +1,601 @@ + + + + + + + + + BaseLLMReranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

BaseLLMReranker

+
+
+class FlagEmbedding.inference.reranker.decoder_only.base.BaseLLMReranker(model_name_or_path: str, peft_path: str | None = None, use_fp16: bool = False, use_bf16: bool = False, query_instruction_for_rerank: str = 'A: ', query_instruction_format: str = '{}{}', passage_instruction_for_rerank: str = 'B: ', passage_instruction_format: str = '{}{}', cache_dir: str | None = None, trust_remote_code: bool = False, devices: str | List[str] | List[int] | None = None, prompt: str | None = None, batch_size: int = 128, query_max_length: int | None = None, max_length: int = 512, normalize: bool = False, **kwargs: Any)[source]
+

Base reranker class for LLM like decoder only models.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • peft_path (Optional[str], optional) – Path to the PEFT config. Defaults to None.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to False. Defaults to False.

  • +
  • use_bf16 (bool, optional) – Another type of half-precision floating-point, you can use bf16 if the hardware supports. +Defaults to :data:False.

  • +
  • query_instruction_for_rerank (str, optional) – Query instruction for retrieval tasks, which will be used with +with query_instruction_format. Defaults to "A: ".

  • +
  • query_instruction_format (str, optional) – The template for query_instruction_for_rerank. Defaults to "{}{}".

  • +
  • passage_instruction_for_rerank (str, optional) – Passage instruction for retrieval tasks, which will be used with +with passage_instruction_format. Defaults to "B: ".

  • +
  • passage_instruction_format (str, optional) – The template for passage. Defaults to “{}{}”.

  • +
  • cache_dir (Optional[str], optional) – Cache directory for the model. Defaults to None.

  • +
  • trust_remote_code (bool, optional) – trust_remote_code. Defaults to False.

  • +
  • devices (Union[str, List[str], List[int]], optional) – Devices to use for model inference, such as [“cuda:0”] or [“0”]. +Defaults to None.

  • +
  • prompt (Optional[str], optional) – Prompt for the specific task. Defaults to None.

  • +
  • batch_size (int, optional) – Batch size for inference. Defaults to 128.

  • +
  • query_max_length (int, optional) – Maximum length for queries. If not specified, will be 3/4 of max_length. +Defaults to None.

  • +
  • max_length (int, optional) – Maximum length of passages. Defaults to :data`512`.

  • +
  • normalize (bool, optional) – If True, use Sigmoid to normalize the results. Defaults to False.

  • +
+
+
+
+ +
+

Methods

+
+
+class FlagEmbedding.inference.reranker.decoder_only.base.BaseLLMReranker.compute_score_single_gpu(self, sentence_pairs: List[Tuple[str, str]] | Tuple[str, str], batch_size: int | None = None, query_max_length: int | None = None, max_length: int | None = None, prompt: str | None = None, normalize: bool | None = None, use_dataloader: bool = False, num_workers: int = None, device: str | None = None, **kwargs: Any)
+

Compute the relevance scores using a single GPU.

+
+
Parameters:
+
    +
  • sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]) – Input sentence pairs to compute scores.

  • +
  • batch_size (Optional[int], optional) – Number of inputs for each iter. Defaults to None.

  • +
  • query_max_length (Optional[int], optional) – Maximum length of tokens of queries. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • prompt (Optional[str], optional) – Prompt for the specific task. Defaults to None.

  • +
  • normalize (Optional[bool], optional) – If True, use Sigmoid to normalize the results. Defaults to None.

  • +
  • use_dataloader (bool, optional) – If True, will use the dataloader to load the datasets. Defaults to False.

  • +
  • num_workers (int, optional) – Number of workers for dataloader. Defaults to None.

  • +
  • device (Optional[str], optional) – Device to use for computation. Defaults to None.

  • +
+
+
Returns:
+

The computed scores.

+
+
Return type:
+

List[float]

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/reranker/decoder_only/LayerWiseLLMReranker.html b/API/inference/reranker/decoder_only/LayerWiseLLMReranker.html new file mode 100644 index 00000000..afbf560a --- /dev/null +++ b/API/inference/reranker/decoder_only/LayerWiseLLMReranker.html @@ -0,0 +1,603 @@ + + + + + + + + + LayerWiseLLMReranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

LayerWiseLLMReranker

+
+
+class FlagEmbedding.inference.reranker.decoder_only.layerwise.LayerWiseLLMReranker(model_name_or_path: str, peft_path: str | None = None, use_fp16: bool = False, use_bf16: bool = False, query_instruction_for_rerank: str = 'A: ', query_instruction_format: str = '{}{}', passage_instruction_for_rerank: str = 'B: ', passage_instruction_format: str = '{}{}', cache_dir: str | None = None, trust_remote_code: bool = False, devices: str | List[str] | List[int] | None = None, cutoff_layers: List[int] | None = None, prompt: str | None = None, batch_size: int = 128, query_max_length: int | None = None, max_length: int = 512, normalize: bool = False, **kwargs: Any)[source]
+

Base reranker class for layerwise LLM like decoder only models.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • peft_path (Optional[str], optional) – Path to the PEFT config. Defaults to None.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to False. Defaults to False.

  • +
  • use_bf16 (bool, optional) – Another type of half-precision floating-point, you can use bf16 if the hardware supports. +Defaults to :data:False.

  • +
  • query_instruction_for_rerank (str, optional) – Query instruction for retrieval tasks, which will be used with +with query_instruction_format. Defaults to "A: ".

  • +
  • query_instruction_format (str, optional) – The template for query_instruction_for_rerank. Defaults to "{}{}".

  • +
  • passage_instruction_for_rerank (str, optional) – Passage instruction for retrieval tasks, which will be used with +with passage_instruction_format. Defaults to "B: ".

  • +
  • passage_instruction_format (str, optional) – The template for passage. Defaults to “{}{}”.

  • +
  • cache_dir (Optional[str], optional) – Cache directory for the model. Defaults to None.

  • +
  • trust_remote_code (bool, optional) – trust_remote_code. Defaults to False.

  • +
  • devices (Union[str, List[str], List[int]], optional) – Devices to use for model inference, such as [“cuda:0”] or [“0”]. +Defaults to None.

  • +
  • cutoff_layers (Optional[List[int]]) – Pick which layers are used for computing the score. Defaults to None.

  • +
  • prompt (Optional[str], optional) – Prompt for the specific task. Defaults to None.

  • +
  • batch_size (int, optional) – Batch size for inference. Defaults to 128.

  • +
  • query_max_length (int, optional) – Maximum length for queries. If not specified, will be 3/4 of max_length. +Defaults to None.

  • +
  • max_length (int, optional) – Maximum length of passages. Defaults to :data`512`.

  • +
  • normalize (bool, optional) – If True, use Sigmoid to normalize the results. Defaults to False.

  • +
+
+
+
+ +
+

Methods

+
+
+class FlagEmbedding.inference.reranker.decoder_only.layerwise.LayerWiseLLMReranker.compute_score_single_gpu(self, sentence_pairs: List[Tuple[str, str]] | Tuple[str, str], batch_size: int | None = None, query_max_length: int | None = None, max_length: int | None = None, cutoff_layers: List[int] | None = None, prompt: str | None = None, normalize: bool | None = None, use_dataloader: bool = False, num_workers: int | None = None, device: str | None = None, **kwargs: Any)
+

Compute the relevance scores using a single GPU.

+
+
Parameters:
+
    +
  • sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]) – Input sentence pairs to compute scores.

  • +
  • batch_size (Optional[int], optional) – Number of inputs for each iter. Defaults to None.

  • +
  • query_max_length (Optional[int], optional) – Maximum length of tokens of queries. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • cutoff_layers (Optional[List[int]], optional) – Pick which layers are used for computing the score. Defaults to None.

  • +
  • prompt (Optional[str], optional) – Prompt for the specific task. Defaults to None.

  • +
  • normalize (Optional[bool], optional) – If True, use Sigmoid to normalize the results. Defaults to None.

  • +
  • use_dataloader (bool, optional) – If True, will use the dataloader to load the datasets. Defaults to False.

  • +
  • num_workers (int, optional) – Number of workers for dataloader. Defaults to None.

  • +
  • device (Optional[str], optional) – Device to use for computation. Defaults to None.

  • +
+
+
Returns:
+

The computed scores.

+
+
Return type:
+

List[float]

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/reranker/decoder_only/LightweightLLMReranker.html b/API/inference/reranker/decoder_only/LightweightLLMReranker.html new file mode 100644 index 00000000..f80a990d --- /dev/null +++ b/API/inference/reranker/decoder_only/LightweightLLMReranker.html @@ -0,0 +1,608 @@ + + + + + + + + + LightweightLLMReranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

LightweightLLMReranker

+
+
+class FlagEmbedding.inference.reranker.decoder_only.lightweight.LightweightLLMReranker(model_name_or_path: str, peft_path: str | None = None, use_fp16: bool = False, use_bf16: bool = False, query_instruction_for_rerank: str = 'A: ', query_instruction_format: str = '{}{}', passage_instruction_for_rerank: str = 'B: ', passage_instruction_format: str = '{}{}', cache_dir: str | None = None, trust_remote_code: bool = False, devices: str | List[str] | List[int] | None = None, cutoff_layers: List[int] | None = None, compress_layers: List[int] = [8], compress_ratio: int = 1, prompt: str | None = None, batch_size: int = 128, query_max_length: int | None = None, max_length: int = 512, normalize: bool = False, **kwargs: Any)[source]
+

Base reranker class for light weight LLM like decoder only models.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • peft_path (Optional[str], optional) – Path to the PEFT config. Defaults to None.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to False. Defaults to False.

  • +
  • use_bf16 (bool, optional) – Another type of half-precision floating-point, you can use bf16 if the hardware supports. +Defaults to :data:False.

  • +
  • query_instruction_for_rerank (str, optional) – Query instruction for retrieval tasks, which will be used with +with query_instruction_format. Defaults to "A: ".

  • +
  • query_instruction_format (str, optional) – The template for query_instruction_for_rerank. Defaults to "{}{}".

  • +
  • passage_instruction_for_rerank (str, optional) – Passage instruction for retrieval tasks, which will be used with +with passage_instruction_format. Defaults to "B: ".

  • +
  • passage_instruction_format (str, optional) – The template for passage. Defaults to “{}{}”.

  • +
  • cache_dir (Optional[str], optional) – Cache directory for the model. Defaults to None.

  • +
  • trust_remote_code (bool, optional) – trust_remote_code. Defaults to False.

  • +
  • devices (Union[str, List[str], List[int]], optional) – Devices to use for model inference, such as [“cuda:0”] or [“0”]. +Defaults to None.

  • +
  • cutoff_layers (Optional[List[int]]) – Pick which layers are used for computing the score. Defaults to None.

  • +
  • compress_layers (List[int], optional) – Choose the layers to compress. Defaults to [8].

  • +
  • compress_ratio (int, optional) – Ratio to compress the selected layers, supported ratios: [1, 2, 4, 8]. +Defaults to 1.

  • +
  • prompt (Optional[str], optional) – Prompt for the specific task. Defaults to None.

  • +
  • batch_size (int, optional) – Batch size for inference. Defaults to 128.

  • +
  • query_max_length (int, optional) – Maximum length for queries. If not specified, will be 3/4 of max_length. +Defaults to None.

  • +
  • max_length (int, optional) – Maximum length of passages. Defaults to :data`512`.

  • +
  • normalize (bool, optional) – If True, use Sigmoid to normalize the results. Defaults to False.

  • +
+
+
+
+ +
+

Methods

+
+
+class FlagEmbedding.inference.reranker.decoder_only.lightweight.LightweightLLMReranker.compute_score_single_gpu(self, sentence_pairs: List[Tuple[str, str]] | Tuple[str, str], batch_size: int | None = None, query_max_length: int | None = None, max_length: int | None = None, cutoff_layers: List[int] | None = None, compress_layer: List[int] | None = None, compress_layers: List[int] | None = None, compress_ratio: int | None = None, prompt: str | None = None, normalize: bool | None = None, device: str | None = None, **kwargs: Any)
+

Compute the relevance scores using a single GPU.

+
+
Parameters:
+
    +
  • sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]) – Input sentence pairs to compute scores.

  • +
  • batch_size (Optional[int], optional) – Number of inputs for each iter. Defaults to None.

  • +
  • query_max_length (Optional[int], optional) – Maximum length of tokens of queries. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • cutoff_layers (Optional[List[int]], optional) – Pick which layers are used for computing the score. Defaults to None.

  • +
  • compress_layer (Optional[List[int]]) – Deprecated, use compress_layers instead. Defaults to None.

  • +
  • compress_layers (Optional[List[int]]) – Selected layers to compress. Defaults to None.

  • +
  • compress_ratio (Optional[int]) – Ratio to compress the selected layers, supported ratios: [1, 2, 4, 8]. +Defaults to None.

  • +
  • prompt (Optional[str], optional) – Prompt for the specific task. Defaults to None.

  • +
  • normalize (Optional[bool], optional) – If True, use Sigmoid to normalize the results. Defaults to None.

  • +
  • device (Optional[str], optional) – Device to use for computation. Defaults to None.

  • +
+
+
Returns:
+

The computed scores.

+
+
Return type:
+

List[float]

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/reranker/encoder_only/BaseReranker.html b/API/inference/reranker/encoder_only/BaseReranker.html new file mode 100644 index 00000000..bca4b124 --- /dev/null +++ b/API/inference/reranker/encoder_only/BaseReranker.html @@ -0,0 +1,590 @@ + + + + + + + + + BaseReranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

BaseReranker

+
+
+class FlagEmbedding.inference.reranker.encoder_only.base.BaseReranker(model_name_or_path: str, use_fp16: bool = False, query_instruction_for_rerank: str | None = None, query_instruction_format: str = '{}{}', passage_instruction_for_rerank: str | None = None, passage_instruction_format: str = '{}{}', trust_remote_code: bool = False, cache_dir: str | None = None, devices: str | List[str] | List[int] | None = None, batch_size: int = 128, query_max_length: int | None = None, max_length: int = 512, normalize: bool = False, **kwargs: Any)[source]
+

Base reranker class for encoder only models.

+
+
Parameters:
+
    +
  • model_name_or_path (str) – If it’s a path to a local model, it loads the model from the path. Otherwise tries to download and +load a model from HuggingFace Hub with the name.

  • +
  • use_fp16 (bool, optional) – If true, use half-precision floating-point to speed up computation with a slight performance +degradation. Defaults to False.

  • +
  • query_instruction_for_rerank (Optional[str], optional) – Query instruction for retrieval tasks, which will be used with +with query_instruction_format. Defaults to None.

  • +
  • query_instruction_format (str, optional) – The template for query_instruction_for_rerank. Defaults to "{}{}".

  • +
  • passage_instruction_format (str, optional) – The template for passage. Defaults to “{}{}”.

  • +
  • cache_dir (Optional[str], optional) – Cache directory for the model. Defaults to None.

  • +
  • devices (Optional[Union[str, List[str], List[int]]], optional) – Devices to use for model inference. Defaults to None.

  • +
  • batch_size (int, optional) – Batch size for inference. Defaults to 128.

  • +
  • query_max_length (Optional[int], optional) – Maximum length for queries. If not specified, will be 3/4 of max_length. +Defaults to None.

  • +
  • max_length (int, optional) – Maximum length of passages. Defaults to :data`512`.

  • +
  • normalize (bool, optional) – If True, use Sigmoid to normalize the results. Defaults to False.

  • +
+
+
+
+ +
+

Methods

+
+
+class FlagEmbedding.inference.reranker.encoder_only.base.BaseReranker.compute_score_single_gpu(self, sentence_pairs: List[Tuple[str, str]] | Tuple[str, str], batch_size: int | None = None, query_max_length: int | None = None, max_length: int | None = None, normalize: bool | None = None, device: str | None = None, **kwargs: Any)
+

_summary_

+
+
Parameters:
+
    +
  • sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]) – Input sentence pairs to compute scores.

  • +
  • batch_size (Optional[int], optional) – Number of inputs for each iter. Defaults to None.

  • +
  • query_max_length (Optional[int], optional) – Maximum length of tokens of queries. Defaults to None.

  • +
  • max_length (Optional[int], optional) – Maximum length of tokens. Defaults to None.

  • +
  • normalize (Optional[bool], optional) – If True, use Sigmoid to normalize the results. Defaults to None.

  • +
  • device (Optional[str], optional) – Device to use for computation. Defaults to None.

  • +
+
+
Returns:
+

Computed scores of queries and passages.

+
+
Return type:
+

List[float]

+
+
+
+ +
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/API/inference/reranker/reranker.html b/API/inference/reranker/reranker.html new file mode 100644 index 00000000..8e915b03 --- /dev/null +++ b/API/inference/reranker/reranker.html @@ -0,0 +1,549 @@ + + + + + + + + + Reranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+ +
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/C-MTEB.html b/C-MTEB.html new file mode 100644 index 00000000..a61e481a --- /dev/null +++ b/C-MTEB.html @@ -0,0 +1,492 @@ + + + + + + + + + <no title> - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+ +
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/Introduction/installation.html b/Introduction/installation.html new file mode 100644 index 00000000..aa054340 --- /dev/null +++ b/Introduction/installation.html @@ -0,0 +1,563 @@ + + + + + + + + + Installation - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Installation

+
+

Using pip:

+

If you do not want to finetune the models, you can install the package without the finetune dependency:

+
pip install -U FlagEmbedding
+
+
+

If you want to finetune the models, you can install the package with the finetune dependency:

+
pip install -U FlagEmbedding[finetune]
+
+
+
+
+

Install from sources:

+

Clone the repository and install

+
git clone https://github.com/FlagOpen/FlagEmbedding.git
+cd FlagEmbedding
+# If you do not want to finetune the models, you can install the package without the finetune dependency:
+pip install  .
+# If you want to finetune the models, you can install the package with the finetune dependency:
+pip install  .[finetune]
+
+
+

For development in editable mode:

+
# If you do not want to finetune the models, you can install the package without the finetune dependency:
+pip install -e .
+# If you want to finetune the models, you can install the package with the finetune dependency:
+pip install -e .[finetune]
+
+
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/Introduction/quick_start.html b/Introduction/quick_start.html new file mode 100644 index 00000000..6d265a6c --- /dev/null +++ b/Introduction/quick_start.html @@ -0,0 +1,792 @@ + + + + + + + + + Quick Start - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+ +
+ +
+ +
+
+
+

Quick Start

+

In this tutorial, we will show how to use BGE models on a text retrieval task in 5 minutes.

+
+

Step 0: Preparation

+

First, install FlagEmbedding in the environment.

+
+
+
%pip install -U FlagEmbedding
+
+
+
+
+

Below is a super tiny courpus with only 10 sentences, which will be the dataset we use.

+

Each sentence is a concise discription of a famous people in specific domain.

+
+
+
corpus = [
+    "Michael Jackson was a legendary pop icon known for his record-breaking music and dance innovations.",
+    "Fei-Fei Li is a professor in Stanford University, revolutionized computer vision with the ImageNet project.",
+    "Brad Pitt is a versatile actor and producer known for his roles in films like 'Fight Club' and 'Once Upon a Time in Hollywood.'",
+    "Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.",
+    "Eminem is a renowned rapper and one of the best-selling music artists of all time.",
+    "Taylor Swift is a Grammy-winning singer-songwriter known for her narrative-driven music.",
+    "Sam Altman leads OpenAI as its CEO, with astonishing works of GPT series and pursuing safe and beneficial AI.",
+    "Morgan Freeman is an acclaimed actor famous for his distinctive voice and diverse roles.",
+    "Andrew Ng spread AI knowledge globally via public courses on Coursera and Stanford University.",
+    "Robert Downey Jr. is an iconic actor best known for playing Iron Man in the Marvel Cinematic Universe.",
+]
+
+
+
+
+

We want to know which one of these people could be an expert of neural network and who he/she is.

+

Thus we generate the following query:

+
+
+
query = "Who could be an expert of neural network?"
+
+
+
+
+
+
+

Step 1: Text -> Embedding

+

First, let’s use a BGE embedding model to create sentence embedding for the corpus.

+
+
+
from FlagEmbedding import FlagModel
+
+# get the BGE embedding model
+model = FlagModel('BAAI/bge-base-en-v1.5',
+                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
+                  use_fp16=True)
+
+# get the embedding of the query and corpus
+corpus_embeddings = model.encode(corpus)
+query_embedding = model.encode(query)
+
+
+
+
+

The embedding of each sentence is a vector with length 768.

+
+
+
print("shape of the query embedding:  ", query_embedding.shape)
+print("shape of the corpus embeddings:", corpus_embeddings.shape)
+
+
+
+
+
shape of the query embedding:   (768,)
+shape of the corpus embeddings: (10, 768)
+
+
+
+
+

Run the following print line to take a look at the first 10 elements of the query embedding vector.

+
+
+
print(query_embedding[:10])
+
+
+
+
+
[-0.00790005 -0.00683443 -0.00806659  0.00756918  0.04374858  0.02838556
+  0.02357143 -0.02270943 -0.03611493 -0.03038301]
+
+
+
+
+
+
+

Step 2: Calculate Similarity

+

Now, we have the embeddings of the query and the corpus. The next step is to calculate the similarity between the query and each sentence in the corpus. Here we use the dot product/inner product as our similarity metric.

+
+
+
sim_scores = query_embedding @ corpus_embeddings.T
+print(sim_scores)
+
+
+
+
+
[0.39290053 0.6031525  0.32672375 0.6082418  0.39446455 0.35350388
+ 0.4626108  0.40196604 0.5284606  0.36792332]
+
+
+
+
+

The result is a list of score representing the query’s similarity to: [sentence 0, sentence 1, sentence 2, …]

+
+
+

Step 3: Ranking

+

After we have the similarity score of the query to each sentence in the corpus, we can rank them from large to small.

+
+
+
# get the indices in sorted order
+sorted_indices = sorted(range(len(sim_scores)), key=lambda k: sim_scores[k], reverse=True)
+print(sorted_indices)
+
+
+
+
+
[3, 1, 8, 6, 7, 4, 0, 9, 5, 2]
+
+
+
+
+

Now from the ranking, the sentence with index 3 is the best answer to our query “Who could be an expert of neural network?”

+

And that person is Geoffrey Hinton!

+
+
+
print(corpus[3])
+
+
+
+
+
Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.
+
+
+
+
+

According to the order of indecies, we can print out the ranking of people that our little retriever got.

+
+
+
# iteratively print the score and corresponding sentences in descending order
+
+for i in sorted_indices:
+    print(f"Score of {sim_scores[i]:.3f}: \"{corpus[i]}\"")
+
+
+
+
+
Score of 0.608: "Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning."
+Score of 0.603: "Fei-Fei Li is a professor in Stanford University, revolutionized computer vision with the ImageNet project."
+Score of 0.528: "Andrew Ng spread AI knowledge globally via public courses on Coursera and Stanford University."
+Score of 0.463: "Sam Altman leads OpenAI as its CEO, with astonishing works of GPT series and pursuing safe and beneficial AI."
+Score of 0.402: "Morgan Freeman is an acclaimed actor famous for his distinctive voice and diverse roles."
+Score of 0.394: "Eminem is a renowned rapper and one of the best-selling music artists of all time."
+Score of 0.393: "Michael Jackson was a legendary pop icon known for his record-breaking music and dance innovations."
+Score of 0.368: "Robert Downey Jr. is an iconic actor best known for playing Iron Man in the Marvel Cinematic Universe."
+Score of 0.354: "Taylor Swift is a Grammy-winning singer-songwriter known for her narrative-driven music."
+Score of 0.327: "Brad Pitt is a versatile actor and producer known for his roles in films like 'Fight Club' and 'Once Upon a Time in Hollywood.'"
+
+
+
+
+

From the ranking, not surprisingly, the similarity scores of the query and the discriptions of Geoffrey Hinton and Fei-Fei Li is way higher than others, following by those of Andrew Ng and Sam Altman.

+

While the key phrase “neural network” in the query does not appear in any of those discriptions, the BGE embedding model is still powerful enough to get the semantic meaning of query and corpus well.

+
+
+

Step 4: Evaluate

+

We’ve seen the embedding model performed pretty well on the “neural network” query. What about the more general quality?

+

Let’s generate a very small dataset of queries and corresponding ground truth answers. Note that the ground truth answers are the indices of sentences in the corpus.

+
+
+
queries = [
+    "Who could be an expert of neural network?",
+    "Who might had won Grammy?",
+    "Won Academy Awards",
+    "One of the most famous female singers.",
+    "Inventor of AlexNet",
+]
+
+
+
+
+
+
+
ground_truth = [
+    [1, 3],
+    [0, 4, 5],
+    [2, 7, 9],
+    [5],
+    [3],
+]
+
+
+
+
+

Here we repeat the steps we covered above to get the predicted ranking of each query.

+
+
+
# use bge model to generate embeddings for all the queries
+queries_embedding = model.encode(queries)
+# compute similarity scores
+scores = queries_embedding @ corpus_embeddings.T
+# get he final rankings
+rankings = [sorted(range(len(sim_scores)), key=lambda k: sim_scores[k], reverse=True) for sim_scores in scores]
+rankings
+
+
+
+
+
[[3, 1, 8, 6, 7, 4, 0, 9, 5, 2],
+ [5, 0, 3, 4, 1, 9, 7, 2, 6, 8],
+ [3, 2, 7, 5, 9, 0, 1, 4, 6, 8],
+ [5, 0, 4, 7, 1, 9, 2, 3, 6, 8],
+ [3, 1, 8, 6, 0, 7, 5, 9, 4, 2]]
+
+
+
+
+

Mean Reciprocal Rank (MRR) is a widely used metric in information retrieval to evaluate the effectiveness of a system. Here we use that to have a very rough idea how our system performs.

+
+
+
def MRR(preds, labels, cutoffs):
+    mrr = [0 for _ in range(len(cutoffs))]
+    for pred, label in zip(preds, labels):
+        for i, c in enumerate(cutoffs):
+            for j, index in enumerate(pred):
+                if j < c and index in label:
+                    mrr[i] += 1/(j+1)
+                    break
+    mrr = [k/len(preds) for k in mrr]
+    return mrr
+
+
+
+
+

We choose to use 1 and 5 as our cutoffs, with the result of 0.8 and 0.9 respectively.

+
+
+
cutoffs = [1, 5]
+mrrs = MRR(rankings, ground_truth, cutoffs)
+for i, c in enumerate(cutoffs):
+    print(f"MRR@{c}: {mrrs[i]}")
+
+
+
+
+
MRR@1: 0.8
+MRR@5: 0.9
+
+
+
+
+
+
+ +
+
+ +
+ +
+
+ + + + + \ No newline at end of file diff --git a/_images/BAAI_logo.png b/_images/BAAI_logo.png new file mode 100644 index 00000000..c39cc6fd Binary files /dev/null and b/_images/BAAI_logo.png differ diff --git a/_images/bge_logo.jpg b/_images/bge_logo.jpg new file mode 100644 index 00000000..e9560649 Binary files /dev/null and b/_images/bge_logo.jpg differ diff --git a/_modules/FlagEmbedding/abc/finetune/embedder/AbsArguments.html b/_modules/FlagEmbedding/abc/finetune/embedder/AbsArguments.html new file mode 100644 index 00000000..6312e4d9 --- /dev/null +++ b/_modules/FlagEmbedding/abc/finetune/embedder/AbsArguments.html @@ -0,0 +1,622 @@ + + + + + + + + FlagEmbedding.abc.finetune.embedder.AbsArguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.finetune.embedder.AbsArguments

+import os
+from typing import Optional
+from dataclasses import dataclass, field
+
+from transformers import TrainingArguments
+
+
+
+[docs] +@dataclass +class AbsEmbedderModelArguments: + """ + Abstract class for model arguments. + """ + + model_name_or_path: str = field( + metadata={"help": "The model checkpoint for initialization."} + ) + config_name: str = field( + default=None, + metadata={"help": "Pretrained config name or path if not the same as model_name."} + ) + tokenizer_name: str = field( + default=None, + metadata={"help": "Pretrained tokenizer name or path if not the same as model_name."} + ) + cache_dir: str = field( + default=None, + metadata={"help": "Where do you want to store the pre-trained models downloaded from s3."} + ) + trust_remote_code: bool = field( + default=False, + metadata={"help": "Trust remote code"} + ) + token: str = field( + default_factory=lambda: os.getenv('HF_TOKEN', None), + metadata={"help": "The token to use when accessing the model."} + )
+ + + +
+[docs] +@dataclass +class AbsEmbedderDataArguments: + """ + Abstract class for data arguments. + """ + train_data: str = field( + default=None, metadata={ + "help": "One or more paths to training data. `query: str`, `pos: List[str]`, `neg: List[str]` are required in the training data.", + "nargs": "+" + } + ) + cache_path: Optional[str] = field( + default=None, metadata={"help": "Where do you want to store the cached data"} + ) + train_group_size: int = field(default=8) + + query_max_len: int = field( + default=32, + metadata={ + "help": "The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated." + }, + ) + + passage_max_len: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated." + }, + ) + + pad_to_multiple_of: Optional[int] = field( + default=None, + metadata={ + "help": "If set will pad the sequence to be a multiple of the provided value." + }, + ) + + max_example_num_per_dataset: int = field( + default=100000000, metadata={"help": "the max number of examples for each dataset"} + ) + + query_instruction_for_retrieval: str= field( + default=None, metadata={"help": "instruction for query"} + ) + query_instruction_format: str = field( + default="{}{}", metadata={"help": "format for query instruction"} + ) + + knowledge_distillation: bool = field( + default=False, + metadata={"help": "Use knowledge distillation when `pos_scores: List[float]` and `neg_scores: List[float]` are in features of training data"} + ) + + passage_instruction_for_retrieval: Optional[str] = field( + default=None, metadata={"help": "instruction for passage"} + ) + passage_instruction_format: Optional[str] = field( + default="{}{}", metadata={"help": "format for passage instruction"} + ) + + shuffle_ratio: float = field( + default=0.0, metadata={"help": "The ratio of shuffling the text"} + ) + + # Parameters for SameDatasetDataArguments + same_dataset_within_batch: bool = field( + default=False, metadata={"help": "All samples in the same batch comes from the same dataset."} + ) + small_threshold: int = field( + default=0, + metadata={"help": "The threshold of small dataset. All small dataset in the same directory will be merged into one dataset."} + ) + drop_threshold: int = field( + default=0, + metadata={"help": "The threshold for dropping merged small dataset. If the number of examples in the merged small dataset is less than this threshold, it will be dropped."} + ) + + def __post_init__(self): + for train_dir in self.train_data: + if not os.path.exists(train_dir): + raise FileNotFoundError(f"cannot find file: {train_dir}, please set a true path")
+ + + +@dataclass +class AbsEmbedderTrainingArguments(TrainingArguments): + negatives_cross_device: bool = field(default=False, metadata={"help": "share negatives across devices"}) + temperature: Optional[float] = field(default=0.02, metadata={"help": "temperature used for similarity score"}) + fix_position_embedding: bool = field(default=False, metadata={"help": "Freeze the parameters of position embeddings"}) + sentence_pooling_method: str = field(default='cls', metadata={"help": "the pooling method. Available options: cls, mean, last_token. Default: cls", "choices": ['cls', 'mean', 'last_token']}) + normalize_embeddings: bool = field(default=True, metadata={"help": "whether to normalize the embeddings"}) + sub_batch_size: Optional[int] = field(default=None, metadata={"help": "sub batch size for training"}) + kd_loss_type: str = field(default='kl_div', metadata={"help": "the loss type for knowledge distillation. Available options: kl_div, m3_kd_loss. Default: kl_div.", "choices": ['kl_div', 'm3_kd_loss']}) +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/finetune/embedder/AbsDataset.html b/_modules/FlagEmbedding/abc/finetune/embedder/AbsDataset.html new file mode 100644 index 00000000..f9924adf --- /dev/null +++ b/_modules/FlagEmbedding/abc/finetune/embedder/AbsDataset.html @@ -0,0 +1,1141 @@ + + + + + + + + FlagEmbedding.abc.finetune.embedder.AbsDataset - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.finetune.embedder.AbsDataset

+import os
+import math
+import random
+import logging
+import datasets
+import numpy as np
+import torch.distributed as dist
+from dataclasses import dataclass
+from torch.utils.data import Dataset
+from transformers import (
+    PreTrainedTokenizer, 
+    DataCollatorWithPadding,
+    TrainerCallback,
+    TrainerState,
+    TrainerControl
+)
+
+from .AbsArguments import AbsEmbedderDataArguments, AbsEmbedderTrainingArguments
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class AbsEmbedderTrainDataset(Dataset): + """Abstract class for training dataset. + + Args: + args (AbsEmbedderDataArguments): Data arguments. + tokenizer (PreTrainedTokenizer): Tokenizer to use. + """ + def __init__( + self, + args: AbsEmbedderDataArguments, + tokenizer: PreTrainedTokenizer + ): + self.args = args + self.tokenizer = tokenizer + self.shuffle_ratio = args.shuffle_ratio + + train_datasets = [] + for data_dir in args.train_data: + if not os.path.isdir(data_dir): + if not (data_dir.endswith('.json') or data_dir.endswith('.jsonl')): continue + temp_dataset = self._load_dataset(data_dir) + if len(temp_dataset) == 0: continue + train_datasets.append(temp_dataset) + else: + for file in os.listdir(data_dir): + if not (file.endswith('.json') or file.endswith('.jsonl')): continue + temp_dataset = self._load_dataset(os.path.join(data_dir, file)) + if len(temp_dataset) == 0: continue + train_datasets.append(temp_dataset) + self.dataset = datasets.concatenate_datasets(train_datasets) + +
+[docs] + def _load_dataset(self, file_path: str): + """Load dataset from path. + + Args: + file_path (str): Path to load the datasets from. + + Raises: + ValueError: `pos_scores` and `neg_scores` not found in the features of training data + + Returns: + datasets.Dataset: Loaded HF dataset. + """ + if dist.get_rank() == 0: + logger.info(f'loading data from {file_path} ...') + + temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', cache_dir=self.args.cache_path) + if len(temp_dataset) > self.args.max_example_num_per_dataset: + temp_dataset = temp_dataset.select(random.sample(list(range(len(temp_dataset))), self.args.max_example_num_per_dataset)) + if not self.args.knowledge_distillation: + if 'pos_scores' in temp_dataset.column_names: + temp_dataset = temp_dataset.remove_columns(['pos_scores']) + if 'neg_scores' in temp_dataset.column_names: + temp_dataset = temp_dataset.remove_columns(['neg_scores']) + else: + if 'pos_scores' not in temp_dataset.column_names or 'neg_scores' not in temp_dataset.column_names: + raise ValueError(f"`pos_scores` and `neg_scores` not found in the features of training data in {file_path}, which is necessary when using knowledge distillation.") + return temp_dataset
+ + +
+[docs] + def _shuffle_text(self, text): + """shuffle the input text. + + Args: + text (str): Input text. + + Returns: + str: Shuffled text. + """ + if self.shuffle_ratio > 0 and len(text) > 100 and random.random() < self.shuffle_ratio: + split_text = [] + chunk_size = len(text)//3 + 1 + for i in range(0, len(text), chunk_size): + split_text.append(text[i:i+chunk_size]) + random.shuffle(split_text) + return " ".join(split_text) + else: + return text
+ + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, item): + data = self.dataset[item] + train_group_size = self.args.train_group_size + + query = data['query'] + if self.args.query_instruction_for_retrieval is not None: + query = self.args.query_instruction_format.format( + data['prompt'] if 'prompt' in data else self.args.query_instruction_for_retrieval, + query + ) + + passages = [] + teacher_scores = [] + + assert isinstance(data['pos'], list) and isinstance(data['neg'], list) + + pos_idx = random.choice(list(range(len(data['pos'])))) + passages.append(self._shuffle_text(data['pos'][pos_idx])) + + neg_all_idx = list(range(len(data['neg']))) + if len(data['neg']) < train_group_size - 1: + num = math.ceil((train_group_size - 1) / len(data['neg'])) + neg_idxs = random.sample(neg_all_idx * num, train_group_size - 1) + else: + neg_idxs = random.sample(neg_all_idx, self.args.train_group_size - 1) + for neg_idx in neg_idxs: + passages.append(data['neg'][neg_idx]) + + if self.args.knowledge_distillation: + assert isinstance(data['pos_scores'], list) and isinstance(data['neg_scores'], list) + teacher_scores.append(data['pos_scores'][pos_idx]) + for neg_idx in neg_idxs: + teacher_scores.append(data['neg_scores'][neg_idx]) + if not all(isinstance(score, (int, float)) for score in teacher_scores): + raise ValueError(f"pos_score or neg_score must be digit") + else: + teacher_scores = None + + if self.args.passage_instruction_for_retrieval is not None: + passages = [ + self.args.passage_instruction_format.format( + self.args.passage_instruction_for_retrieval, p + ) + for p in passages + ] + + return query, passages, teacher_scores
+ + +
+[docs] +@dataclass +class AbsEmbedderCollator(DataCollatorWithPadding): + """ + The abstract embedder collator. + """ + query_max_len: int = 32 + passage_max_len: int = 128 + sub_batch_size: int = -1 + + def __call__(self, features): + queries = [f[0] for f in features] + passages = [f[1] for f in features] + teacher_scores = [f[2] for f in features] + if teacher_scores[0] is None: + teacher_scores = None + elif isinstance(teacher_scores[0], list): + teacher_scores = sum(teacher_scores, []) + + if isinstance(queries[0], list): + queries = sum(queries, []) + if isinstance(passages[0], list): + passages = sum(passages, []) + + queries_inputs = self.tokenizer( + queries, + truncation=True, + max_length=self.query_max_len, + return_tensors=None + ) + passages_inputs = self.tokenizer( + passages, + truncation=True, + max_length=self.passage_max_len, + return_tensors=None + ) + + if self.sub_batch_size is None or self.sub_batch_size <= 0: + q_collated = self.tokenizer.pad( + queries_inputs, + padding=self.padding, + max_length=self.query_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors + ) + d_collated = self.tokenizer.pad( + passages_inputs, + padding=self.padding, + max_length=self.passage_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors + ) + else: + batch_size = self.sub_batch_size + + q_collated = [] + for i in range(0, len(queries_inputs['attention_mask']), batch_size): + start = i + end = min(len(queries_inputs['attention_mask']), i + batch_size) + sub_features = {} + for k, v in queries_inputs.items(): + sub_features[k] = v[start:end] + q_collated.append(self.tokenizer.pad( + sub_features, + padding=self.padding, + max_length=self.passage_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors + )) + + d_collated = [] + for i in range(0, len(passages_inputs['attention_mask']), batch_size): + start = i + end = min(len(passages_inputs['attention_mask']), i + batch_size) + sub_features = {} + + for k, v in passages_inputs.items(): + sub_features[k] = v[start:end] + d_collated.append(self.tokenizer.pad( + sub_features, + padding=self.padding, + max_length=self.passage_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors + )) + return { + "queries": q_collated, + "passages": d_collated, + "teacher_scores": teacher_scores, + "no_in_batch_neg_flag": False + }
+ + + +
+[docs] +class AbsEmbedderSameDatasetTrainDataset(AbsEmbedderTrainDataset): + """Abstract class for training dataset that samples batches from same dataset. + + Args: + args (AbsEmbedderDataArguments): Data arguments. + default_batch_size (int): The default batch size for training. + seed (int): Random seed. + tokenizer (PreTrainedTokenizer): Tokenizer to use. + process_index (int, optional): Current process index. Defaults to 0. + num_processes (int, optional): Total number of processes. Defaults to 1. + """ + def __init__( + self, + args: AbsEmbedderDataArguments, + default_batch_size: int, + seed: int, + tokenizer: PreTrainedTokenizer, + process_index: int=0, + num_processes: int=1 + ): + self.args = args + self.shuffle_ratio = args.shuffle_ratio + self.defaut_batch_size = default_batch_size + self.deterministic_generator = np.random.default_rng(seed) + self.tokenizer = tokenizer + self.process_index = process_index + self.num_processes = num_processes + + self.step = 0 + + train_datasets = [] + each_data_idxs = [] + batch_size_idxs = [] + no_in_batch_neg_flags = [] + cur_all_num = 0 + + small_threshold = args.small_threshold + drop_threshold = args.drop_threshold + + for data_dir in args.train_data: + if not os.path.isdir(data_dir): + # Add `no_in_batch_neg` **suffix** to `data_dir` to indicate that this dataset does not use in-batch negatives + no_in_batch_neg_flag = data_dir.split('.')[-2].endswith('no_in_batch_neg') + if not (data_dir.endswith('.json') or data_dir.endswith('.jsonl')): continue + temp_dataset = self._load_dataset(data_dir) + + if len(temp_dataset) == 0 or len(temp_dataset) < small_threshold: continue + else: + train_datasets.append(temp_dataset) + each_data_idxs.append(np.arange(len(temp_dataset)) + cur_all_num) + cur_all_num += len(temp_dataset) + batch_size_idxs.append(self._get_file_batch_size(temp_dataset, default_batch_size)) + no_in_batch_neg_flags.append(no_in_batch_neg_flag) + + else: + small_datasets = [] + small_batch_size = math.inf + + # Add `no_in_batch_neg` **suffix** to `data_dir` to indicate that this dataset does not use in-batch negatives + no_in_batch_neg_flag = data_dir.endswith('no_in_batch_neg') + for file in os.listdir(data_dir): + if not (file.endswith('.json') or file.endswith('.jsonl')): continue + temp_dataset = self._load_dataset(os.path.join(data_dir, file)) + + if len(temp_dataset) == 0: continue + elif len(temp_dataset) < small_threshold: + small_datasets.append(temp_dataset) + small_batch_size = min(small_batch_size, self._get_file_batch_size(temp_dataset, default_batch_size)) + else: + train_datasets.append(temp_dataset) + each_data_idxs.append(np.arange(len(temp_dataset)) + cur_all_num) + cur_all_num += len(temp_dataset) + batch_size_idxs.append(self._get_file_batch_size(temp_dataset, default_batch_size)) + no_in_batch_neg_flags.append(no_in_batch_neg_flag) + + if len(small_datasets) > 0: + small_dataset = datasets.concatenate_datasets(small_datasets) + if len(small_dataset) >= drop_threshold: + train_datasets.append(small_dataset) + each_data_idxs.append(np.arange(len(small_dataset)) + cur_all_num) + cur_all_num += len(small_dataset) + batch_size_idxs.append(small_batch_size) + no_in_batch_neg_flags.append(no_in_batch_neg_flag) + + self.dataset = datasets.concatenate_datasets(train_datasets) + self.each_data_idxs = each_data_idxs + self.datasets_inxs = np.arange(len(each_data_idxs)) + self.batch_size_idxs = batch_size_idxs + self.no_in_batch_neg_flags = no_in_batch_neg_flags + + self.refresh_epoch() + +
+[docs] + def _load_dataset(self, file_path: str): + """Load datset from given path. + + Args: + file_path (str): The path to load or download from HF hub. + + Returns: + datasets.Dataset: The loaded dataset. + """ + if dist.get_rank() == 0: + logger.info(f'loading data from {file_path} ...') + + temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', cache_dir=self.args.cache_path) + if len(temp_dataset) > self.args.max_example_num_per_dataset: + temp_dataset = temp_dataset.select(random.sample(list(range(len(temp_dataset))), self.args.max_example_num_per_dataset)) + if not self.args.knowledge_distillation: + if 'pos_scores' in temp_dataset.column_names: + temp_dataset = temp_dataset.remove_columns(['pos_scores']) + if 'neg_scores' in temp_dataset.column_names: + temp_dataset = temp_dataset.remove_columns(['neg_scores']) + return temp_dataset
+ + +
+[docs] + @staticmethod + def _get_file_batch_size(temp_dataset: datasets.Dataset, default_batch_size: int): + """Get the appropriate batch size for the dataset. + + Args: + temp_dataset (datasets.Dataset): Loaded :data:`datasets.Dataset` object. + default_batch_size (int): The default batch size to use if not specified in the dataset. + + Returns: + int: The final batch size to use. + """ + if 'batch_size' in temp_dataset.column_names: + return temp_dataset['batch_size'][0] + if 'type' in temp_dataset.column_names: + data_type = temp_dataset['type'][0] + if 'symmetric' in data_type: + return default_batch_size // 2 # make the symmetric data have smaller batch size + return default_batch_size
+ + +
+[docs] + def refresh_epoch(self): + """ + Refresh data for epoch. + """ + logger.info(f'-- Rank {self.process_index}: refresh data --') + self.deterministic_generator.shuffle(self.datasets_inxs) + + batch_datas = [] + for dataset_inx in self.datasets_inxs: + self.deterministic_generator.shuffle(self.each_data_idxs[dataset_inx]) + cur_batch_size = self.batch_size_idxs[dataset_inx]*self.num_processes + no_in_batch_neg_flag = self.no_in_batch_neg_flags[dataset_inx] + for start_index in range(0, len(self.each_data_idxs[dataset_inx]), cur_batch_size): + # judge the last batch's length + if len(self.each_data_idxs[dataset_inx]) - start_index < cur_batch_size: + break + batch_datas.append(( + self.each_data_idxs[dataset_inx][start_index:start_index+cur_batch_size], + no_in_batch_neg_flag + )) + self.deterministic_generator.shuffle(batch_datas) + self.batch_datas = batch_datas + self.step = 0
+ + + def __len__(self): + return len(self.batch_datas) * self.num_processes + + def __getitem__(self, _): + batch_indices, no_in_batch_neg_flag = self.batch_datas[self.step] # extend here + cur_batch_size = int(len(batch_indices) / self.num_processes) + batch_indices = batch_indices[self.process_index * cur_batch_size: (self.process_index + 1) * cur_batch_size] + batch_data = self.dataset[batch_indices] + self.step += 1 + queries, passages, teacher_scores = self._create_batch_data(batch_raw_data=batch_data) + return queries, passages, teacher_scores, no_in_batch_neg_flag + +
+[docs] + def _get_train_group_size(self, batch_raw_data): + """Get the training group size and data type. + + Args: + batch_raw_data (datasets.Dataset): One batch of raw data. + + Returns: + int: The training group size. + str: The type of data for the task. + """ + if 'type' in batch_raw_data: + data_type = batch_raw_data['type'][0] + if data_type in ['only_1neg']: + return 2, data_type + elif data_type in ['symmetric_class']: + return min(len(batch_raw_data['neg'][0]) + 1, self.args.train_group_size), data_type + else: + return self.args.train_group_size, data_type + return self.args.train_group_size, None
+ + +
+[docs] + def _create_batch_data(self, batch_raw_data): + """Create a comple batch of data with queries, documents and teacher scores. + + Args: + batch_raw_data (datasets.Dataset): One batch of raw data. + + Returns: + List[str]: Queries with instruction format. + List[str]: Documents with instruction format. + List[float]: Teacher scores for model distillation. + """ + queries, passages, teacher_scores = [], [], [] + + train_group_size, data_type = self._get_train_group_size(batch_raw_data) + + for i in range(len(batch_raw_data['query'])): + if data_type is not None: + assert batch_raw_data['type'][i] == data_type, f"Data type is not consistent in the same batch" + + queries.append( + self.args.query_instruction_format.format( + batch_raw_data['prompt'][i] if 'prompt' in batch_raw_data else self.args.query_instruction_for_retrieval, + batch_raw_data['query'][i] + ) + ) + tmp_passages = [] + pos_idx = random.choice(list(range(len(batch_raw_data['pos'][i])))) + pos = self._shuffle_text(batch_raw_data['pos'][i][pos_idx]) + tmp_passages.append(pos) + + neg_all_idx = list(range(len(batch_raw_data['neg'][i]))) + if len(batch_raw_data['neg'][i]) < train_group_size - 1: + num = math.ceil((train_group_size - 1) / len(batch_raw_data['neg'][i])) + neg_idxs = random.sample(neg_all_idx * num, train_group_size - 1) + else: + neg_idxs = random.sample(neg_all_idx, train_group_size - 1) + for neg_idx in neg_idxs: + tmp_passages.append(batch_raw_data['neg'][i][neg_idx]) + + if self.args.knowledge_distillation: + if 'pos_scores' in batch_raw_data and batch_raw_data['pos_scores'][i] is not None: + teacher_scores.append(batch_raw_data['pos_scores'][i][pos_idx]) + for neg_idx in neg_idxs: + if 'neg_scores' in batch_raw_data and batch_raw_data['neg_scores'][i] is not None: + teacher_scores.append(batch_raw_data['neg_scores'][i][neg_idx]) + else: + teacher_scores = None + + if data_type is not None and data_type in ['symmetric_sts', 'symmetric_clustering']: + tmp_passages = [ + self.args.query_instruction_format.format( + batch_raw_data['prompt'][i] if 'prompt' in batch_raw_data else self.args.query_instruction_for_retrieval, + p + ) for p in tmp_passages + ] + else: + if self.args.passage_instruction_for_retrieval is not None: + tmp_passages = [ + self.args.passage_instruction_format.format( + self.args.passage_instruction_for_retrieval, p + ) for p in tmp_passages + ] + + passages.extend(tmp_passages) + + if teacher_scores is not None: + if len(teacher_scores) > 0 and len(passages) > 0: + assert len(teacher_scores) == len(passages) + + return queries, passages, teacher_scores
+
+ + + +
+[docs] +@dataclass +class AbsEmbedderSameDatasetCollator(DataCollatorWithPadding): + """ + EmbedCollator for SameDataset. + Note that after using this collator, the training_args should be set as: + + ``training_args.per_device_train_batch_size = 1`` + + ``training_args.dataloader_num_workers = 0 # avoid multi-processing`` + """ + query_max_len: int = 32 + passage_max_len: int = 128 + sub_batch_size: int = -1 + + def __call__(self, features): + queries = features[0][0] + passages = features[0][1] + teacher_scores = features[0][2] + no_in_batch_neg_flag = features[0][3] + + queries_inputs = self.tokenizer( + queries, + truncation=True, + max_length=self.query_max_len, + return_tensors=None + ) + passages_inputs = self.tokenizer( + passages, + truncation=True, + max_length=self.passage_max_len, + return_tensors=None + ) + + if self.sub_batch_size is None or self.sub_batch_size <= 0: + q_collated = self.tokenizer.pad( + queries_inputs, + padding=self.padding, + max_length=self.query_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + + d_collated = self.tokenizer.pad( + passages_inputs, + padding=self.padding, + max_length=self.passage_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + else: + batch_size = self.sub_batch_size + + q_collated = [] + for i in range(0, len(queries_inputs['attention_mask']), batch_size): + start = i + end = min(len(queries_inputs['attention_mask']), i + batch_size) + sub_features = {} + for k, v in queries_inputs.items(): + sub_features[k] = v[start:end] + q_collated.append(self.tokenizer.pad( + sub_features, + padding=self.padding, + max_length=self.query_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + )) + + d_collated = [] + for i in range(0, len(passages_inputs['attention_mask']), batch_size): + start = i + end = min(len(passages_inputs['attention_mask']), i + batch_size) + sub_features = {} + + for k, v in passages_inputs.items(): + sub_features[k] = v[start:end] + d_collated.append(self.tokenizer.pad( + sub_features, + padding=self.padding, + max_length=self.passage_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + )) + + if isinstance(teacher_scores, list) and len(teacher_scores) == 0: + teacher_scores = None + + return { + "queries": q_collated, + "passages": d_collated, + "teacher_scores": teacher_scores, + "no_in_batch_neg_flag": no_in_batch_neg_flag + }
+ + + +
+[docs] +class EmbedderTrainerCallbackForDataRefresh(TrainerCallback): + """ + Callback class to inspect the state of the training loop and take decision. + """ + def __init__(self, train_dataset: AbsEmbedderSameDatasetTrainDataset): + self.train_dataset = train_dataset + +
+[docs] + def on_epoch_end( + self, + args: AbsEmbedderTrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs + ): + """ + Event called at the end of an epoch. + """ + self.train_dataset.refresh_epoch()
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/finetune/embedder/AbsModeling.html b/_modules/FlagEmbedding/abc/finetune/embedder/AbsModeling.html new file mode 100644 index 00000000..03754876 --- /dev/null +++ b/_modules/FlagEmbedding/abc/finetune/embedder/AbsModeling.html @@ -0,0 +1,868 @@ + + + + + + + + FlagEmbedding.abc.finetune.embedder.AbsModeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.finetune.embedder.AbsModeling

+import torch
+from torch import nn, Tensor
+import torch.nn.functional as F
+import torch.distributed as dist
+from transformers import AutoTokenizer
+from transformers.file_utils import ModelOutput
+
+import logging
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, List, Union
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +@dataclass +class EmbedderOutput(ModelOutput): + """ + Output information returned by the model. + """ + q_reps: Optional[Tensor] = None + p_reps: Optional[Tensor] = None + loss: Optional[Tensor] = None + scores: Optional[Tensor] = None
+ + + +
+[docs] +class AbsEmbedderModel(ABC, nn.Module): + """Abstract class of embedding model for training. + + Args: + base_model: The base model to train on. + tokenizer (AutoTokenizer, optional): The tokenizer to use. Defaults to ``None``. + negatives_cross_device (bool, optional): If True, will compute cross devices negative loss. Defaults to ``False``. + temperature (float, optional): Temperature to control the scale of scores. Defaults to ``1.0``. + sub_batch_size (int, optional): Sub-batch size during encoding. If negative, will not split to sub-batch. + Defaults to ``-1``. + kd_loss_type (str, optional): Type of knowledge distillation loss. Defaults to ``"kl_div"``. + """ + def __init__( + self, + base_model, + tokenizer: AutoTokenizer = None, + negatives_cross_device: bool = False, + temperature: float = 1.0, + sub_batch_size: int = -1, + kd_loss_type: str = 'kl_div', + ): + super().__init__() + self.model = base_model + self.tokenizer = tokenizer + + self.temperature = temperature + self.negatives_cross_device = negatives_cross_device + if self.negatives_cross_device: + if not dist.is_initialized(): + raise ValueError('Distributed training has not been initialized for representation all gather.') + self.process_rank = dist.get_rank() + self.world_size = dist.get_world_size() + + self.sub_batch_size = sub_batch_size + self.kd_loss_type = kd_loss_type + +
+[docs] + @abstractmethod + def encode(self, features): + """Abstract method encode and get the embedding. + + Args: + features (Union[list, dict]): Features feed to the model. + """ + pass
+ + +
+[docs] + @abstractmethod + def compute_loss(self, scores, target): + """Abstract method compute the loss. + + Args: + scores (torch.Tensor): Computed score. + target (torch.Tensor): The target value. + """ + pass
+ + +
+[docs] + @abstractmethod + def compute_score(self, q_reps, p_reps): + """Abstract method to compute the score. + + Args: + q_reps (torch.Tensor): Queries representations. + p_reps (torch.Tensor): Passages rerpresentations. + """ + pass
+ + +
+[docs] + @abstractmethod + def save(self, output_dir: str): + """Abstract method to save the model. + + Args: + output_dir (str): Directory for saving the model. + """ + pass
+ + +
+[docs] + def get_local_score(self, q_reps, p_reps, all_scores): + """Get the local score of queries and passages. + + Args: + q_reps (torch.Tensor): Queries representations. + p_reps (torch.Tensor): Passages rerpresentations. + all_scores (torch.Tensor): All the query-passage scores computed. + + Returns: + torch.Tensor: Local scores to compute loss. + """ + group_size = p_reps.size(0) // q_reps.size(0) + indices = torch.arange(0, q_reps.size(0), device=q_reps.device) * group_size + specific_scores = [] + for i in range(group_size): + specific_scores.append( + all_scores[torch.arange(q_reps.size(0), device=q_reps.device), indices + i] + ) + return torch.stack(specific_scores, dim=1).view(q_reps.size(0), -1)
+ + +
+[docs] + def compute_local_score(self, q_reps, p_reps, compute_score_func=None, **kwargs): + """Compute the local score of queries and passages. + + Args: + q_reps (torch.Tensor): Queries representations. + p_reps (torch.Tensor): Passages rerpresentations. + compute_score_func (function, optional): Function to compute score. Defaults to ``None``, which will use the + :meth:`self.compute_score`. + + Returns: + torch.Tensor: Local scores to compute loss. + """ + if compute_score_func is None: + all_scores = self.compute_score(q_reps, p_reps) + else: + all_scores = compute_score_func(q_reps, p_reps, **kwargs) + loacl_scores = self.get_local_score(q_reps, p_reps, all_scores) + return loacl_scores
+ + +
+[docs] + def _compute_no_in_batch_neg_loss(self, q_reps, p_reps, teacher_targets=None, compute_score_func=None, **kwargs): + """ + Compute loss when using no in-batch negatives and no cross-device negatives + """ + group_size = p_reps.size(0) // q_reps.size(0) + + local_scores = self.compute_local_score(q_reps, p_reps, compute_score_func, **kwargs) # (batch_size, group_size) + + if teacher_targets is not None: + # compute kd loss + loss = self.distill_loss(self.kd_loss_type, teacher_targets, local_scores, group_size=group_size) + + # add normal loss if needed + if self.kd_loss_type == "kl_div": + local_targets = torch.zeros(local_scores.size(0), device=local_scores.device, dtype=torch.long) # (batch_size) + loss += self.compute_loss(local_scores, local_targets) + else: + local_targets = torch.zeros(local_scores.size(0), device=local_scores.device, dtype=torch.long) # (batch_size) + loss = self.compute_loss(local_scores, local_targets) + + return local_scores, loss
+ + +
+[docs] + def _compute_in_batch_neg_loss(self, q_reps, p_reps, teacher_targets=None, compute_score_func=None, **kwargs): + """ + Compute loss when only using in-batch negatives + """ + group_size = p_reps.size(0) // q_reps.size(0) + + if compute_score_func is None: + scores = self.compute_score(q_reps, p_reps) # (batch_size, batch_size * group_size) + else: + scores = compute_score_func(q_reps, p_reps, **kwargs) # (batch_size, batch_size * group_size) + + if teacher_targets is not None: + # compute kd loss + if self.kd_loss_type == "kl_div": + student_scores = self.get_local_score(q_reps, p_reps, scores) # (batch_size, group_size) + + loss = self.distill_loss(self.kd_loss_type, teacher_targets, student_scores, group_size) + + idxs = torch.arange(q_reps.size(0), device=q_reps.device, dtype=torch.long) + targets = idxs * (p_reps.size(0) // q_reps.size(0)) # (batch_size) + loss += self.compute_loss(scores, targets) + elif self.kd_loss_type == "m3_kd_loss": + loss = self.distill_loss(self.kd_loss_type, teacher_targets, scores, group_size) + else: + raise ValueError(f"Invalid kd_loss_type: {self.kd_loss_type}") + else: + idxs = torch.arange(q_reps.size(0), device=q_reps.device, dtype=torch.long) + targets = idxs * group_size # (batch_size) + loss = self.compute_loss(scores, targets) + + return scores, loss
+ + +
+[docs] + def _compute_cross_device_neg_loss(self, q_reps, p_reps, teacher_targets=None, compute_score_func=None, **kwargs): + """ + Compute loss when using both in-batch negatives and cross-device negatives + """ + group_size = p_reps.size(0) // q_reps.size(0) + + cross_q_reps = self._dist_gather_tensor(q_reps) # (world_size * batch_size, dim) + cross_p_reps = self._dist_gather_tensor(p_reps) # (world_size * batch_size * group_size, dim) + + if compute_score_func is None: + cross_scores = self.compute_score(cross_q_reps, cross_p_reps) # (world_size * batch_size, world_size * batch_size * group_size) + else: + cross_scores = compute_score_func(cross_q_reps, cross_p_reps, **kwargs) # (world_size * batch_size, world_size * batch_size * group_size) + + if teacher_targets is not None: + # compute kd loss + if self.kd_loss_type == "kl_div": + student_scores = self.get_local_score(cross_q_reps, cross_p_reps, cross_scores) # (world_size * batch_size, group_size) + student_scores = student_scores[ + q_reps.size(0)*self.process_rank : q_reps.size(0)*(self.process_rank+1) + ] # (batch_size, group_size) + + loss = self.distill_loss(self.kd_loss_type, teacher_targets, student_scores, group_size) + + cross_idxs = torch.arange(cross_q_reps.size(0), device=cross_q_reps.device, dtype=torch.long) + cross_targets = cross_idxs * group_size # (world_size * batch_size) + loss += self.compute_loss(cross_scores, cross_targets) + elif self.kd_loss_type == "m3_kd_loss": + cross_teacher_targets = self._dist_gather_tensor(teacher_targets) # (world_size * batch_size, group_size) + + loss = self.distill_loss(self.kd_loss_type, cross_teacher_targets, cross_scores, group_size) + else: + raise ValueError(f"Invalid kd_loss_type: {self.kd_loss_type}") + else: + cross_idxs = torch.arange(cross_q_reps.size(0), device=cross_q_reps.device, dtype=torch.long) + cross_targets = cross_idxs * group_size # (world_size * batch_size) + loss = self.compute_loss(cross_scores, cross_targets) + + return cross_scores, loss
+ + +
+[docs] + def forward( + self, + queries: Union[Dict[str, Tensor], List[Dict[str, Tensor]]] = None, + passages: Union[Dict[str, Tensor], List[Dict[str, Tensor]]] = None, + teacher_scores: Union[None, List[float]] = None, + no_in_batch_neg_flag: bool = False, + ): + """The computation performed at every call. + + Args: + queries (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional): Input queries. Defaults to ``None``. + passages (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional): Input passages. Defaults to ``None``. + teacher_scores (Union[None, List[float]], optional): Teacher scores for distillation. Defaults to ``None``. + no_in_batch_neg_flag (bool, optional): If True, use no in-batch negatives and no cross-device negatives. Defaults to ``False``. + + Returns: + EmbedderOutput: Output of the forward call of model. + """ + q_reps = self.encode(queries) # (batch_size, dim) + p_reps = self.encode(passages) # (batch_size * group_size, dim) + + if self.training: + if teacher_scores is not None: + teacher_scores = torch.tensor(teacher_scores, device=q_reps.device) + teacher_scores = teacher_scores.view(q_reps.size(0), -1).detach() # (batch_size, group_size) + teacher_targets = F.softmax(teacher_scores, dim=-1) # (batch_size, group_size) + else: + teacher_targets = None + + if no_in_batch_neg_flag: + compute_loss_func = self._compute_no_in_batch_neg_loss + else: + if self.negatives_cross_device: + compute_loss_func = self._compute_cross_device_neg_loss + else: + compute_loss_func = self._compute_in_batch_neg_loss + + scores, loss = compute_loss_func(q_reps, p_reps, teacher_targets=teacher_targets) + else: + loss = None + + return EmbedderOutput( + loss=loss, + )
+ + +
+[docs] + @staticmethod + def distill_loss(kd_loss_type, teacher_targets, student_scores, group_size=None): + """Compute the distillation loss. + + Args: + kd_loss_type (str): Type of knowledge distillation loss, supports "kl_div" and "m3_kd_loss". + teacher_targets (torch.Tensor): Targets from the teacher model. + student_scores (torch.Tensor): Score of student model. + group_size (int, optional): Number of groups for . Defaults to ``None``. + + Raises: + ValueError: Invalid kd_loss_type + + Returns: + torch.Tensor: A scalar of computed distillation loss. + """ + if kd_loss_type == 'kl_div': + # teacher_targets: (batch_size, group_size) / (world_size * batch_size, group_size) + # student_scores: (batch_size, group_size) / (world_size * batch_size, group_size) + return - torch.mean( + torch.sum(torch.log_softmax(student_scores, dim=-1) * teacher_targets, dim=-1) + ) + elif kd_loss_type == 'm3_kd_loss': + # teacher_targets: (batch_size, group_size) / (world_size * batch_size, group_size) + # student_scores: (batch_size, batch_size * group_size) / (world_size * batch_size, world_size * batch_size * group_size) + labels = torch.arange(student_scores.size(0), device=student_scores.device, dtype=torch.long) + labels = labels * group_size + + loss = 0 + mask = torch.zeros_like(student_scores) + for i in range(group_size): + temp_target = labels + i + temp_scores = student_scores + mask + temp_loss = F.cross_entropy(temp_scores, temp_target, reduction="none") # B + loss += torch.mean(teacher_targets[:, i] * temp_loss) + mask = torch.scatter(mask, dim=-1, index=temp_target.unsqueeze(-1), + value=torch.finfo(student_scores.dtype).min) + return loss + else: + raise ValueError(f"Invalid kd_loss_type: {kd_loss_type}")
+ + +
+[docs] + def _dist_gather_tensor(self, t: Optional[torch.Tensor]): + """Gather a tensor from all processes in a distributed setting. + + Args: + t (Optional[torch.Tensor]): The input tensor to be gathered. If `None`, no gathering is performed. + + Returns: + Union[torch.Tensor, None]: A concatenated tensor from all processes if ``t`` is not ``None``, + otherwise returns ``None``. + """ + if t is None: + return None + t = t.contiguous() + + all_tensors = [torch.empty_like(t) for _ in range(self.world_size)] + dist.all_gather(all_tensors, t) + + all_tensors[self.process_rank] = t + all_tensors = torch.cat(all_tensors, dim=0) + + return all_tensors
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/finetune/embedder/AbsRunner.html b/_modules/FlagEmbedding/abc/finetune/embedder/AbsRunner.html new file mode 100644 index 00000000..e8dc1b94 --- /dev/null +++ b/_modules/FlagEmbedding/abc/finetune/embedder/AbsRunner.html @@ -0,0 +1,654 @@ + + + + + + + + FlagEmbedding.abc.finetune.embedder.AbsRunner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.finetune.embedder.AbsRunner

+import os
+import logging
+from pathlib import Path
+from typing import Tuple
+from abc import ABC, abstractmethod
+from transformers import set_seed, PreTrainedTokenizer
+
+
+from .AbsArguments import (
+    AbsEmbedderModelArguments,
+    AbsEmbedderDataArguments,
+    AbsEmbedderTrainingArguments
+)
+from .AbsTrainer import AbsEmbedderTrainer
+from .AbsModeling import AbsEmbedderModel
+from .AbsDataset import (
+    AbsEmbedderTrainDataset, AbsEmbedderCollator,
+    AbsEmbedderSameDatasetTrainDataset, AbsEmbedderSameDatasetCollator
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class AbsEmbedderRunner(ABC): + """Abstract class to run embedding model fine-tuning. + + Args: + model_args (AbsEmbedderModelArguments): Model arguments + data_args (AbsEmbedderDataArguments): Data arguments. + training_args (AbsEmbedderTrainingArguments): Training arguments. + """ + def __init__( + self, + model_args: AbsEmbedderModelArguments, + data_args: AbsEmbedderDataArguments, + training_args: AbsEmbedderTrainingArguments + ): + self.model_args = model_args + self.data_args = data_args + self.training_args = training_args + + if ( + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + logger.info("Model parameters %s", model_args) + logger.info("Data parameters %s", data_args) + + # Set seed + set_seed(training_args.seed) + + self.tokenizer, self.model = self.load_tokenizer_and_model() + self.train_dataset = self.load_train_dataset() + self.data_collator = self.load_data_collator() + self.trainer = self.load_trainer() + +
+[docs] + @abstractmethod + def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderModel]: + """Abstract method to load the tokenizer and model. + + Returns: + Tuple[PreTrainedTokenizer, AbsEmbedderModel]: Loaded tokenizer and model instances. + """ + pass
+ + +
+[docs] + @abstractmethod + def load_trainer(self) -> AbsEmbedderTrainer: + """Abstract method to load the trainer. + + Returns: + AbsEmbedderTrainer: The loaded trainer instance. + """ + pass
+ + +
+[docs] + def load_train_dataset(self) -> AbsEmbedderTrainDataset: + """Loads the training dataset based on data arguments. + + Returns: + AbsEmbedderTrainDataset: The loaded dataset instance. + """ + if self.data_args.same_dataset_within_batch: + train_dataset = AbsEmbedderSameDatasetTrainDataset( + args=self.data_args, + default_batch_size=self.training_args.per_device_train_batch_size, + seed=self.training_args.seed, + tokenizer=self.tokenizer, + process_index=self.training_args.process_index, + num_processes=self.training_args.world_size + ) + self.training_args.per_device_train_batch_size = 1 + self.training_args.dataloader_num_workers = 0 # avoid multi-processing + else: + train_dataset = AbsEmbedderTrainDataset( + args=self.data_args, + tokenizer=self.tokenizer + ) + return train_dataset
+ + +
+[docs] + def load_data_collator(self) -> AbsEmbedderCollator: + """Loads the appropriate data collator. + + Returns: + AbsEmbedderCollator: Loaded data collator. + """ + if self.data_args.same_dataset_within_batch: + EmbedCollator = AbsEmbedderSameDatasetCollator + else: + EmbedCollator = AbsEmbedderCollator + + data_collator = EmbedCollator( + tokenizer=self.tokenizer, + query_max_len=self.data_args.query_max_len, + passage_max_len=self.data_args.passage_max_len, + sub_batch_size=self.training_args.sub_batch_size, + pad_to_multiple_of=self.data_args.pad_to_multiple_of, + padding=True, + return_tensors="pt" + ) + return data_collator
+ + +
+[docs] + def run(self): + """ + Executes the training process. + """ + Path(self.training_args.output_dir).mkdir(parents=True, exist_ok=True) + + # Training + self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint) + self.trainer.save_model()
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/finetune/embedder/AbsTrainer.html b/_modules/FlagEmbedding/abc/finetune/embedder/AbsTrainer.html new file mode 100644 index 00000000..f2781466 --- /dev/null +++ b/_modules/FlagEmbedding/abc/finetune/embedder/AbsTrainer.html @@ -0,0 +1,529 @@ + + + + + + + + FlagEmbedding.abc.finetune.embedder.AbsTrainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.finetune.embedder.AbsTrainer

+import logging
+from typing import Optional
+from abc import ABC, abstractmethod
+from transformers.trainer import Trainer
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class AbsEmbedderTrainer(ABC, Trainer): + """ + Abstract class for the trainer of embedder. + """ + @abstractmethod + def _save(self, output_dir: Optional[str] = None, state_dict=None): + pass + +
+[docs] + def compute_loss(self, model, inputs, return_outputs=False, **kwargs): + """ + How the loss is computed by Trainer. By default, all models return the loss in the first element. + + Subclass and override for custom behavior. + + Args: + model (AbsEmbedderModel): The model being trained. + inputs (dict): A dictionary of input tensors to be passed to the model. + return_outputs (bool, optional): If ``True``, returns both the loss and the model's outputs. Otherwise, + returns only the loss. + + Returns: + Union[torch.Tensor, tuple(torch.Tensor, EmbedderOutput)]: The computed loss. If ``return_outputs`` is ``True``, + also returns the model's outputs in a tuple ``(loss, outputs)``. + """ + + outputs = model(**inputs) + loss = outputs.loss + + return (loss, outputs) if return_outputs else loss
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/finetune/reranker/AbsArguments.html b/_modules/FlagEmbedding/abc/finetune/reranker/AbsArguments.html new file mode 100644 index 00000000..df4a3009 --- /dev/null +++ b/_modules/FlagEmbedding/abc/finetune/reranker/AbsArguments.html @@ -0,0 +1,622 @@ + + + + + + + + FlagEmbedding.abc.finetune.reranker.AbsArguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.finetune.reranker.AbsArguments

+import os
+from typing import Optional
+from dataclasses import dataclass, field
+
+from transformers import TrainingArguments
+
+
+
+[docs] +@dataclass +class AbsRerankerModelArguments: + """ + Abstract class for reranker model arguments. + """ + + model_name_or_path: str = field( + metadata={"help": "The model checkpoint for initialization."} + ) + config_name: str = field( + default=None, + metadata={"help": "Pretrained config name or path if not the same as model_name."} + ) + tokenizer_name: str = field( + default=None, + metadata={"help": "Pretrained tokenizer name or path if not the same as model_name."} + ) + cache_dir: str = field( + default=None, + metadata={"help": "Where do you want to store the pre-trained models downloaded from s3."} + ) + trust_remote_code: bool = field( + default=False, + metadata={"help": "Trust remote code"} + ) + model_type: str = field( + default='encoder', + metadata={"help": "Type of finetune, ['encoder', 'decoder']"} + ) + token: str = field( + default_factory=lambda: os.getenv('HF_TOKEN', None), + metadata={"help": "The token to use when accessing the model."} + )
+ + # finetune_type: str = field( + # default='sratch', + # metadata={"help": "Type of finetune, ['sratch', 'finetune']"} + # ) + + +
+[docs] +@dataclass +class AbsRerankerDataArguments: + """ + Abstract class for reranker data arguments. + """ + train_data: str = field( + default=None, metadata={ + "help": "One or more paths to training data. `query: str`, `pos: List[str]`, `neg: List[str]` are required in the training data.", + "nargs": "+" + } + ) + cache_path: Optional[str] = field( + default=None, metadata={"help": "Where do you want to store the cached data"} + ) + train_group_size: int = field(default=8) + + query_max_len: int = field( + default=32, + metadata={ + "help": "The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated." + }, + ) + + passage_max_len: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization for passage. Sequences longer than this will be truncated." + }, + ) + + max_len: int = field( + default=512, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer than this will be truncated." + }, + ) + + pad_to_multiple_of: Optional[int] = field( + default=None, + metadata={ + "help": "If set will pad the sequence to be a multiple of the provided value." + }, + ) + + max_example_num_per_dataset: int = field( + default=100000000, metadata={"help": "the max number of examples for each dataset"} + ) + + query_instruction_for_rerank: str= field( + default=None, metadata={"help": "instruction for query"} + ) + query_instruction_format: str = field( + default="{}{}", metadata={"help": "format for query instruction"} + ) + + knowledge_distillation: bool = field( + default=False, + metadata={"help": "Use knowledge distillation when `pos_scores: List[float]` and `neg_scores: List[float]` are in features of training data"} + ) + + passage_instruction_for_rerank: Optional[str] = field( + default=None, metadata={"help": "instruction for passage"} + ) + passage_instruction_format: Optional[str] = field( + default="{}{}", metadata={"help": "format for passage instruction"} + ) + + shuffle_ratio: float = field( + default=0.0, metadata={"help": "The ratio of shuffling the text"} + ) + + sep_token: str = field( + default='\n', metadata={"help": "The sep token for LLM reranker to discriminate between query and passage"} + )
+ + + # def __post_init__(self): + # for train_dir in self.train_data: + # if not os.path.exists(train_dir): + # raise FileNotFoundError(f"cannot find file: {train_dir}, please set a true path") + + +@dataclass +class AbsRerankerTrainingArguments(TrainingArguments): + sub_batch_size: Optional[int] = field(default=None, metadata={"help": "sub batch size for training, not implemented yet"}) +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/finetune/reranker/AbsDataset.html b/_modules/FlagEmbedding/abc/finetune/reranker/AbsDataset.html new file mode 100644 index 00000000..2da69ec6 --- /dev/null +++ b/_modules/FlagEmbedding/abc/finetune/reranker/AbsDataset.html @@ -0,0 +1,907 @@ + + + + + + + + FlagEmbedding.abc.finetune.reranker.AbsDataset - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.finetune.reranker.AbsDataset

+import os
+import math
+import random
+import logging
+import datasets
+import numpy as np
+import torch.distributed as dist
+from dataclasses import dataclass
+from torch.utils.data import Dataset
+from transformers import (
+    PreTrainedTokenizer, 
+    DataCollatorWithPadding,
+    BatchEncoding,
+    DataCollatorForSeq2Seq
+)
+from typing import List
+
+from .AbsArguments import AbsRerankerDataArguments
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class AbsRerankerTrainDataset(Dataset): + """Abstract class for reranker training dataset. + + Args: + args (AbsRerankerDataArguments): Data arguments. + tokenizer (PreTrainedTokenizer): Tokenizer to use. + """ + def __init__( + self, + args: AbsRerankerDataArguments, + tokenizer: PreTrainedTokenizer + ): + self.args = args + self.tokenizer = tokenizer + + train_datasets = [] + for data_dir in args.train_data: + if not os.path.isdir(data_dir): + if not (data_dir.endswith('.json') or data_dir.endswith('.jsonl')): continue + temp_dataset = self._load_dataset(data_dir) + if len(temp_dataset) == 0: continue + train_datasets.append(temp_dataset) + else: + for file in os.listdir(data_dir): + if not (file.endswith('.json') or file.endswith('.jsonl')): continue + temp_dataset = self._load_dataset(os.path.join(data_dir, file)) + if len(temp_dataset) == 0: continue + train_datasets.append(temp_dataset) + self.dataset = datasets.concatenate_datasets(train_datasets) + + self.max_length = self.args.query_max_len + self.args.passage_max_len + +
+[docs] + def _load_dataset(self, file_path: str): + """Load dataset from path. + + Args: + file_path (str): Path to load the datasets from. + + Raises: + ValueError: `pos_scores` and `neg_scores` not found in the features of training data + + Returns: + datasets.Dataset: Loaded HF dataset. + """ + if dist.get_rank() == 0: + logger.info(f'loading data from {file_path} ...') + + temp_dataset = datasets.load_dataset('json', data_files=file_path, split='train', cache_dir=self.args.cache_path) + if len(temp_dataset) > self.args.max_example_num_per_dataset: + temp_dataset = temp_dataset.select(random.sample(list(range(len(temp_dataset))), self.args.max_example_num_per_dataset)) + if not self.args.knowledge_distillation: + if 'pos_scores' in temp_dataset.column_names: + temp_dataset = temp_dataset.remove_columns(['pos_scores']) + if 'neg_scores' in temp_dataset.column_names: + temp_dataset = temp_dataset.remove_columns(['neg_scores']) + else: + if 'pos_scores' not in temp_dataset.column_names or 'neg_scores' not in temp_dataset.column_names: + raise ValueError(f"`pos_scores` and `neg_scores` not found in the features of training data in {file_path}, which is necessary when using knowledge distillation.") + return temp_dataset
+ + +
+[docs] + def _shuffle_text(self, text): + """shuffle the input text. + + Args: + text (str): Input text. + + Returns: + str: Shuffled text. + """ + if self.args.shuffle_ratio > 0 and len(text) > 100 and random.random() < self.args.shuffle_ratio: + split_text = [] + chunk_size = len(text)//3 + 1 + for i in range(0, len(text), chunk_size): + split_text.append(text[i:i+chunk_size]) + random.shuffle(split_text) + return " ".join(split_text) + else: + return text
+ + + def __len__(self): + return len(self.dataset) + +
+[docs] + def create_one_example(self, qry_encoding: str, doc_encoding: str): + """Creates a single input example by encoding and preparing a query and document pair for the model. + + Args: + qry_encoding (str): Query to be encoded. + doc_encoding (str): Document to be encoded. + + Returns: + dict: A dictionary containing tokenized and prepared inputs, ready for model consumption. + """ + qry_inputs = self.tokenizer.encode(qry_encoding, truncation=True, max_length=self.args.query_max_len + self.args.passage_max_len // 4, add_special_tokens=False) + doc_inputs = self.tokenizer.encode(doc_encoding, truncation=True, max_length=self.args.passage_max_len + self.args.query_max_len // 2, add_special_tokens=False) + item = self.tokenizer.prepare_for_model( + qry_inputs, + doc_inputs, + truncation='only_second', + max_length=self.args.query_max_len + self.args.passage_max_len, + padding=False, + ) + return item
+ + + def __getitem__(self, item): + data = self.dataset[item] + train_group_size = self.args.train_group_size + + query = data['query'] + if self.args.query_instruction_for_rerank is not None: + query = self.args.query_instruction_format.format( + data['query_prompt'] if 'query_prompt' in data else self.args.query_instruction_for_rerank, + query + ) + + passages = [] + teacher_scores = [] + + assert isinstance(data['pos'], list) and isinstance(data['neg'], list) + + pos_idx = random.choice(list(range(len(data['pos'])))) + passages.append(self._shuffle_text(data['pos'][pos_idx])) + + neg_all_idx = list(range(len(data['neg']))) + if len(data['neg']) < train_group_size - 1: + num = math.ceil((train_group_size - 1) / len(data['neg'])) + neg_idxs = random.sample(neg_all_idx * num, train_group_size - 1) + else: + neg_idxs = random.sample(neg_all_idx, self.args.train_group_size - 1) + for neg_idx in neg_idxs: + passages.append(data['neg'][neg_idx]) + + if self.args.knowledge_distillation: + assert isinstance(data['pos_scores'], list) and isinstance(data['neg_scores'], list) + teacher_scores.append(data['pos_scores'][pos_idx]) + for neg_idx in neg_idxs: + teacher_scores.append(data['neg_scores'][neg_idx]) + if not all(isinstance(score, (int, float)) for score in teacher_scores): + raise ValueError(f"pos_score or neg_score must be digit") + else: + teacher_scores = None + + if self.args.passage_instruction_for_rerank is not None: + passages = [ + self.args.passage_instruction_format.format( + data['passage_prompt'] if 'passage_prompt' in data else self.args.passage_instruction_for_rerank, p + ) + for p in passages + ] + + batch_data = [] + for passage in passages: + batch_data.append(self.create_one_example(query, passage)) + + return batch_data, teacher_scores
+ + +
+[docs] +@dataclass +class AbsRerankerCollator(DataCollatorWithPadding): + """ + The abstract reranker collator. + """ + query_max_len: int = 32 + passage_max_len: int = 128 + + def __call__(self, features) -> list[BatchEncoding]: + teacher_scores = [f[1] for f in features] + if teacher_scores[0] is None: + teacher_scores = None + elif isinstance(teacher_scores[0], list): + teacher_scores = sum(teacher_scores, []) + + features = [f[0] for f in features] + if isinstance(features[0], list): + features = sum(features, []) + + collated = self.tokenizer.pad( + features, + padding=self.padding, + max_length=self.query_max_len + self.passage_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + + return { + "pair": collated, + "teacher_scores": teacher_scores, + }
+ + +
+[docs] +class AbsLLMRerankerTrainDataset(AbsRerankerTrainDataset): + """Abstract class for LLM reranker training dataset. + + Args: + args (AbsRerankerDataArguments): Data arguments. + tokenizer (PreTrainedTokenizer): Tokenizer to use. + """ + def __init__( + self, + args: AbsRerankerDataArguments, + tokenizer: PreTrainedTokenizer + ): + super().__init__(args, tokenizer) + sep = self.args.sep_token + self.sep_inputs = self.tokenizer( + sep, + return_tensors=None, + add_special_tokens=False + )['input_ids'] + + def __getitem__(self, item) -> List[BatchEncoding]: + data = self.dataset[item] + train_group_size = self.args.train_group_size + + query = data['query'] + if self.args.query_instruction_for_rerank is not None: + query = self.args.query_instruction_format.format( + data['query_prompt'] if 'query_prompt' in data else self.args.query_instruction_for_rerank, + query + ) + + passages = [] + teacher_scores = [] + + assert isinstance(data['pos'], list) and isinstance(data['neg'], list) + + pos_idx = random.choice(list(range(len(data['pos'])))) + passages.append(self._shuffle_text(data['pos'][pos_idx])) + + neg_all_idx = list(range(len(data['neg']))) + if len(data['neg']) < train_group_size - 1: + num = math.ceil((train_group_size - 1) / len(data['neg'])) + neg_idxs = random.sample(neg_all_idx * num, train_group_size - 1) + else: + neg_idxs = random.sample(neg_all_idx, self.args.train_group_size - 1) + for neg_idx in neg_idxs: + passages.append(data['neg'][neg_idx]) + + if self.args.knowledge_distillation: + assert isinstance(data['pos_scores'], list) and isinstance(data['neg_scores'], list) + teacher_scores.append(data['pos_scores'][pos_idx]) + for neg_idx in neg_idxs: + teacher_scores.append(data['neg_scores'][neg_idx]) + if not all(isinstance(score, (int, float)) for score in teacher_scores): + raise ValueError(f"pos_score or neg_score must be digit") + else: + teacher_scores = None + + if self.args.passage_instruction_for_rerank is not None: + passages = [ + self.args.passage_instruction_format.format( + data['passage_prompt'] if 'passage_prompt' in data else self.args.passage_instruction_for_rerank, p + ) + for p in passages + ] + + prompt = self.dataset[item].get('prompt', "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'.") + + query_inputs = self.tokenizer( + query, + return_tensors=None, + max_length=self.args.query_max_len + self.args.passage_max_len // 4, + truncation=True, + add_special_tokens=False + ) + + prompt_inputs = self.tokenizer( + prompt, + return_tensors=None, + add_special_tokens=False + )['input_ids'] + + max_length = self.max_length - len(prompt_inputs) - len(self.sep_inputs) + + passages_inputs = [] + for i, passage in enumerate(passages): + passage_inputs = self.tokenizer( + passage, + return_tensors=None, + max_length=self.args.passage_max_len + self.args.query_max_len // 2, + truncation=True, + add_special_tokens=False + ) + if self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id != self.tokenizer.pad_token_id: + item = self.tokenizer.prepare_for_model( + [self.tokenizer.bos_token_id] + query_inputs['input_ids'], + self.sep_inputs + passage_inputs['input_ids'], + truncation='only_second', + max_length=max_length, + padding=False, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=False + ) + else: + item = self.tokenizer.prepare_for_model( + query_inputs['input_ids'], + self.sep_inputs + passage_inputs['input_ids'], + truncation='only_second', + max_length=max_length, + padding=False, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=False + ) + + passage_inputs['input_ids'] = item['input_ids'] + self.sep_inputs + prompt_inputs + + passage_inputs['attention_mask'] = [1] * len(passage_inputs['input_ids']) + # passage_inputs['labels'] = passage_inputs['input_ids'].copy() + # passage_inputs['labels'] = [-100] * (len(passage_inputs['input_ids']) - 1) + passage_inputs['labels'][(len(passage_inputs['input_ids']) - 1):] + passage_inputs.pop('token_type_ids') if 'token_type_ids' in passage_inputs.keys() else None + if 'position_ids' in passage_inputs.keys(): + passage_inputs['position_ids'] = list(range(len(passage_inputs['input_ids']))) + passages_inputs.append(passage_inputs) + + return passages_inputs, teacher_scores
+ + + +
+[docs] +@dataclass +class AbsLLMRerankerCollator(DataCollatorForSeq2Seq): + """ + Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg] + and pass batch separately to the actual collator. + Abstract out data detail for the model. + """ + query_max_len: int = 32 + passage_max_len: int = 128 + + def __call__(self, features, return_tensors='pt'): + if return_tensors is None: + return_tensors = self.return_tensors + + teacher_scores = [f[1] for f in features] + if teacher_scores[0] is None: + teacher_scores = None + elif isinstance(teacher_scores[0], list): + teacher_scores = sum(teacher_scores, []) + + features = [f[0] for f in features] + if isinstance(features[0], list): + features = sum(features, []) + + labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None + # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the + # same length to return tensors. + if labels is not None: + max_label_length = max(len(l) for l in labels) + # print(max_label_length) + if self.pad_to_multiple_of is not None: + max_label_length = ( + (max_label_length + self.pad_to_multiple_of - 1) + // self.pad_to_multiple_of + * self.pad_to_multiple_of + ) + + padding_side = self.tokenizer.padding_side + for feature in features: + remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"])) + if isinstance(feature["labels"], list): + feature["labels"] = ( + feature["labels"] + remainder + if padding_side == "right" else remainder + feature["labels"] + ) + elif padding_side == "right": + feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64) + else: + feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64) + + collated = self.tokenizer.pad( + features, + padding=self.padding, + max_length=self.query_max_len + self.passage_max_len, + return_tensors=return_tensors, + pad_to_multiple_of=self.pad_to_multiple_of, + ) + + return { + "pair": collated, + "teacher_scores": teacher_scores, + }
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/finetune/reranker/AbsModeling.html b/_modules/FlagEmbedding/abc/finetune/reranker/AbsModeling.html new file mode 100644 index 00000000..255e95d5 --- /dev/null +++ b/_modules/FlagEmbedding/abc/finetune/reranker/AbsModeling.html @@ -0,0 +1,645 @@ + + + + + + + + FlagEmbedding.abc.finetune.reranker.AbsModeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.finetune.reranker.AbsModeling

+import torch
+from torch import nn, Tensor
+from transformers import AutoTokenizer
+from transformers.file_utils import ModelOutput
+
+import logging
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, List, Union
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +@dataclass +class RerankerOutput(ModelOutput): + loss: Optional[Tensor] = None + scores: Optional[Tensor] = None
+ + + +
+[docs] +class AbsRerankerModel(ABC, nn.Module): + """Abstract class of embedding model for training. + + Args: + base_model: The base model to train on. + tokenizer (AutoTokenizer, optional): The tokenizer to use. Defaults to ``None``. + train_batch_size (int, optional): Batch size used for training. Defaults to ``4``. + """ + def __init__( + self, + base_model: None, + tokenizer: AutoTokenizer = None, + train_batch_size: int = 4, + ): + super().__init__() + self.model = base_model + self.tokenizer = tokenizer + self.cross_entropy = nn.CrossEntropyLoss(reduction='mean') + + if self.model.config.pad_token_id is None: + self.model.config.pad_token_id = self.tokenizer.pad_token_id + self.config = self.model.config + + self.train_batch_size = train_batch_size + + self.yes_loc = self.tokenizer('Yes', add_special_tokens=False)['input_ids'][-1] + +
+[docs] + def gradient_checkpointing_enable(self, **kwargs): + """ + Activates gradient checkpointing for the current model. + """ + self.model.gradient_checkpointing_enable(**kwargs)
+ + +
+[docs] + def enable_input_require_grads(self, **kwargs): + """ + Enables the gradients for the input embeddings. + """ + self.model.enable_input_require_grads(**kwargs)
+ + +
+[docs] + @abstractmethod + def encode(self, features): + """Abstract method of encode. + + Args: + features (dict): Teatures to pass to the model. + """ + pass
+ + +
+[docs] + def forward(self, pair: Union[Dict[str, Tensor], List[Dict[str, Tensor]]] = None, teacher_scores: Optional[Tensor] = None): + """The computation performed at every call. + + Args: + pair (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional): The query-document pair. Defaults to ``None``. + teacher_scores (Optional[Tensor], optional): Teacher scores of knowledge distillation. Defaults to None. + + Returns: + RerankerOutput: Output of reranker model. + """ + ranker_logits = self.encode(pair) # (batch_size * num, dim) + if teacher_scores is not None: + teacher_scores = torch.Tensor(teacher_scores) + teacher_targets = teacher_scores.view(self.train_batch_size, -1) + teacher_targets = torch.softmax(teacher_targets.detach(), dim=-1) + + if self.training: + grouped_logits = ranker_logits.view(self.train_batch_size, -1) + target = torch.zeros(self.train_batch_size, device=grouped_logits.device, dtype=torch.long) + loss = self.compute_loss(grouped_logits, target) + if teacher_scores is not None: + teacher_targets = teacher_targets.to(grouped_logits.device) + # print(teacher_targets, torch.mean(torch.sum(torch.log_softmax(grouped_logits, dim=-1) * teacher_targets, dim=-1))) + loss += - torch.mean(torch.sum(torch.log_softmax(grouped_logits, dim=-1) * teacher_targets, dim=-1)) + else: + loss = None + + # print(loss) + return RerankerOutput( + loss=loss, + scores=ranker_logits, + )
+ + +
+[docs] + def compute_loss(self, scores, target): + """Compute the loss. + + Args: + scores (torch.Tensor): Computed scores. + target (torch.Tensor): The target value. + + Returns: + torch.Tensor: The computed loss. + """ + return self.cross_entropy(scores, target)
+ + +
+[docs] + def save(self, output_dir: str): + """Save the model. + + Args: + output_dir (str): Directory for saving the model. + """ + # self.model.save_pretrained(output_dir) + state_dict = self.model.state_dict() + state_dict = type(state_dict)( + {k: v.clone().cpu() + for k, + v in state_dict.items()}) + self.model.save_pretrained(output_dir, state_dict=state_dict)
+ + +
+[docs] + def save_pretrained(self, *args, **kwargs): + """ + Save the tokenizer and model. + """ + self.tokenizer.save_pretrained(*args, **kwargs) + return self.model.save_pretrained(*args, **kwargs)
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/finetune/reranker/AbsRunner.html b/_modules/FlagEmbedding/abc/finetune/reranker/AbsRunner.html new file mode 100644 index 00000000..ab01ff75 --- /dev/null +++ b/_modules/FlagEmbedding/abc/finetune/reranker/AbsRunner.html @@ -0,0 +1,647 @@ + + + + + + + + FlagEmbedding.abc.finetune.reranker.AbsRunner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.finetune.reranker.AbsRunner

+import os
+import logging
+from pathlib import Path
+from typing import Tuple
+from abc import ABC, abstractmethod
+from transformers import set_seed, PreTrainedTokenizer
+
+
+from .AbsArguments import (
+    AbsRerankerModelArguments,
+    AbsRerankerDataArguments,
+    AbsRerankerTrainingArguments
+)
+from .AbsTrainer import AbsRerankerTrainer
+from .AbsModeling import AbsRerankerModel
+from .AbsDataset import (
+    AbsRerankerTrainDataset, AbsRerankerCollator,
+    AbsLLMRerankerTrainDataset, AbsLLMRerankerCollator
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class AbsRerankerRunner(ABC): + """Abstract class to run reranker model fine-tuning. + + Args: + model_args (AbsRerankerModelArguments): Model arguments + data_args (AbsRerankerDataArguments): Data arguments. + training_args (AbsRerankerTrainingArguments): Training arguments. + """ + def __init__( + self, + model_args: AbsRerankerModelArguments, + data_args: AbsRerankerDataArguments, + training_args: AbsRerankerTrainingArguments + ): + self.model_args = model_args + self.data_args = data_args + self.training_args = training_args + + if ( + os.path.exists(training_args.output_dir) + and os.listdir(training_args.output_dir) + and training_args.do_train + and not training_args.overwrite_output_dir + ): + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, + ) + logger.info("Training/evaluation parameters %s", training_args) + logger.info("Model parameters %s", model_args) + logger.info("Data parameters %s", data_args) + + # Set seed + set_seed(training_args.seed) + + self.tokenizer, self.model = self.load_tokenizer_and_model() + self.train_dataset = self.load_train_dataset() + self.data_collator = self.load_data_collator() + self.trainer = self.load_trainer() + +
+[docs] + @abstractmethod + def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRerankerModel]: + """Abstract method to load the tokenizer and model. + + Returns: + Tuple[PreTrainedTokenizer, AbsRerankerModel]: Loaded tokenizer and model instances. + """ + pass
+ + +
+[docs] + @abstractmethod + def load_trainer(self) -> AbsRerankerTrainer: + """Abstract method to load the trainer. + + Returns: + AbsRerankerTrainer: The loaded trainer instance. + """ + pass
+ + +
+[docs] + def load_train_dataset(self) -> AbsRerankerTrainDataset: + """Loads the training dataset based on data arguments. + + Returns: + AbsRerankerTrainDataset: The loaded dataset instance. + """ + if self.model_args.model_type == 'encoder': + train_dataset = AbsRerankerTrainDataset( + args=self.data_args, + tokenizer=self.tokenizer + ) + else: + train_dataset = AbsLLMRerankerTrainDataset( + args=self.data_args, + tokenizer=self.tokenizer + ) + return train_dataset
+ + +
+[docs] + def load_data_collator(self) -> AbsRerankerCollator: + """Loads the appropriate data collator. + + Returns: + AbsRerankerCollator: Loaded data collator. + """ + if self.model_args.model_type == 'encoder': + RerankerCollator = AbsRerankerCollator + else: + RerankerCollator = AbsLLMRerankerCollator + + data_collator = RerankerCollator( + tokenizer=self.tokenizer, + query_max_len=self.data_args.query_max_len, + passage_max_len=self.data_args.passage_max_len, + pad_to_multiple_of=self.data_args.pad_to_multiple_of, + padding=True, + return_tensors="pt" + ) + return data_collator
+ + +
+[docs] + def run(self): + """ + Executes the training process. + """ + Path(self.training_args.output_dir).mkdir(parents=True, exist_ok=True) + + # Training + self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint) + self.trainer.save_model()
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/finetune/reranker/AbsTrainer.html b/_modules/FlagEmbedding/abc/finetune/reranker/AbsTrainer.html new file mode 100644 index 00000000..07de2a56 --- /dev/null +++ b/_modules/FlagEmbedding/abc/finetune/reranker/AbsTrainer.html @@ -0,0 +1,529 @@ + + + + + + + + FlagEmbedding.abc.finetune.reranker.AbsTrainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.finetune.reranker.AbsTrainer

+import logging
+from typing import Optional
+from abc import ABC, abstractmethod
+from transformers.trainer import Trainer
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class AbsRerankerTrainer(ABC, Trainer): + """ + Abstract class for the trainer of reranker. + """ + @abstractmethod + def _save(self, output_dir: Optional[str] = None, state_dict=None): + pass + +
+[docs] + def compute_loss(self, model, inputs, return_outputs=False, **kwargs): + """ + How the loss is computed by Trainer. By default, all models return the loss in the first element. + + Subclass and override for custom behavior. + + Args: + model (AbsRerankerModel): The model being trained. + inputs (dict): A dictionary of input tensors to be passed to the model. + return_outputs (bool, optional): If ``True``, returns both the loss and the model's outputs. Otherwise, + returns only the loss. Defaults to ``False``. + + Returns: + Union[torch.Tensor, tuple(torch.Tensor, RerankerOutput)]: The computed loss. If ``return_outputs`` is ``True``, + also returns the model's outputs in a tuple ``(loss, outputs)``. + """ + + outputs = model(**inputs) + loss = outputs.loss + + return (loss, outputs) if return_outputs else loss
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/inference/AbsEmbedder.html b/_modules/FlagEmbedding/abc/inference/AbsEmbedder.html new file mode 100644 index 00000000..5bdc89f1 --- /dev/null +++ b/_modules/FlagEmbedding/abc/inference/AbsEmbedder.html @@ -0,0 +1,945 @@ + + + + + + + + FlagEmbedding.abc.inference.AbsEmbedder - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.inference.AbsEmbedder

+import logging
+from tqdm import tqdm, trange
+from abc import ABC, abstractmethod
+from typing import Any, Union, List, Dict, Literal, Optional
+
+import queue
+import multiprocessing as mp
+from multiprocessing import Queue
+
+import math
+import gc
+import torch
+import numpy as np
+from transformers import is_torch_npu_available
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class AbsEmbedder(ABC): + """ + Base class for embedder. + Extend this class and implement :meth:`encode_queries`, :meth:`encode_corpus`, :meth:`encode` for custom embedders. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to :data:`True`. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`True`. + query_instruction_for_retrieval: (Optional[str], optional): Query instruction for retrieval tasks, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`None`. + query_instruction_format: (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"{}{}"`. + devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`256`. + query_max_length (int, optional): Maximum length for query. Defaults to :data:`512`. + passage_max_length (int, optional): Maximum length for passage. Defaults to :data:`512`. + convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. + Defaults to :data:`True`. + kwargs (Dict[Any], optional): Additional parameters for HuggingFace Transformers config or children classes. + """ + + def __init__( + self, + model_name_or_path: str, + normalize_embeddings: bool = True, + use_fp16: bool = True, + query_instruction_for_retrieval: Optional[str] = None, + query_instruction_format: str = "{}{}", # specify the format of query_instruction_for_retrieval + devices: Optional[Union[str, int, List[str], List[int]]] = None, + # inference + batch_size: int = 256, + query_max_length: int = 512, + passage_max_length: int = 512, + convert_to_numpy: bool = True, + **kwargs: Any, + ): + query_instruction_format = query_instruction_format.replace('\\n', '\n') + self.model_name_or_path = model_name_or_path + self.normalize_embeddings = normalize_embeddings + self.use_fp16 = use_fp16 + self.query_instruction_for_retrieval = query_instruction_for_retrieval + self.query_instruction_format = query_instruction_format + self.target_devices = self.get_target_devices(devices) + + self.batch_size = batch_size + self.query_max_length = query_max_length + self.passage_max_length = passage_max_length + self.convert_to_numpy = convert_to_numpy + + for k in kwargs: + setattr(self, k, kwargs[k]) + + self.kwargs = kwargs + + # tokenizer and model are initialized in the child class + self.tokenizer = None + self.model = None + self.pool = None + + def stop_self_pool(self): + if self.pool is not None: + self.stop_multi_process_pool(self.pool) + self.pool = None + try: + self.model.to('cpu') + torch.cuda.empty_cache() + except: + pass + gc.collect() + +
+[docs] + @staticmethod + def get_target_devices(devices: Union[str, int, List[str], List[int]]) -> List[str]: + """ + + Args: + devices (Union[str, int, List[str], List[int]]): specified devices, can be `str`, `int`, list of `str`, or list of `int`. + + Raises: + ValueError: Devices should be a string or an integer or a list of strings or a list of integers. + + Returns: + List[str]: A list of target devices in format. + """ + if devices is None: + if torch.cuda.is_available(): + return [f"cuda:{i}" for i in range(torch.cuda.device_count())] + elif is_torch_npu_available(): + return [f"npu:{i}" for i in range(torch.npu.device_count())] + elif torch.backends.mps.is_available(): + return [f"mps:{i}" for i in range(torch.mps.device_count())] + else: + return ["cpu"] + elif isinstance(devices, str): + return [devices] + elif isinstance(devices, int): + return [f"cuda:{devices}"] + elif isinstance(devices, list): + if isinstance(devices[0], str): + return devices + elif isinstance(devices[0], int): + return [f"cuda:{device}" for device in devices] + else: + raise ValueError("devices should be a string or an integer or a list of strings or a list of integers.") + else: + raise ValueError("devices should be a string or an integer or a list of strings or a list of integers.")
+ + +
+[docs] + @staticmethod + def get_detailed_instruct(instruction_format: str, instruction: str, sentence: str): + """Combine the instruction and sentence along with the instruction format. + + Args: + instruction_format (str): Format for instruction. + instruction (str): The text of instruction. + sentence (str): The sentence to concatenate with. + + Returns: + str: The complete sentence with instruction + """ + return instruction_format.format(instruction, sentence)
+ + +
+[docs] + def encode_queries( + self, + queries: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ): + """encode the queries using the instruction if provided. + + Args: + queries (Union[List[str], str]): Input queries to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor. + """ + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.query_max_length + if convert_to_numpy is None: convert_to_numpy = self.convert_to_numpy + + return self.encode( + queries, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + instruction=self.query_instruction_for_retrieval, + instruction_format=self.query_instruction_format, + **kwargs + )
+ + +
+[docs] + def encode_corpus( + self, + corpus: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ): + """encode the corpus using the instruction if provided. + + Args: + corpus (Union[List[str], str]): Input corpus to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor. + """ + passage_instruction_for_retrieval = self.kwargs.get("passage_instruction_for_retrieval", None) + passage_instruction_format = self.kwargs.get("passage_instruction_format", "{}{}") + + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.passage_max_length + if convert_to_numpy is None: convert_to_numpy = self.convert_to_numpy + + return self.encode( + corpus, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + instruction=passage_instruction_for_retrieval, + instruction_format=passage_instruction_format, + **kwargs + )
+ + +
+[docs] + def encode( + self, + sentences: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + instruction: Optional[str] = None, + instruction_format: Optional[str] = None, + **kwargs: Any + ): + """encode the input sentences with the embedding model. + + Args: + sentences (Union[List[str], str]): Input sentences to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + instruction (Optional[str], optional): The text of instruction. Defaults to :data:`None`. + instruction_format (Optional[str], optional): Format for instruction. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor. + """ + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.passage_max_length + if convert_to_numpy is None: convert_to_numpy = self.convert_to_numpy + + if instruction is not None: + if isinstance(sentences, str): + sentences = self.get_detailed_instruct(instruction_format, instruction, sentences) + else: + sentences = [self.get_detailed_instruct(instruction_format, instruction, sentence) for sentence in + sentences] + + if isinstance(sentences, str) or len(self.target_devices) == 1: + return self.encode_single_device( + sentences, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + device=self.target_devices[0], + **kwargs + ) + + if self.pool is None: + self.pool = self.start_multi_process_pool(AbsEmbedder._encode_multi_process_worker) + embeddings = self.encode_multi_process( + sentences, + self.pool, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + ) + return embeddings
+ + + def __del__(self): + self.stop_self_pool() + +
+[docs] + @abstractmethod + def encode_single_device( + self, + sentences: Union[List[str], str], + batch_size: int = 256, + max_length: int = 512, + convert_to_numpy: bool = True, + device: Optional[str] = None, + **kwargs: Any, + ): + """ + This method should encode sentences and return embeddings on a single device. + """ + pass
+ + + # adapted from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L807 +
+[docs] + def start_multi_process_pool( + self, + process_target_func: Any, + ) -> Dict[Literal["input", "output", "processes"], Any]: + """ + Starts a multi-process pool to process the encoding with several independent processes + via :meth:`SentenceTransformer.encode_multi_process <sentence_transformers.SentenceTransformer.encode_multi_process>`. + + This method is recommended if you want to encode on multiple GPUs or CPUs. It is advised + to start only one process per GPU. This method works together with encode_multi_process + and stop_multi_process_pool. + + Returns: + Dict[str, Any]: A dictionary with the target processes, an input queue, and an output queue. + """ + if self.model is None: + raise ValueError("Model is not initialized.") + + logger.info("Start multi-process pool on devices: {}".format(", ".join(map(str, self.target_devices)))) + + self.model.to("cpu") + self.model.share_memory() + ctx = mp.get_context("spawn") + input_queue = ctx.Queue() + output_queue = ctx.Queue() + processes = [] + + for device_id in tqdm(self.target_devices, desc='initial target device'): + p = ctx.Process( + target=process_target_func, + args=(device_id, self, input_queue, output_queue), + daemon=True, + ) + p.start() + processes.append(p) + + return {"input": input_queue, "output": output_queue, "processes": processes}
+ + + # adapted from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L976 +
+[docs] + @staticmethod + def _encode_multi_process_worker( + target_device: str, model: 'AbsEmbedder', input_queue: Queue, results_queue: Queue + ) -> None: + """ + Internal working process to encode sentences in multi-process setup + """ + while True: + try: + chunk_id, sentences, kwargs = ( + input_queue.get() + ) + embeddings = model.encode_single_device( + sentences, + device=target_device, + **kwargs + ) + + results_queue.put([chunk_id, embeddings]) + except queue.Empty: + break
+ + + # copied from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L857 +
+[docs] + @staticmethod + def stop_multi_process_pool(pool: Dict[Literal["input", "output", "processes"], Any]) -> None: + """ + Stops all processes started with start_multi_process_pool. + + Args: + pool (Dict[str, object]): A dictionary containing the input queue, output queue, and process list. + + Returns: + None + """ + for p in pool["processes"]: + p.terminate() + + for p in pool["processes"]: + p.join() + p.close() + + pool["input"].close() + pool["output"].close() + pool = None
+ + + # adapted from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L877 +
+[docs] + def encode_multi_process( + self, + sentences: List[str], + pool: Dict[Literal["input", "output", "processes"], Any], + **kwargs + ): + chunk_size = math.ceil(len(sentences) / len(pool["processes"])) + + input_queue = pool["input"] + last_chunk_id = 0 + chunk = [] + + for sentence in sentences: + chunk.append(sentence) + if len(chunk) >= chunk_size: + input_queue.put( + [last_chunk_id, chunk, kwargs] + ) + last_chunk_id += 1 + chunk = [] + + if len(chunk) > 0: + input_queue.put([last_chunk_id, chunk, kwargs]) + last_chunk_id += 1 + + output_queue = pool["output"] + results_list = sorted( + [output_queue.get() for _ in trange(last_chunk_id, desc="Chunks")], + key=lambda x: x[0], + ) + embeddings = self._concatenate_results_from_multi_process([result[1] for result in results_list]) + return embeddings
+ + +
+[docs] + def _concatenate_results_from_multi_process(self, results_list: List[Union[torch.Tensor, np.ndarray, Any]]): + """concatenate and return the results from all the processes + + Args: + results_list (List[Union[torch.Tensor, np.ndarray, Any]]): A list of results from all the processes. + + Raises: + NotImplementedError: Unsupported type for results_list + + Returns: + Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor. + """ + if isinstance(results_list[0], torch.Tensor): + return torch.cat(results_list, dim=0) + elif isinstance(results_list[0], np.ndarray): + return np.concatenate(results_list, axis=0) + else: + raise NotImplementedError("Unsupported type for results_list")
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/abc/inference/AbsReranker.html b/_modules/FlagEmbedding/abc/inference/AbsReranker.html new file mode 100644 index 00000000..af1ff3f2 --- /dev/null +++ b/_modules/FlagEmbedding/abc/inference/AbsReranker.html @@ -0,0 +1,860 @@ + + + + + + + + FlagEmbedding.abc.inference.AbsReranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.abc.inference.AbsReranker

+import logging
+from abc import ABC, abstractmethod
+from typing import Any, Union, List, Tuple, Dict, Literal, Optional
+
+import multiprocessing as mp
+from multiprocessing import Queue
+
+import math
+import gc
+import torch
+import numpy as np
+from tqdm import tqdm, trange
+from transformers import is_torch_npu_available
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class AbsReranker(ABC): + """ + Base class for Reranker. + Extend this class and implement :meth:`compute_score_single_gpu` for custom rerankers. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`False`. + query_instruction_for_rerank: (Optional[str], optional): Query instruction for reranking, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`None`. + query_instruction_format: (str, optional): The template for :attr:`query_instruction_for_rerank`. Defaults to :data:`"{}{}"`. + passage_instruction_for_rerank (Optional[str], optional): Passage instruction for reranking. Defaults to :data:`None`. + passage_instruction_format (str, optional): Passage instruction format when using :attr:`passage_instruction_for_rerank`. + Defaults to :data:`"{}{}"`. + devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`128`. + query_max_length (int, optional): Maximum length for query. Defaults to :data:`None`. + max_length (int, optional): Maximum length. Defaults to :data:`512`. + normalize (bool, optional): If true, normalize the result. Defaults to :data:`False`. + kwargs (Dict[Any], optional): Additional parameters for HuggingFace Transformers config or children classes. + """ + + def __init__( + self, + model_name_or_path: str, + use_fp16: bool = False, + query_instruction_for_rerank: Optional[str] = None, + query_instruction_format: str = "{}{}", # specify the format of query_instruction_for_rerank + passage_instruction_for_rerank: Optional[str] = None, + passage_instruction_format: str = "{}{}", # specify the format of passage_instruction_for_rerank + devices: Optional[Union[str, int, List[str], List[int]]] = None, + # inference + batch_size: int = 128, + query_max_length: Optional[int] = None, + max_length: int = 512, + normalize: bool = False, + **kwargs: Any, + ): + self.model_name_or_path = model_name_or_path + self.use_fp16 = use_fp16 + self.query_instruction_for_rerank = query_instruction_for_rerank + self.query_instruction_format = query_instruction_format + self.passage_instruction_for_rerank = passage_instruction_for_rerank + self.passage_instruction_format = passage_instruction_format + self.target_devices = self.get_target_devices(devices) + + self.batch_size = batch_size + self.query_max_length = query_max_length + self.max_length = max_length + self.normalize = normalize + + for k in kwargs: + setattr(self, k, kwargs[k]) + + self.kwargs = kwargs + + # tokenizer and model are initialized in the child class + self.model = None + self.tokenizer = None + self.pool = None + + def stop_self_pool(self): + if self.pool is not None: + self.stop_multi_process_pool(self.pool) + self.pool = None + try: + self.model.to('cpu') + torch.cuda.empty_cache() + except: + pass + gc.collect() + +
+[docs] + @staticmethod + def get_target_devices(devices: Union[str, int, List[str], List[int]]) -> List[str]: + """ + + Args: + devices (Union[str, int, List[str], List[int]]): Specified devices, can be `str`, `int`, list of `str`, or list of `int`. + + Raises: + ValueError: Devices should be a string or an integer or a list of strings or a list of integers. + + Returns: + List[str]: A list of target devices in format + """ + if devices is None: + if torch.cuda.is_available(): + return [f"cuda:{i}" for i in range(torch.cuda.device_count())] + elif is_torch_npu_available(): + return [f"npu:{i}" for i in range(torch.npu.device_count())] + elif torch.backends.mps.is_available(): + return ["mps"] + else: + return ["cpu"] + elif isinstance(devices, str): + return [devices] + elif isinstance(devices, int): + return [f"cuda:{devices}"] + elif isinstance(devices, list): + if isinstance(devices[0], str): + return devices + elif isinstance(devices[0], int): + return [f"cuda:{device}" for device in devices] + else: + raise ValueError("devices should be a string or an integer or a list of strings or a list of integers.") + else: + raise ValueError("devices should be a string or an integer or a list of strings or a list of integers.")
+ + +
+[docs] + def get_detailed_instruct(self, instruction_format: str, instruction: str, sentence: str): + """Combine the instruction and sentence along with the instruction format. + + Args: + instruction_format (str): Format for instruction. + instruction (str): The text of instruction. + sentence (str): The sentence to concatenate with. + + Returns: + str: The complete sentence with instruction + """ + return instruction_format.format(instruction, sentence)
+ + +
+[docs] + def get_detailed_inputs(self, sentence_pairs: Union[str, List[str]]): + """get detailed instruct for all the inputs + + Args: + sentence_pairs (Union[str, List[str]]): Input sentence pairs + + Returns: + list[list[str]]: The complete sentence pairs with instruction + """ + if isinstance(sentence_pairs, str): + sentence_pairs = [sentence_pairs] + + if self.query_instruction_for_rerank is not None: + if self.passage_instruction_for_rerank is None: + return [ + [ + self.get_detailed_instruct(self.query_instruction_format, self.query_instruction_for_rerank, sentence_pair[0]), + sentence_pair[1] + ] for sentence_pair in sentence_pairs + ] + else: + return [ + [ + self.get_detailed_instruct(self.query_instruction_format, self.query_instruction_for_rerank, sentence_pair[0]), + self.get_detailed_instruct(self.passage_instruction_format, self.passage_instruction_for_rerank, sentence_pair[1]) + ] for sentence_pair in sentence_pairs + ] + else: + if self.passage_instruction_for_rerank is None: + return [ + [ + sentence_pair[0], + sentence_pair[1] + ] for sentence_pair in sentence_pairs + ] + else: + return [ + [ + sentence_pair[0], + self.get_detailed_instruct(self.passage_instruction_format, self.passage_instruction_for_rerank, sentence_pair[1]) + ] for sentence_pair in sentence_pairs + ]
+ + +
+[docs] + def compute_score( + self, + sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], + **kwargs + ): + """Compute score for each sentence pair + + Args: + sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]): Input sentence pairs to compute. + + Returns: + numpy.ndarray: scores of all the sentence pairs. + """ + if isinstance(sentence_pairs[0], str): + sentence_pairs = [sentence_pairs] + sentence_pairs = self.get_detailed_inputs(sentence_pairs) + + if isinstance(sentence_pairs, str) or len(self.target_devices) == 1: + return self.compute_score_single_gpu( + sentence_pairs, + device=self.target_devices[0], + **kwargs + ) + + if self.pool is None: + self.pool = self.start_multi_process_pool() + scores = self.encode_multi_process(sentence_pairs, + self.pool, + **kwargs) + return scores
+ + + def __del__(self): + self.stop_self_pool() + +
+[docs] + @abstractmethod + def compute_score_single_gpu( + self, + sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], + batch_size: int = 256, + query_max_length: Optional[int] = None, + max_length: int = 512, + normalize: bool = False, + device: Optional[str] = None, + **kwargs: Any, + ): + """ + This method should compute the scores of sentence_pair and return scores. + """ + pass
+ + + # copied from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L857 +
+[docs] + def start_multi_process_pool(self) -> Dict[Literal["input", "output", "processes"], Any]: + """ + Starts a multi-process pool to process the encoding with several independent processes + via :meth:`SentenceTransformer.encode_multi_process <sentence_transformers.SentenceTransformer.encode_multi_process>`. + + This method is recommended if you want to encode on multiple GPUs or CPUs. It is advised + to start only one process per GPU. This method works together with encode_multi_process + and stop_multi_process_pool. + + Returns: + Dict[str, Any]: A dictionary with the target processes, an input queue, and an output queue. + """ + logger.info("Start multi-process pool on devices: {}".format(", ".join(map(str, self.target_devices)))) + + self.model.to("cpu") + self.model.share_memory() + ctx = mp.get_context("spawn") + input_queue = ctx.Queue() + output_queue = ctx.Queue() + processes = [] + + for device_id in tqdm(self.target_devices, desc='initial target device'): + p = ctx.Process( + target=AbsReranker._encode_multi_process_worker, + args=(device_id, self, input_queue, output_queue), + daemon=True, + ) + p.start() + processes.append(p) + + return {"input": input_queue, "output": output_queue, "processes": processes}
+ + + # copied from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L857 +
+[docs] + def encode_multi_process( + self, + sentence_pairs: List, + pool: Dict[Literal["input", "output", "processes"], Any], + **kwargs + ) -> np.ndarray: + chunk_size = math.ceil(len(sentence_pairs) / len(pool["processes"])) + + input_queue = pool["input"] + last_chunk_id = 0 + chunk = [] + + for sentence_pair in sentence_pairs: + chunk.append(sentence_pair) + if len(chunk) >= chunk_size: + input_queue.put( + [last_chunk_id, chunk, kwargs] + ) + last_chunk_id += 1 + chunk = [] + + if len(chunk) > 0: + input_queue.put([last_chunk_id, chunk, kwargs]) + last_chunk_id += 1 + + output_queue = pool["output"] + results_list = sorted( + [output_queue.get() for _ in trange(last_chunk_id, desc="Chunks")], + key=lambda x: x[0], + ) + scores = np.concatenate([result[1] for result in results_list]) + return scores
+ + + # copied from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L857 +
+[docs] + @staticmethod + def _encode_multi_process_worker( + target_device: str, model: 'AbsReranker', input_queue: Queue, results_queue: Queue + ) -> None: + """ + Internal working process to encode sentences in multi-process setup + """ + while True: + try: + chunk_id, sentences, kwargs = ( + input_queue.get() + ) + embeddings = model.compute_score_single_gpu( + sentences, + device=target_device, + **kwargs + ) + + results_queue.put([chunk_id, embeddings]) + except: + break
+ + + # copied from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L857 +
+[docs] + @staticmethod + def stop_multi_process_pool(pool: Dict[Literal["input", "output", "processes"], Any]) -> None: + """ + Stops all processes started with start_multi_process_pool. + + Args: + pool (Dict[str, object]): A dictionary containing the input queue, output queue, and process list. + + Returns: + None + """ + for p in pool["processes"]: + p.terminate() + + for p in pool["processes"]: + p.join() + p.close() + + pool["input"].close() + pool["output"].close()
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/arguments.html b/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/arguments.html new file mode 100644 index 00000000..c30cb441 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/arguments.html @@ -0,0 +1,560 @@ + + + + + + + + FlagEmbedding.finetune.embedder.decoder_only.base.arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.decoder_only.base.arguments

+from typing import Optional, List
+from dataclasses import dataclass, field
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderModelArguments
+
+
+def default_target_modules() -> List[int]:
+    return ['v_proj', 'q_proj', 'k_proj', 'gate_proj', 'down_proj', 'o_proj', 'up_proj']
+
+
+
+[docs] +@dataclass +class DecoderOnlyEmbedderModelArguments(AbsEmbedderModelArguments): + """ + Model argument class for decoder only base model. + """ + peft_model_path: str = field( + default='', metadata={"help": "The peft model checkpoint for initialization."} + ) + use_lora: bool = field( + default=True, + metadata={"help": "If passed, will use LORA (low-rank parameter-efficient training) to train the model."} + ) + lora_rank: int = field( + default=64, + metadata={"help": "The rank of lora."} + ) + lora_alpha: float = field( + default=16, + metadata={"help": "The alpha parameter of lora."} + ) + lora_dropout: float = field( + default=0.1, + metadata={"help": "The dropout rate of lora modules."} + ) + target_modules: List[str] = field( + default_factory=default_target_modules, + metadata={"help": "The target modules to apply LORA."} + ) + use_flash_attn: bool = field( + default=False, + metadata={"help": "If passed, will use flash attention to train the model."} + ) + use_slow_tokenizer: bool = field( + default=False, + metadata={"help": "If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library)."} + ) + # low_cpu_mem_usage: bool = field( + # default=False, + # metadata={"help": "It is an option to create the model as an empty shell," + # "then only materialize its parameters when the pretrained weights are loaded." + # "If passed, LLM loading time and RAM consumption will be benefited."} + # ) + from_peft: str = field( + default=None + ) + modules_to_save: str = field( + default=None + ) + raw_peft: str = field( + default=None + ) + + additional_special_tokens: Optional[str] = field( + default=None, + metadata={"help": "additional special tokens", "nargs": "+"} + ) + + save_merged_lora_model: bool = field( + default=False, + metadata={"help": "If passed, will merge the lora modules and save the entire model."} + )
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/modeling.html b/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/modeling.html new file mode 100644 index 00000000..8df36256 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/modeling.html @@ -0,0 +1,705 @@ + + + + + + + + FlagEmbedding.finetune.embedder.decoder_only.base.modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.decoder_only.base.modeling

+import logging
+
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderModel
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class BiDecoderOnlyEmbedderModel(AbsEmbedderModel): + """Embedder model class for decoder only model. + + Args: + base_model (AutoModel): The base model to train on. + tokenizer (AutoTokenizer, optional): The tokenizer to use. Defaults to ``None``. + negatives_cross_device (bool, optional): If True, will compute cross devices negative loss. Defaults to ``False``. + temperature (float, optional): Temperature to control the scale of scores. Defaults to ``1.0``. + sub_batch_size (int, optional): Sub-batch size during encoding. If negative, will not split to sub-batch. + Defaults to ``-1``. + kd_loss_type (str, optional): Type of knowledge distillation loss. Defaults to ``'kl_div'``. + sentence_pooling_method (str, optional): Pooling method to get sentence embedding. Defaults to ``'last_token'``. + normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to ``False``. + """ + TRANSFORMER_CLS = AutoModel + + def __init__( + self, + base_model: AutoModel, + tokenizer: AutoTokenizer = None, + negatives_cross_device: bool = False, + temperature: float = 1.0, + sub_batch_size: int = -1, + kd_loss_type: str = 'kl_div', + sentence_pooling_method: str = 'last_token', + normalize_embeddings: bool = False, + ): + super().__init__( + base_model, + tokenizer=tokenizer, + negatives_cross_device=negatives_cross_device, + temperature=temperature, + sub_batch_size=sub_batch_size, + kd_loss_type=kd_loss_type, + ) + self.sentence_pooling_method = sentence_pooling_method + self.normalize_embeddings = normalize_embeddings + self.cross_entropy = torch.nn.CrossEntropyLoss(reduction='mean') + +
+[docs] + def encode(self, features): + """ + Encode and get the embedding. + + Args: + features (Union[list, dict]): Features feed to the model. + + Returns: + torch.Tensor: The embedding vectors. + """ + if features is None: + return None + if not isinstance(features, list): + if self.sub_batch_size is not None and self.sub_batch_size > 0: + all_p_reps = [] + for i in range(0, len(features['attention_mask']), self.sub_batch_size): + end_inx = min(i + self.sub_batch_size, len(features['attention_mask'])) + sub_features = {} + for k, v in features.items(): + sub_features[k] = v[i:end_inx] + last_hidden_state = self.model(**sub_features, return_dict=True).last_hidden_state + p_reps = self._sentence_embedding(last_hidden_state, sub_features['attention_mask']) + all_p_reps.append(p_reps) + all_p_reps = torch.cat(all_p_reps, 0).contiguous() + if self.normalize_embeddings: + all_p_reps = torch.nn.functional.normalize(all_p_reps, dim=-1) + return all_p_reps.contiguous() + else: + last_hidden_state = self.model(**features, return_dict=True).last_hidden_state + all_p_reps = self._sentence_embedding(last_hidden_state, features['attention_mask']) + if self.normalize_embeddings: + all_p_reps = torch.nn.functional.normalize(all_p_reps, dim=-1) + return all_p_reps.contiguous() + else: + all_p_reps = [] + for sub_features in features: + last_hidden_state = self.model(**sub_features, return_dict=True).last_hidden_state + p_reps = self._sentence_embedding(last_hidden_state, sub_features['attention_mask']) + all_p_reps.append(p_reps) + all_p_reps = torch.cat(all_p_reps, 0).contiguous() + if self.normalize_embeddings: + all_p_reps = torch.nn.functional.normalize(all_p_reps, dim=-1) + return all_p_reps.contiguous()
+ + +
+[docs] + def _sentence_embedding(self, last_hidden_state, attention_mask): + """Use the pooling method to get the sentence embedding. + + Args: + last_hidden_state (torch.Tensor): The model output's last hidden state. + attention_mask (torch.Tensor): Mask out padding tokens during pooling. + + Raises: + NotImplementedError: Specified pooling method not implemented. + + Returns: + torch.Tensor: The sentence embeddings. + """ + if self.sentence_pooling_method == "cls": + return last_hidden_state[:, 0] + elif self.sentence_pooling_method == "mean": + s = torch.sum( + last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1 + ) + d = attention_mask.sum(dim=1, keepdim=True).float() + return s / d + elif self.sentence_pooling_method == "last_token": + left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0] + if left_padding: + return last_hidden_state[:, -1] + else: + sequence_lengths = attention_mask.sum(dim=1) - 1 + batch_size = last_hidden_state.shape[0] + return last_hidden_state[ + torch.arange(batch_size, device=last_hidden_state.device), + sequence_lengths, + ] + else: + raise NotImplementedError(f"pooling method {self.sentence_pooling_method} not implemented")
+ + +
+[docs] + def compute_score(self, q_reps, p_reps): + """Computes the scores between query and passage representations. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + + Returns: + torch.Tensor: The computed scores, adjusted by temperature. + """ + scores = self._compute_similarity(q_reps, p_reps) / self.temperature + scores = scores.view(q_reps.size(0), -1) + return scores
+ + +
+[docs] + def _compute_similarity(self, q_reps, p_reps): + """Computes the similarity between query and passage representations using inner product. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + + Returns: + torch.Tensor: The computed similarity matrix. + """ + if len(p_reps.size()) == 2: + return torch.matmul(q_reps, p_reps.transpose(0, 1)) + return torch.matmul(q_reps, p_reps.transpose(-2, -1))
+ + +
+[docs] + def compute_loss(self, scores, target): + """Compute the loss using cross entropy. + + Args: + scores (torch.Tensor): Computed score. + target (torch.Tensor): The target value. + + Returns: + torch.Tensor: The computed cross entropy loss. + """ + return self.cross_entropy(scores, target)
+ + +
+[docs] + def gradient_checkpointing_enable(self, **kwargs): + """ + Activates gradient checkpointing for the current model. + """ + self.model.gradient_checkpointing_enable(**kwargs)
+ + +
+[docs] + def enable_input_require_grads(self, **kwargs): + """ + Enables the gradients for the input embeddings. + """ + self.model.enable_input_require_grads(**kwargs)
+ + +
+[docs] + def save(self, output_dir: str): + """Save the model to the directory. + + Args: + output_dir (str): Directory for saving the model. + """ + state_dict = self.model.state_dict() + state_dict = type(state_dict)( + {k: v.clone().cpu() + for k, + v in state_dict.items()}) + self.model.save_pretrained(output_dir, state_dict=state_dict)
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/runner.html b/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/runner.html new file mode 100644 index 00000000..cb493d8c --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/runner.html @@ -0,0 +1,625 @@ + + + + + + + + FlagEmbedding.finetune.embedder.decoder_only.base.runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.decoder_only.base.runner

+import logging
+from typing import Tuple
+from pathlib import Path
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+
+from FlagEmbedding.abc.finetune.embedder.AbsArguments import AbsEmbedderDataArguments, AbsEmbedderTrainingArguments
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderRunner, AbsEmbedderModel, EmbedderTrainerCallbackForDataRefresh
+
+from .arguments import DecoderOnlyEmbedderModelArguments
+from .trainer import DecoderOnlyEmbedderTrainer
+from .modeling import BiDecoderOnlyEmbedderModel
+from .load_model import get_model, save_merged_model
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class DecoderOnlyEmbedderRunner(AbsEmbedderRunner): + """Runner class for decoder only embedding model. + + Args: + model_args (DecoderOnlyEmbedderModelArguments): Model arguments instance. + data_args (AbsEmbedderDataArguments): Data arguments instance. + training_args (AbsEmbedderTrainingArguments): Trainer arguments. + """ + def __init__( + self, + model_args: DecoderOnlyEmbedderModelArguments, + data_args: AbsEmbedderDataArguments, + training_args: AbsEmbedderTrainingArguments + ): + super().__init__(model_args, data_args, training_args) + +
+[docs] + def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderModel]: + """Load tokenizer and model. + + Returns: + Tuple[PreTrainedTokenizer, AbsEmbedderModel]: Tokenizer and model instances. + """ + tokenizer = AutoTokenizer.from_pretrained( + self.model_args.tokenizer_name if self.model_args.tokenizer_name else self.model_args.model_name_or_path, + token=self.model_args.token, + cache_dir=self.model_args.cache_dir, + use_fast=False, + add_eos_token=True + ) + + if tokenizer.pad_token is None: + if tokenizer.unk_token is not None: + tokenizer.pad_token = tokenizer.unk_token + tokenizer.pad_token_id = tokenizer.unk_token_id + else: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token_id = tokenizer.eos_token_id + tokenizer.padding_side = 'left' + + resize = False + if self.model_args.additional_special_tokens is not None: + special_tokens_dict = {'additional_special_tokens': self.model_args.additional_special_tokens} + add_num = tokenizer.add_special_tokens(special_tokens_dict) + if add_num > 0: + resize = True + logger.info(f"Add {add_num} special tokens to the tokenizer. Special tokens: {self.model_args.additional_special_tokens}") + else: + logger.warning(f"Special tokens {self.model_args.additional_special_tokens} already exists in the tokenizer.") + base_model = get_model(self.model_args, self.training_args.output_dir, resize, len(tokenizer)) + + num_labels = 1 + config = AutoConfig.from_pretrained( + self.model_args.config_name if self.model_args.config_name else self.model_args.model_name_or_path, + num_labels=num_labels, + cache_dir=self.model_args.cache_dir, + token=self.model_args.token, + trust_remote_code=self.model_args.trust_remote_code, + ) + logger.info('Config: %s', config) + + model = BiDecoderOnlyEmbedderModel( + base_model, + tokenizer=tokenizer, + negatives_cross_device=self.training_args.negatives_cross_device, + temperature=self.training_args.temperature, + sub_batch_size=self.training_args.sub_batch_size, + kd_loss_type=self.training_args.kd_loss_type, + sentence_pooling_method=self.training_args.sentence_pooling_method, + normalize_embeddings=self.training_args.normalize_embeddings + ) + + if self.training_args.gradient_checkpointing: + model.enable_input_require_grads() + + if self.training_args.fix_position_embedding: + for k, v in model.named_parameters(): + if "position_embeddings" in k: + logging.info(f"Freeze the parameters for {k}") + v.requires_grad = False + return tokenizer, model
+ + +
+[docs] + def load_trainer(self) -> DecoderOnlyEmbedderTrainer: + """Load the trainer. + + Returns: + DecoderOnlyEmbedderTrainer: Loaded trainer instance. + """ + trainer = DecoderOnlyEmbedderTrainer( + model=self.model, + args=self.training_args, + train_dataset=self.train_dataset, + data_collator=self.data_collator, + tokenizer=self.tokenizer + ) + if self.data_args.same_dataset_within_batch: + trainer.add_callback(EmbedderTrainerCallbackForDataRefresh(self.train_dataset)) + return trainer
+ + +
+[docs] + def run(self): + """ + Run the finetune. + """ + Path(self.training_args.output_dir).mkdir(parents=True, exist_ok=True) + + # Training + self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint) + self.trainer.save_model() + + # save merged model + if self.model_args.save_merged_lora_model and self.training_args.process_index == 0: + save_merged_model(self.model_args, self.training_args.output_dir)
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/trainer.html b/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/trainer.html new file mode 100644 index 00000000..e5c1ef57 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/decoder_only/base/trainer.html @@ -0,0 +1,534 @@ + + + + + + + + FlagEmbedding.finetune.embedder.decoder_only.base.trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.decoder_only.base.trainer

+import os
+import torch
+import logging
+from typing import Optional
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderTrainer
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class DecoderOnlyEmbedderTrainer(AbsEmbedderTrainer): + """ + Trainer class for base encoder models. + """ + def _save(self, output_dir: Optional[str] = None, state_dict=None): + """Save the model to directory. + + Args: + output_dir (Optional[str], optional): Output directory to save the model. Defaults to ``None``. + + Raises: + NotImplementedError + """ + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info("Saving model checkpoint to %s", output_dir) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not hasattr(self.model, 'save'): + raise NotImplementedError( + f'MODEL {self.model.__class__.__name__} ' + f'does not support save interface') + else: + self.model.save(output_dir) + + if self.tokenizer is not None and self.is_world_process_zero(): + self.tokenizer.save_pretrained(output_dir) + + torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+ + + # save the checkpoint for sentence-transformers library + # if self.is_world_process_zero(): + # save_ckpt_for_sentence_transformers(output_dir, + # pooling_mode=self.args.sentence_pooling_method, + # normlized=self.args.normlized) +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/arguments.html b/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/arguments.html new file mode 100644 index 00000000..bea23260 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/arguments.html @@ -0,0 +1,589 @@ + + + + + + + + FlagEmbedding.finetune.embedder.decoder_only.icl.arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.decoder_only.icl.arguments

+from typing import Optional, List
+from dataclasses import dataclass, field
+
+from FlagEmbedding.abc.finetune.embedder import (
+    AbsEmbedderModelArguments,
+    AbsEmbedderDataArguments,
+)
+
+
+def default_target_modules() -> List[int]:
+    return ['v_proj', 'q_proj', 'k_proj', 'gate_proj', 'down_proj', 'o_proj', 'up_proj']
+
+
+
+[docs] +@dataclass +class DecoderOnlyEmbedderICLModelArguments(AbsEmbedderModelArguments): + """ + Model argument class for decoder only icl model. + """ + peft_model_path: str = field( + default='', metadata={"help": "The peft model checkpoint for initialization."} + ) + use_lora: bool = field( + default=True, + metadata={"help": "If passed, will use LORA (low-rank parameter-efficient training) to train the model."} + ) + lora_rank: int = field( + default=64, + metadata={"help": "The rank of lora."} + ) + lora_alpha: float = field( + default=16, + metadata={"help": "The alpha parameter of lora."} + ) + lora_dropout: float = field( + default=0.1, + metadata={"help": "The dropout rate of lora modules."} + ) + target_modules: List[str] = field( + default_factory=default_target_modules, + metadata={"help": "The target modules to apply LORA."} + ) + use_flash_attn: bool = field( + default=False, + metadata={"help": "If passed, will use flash attention to train the model."} + ) + use_slow_tokenizer: bool = field( + default=False, + metadata={"help": "If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library)."} + ) + # low_cpu_mem_usage: bool = field( + # default=False, + # metadata={"help": "It is an option to create the model as an empty shell," + # "then only materialize its parameters when the pretrained weights are loaded." + # "If passed, LLM loading time and RAM consumption will be benefited."} + # ) + from_peft: str = field( + default=None + ) + modules_to_save: str = field( + default=None, + ) + raw_peft: str = field( + default=None + ) + + additional_special_tokens: Optional[str] = field( + default=None, + metadata={"help": "additional special tokens", "nargs": "+"} + ) + + save_merged_lora_model: bool = field( + default=False, + metadata={"help": "If passed, will merge the lora modules and save the entire model."} + )
+ + + +
+[docs] +@dataclass +class DecoderOnlyEmbedderICLDataArguments(AbsEmbedderDataArguments): + """ + Data argument class for decoder only icl model. + """ + example_query_max_len: int = field( + default=64, + metadata={"help": "The max length of example query."} + ) + example_passage_max_len: int = field( + default=96, + metadata={"help": "The max length of example passage."} + ) + retrieval_use_examples: bool = field( + default=True, + metadata={"help": "If passed, will use examples for retrieval."} + ) + icl_suffix_str: str = field( + default='\nResponse:', + metadata={"help": "The suffix string for ICL dataset."} + )
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/dataset.html b/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/dataset.html new file mode 100644 index 00000000..35d18b12 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/dataset.html @@ -0,0 +1,785 @@ + + + + + + + + FlagEmbedding.finetune.embedder.decoder_only.icl.dataset - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.decoder_only.icl.dataset

+import math
+import random
+import logging
+from dataclasses import dataclass
+from transformers import (
+    PreTrainedTokenizer, 
+    DataCollatorWithPadding,
+)
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderSameDatasetTrainDataset
+
+from .arguments import DecoderOnlyEmbedderICLDataArguments
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class DecoderOnlyEmbedderICLSameDatasetTrainDataset(AbsEmbedderSameDatasetTrainDataset): + """Dataset class for icl model. + + Args: + args (DecoderOnlyEmbedderICLDataArguments): Data argument class for icl model. + default_batch_size (int): The default batch size. + seed (int): Random seed to use. + tokenizer (PreTrainedTokenizer): Tokenzier. + process_index (int, optional): Current process index. Defaults to 0. + num_processes (int, optional): Total number of processes. Defaults to 1. + """ + def __init__( + self, + args: DecoderOnlyEmbedderICLDataArguments, + default_batch_size: int, + seed: int, + tokenizer: PreTrainedTokenizer, + process_index: int=0, + num_processes: int=1 + ): + super().__init__( + args=args, + default_batch_size=default_batch_size, + seed=seed, + tokenizer=tokenizer, + process_index=process_index, + num_processes=num_processes + ) + self.args: DecoderOnlyEmbedderICLDataArguments + + self.suffix = self.tokenizer(f"{self.args.icl_suffix_str}{self.tokenizer.eos_token}", add_special_tokens=False)['input_ids'] + + self.prefix = self.tokenizer(f"{self.tokenizer.bos_token}", add_special_tokens=False)['input_ids'] + +
+[docs] + def _create_batch_data(self, batch_raw_data): + """Create a comple batch of data with queries, documents and teacher scores. + + Args: + batch_raw_data (datasets.Dataset): One batch of raw data. + + Returns: + List[str]: Queries with instruction format. + List[str]: Documents with instruction format. + List[float]: Teacher scores for model distillation. + """ + queries, passages, teacher_scores = [], [], [] + + train_group_size, data_type = self._get_train_group_size(batch_raw_data) + + icl_pairs = [] + + for i in range(len(batch_raw_data['query'])): + if data_type is not None: + assert batch_raw_data['type'][i] == data_type, f"Data type is not consistent in the same batch" + + queries.append( + self.args.query_instruction_format.format( + batch_raw_data['prompt'][i] if 'prompt' in batch_raw_data else self.args.query_instruction_for_retrieval, + batch_raw_data['query'][i] + ) + ) + tmp_passages = [] + pos_idx = random.choice(list(range(len(batch_raw_data['pos'][i])))) + pos = self._shuffle_text(batch_raw_data['pos'][i][pos_idx]) + tmp_passages.append(pos) + + neg_all_idx = list(range(len(batch_raw_data['neg'][i]))) + if len(batch_raw_data['neg'][i]) < train_group_size - 1: + num = math.ceil((train_group_size - 1) / len(batch_raw_data['neg'][i])) + neg_idxs = random.sample(neg_all_idx * num, train_group_size - 1) + else: + neg_idxs = random.sample(neg_all_idx, train_group_size - 1) + for neg_idx in neg_idxs: + tmp_passages.append(batch_raw_data['neg'][i][neg_idx]) + + if self.args.knowledge_distillation: + if 'pos_scores' in batch_raw_data and batch_raw_data['pos_scores'][i] is not None: + teacher_scores.append(batch_raw_data['pos_scores'][i][pos_idx]) + for neg_idx in neg_idxs: + if 'neg_scores' in batch_raw_data and batch_raw_data['neg_scores'][i] is not None: + teacher_scores.append(batch_raw_data['neg_scores'][i][neg_idx]) + else: + teacher_scores = None + + if data_type is not None and data_type in ['symmetric_sts', 'symmetric_clustering']: + tmp_passages = [ + self.args.query_instruction_format.format( + batch_raw_data['prompt'][i] if 'prompt' in batch_raw_data else self.args.query_instruction_for_retrieval, + p + ) for p in tmp_passages + ] + tmp_passages = self.tokenizer.batch_decode( + self.tokenizer( + tmp_passages, + max_length=self.args.passage_max_len - 1 - len(self.suffix), + truncation=True, + add_special_tokens=False, + )['input_ids'] + ) + for j in range(len(tmp_passages)): + tmp_passages[j] += self.args.icl_suffix_str + else: + if self.args.passage_instruction_for_retrieval is not None: + tmp_passages = [ + self.args.passage_instruction_format.format( + self.args.passage_instruction_for_retrieval, p + ) for p in tmp_passages + ] + + passages.extend(tmp_passages) + + if len(teacher_scores) > 0 and len(passages) > 0: + assert len(teacher_scores) == len(passages) + + # add icl pairs + if self.args.retrieval_use_examples or ( + data_type in ['symmetric_sts', 'symmetric_clustering', 'symmetric_class'] + ): + if data_type == 'symmetric_clustering': + icl_pairs.append(( + self.tokenizer.decode( + self.tokenizer( + queries[-1], + add_special_tokens=False + )['input_ids'][:self.args.example_query_max_len] + ), + self.tokenizer.decode( + self.tokenizer( + batch_raw_data['category'][i], # use category as example + add_special_tokens=False + )['input_ids'][:self.args.example_passage_max_len] + ) + )) + else: + icl_pairs.append(( + self.tokenizer.decode( + self.tokenizer( + queries[-1], + add_special_tokens=False + )['input_ids'][:self.args.example_query_max_len] + ), + self.tokenizer.decode( + self.tokenizer( + pos, + add_special_tokens=False + )['input_ids'][:self.args.example_passage_max_len] + ) + )) + else: + icl_pairs = [] + + # handle queries + for i in range(len(queries)): + choices = random.choice([0, 1, 2, 3, 4, 5]) + if choices > 0 and len(icl_pairs) > 0: + prefix_ids = random.sample(list(range(len(icl_pairs))), min(choices + 1, len(icl_pairs))) + if i in prefix_ids: + prefix_ids.remove(i) + prefix_ids = prefix_ids[:choices] + + prefix = '' + for idx in prefix_ids: + tmp = prefix + self.args.icl_suffix_str.join(icl_pairs[idx]) + '\n\n' + if len(self.tokenizer(tmp)['input_ids']) > self.args.query_max_len - 512: + break + prefix = tmp + else: + prefix = '' + + queries[i] = prefix + queries[i] + queries[i] = self.tokenizer.decode( + self.tokenizer( + queries[i], + max_length=self.args.query_max_len - len(self.prefix) - len(self.suffix), + truncation=True, + add_special_tokens=False + )['input_ids'] + ) + self.args.icl_suffix_str + + return queries, passages, teacher_scores
+
+ + + +
+[docs] +@dataclass +class AbsEmbedderSameDatasetCollator(DataCollatorWithPadding): + """ + EmbedCollator for SameDataset. + Note that after using this collator, the training_args should be set as: + + ``training_args.per_device_train_batch_size = 1`` + + ``training_args.dataloader_num_workers = 0 # avoid multi-processing`` + """ + query_max_len: int = 32 + passage_max_len: int = 128 + sub_batch_size: int = -1 + + def __call__(self, features): + queries = features[0][0] + passages = features[0][1] + teacher_scores = features[0][2] + no_in_batch_neg_flag = features[0][3] + + queries_inputs = self.tokenizer( + queries, + truncation=True, + max_length=self.query_max_len, + return_tensors=None + ) + passages_inputs = self.tokenizer( + passages, + truncation=True, + max_length=self.passage_max_len, + return_tensors=None + ) + + if self.sub_batch_size is None or self.sub_batch_size <= 0: + q_collated = self.tokenizer.pad( + queries_inputs, + padding=self.padding, + max_length=self.query_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + + d_collated = self.tokenizer.pad( + passages_inputs, + padding=self.padding, + max_length=self.passage_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + ) + else: + batch_size = self.sub_batch_size + + q_collated = [] + for i in range(0, len(queries_inputs['attention_mask']), batch_size): + start = i + end = min(len(queries_inputs['attention_mask']), i + batch_size) + sub_features = {} + for k, v in queries_inputs.items(): + sub_features[k] = v[start:end] + q_collated.append(self.tokenizer.pad( + sub_features, + padding=self.padding, + max_length=self.passage_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + )) + + d_collated = [] + for i in range(0, len(passages_inputs['attention_mask']), batch_size): + start = i + end = min(len(passages_inputs['attention_mask']), i + batch_size) + sub_features = {} + + for k, v in passages_inputs.items(): + sub_features[k] = v[start:end] + d_collated.append(self.tokenizer.pad( + sub_features, + padding=self.padding, + max_length=self.passage_max_len, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors=self.return_tensors, + )) + + if isinstance(teacher_scores, list) and len(teacher_scores) == 0: + teacher_scores = None + + return { + "queries": q_collated, + "passages": d_collated, + "teacher_scores": teacher_scores, + "no_in_batch_neg_flag": no_in_batch_neg_flag + }
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/modeling.html b/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/modeling.html new file mode 100644 index 00000000..0f87d106 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/modeling.html @@ -0,0 +1,705 @@ + + + + + + + + FlagEmbedding.finetune.embedder.decoder_only.icl.modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.decoder_only.icl.modeling

+import logging
+
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderModel
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class BiDecoderOnlyEmbedderICLModel(AbsEmbedderModel): + """Embedder model class for decoder only model. + + Args: + base_model (AutoModel): The base model to train on. + tokenizer (AutoTokenizer, optional): The tokenizer to use. Defaults to ``None``. + negatives_cross_device (bool, optional): If True, will compute cross devices negative loss. Defaults to ``False``. + temperature (float, optional): Temperature to control the scale of scores. Defaults to ``1.0``. + sub_batch_size (int, optional): Sub-batch size during encoding. If negative, will not split to sub-batch. + Defaults to ``-1``. + kd_loss_type (str, optional): Type of knowledge distillation loss. Defaults to ``'kl_div'``. + sentence_pooling_method (str, optional): Pooling method to get sentence embedding. Defaults to ``'last_token'``. + normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to ``False``. + """ + TRANSFORMER_CLS = AutoModel + + def __init__( + self, + base_model: AutoModel, + tokenizer: AutoTokenizer = None, + negatives_cross_device: bool = False, + temperature: float = 1.0, + sub_batch_size: int = -1, + kd_loss_type: str = 'kl_div', + sentence_pooling_method: str = 'last_token', + normalize_embeddings: bool = False, + ): + super().__init__( + base_model, + tokenizer=tokenizer, + negatives_cross_device=negatives_cross_device, + temperature=temperature, + sub_batch_size=sub_batch_size, + kd_loss_type=kd_loss_type, + ) + self.sentence_pooling_method = sentence_pooling_method + self.normalize_embeddings = normalize_embeddings + self.cross_entropy = torch.nn.CrossEntropyLoss(reduction='mean') + +
+[docs] + def encode(self, features): + """ + Encode and get the embedding. + + Args: + features (Union[list, dict]): Features feed to the model. + + Returns: + torch.Tensor: The embedding vectors. + """ + if features is None: + return None + if not isinstance(features, list): + if self.sub_batch_size is not None and self.sub_batch_size > 0: + all_p_reps = [] + for i in range(0, len(features['attention_mask']), self.sub_batch_size): + end_inx = min(i + self.sub_batch_size, len(features['attention_mask'])) + sub_features = {} + for k, v in features.items(): + sub_features[k] = v[i:end_inx] + last_hidden_state = self.model(**sub_features, return_dict=True).last_hidden_state + p_reps = self._sentence_embedding(last_hidden_state, sub_features['attention_mask']) + all_p_reps.append(p_reps) + all_p_reps = torch.cat(all_p_reps, 0).contiguous() + if self.normalize_embeddings: + all_p_reps = torch.nn.functional.normalize(all_p_reps, dim=-1) + return all_p_reps.contiguous() + else: + last_hidden_state = self.model(**features, return_dict=True).last_hidden_state + all_p_reps = self._sentence_embedding(last_hidden_state, features['attention_mask']) + if self.normalize_embeddings: + all_p_reps = torch.nn.functional.normalize(all_p_reps, dim=-1) + return all_p_reps.contiguous() + else: + all_p_reps = [] + for sub_features in features: + last_hidden_state = self.model(**sub_features, return_dict=True).last_hidden_state + p_reps = self._sentence_embedding(last_hidden_state, sub_features['attention_mask']) + all_p_reps.append(p_reps) + all_p_reps = torch.cat(all_p_reps, 0).contiguous() + if self.normalize_embeddings: + all_p_reps = torch.nn.functional.normalize(all_p_reps, dim=-1) + return all_p_reps.contiguous()
+ + +
+[docs] + def _sentence_embedding(self, last_hidden_state, attention_mask): + """Use the pooling method to get the sentence embedding. + + Args: + last_hidden_state (torch.Tensor): The model output's last hidden state. + attention_mask (torch.Tensor): Mask out padding tokens during pooling. + + Raises: + NotImplementedError: Specified pooling method not implemented. + + Returns: + torch.Tensor: The sentence embeddings. + """ + if self.sentence_pooling_method == "cls": + return last_hidden_state[:, 0] + elif self.sentence_pooling_method == "mean": + s = torch.sum( + last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1 + ) + d = attention_mask.sum(dim=1, keepdim=True).float() + return s / d + elif self.sentence_pooling_method == "last_token": + left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0] + if left_padding: + return last_hidden_state[:, -1] + else: + sequence_lengths = attention_mask.sum(dim=1) - 1 + batch_size = last_hidden_state.shape[0] + return last_hidden_state[ + torch.arange(batch_size, device=last_hidden_state.device), + sequence_lengths, + ] + else: + raise NotImplementedError(f"pooling method {self.sentence_pooling_method} not implemented")
+ + +
+[docs] + def compute_score(self, q_reps, p_reps): + """Computes the scores between query and passage representations. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + + Returns: + torch.Tensor: The computed scores, adjusted by temperature. + """ + scores = self._compute_similarity(q_reps, p_reps) / self.temperature + scores = scores.view(q_reps.size(0), -1) + return scores
+ + +
+[docs] + def _compute_similarity(self, q_reps, p_reps): + """Computes the similarity between query and passage representations using inner product. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + + Returns: + torch.Tensor: The computed similarity matrix. + """ + if len(p_reps.size()) == 2: + return torch.matmul(q_reps, p_reps.transpose(0, 1)) + return torch.matmul(q_reps, p_reps.transpose(-2, -1))
+ + +
+[docs] + def compute_loss(self, scores, target): + """Compute the loss using cross entropy. + + Args: + scores (torch.Tensor): Computed score. + target (torch.Tensor): The target value. + + Returns: + torch.Tensor: The computed cross entropy loss. + """ + return self.cross_entropy(scores, target)
+ + +
+[docs] + def gradient_checkpointing_enable(self, **kwargs): + """ + Activates gradient checkpointing for the current model. + """ + self.model.gradient_checkpointing_enable(**kwargs)
+ + +
+[docs] + def enable_input_require_grads(self, **kwargs): + """ + Enables the gradients for the input embeddings. + """ + self.model.enable_input_require_grads(**kwargs)
+ + +
+[docs] + def save(self, output_dir: str): + """Save the model to the directory. + + Args: + output_dir (str): Directory for saving the model. + """ + state_dict = self.model.state_dict() + state_dict = type(state_dict)( + {k: v.clone().cpu() + for k, + v in state_dict.items()}) + self.model.save_pretrained(output_dir, state_dict=state_dict)
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.html b/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.html new file mode 100644 index 00000000..0f1fe874 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/runner.html @@ -0,0 +1,656 @@ + + + + + + + + FlagEmbedding.finetune.embedder.decoder_only.icl.runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.decoder_only.icl.runner

+import logging
+from typing import Tuple
+from pathlib import Path
+from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer
+
+from FlagEmbedding.abc.finetune.embedder.AbsArguments import AbsEmbedderTrainingArguments
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderRunner, AbsEmbedderModel, EmbedderTrainerCallbackForDataRefresh
+
+from .arguments import DecoderOnlyEmbedderICLModelArguments, DecoderOnlyEmbedderICLDataArguments
+from .trainer import DecoderOnlyEmbedderICLTrainer
+from .modeling import BiDecoderOnlyEmbedderICLModel
+from .dataset import DecoderOnlyEmbedderICLSameDatasetTrainDataset
+from .load_model import get_model, save_merged_model
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class DecoderOnlyEmbedderICLRunner(AbsEmbedderRunner): + """Runner class for decoder only icl model. + + Args: + model_args (DecoderOnlyEmbedderICLModelArguments): Model arguments instance. + data_args (DecoderOnlyEmbedderICLDataArguments): Data arguments instance. + training_args (AbsEmbedderTrainingArguments): Trainer arguments. + """ + def __init__( + self, + model_args: DecoderOnlyEmbedderICLModelArguments, + data_args: DecoderOnlyEmbedderICLDataArguments, + training_args: AbsEmbedderTrainingArguments + ): + super().__init__(model_args, data_args, training_args) + self.model_args: DecoderOnlyEmbedderICLModelArguments + self.data_args: DecoderOnlyEmbedderICLDataArguments + self.training_args: AbsEmbedderTrainingArguments + +
+[docs] + def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderModel]: + """Load tokenizer and model. + + Returns: + Tuple[PreTrainedTokenizer, AbsEmbedderModel]: Tokenizer and model instances. + """ + tokenizer = AutoTokenizer.from_pretrained( + self.model_args.tokenizer_name if self.model_args.tokenizer_name else self.model_args.model_name_or_path, + token=self.model_args.token, + cache_dir=self.model_args.cache_dir, + use_fast=False, + add_eos_token=True + ) + + if tokenizer.pad_token is None: + if tokenizer.unk_token is not None: + tokenizer.pad_token = tokenizer.unk_token + tokenizer.pad_token_id = tokenizer.unk_token_id + else: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token_id = tokenizer.eos_token_id + tokenizer.padding_side = 'left' + + resize = False + if self.model_args.additional_special_tokens is not None: + special_tokens_dict = {'additional_special_tokens': self.model_args.additional_special_tokens} + add_num = tokenizer.add_special_tokens(special_tokens_dict) + if add_num > 0: + resize = True + logger.info(f"Add {add_num} special tokens to the tokenizer. Special tokens: {self.model_args.additional_special_tokens}") + else: + logger.warning(f"Special tokens {self.model_args.additional_special_tokens} already exists in the tokenizer.") + base_model = get_model(self.model_args, self.training_args.output_dir, resize, len(tokenizer)) + + num_labels = 1 + config = AutoConfig.from_pretrained( + self.model_args.config_name if self.model_args.config_name else self.model_args.model_name_or_path, + num_labels=num_labels, + cache_dir=self.model_args.cache_dir, + token=self.model_args.token, + trust_remote_code=self.model_args.trust_remote_code, + ) + logger.info('Config: %s', config) + + model = BiDecoderOnlyEmbedderICLModel( + base_model, + tokenizer=tokenizer, + negatives_cross_device=self.training_args.negatives_cross_device, + temperature=self.training_args.temperature, + sub_batch_size=self.training_args.sub_batch_size, + kd_loss_type=self.training_args.kd_loss_type, + sentence_pooling_method=self.training_args.sentence_pooling_method, + normalize_embeddings=self.training_args.normalize_embeddings + ) + + if self.training_args.gradient_checkpointing: + model.enable_input_require_grads() + + if self.training_args.fix_position_embedding: + for k, v in model.named_parameters(): + if "position_embeddings" in k: + logging.info(f"Freeze the parameters for {k}") + v.requires_grad = False + return tokenizer, model
+ + +
+[docs] + def load_trainer(self) -> DecoderOnlyEmbedderICLTrainer: + """Load the trainer. + + Returns: + DecoderOnlyEmbedderICLTrainer: Loaded trainer instance. + """ + trainer = DecoderOnlyEmbedderICLTrainer( + model=self.model, + args=self.training_args, + train_dataset=self.train_dataset, + data_collator=self.data_collator, + tokenizer=self.tokenizer + ) + if self.data_args.same_dataset_within_batch: + trainer.add_callback(EmbedderTrainerCallbackForDataRefresh(self.train_dataset)) + return trainer
+ + +
+[docs] + def load_train_dataset(self) -> DecoderOnlyEmbedderICLSameDatasetTrainDataset: + """Load the dataset instance for training. + + Raises: + NotImplementedError: Only support `same_dataset_within_batch` for `DecoderOnlyEmbedderICLRunner`. + + Returns: + DecoderOnlyEmbedderICLSameDatasetTrainDataset: The dataset instance. + """ + if self.data_args.same_dataset_within_batch: + train_dataset = DecoderOnlyEmbedderICLSameDatasetTrainDataset( + args=self.data_args, + default_batch_size=self.training_args.per_device_train_batch_size, + seed=self.training_args.seed, + tokenizer=self.tokenizer, + process_index=self.training_args.process_index, + num_processes=self.training_args.world_size + ) + self.training_args.per_device_train_batch_size = 1 + self.training_args.dataloader_num_workers = 0 # avoid multi-processing + else: + raise NotImplementedError("Only support `same_dataset_within_batch` for `DecoderOnlyEmbedderICLRunner`.") + return train_dataset
+ + +
+[docs] + def run(self): + """ + Run the finetune. + """ + Path(self.training_args.output_dir).mkdir(parents=True, exist_ok=True) + + # Training + self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint) + self.trainer.save_model() + + # save merged model + if self.model_args.save_merged_lora_model and self.training_args.process_index == 0: + save_merged_model(self.model_args, self.training_args.output_dir)
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/trainer.html b/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/trainer.html new file mode 100644 index 00000000..b352be40 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/decoder_only/icl/trainer.html @@ -0,0 +1,534 @@ + + + + + + + + FlagEmbedding.finetune.embedder.decoder_only.icl.trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.decoder_only.icl.trainer

+import os
+import torch
+import logging
+from typing import Optional
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderTrainer
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class DecoderOnlyEmbedderICLTrainer(AbsEmbedderTrainer): + """ + Trainer class for base encoder models. + """ + def _save(self, output_dir: Optional[str] = None, state_dict=None): + """Save the model to directory. + + Args: + output_dir (Optional[str], optional): Output directory to save the model. Defaults to ``None``. + + Raises: + NotImplementedError + """ + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info("Saving model checkpoint to %s", output_dir) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not hasattr(self.model, 'save'): + raise NotImplementedError( + f'MODEL {self.model.__class__.__name__} ' + f'does not support save interface') + else: + self.model.save(output_dir) + + if self.tokenizer is not None and self.is_world_process_zero(): + self.tokenizer.save_pretrained(output_dir) + + torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+ + + # save the checkpoint for sentence-transformers library + # if self.is_world_process_zero(): + # save_ckpt_for_sentence_transformers(output_dir, + # pooling_mode=self.args.sentence_pooling_method, + # normlized=self.args.normlized) +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/encoder_only/base/modeling.html b/_modules/FlagEmbedding/finetune/embedder/encoder_only/base/modeling.html new file mode 100644 index 00000000..a3525074 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/encoder_only/base/modeling.html @@ -0,0 +1,704 @@ + + + + + + + + FlagEmbedding.finetune.embedder.encoder_only.base.modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.encoder_only.base.modeling

+import logging
+
+import torch
+from transformers import AutoModel, AutoTokenizer
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderModel
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class BiEncoderOnlyEmbedderModel(AbsEmbedderModel): + """Embedder class for encoder only model. + + Args: + base_model (AutoModel): The base model to train on. + tokenizer (AutoTokenizer, optional): The tokenizer to use. Defaults to ``None``. + negatives_cross_device (bool, optional): If True, will compute cross devices negative loss. Defaults to ``False``. + temperature (float, optional): Temperature to control the scale of scores. Defaults to ``1.0``. + sub_batch_size (int, optional): Sub-batch size during encoding. If negative, will not split to sub-batch. + Defaults to ``-1``. + kd_loss_type (str, optional): Type of knowledge distillation loss. Defaults to ``"kl_div"``. + sentence_pooling_method (str, optional): Pooling method to get sentence embedding. Defaults to ``'cls'``. + normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to ``False``. + """ + TRANSFORMER_CLS = AutoModel + + def __init__( + self, + base_model: AutoModel, + tokenizer: AutoTokenizer = None, + negatives_cross_device: bool = False, + temperature: float = 1.0, + sub_batch_size: int = -1, + kd_loss_type: str = 'kl_div', + sentence_pooling_method: str = 'cls', + normalize_embeddings: bool = False, + ): + super().__init__( + base_model, + tokenizer=tokenizer, + negatives_cross_device=negatives_cross_device, + temperature=temperature, + sub_batch_size=sub_batch_size, + kd_loss_type=kd_loss_type, + ) + self.sentence_pooling_method = sentence_pooling_method + self.normalize_embeddings = normalize_embeddings + self.cross_entropy = torch.nn.CrossEntropyLoss(reduction='mean') + +
+[docs] + def encode(self, features): + """Encode and get the embedding. + + Args: + features (Union[list, dict]): Features feed to the model. + + Returns: + torch.Tensor: The embedding vectors. + """ + if features is None: + return None + if not isinstance(features, list): + if self.sub_batch_size is not None and self.sub_batch_size > 0: + all_p_reps = [] + for i in range(0, len(features['attention_mask']), self.sub_batch_size): + end_inx = min(i + self.sub_batch_size, len(features['attention_mask'])) + sub_features = {} + for k, v in features.items(): + sub_features[k] = v[i:end_inx] + last_hidden_state = self.model(**sub_features, return_dict=True).last_hidden_state + p_reps = self._sentence_embedding(last_hidden_state, sub_features['attention_mask']) + all_p_reps.append(p_reps) + all_p_reps = torch.cat(all_p_reps, 0).contiguous() + if self.normalize_embeddings: + all_p_reps = torch.nn.functional.normalize(all_p_reps, dim=-1) + return all_p_reps.contiguous() + else: + last_hidden_state = self.model(**features, return_dict=True).last_hidden_state + all_p_reps = self._sentence_embedding(last_hidden_state, features['attention_mask']) + if self.normalize_embeddings: + all_p_reps = torch.nn.functional.normalize(all_p_reps, dim=-1) + return all_p_reps.contiguous() + else: + all_p_reps = [] + for sub_features in features: + last_hidden_state = self.model(**sub_features, return_dict=True).last_hidden_state + p_reps = self._sentence_embedding(last_hidden_state, sub_features['attention_mask']) + all_p_reps.append(p_reps) + all_p_reps = torch.cat(all_p_reps, 0).contiguous() + if self.normalize_embeddings: + all_p_reps = torch.nn.functional.normalize(all_p_reps, dim=-1) + return all_p_reps.contiguous()
+ + +
+[docs] + def _sentence_embedding(self, last_hidden_state, attention_mask): + """Use the pooling method to get the sentence embedding. + + Args: + last_hidden_state (torch.Tensor): The model output's last hidden state. + attention_mask (torch.Tensor): Mask out padding tokens during pooling. + + Raises: + NotImplementedError: Specified pooling method not implemented. + + Returns: + torch.Tensor: The sentence embeddings. + """ + if self.sentence_pooling_method == "cls": + return last_hidden_state[:, 0] + elif self.sentence_pooling_method == "mean": + s = torch.sum( + last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1 + ) + d = attention_mask.sum(dim=1, keepdim=True).float() + return s / d + elif self.sentence_pooling_method == "last_token": + left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0] + if left_padding: + return last_hidden_state[:, -1] + else: + sequence_lengths = attention_mask.sum(dim=1) - 1 + batch_size = last_hidden_state.shape[0] + return last_hidden_state[ + torch.arange(batch_size, device=last_hidden_state.device), + sequence_lengths, + ] + else: + raise NotImplementedError(f"pooling method {self.sentence_pooling_method} not implemented")
+ + +
+[docs] + def compute_score(self, q_reps, p_reps): + """Computes the scores between query and passage representations. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + + Returns: + torch.Tensor: The computed scores, adjusted by temperature. + """ + scores = self._compute_similarity(q_reps, p_reps) / self.temperature + scores = scores.view(q_reps.size(0), -1) + return scores
+ + +
+[docs] + def _compute_similarity(self, q_reps, p_reps): + """Computes the similarity between query and passage representations using inner product. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + + Returns: + torch.Tensor: The computed similarity matrix. + """ + if len(p_reps.size()) == 2: + return torch.matmul(q_reps, p_reps.transpose(0, 1)) + return torch.matmul(q_reps, p_reps.transpose(-2, -1))
+ + +
+[docs] + def compute_loss(self, scores, target): + """Compute the loss using cross entropy. + + Args: + scores (torch.Tensor): Computed score. + target (torch.Tensor): The target value. + + Returns: + torch.Tensor: The computed cross entropy loss. + """ + return self.cross_entropy(scores, target)
+ + +
+[docs] + def gradient_checkpointing_enable(self, **kwargs): + """ + Activates gradient checkpointing for the current model. + """ + self.model.gradient_checkpointing_enable(**kwargs)
+ + +
+[docs] + def enable_input_require_grads(self, **kwargs): + """ + Enables the gradients for the input embeddings. + """ + self.model.enable_input_require_grads(**kwargs)
+ + +
+[docs] + def save(self, output_dir: str): + """Save the model to the directory. + + Args: + output_dir (str): Directory for saving the model. + """ + state_dict = self.model.state_dict() + state_dict = type(state_dict)( + {k: v.clone().cpu() + for k, + v in state_dict.items()}) + self.model.save_pretrained(output_dir, state_dict=state_dict)
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/encoder_only/base/runner.html b/_modules/FlagEmbedding/finetune/embedder/encoder_only/base/runner.html new file mode 100644 index 00000000..76fa53a5 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/encoder_only/base/runner.html @@ -0,0 +1,579 @@ + + + + + + + + FlagEmbedding.finetune.embedder.encoder_only.base.runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.encoder_only.base.runner

+import logging
+from typing import Tuple
+from transformers import (
+    AutoModel, AutoConfig,
+    AutoTokenizer, PreTrainedTokenizer
+)
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderRunner, AbsEmbedderModel, EmbedderTrainerCallbackForDataRefresh
+from .modeling import BiEncoderOnlyEmbedderModel
+from .trainer import EncoderOnlyEmbedderTrainer
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class EncoderOnlyEmbedderRunner(AbsEmbedderRunner): + """ + Finetune Runner for base embedding models. + """ +
+[docs] + def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderModel]: + """Load tokenizer and model. + + Returns: + Tuple[PreTrainedTokenizer, AbsEmbedderModel]: Tokenizer and model instances. + """ + tokenizer = AutoTokenizer.from_pretrained( + self.model_args.model_name_or_path, + cache_dir=self.model_args.cache_dir, + token=self.model_args.token, + trust_remote_code=self.model_args.trust_remote_code + ) + base_model = AutoModel.from_pretrained( + self.model_args.model_name_or_path, + cache_dir=self.model_args.cache_dir, + token=self.model_args.token, + trust_remote_code=self.model_args.trust_remote_code + ) + + num_labels = 1 + config = AutoConfig.from_pretrained( + self.model_args.config_name if self.model_args.config_name else self.model_args.model_name_or_path, + num_labels=num_labels, + cache_dir=self.model_args.cache_dir, + token=self.model_args.token, + trust_remote_code=self.model_args.trust_remote_code, + ) + logger.info('Config: %s', config) + + model = BiEncoderOnlyEmbedderModel( + base_model, + tokenizer=tokenizer, + negatives_cross_device=self.training_args.negatives_cross_device, + temperature=self.training_args.temperature, + sub_batch_size=self.training_args.sub_batch_size, + kd_loss_type=self.training_args.kd_loss_type, + sentence_pooling_method=self.training_args.sentence_pooling_method, + normalize_embeddings=self.training_args.normalize_embeddings + ) + + if self.training_args.gradient_checkpointing: + model.enable_input_require_grads() + + if self.training_args.fix_position_embedding: + for k, v in model.named_parameters(): + if "position_embeddings" in k: + logging.info(f"Freeze the parameters for {k}") + v.requires_grad = False + return tokenizer, model
+ + +
+[docs] + def load_trainer(self) -> EncoderOnlyEmbedderTrainer: + """Load the trainer. + + Returns: + EncoderOnlyEmbedderTrainer: Loaded trainer instance. + """ + trainer = EncoderOnlyEmbedderTrainer( + model=self.model, + args=self.training_args, + train_dataset=self.train_dataset, + data_collator=self.data_collator, + tokenizer=self.tokenizer + ) + if self.data_args.same_dataset_within_batch: + trainer.add_callback(EmbedderTrainerCallbackForDataRefresh(self.train_dataset)) + return trainer
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/encoder_only/base/trainer.html b/_modules/FlagEmbedding/finetune/embedder/encoder_only/base/trainer.html new file mode 100644 index 00000000..11bd652e --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/encoder_only/base/trainer.html @@ -0,0 +1,533 @@ + + + + + + + + FlagEmbedding.finetune.embedder.encoder_only.base.trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.encoder_only.base.trainer

+import os
+import torch
+import logging
+from typing import Optional
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderTrainer
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class EncoderOnlyEmbedderTrainer(AbsEmbedderTrainer): + """ + Trainer class for base encoder models. + """ + def _save(self, output_dir: Optional[str] = None, state_dict=None): + """Save the model to directory. + + Args: + output_dir (Optional[str], optional): Output directory to save the model. Defaults to ``None``. + + Raises: + NotImplementedError + """ + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info("Saving model checkpoint to %s", output_dir) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not hasattr(self.model, 'save'): + raise NotImplementedError( + f'MODEL {self.model.__class__.__name__} ' + f'does not support save interface') + else: + self.model.save(output_dir) + if self.tokenizer is not None and self.is_world_process_zero(): + self.tokenizer.save_pretrained(output_dir) + + torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+ + + # save the checkpoint for sentence-transformers library + # if self.is_world_process_zero(): + # save_ckpt_for_sentence_transformers(output_dir, + # pooling_mode=self.args.sentence_pooling_method, + # normlized=self.args.normlized) +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/arguments.html b/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/arguments.html new file mode 100644 index 00000000..84ec9440 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/arguments.html @@ -0,0 +1,517 @@ + + + + + + + + FlagEmbedding.finetune.embedder.encoder_only.m3.arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.encoder_only.m3.arguments

+from dataclasses import dataclass, field
+
+from FlagEmbedding.abc.finetune.embedder import (
+    AbsEmbedderTrainingArguments,
+    AbsEmbedderModelArguments
+)
+
+
+
+[docs] +@dataclass +class EncoderOnlyEmbedderM3ModelArguments(AbsEmbedderModelArguments): + """ + Model argument class for M3. + """ + colbert_dim: int = field(default=-1, metadata={"help": "Dim of colbert linear"})
+ + + +
+[docs] +@dataclass +class EncoderOnlyEmbedderM3TrainingArguments(AbsEmbedderTrainingArguments): + """ + Training argument class for M3. + """ + unified_finetuning: bool = field(default=False, metadata={"help": "use unify fine-tuning"}) + use_self_distill: bool = field(default=False, metadata={"help": "use self-distill when using unify fine-tuning"}) + fix_encoder: bool = field(default=False, metadata={"help": "Freeze the parameters of encoder"}) + self_distill_start_step: int = field(default=-1, metadata={"help": "Num of step when using self-distill"})
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/modeling.html b/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/modeling.html new file mode 100644 index 00000000..4a84b616 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/modeling.html @@ -0,0 +1,1092 @@ + + + + + + + + FlagEmbedding.finetune.embedder.encoder_only.m3.modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.encoder_only.m3.modeling

+import os
+import logging
+from typing import Dict, List, Union, Any
+
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderModel, EmbedderOutput
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class EncoderOnlyEmbedderM3Model(AbsEmbedderModel): + """Embedder class for M3 model. + + Args: + base_model (AutoModel): The base model to train on. + tokenizer (AutoTokenizer, optional): The tokenizer to use. Defaults to ``None``. + negatives_cross_device (bool, optional): If True, will compute cross devices negative loss. Defaults to ``False``. + temperature (float, optional): Temperature to control the scale of scores. Defaults to ``1.0``. + sub_batch_size (int, optional): Sub-batch size during encoding. If negative, will not split to sub-batch. + Defaults to ``-1``. + kd_loss_type (str, optional): Type of knowledge distillation loss. Defaults to ``'m3_kd_loss'``. + sentence_pooling_method (str, optional): Pooling method to get sentence embedding. Defaults to ``'cls'``. + normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to ``False``. + unified_finetuning (bool, optional): If True, will finetune colbert vector and sparce embedding. Defaults to ``True``. + use_self_distill (bool, optional): If True, will do self distillation. Defaults to ``False``. + self_distill_start_step (int, optional): Step num to start self distillation. Defaults to ``-1``. + """ + def __init__( + self, + base_model: Dict[str, Any], + tokenizer: AutoTokenizer = None, + negatives_cross_device: bool = False, + temperature: float = 1, + sub_batch_size: int = -1, + kd_loss_type: str = 'm3_kd_loss', + sentence_pooling_method: str = 'cls', + normalize_embeddings: bool = False, + unified_finetuning: bool = True, + use_self_distill: bool = False, + self_distill_start_step: int = -1 + ): + super().__init__( + base_model, + tokenizer=tokenizer, + negatives_cross_device=negatives_cross_device, + temperature=temperature, + sub_batch_size=sub_batch_size, + kd_loss_type=kd_loss_type, + ) + self.sentence_pooling_method = sentence_pooling_method + self.normalize_embeddings = normalize_embeddings + self.cross_entropy = torch.nn.CrossEntropyLoss(reduction='mean') + + self.unified_finetuning = unified_finetuning + if not self.unified_finetuning: + self.model = base_model['model'] + self.colbert_linear = None + self.sparse_linear = None + else: + self.model = base_model['model'] + self.colbert_linear = base_model['colbert_linear'] + self.sparse_linear = base_model['sparse_linear'] + + self.config = self.model.config + + self.vocab_size = self.model.config.vocab_size + self.use_self_distill = use_self_distill + self.self_distill_start_step = self_distill_start_step + self.step = 0 + +
+[docs] + def _dense_embedding(self, last_hidden_state, attention_mask): + """Use the pooling method to get the dense embedding. + + Args: + last_hidden_state (torch.Tensor): The model output's last hidden state. + attention_mask (torch.Tensor): Mask out padding tokens during pooling. + + Raises: + NotImplementedError: Specified pooling method not implemented. + + Returns: + torch.Tensor: The dense embeddings. + """ + if self.sentence_pooling_method == "cls": + return last_hidden_state[:, 0] + elif self.sentence_pooling_method == "mean": + s = torch.sum( + last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1 + ) + d = attention_mask.sum(dim=1, keepdim=True).float() + return s / d + elif self.sentence_pooling_method == "last_token": + left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0] + if left_padding: + return last_hidden_state[:, -1] + else: + sequence_lengths = attention_mask.sum(dim=1) - 1 + batch_size = last_hidden_state.shape[0] + return last_hidden_state[ + torch.arange(batch_size, device=last_hidden_state.device), + sequence_lengths, + ] + else: + raise NotImplementedError(f"pooling method {self.sentence_pooling_method} not implemented")
+ + +
+[docs] + def _sparse_embedding(self, hidden_state, input_ids, return_embedding: bool = True): + """Compute and return the sparse embedding. + + Args: + hidden_state (torch.Tensor): The model output's last hidden state. + input_ids (_type_): Ids from input features. + return_embedding (bool, optional): If True, return the computed embedding, otherwise just return the token weights. + Defaults to ``True``. + + Returns: + torch.Tensor: The sparse embedding or just the token weights. + """ + token_weights = torch.relu(self.sparse_linear(hidden_state)) + if not return_embedding: return token_weights + + sparse_embedding = torch.zeros( + input_ids.size(0), input_ids.size(1), self.vocab_size, + dtype=token_weights.dtype, + device=token_weights.device + ) + sparse_embedding = torch.scatter(sparse_embedding, dim=-1, index=input_ids.unsqueeze(-1), src=token_weights) + + unused_tokens = [ + self.tokenizer.cls_token_id, self.tokenizer.eos_token_id, + self.tokenizer.pad_token_id, self.tokenizer.unk_token_id + ] + sparse_embedding = torch.max(sparse_embedding, dim=1).values + sparse_embedding[:, unused_tokens] *= 0. + return sparse_embedding
+ + +
+[docs] + def _colbert_embedding(self, last_hidden_state, mask): + """Get the colbert vectors. + + Args: + last_hidden_state (torch.Tensor): The model output's last hidden state. + attention_mask (torch.Tensor): Mask out padding tokens during pooling. + + Returns: + torch.Tensor: The colbert vectors. + """ + colbert_vecs = self.colbert_linear(last_hidden_state[:, 1:]) + colbert_vecs = colbert_vecs * mask[:, 1:][:, :, None].float() + return colbert_vecs
+ + +
+[docs] + def compute_score( + self, q_reps, p_reps, q_mask: torch.Tensor, + dense_weight: float = 1.0, sparse_weight: float = 0.3, colbert_weight: float = 1.0 + ): + """_summary_ + + Args: + q_reps (_type_): Query representations. + p_reps (_type_): Passage representations. + q_mask (torch.Tensor): _description_ + dense_weight (float, optional): _description_. Defaults to 1.0. + sparse_weight (float, optional): _description_. Defaults to 0.3. + colbert_weight (float, optional): _description_. Defaults to 1.0. + + Returns: + _type_: _description_ + """ + dense_score = self.compute_dense_score(q_reps, p_reps) + sparse_score = self.compute_sparse_score(q_reps, p_reps) + colbert_score = self.compute_colbert_score(q_reps, p_reps, q_mask=q_mask) + return dense_score * dense_weight + sparse_score * sparse_weight + colbert_score * colbert_weight
+ + +
+[docs] + def compute_dense_score(self, q_reps, p_reps): + """Compute the dense score. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + + Returns: + torch.Tensor: The computed dense scores, adjusted by temperature. + """ + scores = self._compute_similarity(q_reps, p_reps) / self.temperature + scores = scores.view(q_reps.size(0), -1) + return scores
+ + +
+[docs] + def compute_sparse_score(self, q_reps, p_reps): + """Compute the sparse score. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + + Returns: + torch.Tensor: The computed sparse scores, adjusted by temperature. + """ + scores = self._compute_similarity(q_reps, p_reps) / self.temperature + scores = scores.view(q_reps.size(0), -1) + return scores
+ + +
+[docs] + def compute_colbert_score(self, q_reps, p_reps, q_mask: torch.Tensor=None): + """Compute the colbert score. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + + Returns: + torch.Tensor: The computed colber scores, adjusted by temperature. + """ + token_scores = torch.einsum('qin,pjn->qipj', q_reps, p_reps) + scores, _ = token_scores.max(-1) + scores = scores.sum(1) / q_mask[:, 1:].sum(-1, keepdim=True) + scores = scores / self.temperature + return scores
+ + +
+[docs] + def ensemble_score(self, q_reps, p_reps, dense_scores=None, sparse_scores=None, colbert_scores=None): + """Compute the ensemble score of the three methods. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + dense_scores (torch.Tensor, optional): The dense scores. Defaults to ``None``. + sparse_scores (torch.Tensor, optional): The sparse scores. Defaults to ``None``. + colbert_scores (torch.Tensor, optional): The colbert scores. Defaults to ``None``. + + Raises: + ValueError: dense_scores, sparse_scores, colbert_scores must be provided + + Returns: + _type_: The ensemble score of the three methods. + """ + if dense_scores is None or sparse_scores is None or colbert_scores is None: + raise ValueError("dense_scores, sparse_scores, colbert_scores must be provided!") + return dense_scores + 0.3 * sparse_scores + colbert_scores
+ + +
+[docs] + def _encode(self, features): + """Helper function to encode using input features. + + Args: + features (Union[list, dict]): Features feed to the model. + + Returns: + torch.Tensor: Dense embedding. + torch.Tensor: Sparce embedding. + torch.Tensor: Colbert vector. + """ + dense_vecs, sparse_vecs, colbert_vecs = None, None, None + last_hidden_state = self.model(**features, return_dict=True).last_hidden_state + dense_vecs = self._dense_embedding(last_hidden_state, features['attention_mask']) + if self.unified_finetuning: + sparse_vecs = self._sparse_embedding(last_hidden_state, features['input_ids']) + colbert_vecs = self._colbert_embedding(last_hidden_state, features['attention_mask']) + if self.normalize_embeddings: + dense_vecs = F.normalize(dense_vecs, dim=-1) + if self.unified_finetuning: + colbert_vecs = F.normalize(colbert_vecs, dim=-1) + return dense_vecs, sparse_vecs, colbert_vecs
+ + +
+[docs] + def encode(self, features): + """Encode and get the embedding. + + Args: + features (Union[list, dict]): Features feed to the model. + + Returns: + torch.Tensor: Dense embeddings. + torch.Tensor: Sparce embeddings. + torch.Tensor: Colbert vectors. + """ + if features is None: + return None + + if not isinstance(features, list): + if self.sub_batch_size is not None and self.sub_batch_size != -1: + all_dense_vecs, all_sparse_vecs, all_colbert_vecs = [], [], [] + for i in range(0, len(features['attention_mask']), self.sub_batch_size): + end_inx = min(i + self.sub_batch_size, len(features['attention_mask'])) + sub_features = {} + for k, v in features.items(): + sub_features[k] = v[i:end_inx] + + dense_vecs, sparse_vecs, colbert_vecs = self._encode(sub_features) + all_dense_vecs.append(dense_vecs) + all_sparse_vecs.append(sparse_vecs) + all_colbert_vecs.append(colbert_vecs) + + dense_vecs = torch.cat(all_dense_vecs, 0) + if self.unified_finetuning: + sparse_vecs = torch.cat(all_sparse_vecs, 0) + colbert_vecs = torch.cat(all_colbert_vecs, 0) + else: + dense_vecs, sparse_vecs, colbert_vecs = self._encode(features) + else: + all_dense_vecs, all_sparse_vecs, all_colbert_vecs = [], [], [] + for sub_features in features: + dense_vecs, sparse_vecs, colbert_vecs = self._encode(sub_features) + all_dense_vecs.append(dense_vecs) + all_sparse_vecs.append(sparse_vecs) + all_colbert_vecs.append(colbert_vecs) + + dense_vecs = torch.cat(all_dense_vecs, 0) + if self.unified_finetuning: + sparse_vecs = torch.cat(all_sparse_vecs, 0) + colbert_vecs = torch.cat(all_colbert_vecs, 0) + + if self.unified_finetuning: + return dense_vecs.contiguous(), sparse_vecs.contiguous(), colbert_vecs.contiguous() + else: + return dense_vecs.contiguous(), None, None
+ + +
+[docs] + def _compute_similarity(self, q_reps, p_reps): + """Computes the similarity between query and passage representations using inner product. + + Args: + q_reps (torch.Tensor): Query representations. + p_reps (torch.Tensor): Passage representations. + + Returns: + torch.Tensor: The computed similarity matrix. + """ + if len(p_reps.size()) == 2: + return torch.matmul(q_reps, p_reps.transpose(0, 1)) + return torch.matmul(q_reps, p_reps.transpose(-2, -1))
+ + +
+[docs] + def _get_queries_attention_mask(self, queries: Union[Dict[str, Tensor], List[Dict[str, Tensor]]]): + """padding attention mask for colbert + + Args: + queries (Union[Dict[str, Tensor], List[Dict[str, Tensor]]]): Input queries. + + Returns: + torch.Tensor: The query attention mask. + """ + if not isinstance(queries, list): + q_mask = queries['attention_mask'] + else: + q_mask_list = [sub_features['attention_mask'] for sub_features in queries] + _length = max([mask.shape[1] for mask in q_mask_list]) + if self.tokenizer.padding_side == 'right': + q_mask = torch.cat([ + F.pad(mask, (0, _length - mask.shape[1]), value=0) + for mask in q_mask_list + ], dim=0) + else: + q_mask = torch.cat([ + F.pad(mask, (_length - mask.shape[1], 0), value=0) + for mask in q_mask_list + ], dim=0) + return q_mask
+ + +
+[docs] + def forward( + self, + queries: Union[Dict[str, Tensor], List[Dict[str, Tensor]]] = None, + passages: Union[Dict[str, Tensor], List[Dict[str, Tensor]]] = None, + teacher_scores: Union[None, List[float]] = None, + no_in_batch_neg_flag: bool = False, + ): + """The computation performed at every call. + + Args: + queries (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional): Input queries. Defaults to ``None``. + passages (Union[Dict[str, Tensor], List[Dict[str, Tensor]]], optional): Input passages. Defaults to ``None``. + teacher_scores (Union[None, List[float]], optional): Teacher scores for distillation. Defaults to ``None``. + no_in_batch_neg_flag (bool, optional): If True, use no in-batch negatives and no cross-device negatives. Defaults to ``False``. + + Returns: + EmbedderOutput: Output of the forward call of model. + """ + q_dense_vecs, q_sparse_vecs, q_colbert_vecs = self.encode(queries) # (batch_size, dim) + p_dense_vecs, p_sparse_vecs, p_colbert_vecs = self.encode(passages) # (batch_size * group_size, dim) + + if self.training: + if teacher_scores is not None: + teacher_scores = torch.tensor(teacher_scores, device=q_dense_vecs.device) + teacher_scores = teacher_scores.view(q_dense_vecs.size(0), -1).detach() # (batch_size, group_size) + teacher_targets = F.softmax(teacher_scores, dim=-1) # (batch_size, group_size) + else: + teacher_targets = None + + if no_in_batch_neg_flag: + compute_loss_func = self._compute_no_in_batch_neg_loss + else: + if self.negatives_cross_device: + compute_loss_func = self._compute_cross_device_neg_loss + else: + compute_loss_func = self._compute_in_batch_neg_loss + + # dense loss + dense_scores, loss = compute_loss_func( + q_dense_vecs, p_dense_vecs, teacher_targets=teacher_targets, + compute_score_func=self.compute_dense_score + ) + + if self.unified_finetuning: + # disable cross device negatives for unified finetuning + if no_in_batch_neg_flag: + compute_loss_func = self._compute_no_in_batch_neg_loss + else: + compute_loss_func = self._compute_in_batch_neg_loss + + # sparse loss + sparse_scores, sparse_loss = compute_loss_func( + q_sparse_vecs, p_sparse_vecs, teacher_targets=teacher_targets, + compute_score_func=self.compute_sparse_score + ) + + # colbert loss + colbert_scores, colbert_loss = compute_loss_func( + q_colbert_vecs, p_colbert_vecs, teacher_targets=teacher_targets, + compute_score_func=self.compute_colbert_score, + q_mask=self._get_queries_attention_mask(queries) + ) + + # get dense scores of current process + if not no_in_batch_neg_flag and self.negatives_cross_device: + dense_scores = dense_scores[ + q_dense_vecs.size(0)*self.process_rank : q_dense_vecs.size(0)*(self.process_rank+1), + p_dense_vecs.size(0)*self.process_rank : p_dense_vecs.size(0)*(self.process_rank+1) + ] # (batch_size, batch_size * group_size) + + # ensemble loss + ensemble_scores, ensemble_loss = compute_loss_func( + q_dense_vecs, p_dense_vecs, teacher_targets=teacher_targets, + compute_score_func=self.ensemble_score, + dense_scores=dense_scores, + sparse_scores=sparse_scores, + colbert_scores=colbert_scores + ) + + loss = (loss + ensemble_loss + 0.1 * sparse_loss + colbert_loss) / 4 + + if self.use_self_distill and self.step > self.self_distill_start_step: + self_teacher_targets = torch.softmax(ensemble_scores.detach(), dim=-1) + + dense_self_distill_loss = self.distill_loss("kl_div", self_teacher_targets, dense_scores) + sparse_self_distill_loss = self.distill_loss("kl_div", self_teacher_targets, sparse_scores) + colbert_self_distill_loss = self.distill_loss("kl_div", self_teacher_targets, colbert_scores) + + loss += (dense_self_distill_loss + 0.1 * sparse_self_distill_loss + colbert_self_distill_loss) / 3 + loss = loss / 2 + self.step += 1 + else: + loss = None + + return EmbedderOutput( + loss=loss, + )
+ + +
+[docs] + def compute_loss(self, scores, target): + """Compute the loss using cross entropy. + + Args: + scores (torch.Tensor): Computed score. + target (torch.Tensor): The target value. + + Returns: + torch.Tensor: The computed cross entropy loss. + """ + return self.cross_entropy(scores, target)
+ + +
+[docs] + def gradient_checkpointing_enable(self, **kwargs): + """ + Activates gradient checkpointing for the current model. + """ + self.model.gradient_checkpointing_enable(**kwargs)
+ + +
+[docs] + def enable_input_require_grads(self, **kwargs): + """ + Enables the gradients for the input embeddings. + """ + self.model.enable_input_require_grads(**kwargs)
+ + +
+[docs] + def save(self, output_dir: str): + """Save the model to the directory. + + Args: + output_dir (str): Directory for saving the model. + """ + def _trans_state_dict(state_dict): + state_dict = type(state_dict)( + {k: v.clone().cpu() + for k, + v in state_dict.items()}) + return state_dict + + self.model.save_pretrained(output_dir, state_dict=_trans_state_dict(self.model.state_dict())) + + if self.unified_finetuning: + torch.save(_trans_state_dict(self.colbert_linear.state_dict()), + os.path.join(output_dir, 'colbert_linear.pt')) + torch.save(_trans_state_dict(self.sparse_linear.state_dict()), + os.path.join(output_dir, 'sparse_linear.pt'))
+
+ + + +
+[docs] +class EncoderOnlyEmbedderM3ModelForInference(EncoderOnlyEmbedderM3Model): + """ + Inference class of M3 model. + """ +
+[docs] + def forward(self, + text_input: Dict[str, Tensor] = None, + return_dense: bool = True, + return_sparse: bool = False, + return_colbert_vecs: bool = False, + return_sparse_embedding: bool = False): + """Encode the text input using the selected way. + + Args: + text_input (Dict[str, Tensor], optional): Text inputs. Defaults to ``None``. + return_dense (bool, optional): If True, return the dense embedding. Defaults to ``True``. + return_sparse (bool, optional): If True, return the sparse embedding. Defaults to ``False``. + return_colbert_vecs (bool, optional): If True, return the colbert vectors. Defaults to ``False``. + return_sparse_embedding (bool, optional): Parameter for :meth:`_sparse_embedding()`. If True, will return sparse embedding. + Otherwise, return the token weights. Defaults to ``False``. + + Returns: + dict: A dictionary containing the three types of embeddings. + """ + assert return_dense or return_sparse or return_colbert_vecs, 'Must choose one or more from `return_colbert_vecs`, `return_sparse`, `return_dense` to set `True`!' + + last_hidden_state = self.model(**text_input, return_dict=True).last_hidden_state + + output = {} + if return_dense: + dense_vecs = self._dense_embedding(last_hidden_state, text_input['attention_mask']) + output['dense_vecs'] = dense_vecs + if return_sparse: + sparse_vecs = self._sparse_embedding( + last_hidden_state, text_input['input_ids'], + return_embedding=return_sparse_embedding + ) + output['sparse_vecs'] = sparse_vecs + if return_colbert_vecs: + colbert_vecs = self._colbert_embedding(last_hidden_state, text_input['attention_mask']) + output['colbert_vecs'] = colbert_vecs + + if self.normalize_embeddings: + if 'dense_vecs' in output: + output['dense_vecs'] = F.normalize(output['dense_vecs'], dim=-1) + if 'colbert_vecs' in output: + output['colbert_vecs'] = F.normalize(output['colbert_vecs'], dim=-1) + + return output
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.html b/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.html new file mode 100644 index 00000000..fd79c2a6 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/runner.html @@ -0,0 +1,660 @@ + + + + + + + + FlagEmbedding.finetune.embedder.encoder_only.m3.runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.encoder_only.m3.runner

+import os
+import torch
+import logging
+from typing import Tuple
+from transformers import (
+    AutoModel, AutoConfig,
+    AutoTokenizer, PreTrainedTokenizer
+)
+from huggingface_hub import snapshot_download
+
+from FlagEmbedding.abc.finetune.embedder import (
+    AbsEmbedderRunner, AbsEmbedderModel,
+    AbsEmbedderDataArguments, EmbedderTrainerCallbackForDataRefresh
+)
+from .modeling import EncoderOnlyEmbedderM3Model
+from .trainer import EncoderOnlyEmbedderM3Trainer
+from .arguments import EncoderOnlyEmbedderM3ModelArguments, EncoderOnlyEmbedderM3TrainingArguments
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class EncoderOnlyEmbedderM3Runner(AbsEmbedderRunner): + """ + M3 model runner for finetuning. + + Args: + model_args (EncoderOnlyEmbedderM3ModelArguments): Model arguments + data_args (AbsEmbedderDataArguments): Data arguments. + training_args (EncoderOnlyEmbedderM3TrainingArguments): Training arguments. + """ + def __init__( + self, + model_args: EncoderOnlyEmbedderM3ModelArguments, + data_args: AbsEmbedderDataArguments, + training_args: EncoderOnlyEmbedderM3TrainingArguments + ): + super().__init__(model_args, data_args, training_args) + self.model_args: EncoderOnlyEmbedderM3ModelArguments + self.data_args: AbsEmbedderDataArguments + self.training_args: EncoderOnlyEmbedderM3TrainingArguments + +
+[docs] + @staticmethod + def get_model( + model_name_or_path: str, + trust_remote_code: bool = False, + colbert_dim: int = -1, + cache_dir: str = None + ): + """Get the model. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + trust_remote_code (bool, optional): trust_remote_code to use when loading models from HF. Defaults to ``False``. + colbert_dim (int, optional): Colbert dim to set. Defaults to ``-1``. + cache_dir (str, optional): HF cache dir to store the model. Defaults to ``None``. + + Returns: + dict: A dictionary containing the model, colbert linear and sparse linear. + """ + cache_folder = os.getenv('HF_HUB_CACHE', None) if cache_dir is None else cache_dir + if not os.path.exists(model_name_or_path): + model_name_or_path = snapshot_download( + repo_id=model_name_or_path, + cache_dir=cache_folder, + ignore_patterns=['flax_model.msgpack', 'rust_model.ot', 'tf_model.h5'] + ) + + model = AutoModel.from_pretrained( + model_name_or_path, + cache_dir=cache_folder, + trust_remote_code=trust_remote_code + ) + colbert_linear = torch.nn.Linear( + in_features=model.config.hidden_size, + out_features=model.config.hidden_size if colbert_dim <= 0 else colbert_dim + ) + sparse_linear = torch.nn.Linear( + in_features=model.config.hidden_size, + out_features=1 + ) + + colbert_model_path = os.path.join(model_name_or_path, 'colbert_linear.pt') + sparse_model_path = os.path.join(model_name_or_path, 'sparse_linear.pt') + if os.path.exists(colbert_model_path) and os.path.exists(sparse_model_path): + logger.info('loading existing colbert_linear and sparse_linear---------') + colbert_state_dict = torch.load(colbert_model_path, map_location='cpu', weights_only=True) + sparse_state_dict = torch.load(sparse_model_path, map_location='cpu', weights_only=True) + colbert_linear.load_state_dict(colbert_state_dict) + sparse_linear.load_state_dict(sparse_state_dict) + else: + logger.info('The parameters of colbert_linear and sparse linear is new initialize. Make sure the model is loaded for training, not inferencing') + + return { + 'model': model, + 'colbert_linear': colbert_linear, + 'sparse_linear': sparse_linear + }
+ + +
+[docs] + def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderModel]: + """Load the tokenizer and model. + + Returns: + Tuple[PreTrainedTokenizer, AbsEmbedderModel]: Tokenizer and model instances. + """ + tokenizer = AutoTokenizer.from_pretrained( + self.model_args.model_name_or_path, + cache_dir=self.model_args.cache_dir, + token=self.model_args.token, + trust_remote_code=self.model_args.trust_remote_code + ) + + num_labels = 1 + config = AutoConfig.from_pretrained( + self.model_args.config_name if self.model_args.config_name else self.model_args.model_name_or_path, + num_labels=num_labels, + cache_dir=self.model_args.cache_dir, + token=self.model_args.token, + trust_remote_code=self.model_args.trust_remote_code, + ) + logger.info('Config: %s', config) + + model = EncoderOnlyEmbedderM3Model( + self.get_model(self.model_args.model_name_or_path, self.model_args.trust_remote_code, self.model_args.colbert_dim), + tokenizer=tokenizer, + negatives_cross_device=self.training_args.negatives_cross_device, + temperature=self.training_args.temperature, + sub_batch_size=self.training_args.sub_batch_size, + kd_loss_type=self.training_args.kd_loss_type, + sentence_pooling_method=self.training_args.sentence_pooling_method, + normalize_embeddings=self.training_args.normalize_embeddings, + unified_finetuning=self.training_args.unified_finetuning, + use_self_distill=self.training_args.use_self_distill, + self_distill_start_step=self.training_args.self_distill_start_step + ) + + if self.training_args.gradient_checkpointing: + model.enable_input_require_grads() + + if self.training_args.fix_position_embedding: + for k, v in model.named_parameters(): + if "position_embeddings" in k: + logging.info(f"Freeze the parameters for {k}") + v.requires_grad = False + return tokenizer, model
+ + +
+[docs] + def load_trainer(self) -> EncoderOnlyEmbedderM3Trainer: + """Load the M3 trainer. + + Returns: + EncoderOnlyEmbedderM3Trainer: M3 Trainer instance. + """ + trainer = EncoderOnlyEmbedderM3Trainer( + model=self.model, + args=self.training_args, + train_dataset=self.train_dataset, + data_collator=self.data_collator, + tokenizer=self.tokenizer + ) + if self.data_args.same_dataset_within_batch: + trainer.add_callback(EmbedderTrainerCallbackForDataRefresh(self.train_dataset)) + return trainer
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/trainer.html b/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/trainer.html new file mode 100644 index 00000000..914762dc --- /dev/null +++ b/_modules/FlagEmbedding/finetune/embedder/encoder_only/m3/trainer.html @@ -0,0 +1,533 @@ + + + + + + + + FlagEmbedding.finetune.embedder.encoder_only.m3.trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.embedder.encoder_only.m3.trainer

+import os
+import torch
+import logging
+from typing import Optional
+
+from FlagEmbedding.abc.finetune.embedder import AbsEmbedderTrainer
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class EncoderOnlyEmbedderM3Trainer(AbsEmbedderTrainer): + """ + Trainer class for M3. + """ + def _save(self, output_dir: Optional[str] = None, state_dict=None): + """Save the model to directory. + + Args: + output_dir (Optional[str], optional): Output directory to save the model. Defaults to ``None``. + + Raises: + NotImplementedError + """ + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info("Saving model checkpoint to %s", output_dir) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not hasattr(self.model, 'save'): + raise NotImplementedError( + f'MODEL {self.model.__class__.__name__} ' + f'does not support save interface') + else: + self.model.save(output_dir) + if self.tokenizer is not None and self.is_world_process_zero(): + self.tokenizer.save_pretrained(output_dir) + + torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+ + + # save the checkpoint for sentence-transformers library + # if self.is_world_process_zero(): + # save_ckpt_for_sentence_transformers(output_dir, + # pooling_mode=self.args.sentence_pooling_method, + # normlized=self.args.normlized) +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/arguments.html b/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/arguments.html new file mode 100644 index 00000000..071c062e --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/arguments.html @@ -0,0 +1,547 @@ + + + + + + + + FlagEmbedding.finetune.reranker.decoder_only.base.arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.decoder_only.base.arguments

+from typing import List
+from dataclasses import dataclass, field
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerModelArguments
+
+
+def default_target_modules() -> List[int]:
+    return ['v_proj', 'q_proj', 'k_proj', 'gate_proj', 'down_proj', 'o_proj', 'up_proj']
+
+
+
+[docs] +@dataclass +class RerankerModelArguments(AbsRerankerModelArguments): + """ + Model argument class for decoder only reranker. + """ + use_lora: bool = field( + default=True, + metadata={"help": "If passed, will use LORA (low-rank parameter-efficient training) to train the model."} + ) + lora_rank: int = field( + default=64, + metadata={"help": "The rank of lora."} + ) + lora_alpha: float = field( + default=16, + metadata={"help": "The alpha parameter of lora."} + ) + lora_dropout: float = field( + default=0.1, + metadata={"help": "The dropout rate of lora modules."} + ) + target_modules: List[str] = field( + default_factory=default_target_modules, + metadata={"help": "The target modules to apply LORA."} + ) + modules_to_save: List[str] = field( + default=None, + metadata={"help": "List of modules that should be saved in the final checkpoint."} + ) + use_flash_attn: bool = field( + default=False, + metadata={"help": "If passed, will use flash attention to train the model."} + ) + # use_slow_tokenizer: bool = field( + # default=False, + # metadata={"help": "If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library)."} + # ) + from_peft: str = field( + default=None + ) + raw_peft: List[str] = field( + default=None + ) + + save_merged_lora_model: bool = field( + default=False, + metadata={"help": "If passed, will merge the lora modules and save the entire model."} + )
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/modeling.html b/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/modeling.html new file mode 100644 index 00000000..68d6b121 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/modeling.html @@ -0,0 +1,543 @@ + + + + + + + + FlagEmbedding.finetune.reranker.decoder_only.base.modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.decoder_only.base.modeling

+import torch
+from transformers import PreTrainedModel, AutoTokenizer
+import logging
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerModel
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class CrossDecoderModel(AbsRerankerModel): + """ + Model class for decoder only reranker. + + Args: + base_model (PreTrainedModel): The underlying pre-trained model used for encoding and scoring input pairs. + tokenizer (AutoTokenizer, optional): The tokenizer for encoding input text. Defaults to ``None``. + train_batch_size (int, optional): The batch size to use. Defaults to ``4``. + """ + def __init__( + self, + base_model: PreTrainedModel, + tokenizer: AutoTokenizer = None, + train_batch_size: int = 4, + ): + super().__init__( + base_model, + tokenizer=tokenizer, + train_batch_size=train_batch_size, + ) + +
+[docs] + def encode(self, features): + """Encodes input features to logits. + + Args: + features (dict): Dictionary with input features. + + Returns: + torch.Tensor: The logits output from the model. + """ + if features is None: + return None + outputs = self.model(input_ids=features['input_ids'], + attention_mask=features['attention_mask'], + position_ids=features['position_ids'] if 'position_ids' in features.keys() else None, + output_hidden_states=True) + # _, max_indices = torch.max(features['labels'], dim=1) + # predict_indices = max_indices + # logits = [outputs.logits[i, predict_indices[i], :] for i in range(outputs.logits.shape[0])] + # logits = torch.stack(logits, dim=0) + scores = outputs.logits[:, -1, self.yes_loc] + return scores.contiguous()
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/runner.html b/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/runner.html new file mode 100644 index 00000000..3ea8752c --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/runner.html @@ -0,0 +1,606 @@ + + + + + + + + FlagEmbedding.finetune.reranker.decoder_only.base.runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.decoder_only.base.runner

+import logging
+from typing import Tuple
+from pathlib import Path
+from FlagEmbedding.abc.finetune.reranker.AbsArguments import AbsRerankerDataArguments, AbsRerankerTrainingArguments
+from transformers import (
+    AutoTokenizer, PreTrainedTokenizer
+)
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerRunner, AbsRerankerModel
+
+from .modeling import CrossDecoderModel
+from .arguments import RerankerModelArguments
+from .trainer import DecoderOnlyRerankerTrainer
+from .load_model import get_model, save_merged_model
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class DecoderOnlyRerankerRunner(AbsRerankerRunner): + """ + Decoder only reranker runner for finetuning. + + Args: + model_args (RerankerModelArguments): Model arguments instance. + data_args (AbsRerankerDataArguments): Data arguments instance. + training_args (AbsRerankerTrainingArguments): Trainer arguments. + """ + def __init__( + self, + model_args: RerankerModelArguments, + data_args: AbsRerankerDataArguments, + training_args: AbsRerankerTrainingArguments + ): + super().__init__(model_args, data_args, training_args) + +
+[docs] + def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRerankerModel]: + """Load the tokenizer and model. + + Returns: + Tuple[PreTrainedTokenizer, AbsEmbedderModel]: Tokenizer and model instances. + """ + tokenizer = AutoTokenizer.from_pretrained( + self.model_args.tokenizer_name if self.model_args.tokenizer_name else self.model_args.model_name_or_path, + token=self.model_args.token, + cache_dir=self.model_args.cache_dir, + use_fast=False, + add_eos_token=False, + trust_remote_code=self.model_args.trust_remote_code, + ) + + if tokenizer.pad_token is None: + if tokenizer.unk_token is not None: + tokenizer.pad_token = tokenizer.unk_token + tokenizer.pad_token_id = tokenizer.unk_token_id + elif tokenizer.eod_id is not None: + tokenizer.pad_token = tokenizer.eod + tokenizer.pad_token_id = tokenizer.eod_id + tokenizer.bos_token = tokenizer.im_start + tokenizer.bos_token_id = tokenizer.im_start_id + tokenizer.eos_token = tokenizer.im_end + tokenizer.eos_token_id = tokenizer.im_end_id + else: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token_id = tokenizer.eos_token_id + # if 'mistral' in self.model_args.model_name_or_path.lower(): + tokenizer.padding_side = 'left' + + base_model = get_model(self.model_args) + + model = CrossDecoderModel( + base_model, + tokenizer=tokenizer, + train_batch_size=self.training_args.per_device_train_batch_size, + ) + + if self.training_args.gradient_checkpointing: + model.enable_input_require_grads() + + return tokenizer, model
+ + +
+[docs] + def load_trainer(self) -> DecoderOnlyRerankerTrainer: + """Load the trainer. + + Returns: + DecoderOnlyRerankerTrainer: Loaded trainer instance. + """ + trainer = DecoderOnlyRerankerTrainer( + model=self.model, + args=self.training_args, + train_dataset=self.train_dataset, + data_collator=self.data_collator, + tokenizer=self.tokenizer + ) + return trainer
+ + +
+[docs] + def run(self): + """ + Run the finetuning. + """ + Path(self.training_args.output_dir).mkdir(parents=True, exist_ok=True) + + # Training + self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint) + self.trainer.save_model() + + # save merged model + if self.model_args.save_merged_lora_model and self.training_args.process_index == 0: + save_merged_model(self.model_args, self.training_args.output_dir)
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/trainer.html b/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/trainer.html new file mode 100644 index 00000000..e4c16d03 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/decoder_only/base/trainer.html @@ -0,0 +1,541 @@ + + + + + + + + FlagEmbedding.finetune.reranker.decoder_only.base.trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.decoder_only.base.trainer

+import os
+import torch
+import logging
+from typing import Optional
+# from transformers.deepspeed import is_deepspeed_zero3_enabled
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerTrainer
+from peft import get_peft_model_state_dict
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class DecoderOnlyRerankerTrainer(AbsRerankerTrainer): + """ + Trainer class for encoder only base reranker models. + """ + def _save(self, output_dir: Optional[str] = None, state_dict=None): + """Save the model to directory. + + Args: + output_dir (Optional[str], optional): Output directory to save the model. Defaults to ``None``. + + Raises: + NotImplementedError + """ + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info("Saving model checkpoint to %s", output_dir) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not hasattr(self.model, 'save'): + raise NotImplementedError( + f'MODEL {self.model.__class__.__name__} ' + f'does not support save interface') + else: + self.model.save(output_dir) + + if self.tokenizer is not None and self.is_world_process_zero(): + self.tokenizer.save_pretrained(output_dir) + + torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+ + + # if is_deepspeed_zero3_enabled(): + # if state_dict is None: + # state_dict = self.model.state_dict() + # prefix = 'model.' + # assert all(k.startswith(prefix) for k in state_dict.keys()), list(state_dict.keys()) + # state_dict = {k[len(prefix):]: v for k, v in state_dict.items()} + # lora_state_dict = get_peft_model_state_dict(self.model.model, state_dict) + # if self.args.process_index <= 0: + # torch.save(lora_state_dict, os.path.join(output_dir, "adapter_model.bin")) + # print(f"Save adapter model at {output_dir}") +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/arguments.html b/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/arguments.html new file mode 100644 index 00000000..cbefc0e4 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/arguments.html @@ -0,0 +1,567 @@ + + + + + + + + FlagEmbedding.finetune.reranker.decoder_only.layerwise.arguments - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.decoder_only.layerwise.arguments

+from typing import List
+from dataclasses import dataclass, field
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerModelArguments
+
+
+def default_target_modules() -> List[int]:
+    return ['v_proj', 'q_proj', 'k_proj', 'gate_proj', 'down_proj', 'o_proj', 'up_proj']
+
+
+
+[docs] +@dataclass +class RerankerModelArguments(AbsRerankerModelArguments): + """ + Model argument class for decoder only reranker. + """ + use_lora: bool = field( + default=True, + metadata={"help": "If passed, will use LORA (low-rank parameter-efficient training) to train the model."} + ) + lora_rank: int = field( + default=64, + metadata={"help": "The rank of lora."} + ) + lora_alpha: float = field( + default=16, + metadata={"help": "The alpha parameter of lora."} + ) + lora_dropout: float = field( + default=0.1, + metadata={"help": "The dropout rate of lora modules."} + ) + target_modules: List[str] = field( + default_factory=default_target_modules, + metadata={"help": "The target modules to apply LORA."} + ) + modules_to_save: List[str] = field( + default=None, + metadata={"help": "List of modules that should be saved in the final checkpoint."} + ) + use_flash_attn: bool = field( + default=False, + metadata={"help": "If passed, will use flash attention to train the model."} + ) + # use_slow_tokenizer: bool = field( + # default=False, + # metadata={"help": "If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library)."} + # ) + from_peft: str = field( + default=None + ) + raw_peft: List[str] = field( + default=None + ) + + save_merged_lora_model: bool = field( + default=False, + metadata={"help": "If passed, will merge the lora modules and save the entire model."} + ) + + model_type: str = field( + default='from_raw_model' # should be one of ['from_raw_model', 'from_finetuned_model'] + # from_raw_model -- openbmb/MiniCPM-2B-dpo-bf16 + # from_finetuned_model -- BAAI/bge-reranker-v2-minicpm-layerwise + ) + + start_layer: int = field( + default=8, + metadata={"help": "which layer to start to compute score"} + ) + + head_multi: bool = field( + default=False, + metadata={"help": "use one / multi classifier"} + ) + head_type: str = field( + default='simple', + metadata={"help": "the type of the classifier"} + )
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/modeling.html b/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/modeling.html new file mode 100644 index 00000000..43bc36de --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/modeling.html @@ -0,0 +1,584 @@ + + + + + + + + FlagEmbedding.finetune.reranker.decoder_only.layerwise.modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.decoder_only.layerwise.modeling

+import torch
+from transformers import PreTrainedModel, AutoTokenizer
+import logging
+from typing import List, Union, Dict, Optional
+from torch import Tensor
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerModel, RerankerOutput
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class CrossDecoderModel(AbsRerankerModel): + """ + Model class for decoder only reranker. + + Args: + base_model (PreTrainedModel): The underlying pre-trained model used for encoding and scoring input pairs. + tokenizer (AutoTokenizer, optional): The tokenizer for encoding input text. Defaults to ``None``. + train_batch_size (int, optional): The batch size to use. Defaults to ``4``. + start_layer (int, optional): Starting layer for layerwise. Defaults to ``8``. + """ + def __init__( + self, + base_model: PreTrainedModel, + tokenizer: AutoTokenizer = None, + train_batch_size: int = 4, + start_layer: int = 8 + ): + super().__init__( + base_model, + tokenizer=tokenizer, + train_batch_size=train_batch_size, + ) + + self.start_layer = start_layer + +
+[docs] + def encode(self, features): + if features is None: + return None + outputs = self.model(input_ids=features['input_ids'], + attention_mask=features['attention_mask'], + position_ids=features['position_ids'] if 'position_ids' in features.keys() else None, + output_hidden_states=True) + all_logits = outputs.logits + all_scores = [] + for logits in all_logits: + all_scores.append(logits[:, -1].contiguous()) + return all_scores
+ + +
+[docs] + def forward(self, pair: Union[Dict[str, Tensor], List[Dict[str, Tensor]]] = None, teacher_scores: Optional[Tensor] = None): + ranker_logits = self.encode(pair) # (batch_size * num, dim) + + if self.training: + loss = 0 + for logits in ranker_logits: + grouped_logits = logits.view(self.train_batch_size, -1) + target = torch.zeros(self.train_batch_size, device=grouped_logits.device, dtype=torch.long) + loss += self.compute_loss(grouped_logits, target) + + if teacher_scores is None: + teacher_scores = ranker_logits[-1].view( + self.train_batch_size, + -1 + ) + teacher_targets = torch.softmax(teacher_scores.detach(), dim=-1) + for logits in ranker_logits[:-1]: + student_scores = logits.view( + self.train_batch_size, + -1 + ) + loss += - torch.mean(torch.sum(torch.log_softmax(student_scores, dim=-1) * teacher_targets, dim=-1)) + else: + teacher_scores = torch.Tensor(teacher_scores) + teacher_scores = teacher_scores.view(self.train_batch_size, -1) + teacher_targets = torch.softmax(teacher_scores.detach(), dim=-1).to(ranker_logits[-1].device) + for logits in ranker_logits: + student_scores = logits.view( + self.train_batch_size, + -1 + ) + loss += - torch.mean(torch.sum(torch.log_softmax(student_scores, dim=-1) * teacher_targets, dim=-1)) + else: + loss = None + + # print(loss) + return RerankerOutput( + loss=loss, + scores=ranker_logits, + )
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/runner.html b/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/runner.html new file mode 100644 index 00000000..c0777a58 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/runner.html @@ -0,0 +1,607 @@ + + + + + + + + FlagEmbedding.finetune.reranker.decoder_only.layerwise.runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.decoder_only.layerwise.runner

+import os
+import logging
+from typing import Tuple
+from pathlib import Path
+from FlagEmbedding.abc.finetune.reranker.AbsArguments import AbsRerankerDataArguments, AbsRerankerTrainingArguments
+from transformers import (
+    AutoTokenizer, PreTrainedTokenizer
+)
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerRunner, AbsRerankerModel
+from FlagEmbedding.finetune.reranker.decoder_only.layerwise.modeling import CrossDecoderModel
+from FlagEmbedding.finetune.reranker.decoder_only.layerwise.arguments import RerankerModelArguments
+from FlagEmbedding.finetune.reranker.decoder_only.layerwise.trainer import DecoderOnlyRerankerTrainer
+from FlagEmbedding.finetune.reranker.decoder_only.layerwise.load_model import get_model, save_merged_model
+
+logger = logging.getLogger(__name__)
+
+
+[docs] +class DecoderOnlyRerankerRunner(AbsRerankerRunner): + """ + Decoder only layerwise reranker runner for finetuning. + + Args: + model_args (RerankerModelArguments): Model arguments instance. + data_args (AbsRerankerDataArguments): Data arguments instance. + training_args (AbsRerankerTrainingArguments): Trainer arguments. + """ + def __init__( + self, + model_args: RerankerModelArguments, + data_args: AbsRerankerDataArguments, + training_args: AbsRerankerTrainingArguments + ): + super().__init__(model_args, data_args, training_args) + +
+[docs] + def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRerankerModel]: + """Load the tokenizer and model. + + Returns: + Tuple[PreTrainedTokenizer, AbsEmbedderModel]: Tokenizer and model instances. + """ + # print(self.model_args.model_name_or_path) + tokenizer = AutoTokenizer.from_pretrained( + self.model_args.tokenizer_name if self.model_args.tokenizer_name else self.model_args.model_name_or_path, + token=self.model_args.token, + cache_dir=self.model_args.cache_dir, + # use_fast=False, + add_eos_token=False, + trust_remote_code=self.model_args.trust_remote_code + ) + + if tokenizer.pad_token is None: + if tokenizer.unk_token is not None: + tokenizer.pad_token = tokenizer.unk_token + tokenizer.pad_token_id = tokenizer.unk_token_id + elif tokenizer.eod_id is not None: + tokenizer.pad_token = tokenizer.eod + tokenizer.pad_token_id = tokenizer.eod_id + tokenizer.bos_token = tokenizer.im_start + tokenizer.bos_token_id = tokenizer.im_start_id + tokenizer.eos_token = tokenizer.im_end + tokenizer.eos_token_id = tokenizer.im_end_id + else: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.pad_token_id = tokenizer.eos_token_id + # if 'mistral' in self.model_args.model_name_or_path.lower(): + tokenizer.padding_side = 'left' + + base_model = get_model(self.model_args, tokenizer('Yes', add_special_tokens=False)['input_ids'][-1]) + + model = CrossDecoderModel( + base_model, + tokenizer=tokenizer, + train_batch_size=self.training_args.per_device_train_batch_size, + start_layer=self.model_args.start_layer + ) + + if self.training_args.gradient_checkpointing: + model.enable_input_require_grads() + + return tokenizer, model
+ + +
+[docs] + def load_trainer(self) -> DecoderOnlyRerankerTrainer: + """Load the trainer. + + Returns: + DecoderOnlyRerankerTrainer: Loaded trainer instance. + """ + trainer = DecoderOnlyRerankerTrainer( + model=self.model, + args=self.training_args, + train_dataset=self.train_dataset, + data_collator=self.data_collator, + tokenizer=self.tokenizer + ) + return trainer
+ + +
+[docs] + def run(self): + """ + Run the finetuning. + """ + Path(self.training_args.output_dir).mkdir(parents=True, exist_ok=True) + + # Training + self.trainer.train(resume_from_checkpoint=self.training_args.resume_from_checkpoint) + self.trainer.save_model() + + # save merged model + if self.model_args.save_merged_lora_model and self.training_args.process_index == 0: + save_merged_model(self.model_args, self.training_args.output_dir)
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/trainer.html b/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/trainer.html new file mode 100644 index 00000000..33fa640d --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/decoder_only/layerwise/trainer.html @@ -0,0 +1,541 @@ + + + + + + + + FlagEmbedding.finetune.reranker.decoder_only.layerwise.trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.decoder_only.layerwise.trainer

+import os
+import torch
+import logging
+from typing import Optional
+# from transformers.deepspeed import is_deepspeed_zero3_enabled
+from peft import get_peft_model_state_dict
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerTrainer
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class DecoderOnlyRerankerTrainer(AbsRerankerTrainer): + """ + Trainer class for encoder only base reranker models. + """ + def _save(self, output_dir: Optional[str] = None, state_dict=None): + """Save the model to directory. + + Args: + output_dir (Optional[str], optional): Output directory to save the model. Defaults to ``None``. + + Raises: + NotImplementedError + """ + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info("Saving model checkpoint to %s", output_dir) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not hasattr(self.model, 'save'): + raise NotImplementedError( + f'MODEL {self.model.__class__.__name__} ' + f'does not support save interface') + else: + self.model.save(output_dir) + + if self.tokenizer is not None and self.is_world_process_zero(): + self.tokenizer.save_pretrained(output_dir) + + torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+ + + # if is_deepspeed_zero3_enabled(): + # if state_dict is None: + # state_dict = self.model.state_dict() + # prefix = 'model.' + # assert all(k.startswith(prefix) for k in state_dict.keys()), list(state_dict.keys()) + # state_dict = {k[len(prefix):]: v for k, v in state_dict.items()} + # lora_state_dict = get_peft_model_state_dict(self.model.model, state_dict) + # if self.args.process_index <= 0: + # torch.save(lora_state_dict, os.path.join(output_dir, "adapter_model.bin")) + # print(f"Save adapter model at {output_dir}") +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/encoder_only/base/modeling.html b/_modules/FlagEmbedding/finetune/reranker/encoder_only/base/modeling.html new file mode 100644 index 00000000..683a6b9f --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/encoder_only/base/modeling.html @@ -0,0 +1,530 @@ + + + + + + + + FlagEmbedding.finetune.reranker.encoder_only.base.modeling - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.encoder_only.base.modeling

+from transformers import PreTrainedModel, AutoTokenizer
+import logging
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerModel
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class CrossEncoderModel(AbsRerankerModel): + """Model class for reranker. + + Args: + base_model (PreTrainedModel): The underlying pre-trained model used for encoding and scoring input pairs. + tokenizer (AutoTokenizer, optional): The tokenizer for encoding input text. Defaults to ``None``. + train_batch_size (int, optional): The batch size to use. Defaults to ``4``. + """ + def __init__( + self, + base_model: PreTrainedModel, + tokenizer: AutoTokenizer = None, + train_batch_size: int = 4, + ): + super().__init__( + base_model, + tokenizer=tokenizer, + train_batch_size=train_batch_size, + ) + +
+[docs] + def encode(self, features): + """Encodes input features to logits. + + Args: + features (dict): Dictionary with input features. + + Returns: + torch.Tensor: The logits output from the model. + """ + return self.model(**features, return_dict=True).logits
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/encoder_only/base/runner.html b/_modules/FlagEmbedding/finetune/reranker/encoder_only/base/runner.html new file mode 100644 index 00000000..5eb2680c --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/encoder_only/base/runner.html @@ -0,0 +1,570 @@ + + + + + + + + FlagEmbedding.finetune.reranker.encoder_only.base.runner - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.encoder_only.base.runner

+import logging
+from typing import Tuple
+from transformers import (
+    AutoModelForSequenceClassification, AutoConfig,
+    AutoTokenizer, PreTrainedTokenizer
+)
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerRunner, AbsRerankerModel
+from FlagEmbedding.finetune.reranker.encoder_only.base.modeling import CrossEncoderModel
+from FlagEmbedding.finetune.reranker.encoder_only.base.trainer import EncoderOnlyRerankerTrainer
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class EncoderOnlyRerankerRunner(AbsRerankerRunner): + """ + Encoder only reranker runner for finetuning. + """ +
+[docs] + def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsRerankerModel]: + """Load the tokenizer and model. + + Returns: + Tuple[PreTrainedTokenizer, AbsEmbedderModel]: Tokenizer and model instances. + """ + tokenizer = AutoTokenizer.from_pretrained( + self.model_args.model_name_or_path, + cache_dir=self.model_args.cache_dir, + token=self.model_args.token, + trust_remote_code=self.model_args.trust_remote_code + ) + + num_labels = 1 + config = AutoConfig.from_pretrained( + self.model_args.config_name if self.model_args.config_name else self.model_args.model_name_or_path, + num_labels=num_labels, + cache_dir=self.model_args.cache_dir, + token=self.model_args.token, + trust_remote_code=self.model_args.trust_remote_code, + ) + logger.info('Config: %s', config) + + base_model = AutoModelForSequenceClassification.from_pretrained( + self.model_args.model_name_or_path, + config=config, + cache_dir=self.model_args.cache_dir, + token=self.model_args.token, + from_tf=bool(".ckpt" in self.model_args.model_name_or_path), + trust_remote_code=self.model_args.trust_remote_code + ) + + model = CrossEncoderModel( + base_model, + tokenizer=tokenizer, + train_batch_size=self.training_args.per_device_train_batch_size, + ) + + if self.training_args.gradient_checkpointing: + model.enable_input_require_grads() + + return tokenizer, model
+ + +
+[docs] + def load_trainer(self) -> EncoderOnlyRerankerTrainer: + """Load the trainer. + + Returns: + EncoderOnlyRerankerTrainer: Loaded trainer instance. + """ + trainer = EncoderOnlyRerankerTrainer( + model=self.model, + args=self.training_args, + train_dataset=self.train_dataset, + data_collator=self.data_collator, + tokenizer=self.tokenizer + ) + return trainer
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/finetune/reranker/encoder_only/base/trainer.html b/_modules/FlagEmbedding/finetune/reranker/encoder_only/base/trainer.html new file mode 100644 index 00000000..40a60095 --- /dev/null +++ b/_modules/FlagEmbedding/finetune/reranker/encoder_only/base/trainer.html @@ -0,0 +1,526 @@ + + + + + + + + FlagEmbedding.finetune.reranker.encoder_only.base.trainer - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.finetune.reranker.encoder_only.base.trainer

+import os
+import torch
+import logging
+from typing import Optional
+
+from FlagEmbedding.abc.finetune.reranker import AbsRerankerTrainer
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class EncoderOnlyRerankerTrainer(AbsRerankerTrainer): + """ + Trainer class for encoder only base reranker models. + """ + def _save(self, output_dir: Optional[str] = None, state_dict=None): + """Save the model to directory. + + Args: + output_dir (Optional[str], optional): Output directory to save the model. Defaults to ``None``. + + Raises: + NotImplementedError + """ + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info("Saving model checkpoint to %s", output_dir) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not hasattr(self.model, 'save_pretrained'): + raise NotImplementedError(f'MODEL {self.model.__class__.__name__} ' f'does not support save_pretrained interface') + else: + self.model.save_pretrained(output_dir) + if self.tokenizer is not None and self.is_world_process_zero(): + self.tokenizer.save_pretrained(output_dir) + + # Good practice: save your training arguments together with the trained model + torch.save(self.args, os.path.join(output_dir, "training_args.bin"))
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/inference/auto_embedder.html b/_modules/FlagEmbedding/inference/auto_embedder.html new file mode 100644 index 00000000..7768d193 --- /dev/null +++ b/_modules/FlagEmbedding/inference/auto_embedder.html @@ -0,0 +1,603 @@ + + + + + + + + FlagEmbedding.inference.auto_embedder - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.inference.auto_embedder

+import os
+import logging
+from typing import List, Union, Optional
+
+from FlagEmbedding.inference.embedder.model_mapping import (
+    EmbedderModelClass,
+    AUTO_EMBEDDER_MAPPING, EMBEDDER_CLASS_MAPPING
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class FlagAutoModel: + """ + Automatically choose the appropriate class to load the embedding model. + """ + def __init__(self): + raise EnvironmentError( + "FlagAutoModel is designed to be instantiated using the `FlagAutoModel.from_finetuned(model_name_or_path)` method." + ) + +
+[docs] + @classmethod + def from_finetuned( + cls, + model_name_or_path: str, + model_class: Optional[Union[str, EmbedderModelClass]] = None, + normalize_embeddings: bool = True, + use_fp16: bool = True, + query_instruction_for_retrieval: Optional[str] = None, + devices: Optional[Union[str, List[str]]] = None, + pooling_method: Optional[str] = None, + trust_remote_code: Optional[bool] = None, + query_instruction_format: Optional[str] = None, + **kwargs, + ): + """ + Load a finetuned model according to the provided vars. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + model_class (Optional[Union[str, EmbedderModelClass]], optional): The embedder class to use. Defaults to :data:`None`. + normalize_embeddings (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. + Defaults to :data:`True`. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`True`. + query_instruction_for_retrieval (Optional[str], optional): Query instruction for retrieval tasks, which will be used with + :attr:`query_instruction_format`. Defaults to :data:`None`. + devices (Optional[Union[str, List[str]]], optional): Devices to use for model inference. Defaults to :data:`None`. + pooling_method (Optional[str], optional): Pooling method to get embedding vector from the last hidden state. Defaults to :data:`None`. + trust_remote_code (Optional[bool], optional): trust_remote_code for HF datasets or models. Defaults to :data:`None`. + query_instruction_format (Optional[str], optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`None`. + + Raises: + ValueError + + Returns: + AbsEmbedder: The model class to load model, which is child class of :class:`AbsEmbedder`. + """ + model_name = os.path.basename(model_name_or_path) + if model_name.startswith("checkpoint-"): + model_name = os.path.basename(os.path.dirname(model_name_or_path)) + + if model_class is not None: + _model_class = EMBEDDER_CLASS_MAPPING[EmbedderModelClass(model_class)] + if pooling_method is None: + pooling_method = _model_class.DEFAULT_POOLING_METHOD + logger.warning( + f"`pooling_method` is not specified, use default pooling method '{pooling_method}'." + ) + if trust_remote_code is None: + trust_remote_code = False + logger.warning( + f"`trust_remote_code` is not specified, set to default value '{trust_remote_code}'." + ) + if query_instruction_format is None: + query_instruction_format = "{}{}" + logger.warning( + f"`query_instruction_format` is not specified, set to default value '{query_instruction_format}'." + ) + else: + if model_name not in AUTO_EMBEDDER_MAPPING: + raise ValueError( + f"Model name '{model_name}' not found in the model mapping. You can pull request to add the model to " + "`https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/inference/embedder/model_mapping.py`. " + "If need, you can create a new `<model>.py` file in `https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/inference/embedder/encoder_only` " + "or `https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/inference/embedder/decoder_only`. " + "Welcome to contribute! You can also directly specify the corresponding `model_class` to instantiate the model." + ) + + model_config = AUTO_EMBEDDER_MAPPING[model_name] + + _model_class = model_config.model_class + if pooling_method is None: + pooling_method = model_config.pooling_method.value + if trust_remote_code is None: + trust_remote_code = model_config.trust_remote_code + if query_instruction_format is None: + query_instruction_format = model_config.query_instruction_format + + return _model_class( + model_name_or_path, + normalize_embeddings=normalize_embeddings, + use_fp16=use_fp16, + query_instruction_for_retrieval=query_instruction_for_retrieval, + query_instruction_format=query_instruction_format, + devices=devices, + pooling_method=pooling_method, + trust_remote_code=trust_remote_code, + **kwargs, + )
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/inference/auto_reranker.html b/_modules/FlagEmbedding/inference/auto_reranker.html new file mode 100644 index 00000000..e48dc2d7 --- /dev/null +++ b/_modules/FlagEmbedding/inference/auto_reranker.html @@ -0,0 +1,573 @@ + + + + + + + + FlagEmbedding.inference.auto_reranker - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.inference.auto_reranker

+import os
+import logging
+from typing import Union, Optional
+
+from FlagEmbedding.inference.reranker.model_mapping import (
+    RerankerModelClass,
+    RERANKER_CLASS_MAPPING,
+    AUTO_RERANKER_MAPPING
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class FlagAutoReranker: + """ + Automatically choose the appropriate class to load the reranker model. + """ + def __init__(self): + raise EnvironmentError( + "FlagAutoReranker is designed to be instantiated using the `FlagAutoReranker.from_finetuned(model_name_or_path)` method." + ) + +
+[docs] + @classmethod + def from_finetuned( + cls, + model_name_or_path: str, + model_class: Optional[Union[str, RerankerModelClass]] = None, + use_fp16: bool = False, + trust_remote_code: Optional[bool] = None, + **kwargs, + ): + """ + Load a finetuned model according to the provided vars. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + model_class (Optional[Union[str, RerankerModelClass]], optional): The reranker class to use.. Defaults to :data:`None`. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`False`. + trust_remote_code (Optional[bool], optional): trust_remote_code for HF datasets or models. Defaults to :data:`None`. + + Raises: + ValueError + + Returns: + AbsReranker: The reranker class to load model, which is child class of :class:`AbsReranker`. + """ + model_name = os.path.basename(model_name_or_path) + if model_name.startswith("checkpoint-"): + model_name = os.path.basename(os.path.dirname(model_name_or_path)) + + if model_class is not None: + _model_class = RERANKER_CLASS_MAPPING[RerankerModelClass(model_class)] + if trust_remote_code is None: + trust_remote_code = False + logging.warning( + f"`trust_remote_code` is not specified, set to default value '{trust_remote_code}'." + ) + else: + if model_name not in AUTO_RERANKER_MAPPING: + raise ValueError( + f"Model name '{model_name}' not found in the model mapping. You can pull request to add the model to " + "`https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/inference/reranker/model_mapping.py`. " + "If need, you can create a new `<model>.py` file in `https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/inference/reranker/encoder_only` " + "or `https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/inference/reranker/decoder_only`. " + "Welcome to contribute! You can also directly specify the corresponding `model_class` to instantiate the model." + ) + + model_config = AUTO_RERANKER_MAPPING[model_name] + + _model_class = model_config.model_class + if trust_remote_code is None: + trust_remote_code = model_config.trust_remote_code + + return _model_class( + model_name_or_path, + use_fp16=use_fp16, + trust_remote_code=trust_remote_code, + **kwargs, + )
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/inference/embedder/decoder_only/base.html b/_modules/FlagEmbedding/inference/embedder/decoder_only/base.html new file mode 100644 index 00000000..f7836028 --- /dev/null +++ b/_modules/FlagEmbedding/inference/embedder/decoder_only/base.html @@ -0,0 +1,796 @@ + + + + + + + + FlagEmbedding.inference.embedder.decoder_only.base - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.inference.embedder.decoder_only.base

+from tqdm import tqdm, trange
+from typing import cast, Any, List, Union, Optional
+
+import torch
+import numpy as np
+from transformers import AutoModel, AutoTokenizer
+
+from FlagEmbedding.abc.inference import AbsEmbedder
+
+
+# Pooling function for LLM-based embedding models
+def last_token_pool(last_hidden_states: torch.Tensor,
+                    attention_mask: torch.Tensor) -> torch.Tensor:
+    """Last token pooling method.
+
+    Args:
+        last_hidden_state (torch.Tensor): The last hidden state of the model.
+        attention_mask (torch.Tensor): Attention mask. Defaults to :data:`None`.
+
+    Returns:
+        torch.Tensor: The embedding vectors after pooling.
+    """
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return last_hidden_states[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = last_hidden_states.shape[0]
+        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+
+
+
+[docs] +class BaseLLMEmbedder(AbsEmbedder): + """Base embedder class for LLM like decoder only models. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to :data:`True`. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`True`. + query_instruction_for_retrieval (Optional[str], optional): Query instruction for retrieval tasks, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`None`. + query_instruction_format (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"{}{}"`. + devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`. + trust_remote_code (bool, optional): trust_remote_code for HF datasets or models. Defaults to :data:`False`. + cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`256`. + query_max_length (int, optional): Maximum length for query. Defaults to :data:`512`. + passage_max_length (int, optional): Maximum length for passage. Defaults to :data:`512`. + convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. + Defaults to :data:`True`. + + Attributes: + DEFAULT_POOLING_METHOD: The default pooling method when running the model. + """ + DEFAULT_POOLING_METHOD = "last_token" + + def __init__( + self, + model_name_or_path: str, + normalize_embeddings: bool = True, + use_fp16: bool = True, + query_instruction_for_retrieval: Optional[str] = None, + query_instruction_format: str = "Instruct: {}\nQuery: {}", # specify the format of query_instruction_for_retrieval + devices: Optional[Union[str, List[str]]] = None, # specify devices, such as "cuda:0" or ["cuda:0", "cuda:1"] + # Additional parameters for BaseLLMEmbedder + trust_remote_code: bool = False, + cache_dir: Optional[str] = None, + # inference + batch_size: int = 256, + query_max_length: int = 512, + passage_max_length: int = 512, + convert_to_numpy: bool = True, + **kwargs: Any, + ): + super().__init__( + model_name_or_path, + normalize_embeddings=normalize_embeddings, + use_fp16=use_fp16, + query_instruction_for_retrieval=query_instruction_for_retrieval, + query_instruction_format=query_instruction_format, + devices=devices, + batch_size=batch_size, + query_max_length=query_max_length, + passage_max_length=passage_max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, + trust_remote_code=trust_remote_code, + cache_dir=cache_dir + ) + self.model = AutoModel.from_pretrained( + model_name_or_path, + trust_remote_code=trust_remote_code, + cache_dir=cache_dir + ) + + if self.kwargs.get("pooling_method", "last_token") != "last_token": + raise ValueError("Pooling method must be 'last_token' for LLM-based models.") + +
+[docs] + def encode_queries( + self, + queries: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ) -> Union[np.ndarray, torch.Tensor]: + """Encode the queries. + + Args: + queries (Union[List[str], str]): Input queries to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor. + """ + return super().encode_queries( + queries, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + )
+ + +
+[docs] + def encode_corpus( + self, + corpus: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ) -> Union[np.ndarray, torch.Tensor]: + """Encode the corpus. + + Args: + corpus (Union[List[str], str]): Input corpus to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor. + """ + return super().encode_corpus( + corpus, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + )
+ + +
+[docs] + def encode( + self, + sentences: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ) -> Union[np.ndarray, torch.Tensor]: + """Encode the input sentences with the embedding model. + + Args: + sentences (Union[List[str], str]): Input sentences to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor. + """ + return super().encode( + sentences, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + )
+ + +
+[docs] + @torch.no_grad() + def encode_single_device( + self, + sentences: Union[List[str], str], + batch_size: int = 256, + max_length: int = 512, + convert_to_numpy: bool = True, + device: Optional[str] = None, + **kwargs: Any # add `pad_to_multiple_of=8` for bge-multilingual-gemmma2 + ): + """Encode input sentences by a single device. + + Args: + sentences (Union[List[str], str]): Input sentences to encode. + batch_size (int, optional): Number of sentences for each iter. Defaults to :data:`256`. + max_length (int, optional): Maximum length of tokens. Defaults to :data:`512`. + convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`True`. + device (Optional[str], optional): Device to use for encoding. Defaults to None. + + Returns: + Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor. + """ + if device is None: + device = self.target_devices[0] + + if device == "cpu": self.use_fp16 = False + if self.use_fp16: self.model.half() + + self.model.to(device) + self.model.eval() + + input_was_string = False + if isinstance(sentences, str): + sentences = [sentences] + input_was_string = True + + # tokenize without padding to get the correct length + all_inputs = [] + for start_index in trange(0, len(sentences), batch_size, desc='pre tokenize', + disable=len(sentences) < 256): + sentences_batch = sentences[start_index:start_index + batch_size] + inputs_batch = self.tokenizer( + sentences_batch, + truncation=True, + max_length=max_length, + **kwargs + ) + inputs_batch = [{ + k: inputs_batch[k][i] for k in inputs_batch.keys() + } for i in range(len(sentences_batch))] + all_inputs.extend(inputs_batch) + + # sort by length for less padding + length_sorted_idx = np.argsort([-len(x['input_ids']) for x in all_inputs]) + all_inputs_sorted = [all_inputs[i] for i in length_sorted_idx] + + # adjust batch size + flag = False + while flag is False: + try: + inputs_batch = self.tokenizer.pad( + all_inputs_sorted[: batch_size], + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + last_hidden_state = self.model(**inputs_batch, return_dict=True).last_hidden_state + embeddings = last_token_pool(last_hidden_state, inputs_batch['attention_mask']) + flag = True + except RuntimeError as e: + batch_size = batch_size * 3 // 4 + except torch.OutofMemoryError as e: + batch_size = batch_size * 3 // 4 + + # encode + all_embeddings = [] + for start_index in tqdm(range(0, len(sentences), batch_size), desc="Inference Embeddings", + disable=len(sentences) < 256): + inputs_batch = all_inputs_sorted[start_index:start_index + batch_size] + inputs_batch = self.tokenizer.pad( + inputs_batch, + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + last_hidden_state = self.model(**inputs_batch, return_dict=True).last_hidden_state + embeddings = last_token_pool(last_hidden_state, inputs_batch['attention_mask']) + if self.normalize_embeddings: + embeddings = torch.nn.functional.normalize(embeddings, dim=-1) + embeddings = cast(torch.Tensor, embeddings) + + if convert_to_numpy: + embeddings = embeddings.cpu().numpy() + all_embeddings.append(embeddings) + + if convert_to_numpy: + all_embeddings = np.concatenate(all_embeddings, axis=0) + else: + all_embeddings = torch.cat(all_embeddings, dim=0) + + # adjust the order of embeddings + all_embeddings = all_embeddings[np.argsort(length_sorted_idx)] + + # return the embeddings + if input_was_string: + return all_embeddings[0] + return all_embeddings
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/inference/embedder/decoder_only/icl.html b/_modules/FlagEmbedding/inference/embedder/decoder_only/icl.html new file mode 100644 index 00000000..057f0283 --- /dev/null +++ b/_modules/FlagEmbedding/inference/embedder/decoder_only/icl.html @@ -0,0 +1,1067 @@ + + + + + + + + FlagEmbedding.inference.embedder.decoder_only.icl - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.inference.embedder.decoder_only.icl

+from tqdm import tqdm, trange
+from typing import cast, Any, List, Union, Optional
+
+import queue
+from multiprocessing import Queue
+
+import gc
+import torch
+import numpy as np
+from transformers import AutoModel, AutoTokenizer
+
+from FlagEmbedding.abc.inference import AbsEmbedder
+
+
+# Pooling function for LLM-based embedding models
+def last_token_pool(last_hidden_states: torch.Tensor,
+                    attention_mask: torch.Tensor) -> torch.Tensor:
+    """Last token pooling method.
+
+    Args:
+        last_hidden_state (torch.Tensor): The last hidden state of the model.
+        attention_mask (torch.Tensor): Attention mask. Defaults to :data:`None`.
+
+    Returns:
+        torch.Tensor: The embedding vectors after pooling.
+    """
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return last_hidden_states[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = last_hidden_states.shape[0]
+        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
+
+
+
+[docs] +class ICLLLMEmbedder(AbsEmbedder): + """ + Embedder class for BGE-EN-icl. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to :data:`True`. + use_fp16 (bool, optional) If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`True`. + query_instruction_for_retrieval (Optional[str], optional): Query instruction for retrieval tasks, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`None`. + query_instruction_format (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"{}{}"`. + devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`. + examples_for_task (Optional[List[dict]], optional): Few-shot examples for the model to enhance model's ability. + Defaults to :data:`None`. + examples_instruction_format (str, optional): Example format when using :attr:`examples_for_task`. + trust_remote_code (bool, optional): trust_remote_code for HF datasets or models. Defaults to :data:`False`. + cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`256`. + query_max_length (int, optional): Maximum length for query. Defaults to :data:`512`. + passage_max_length (int, optional): Maximum length for passage. Defaults to :data:`512`. + convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. + Defaults to :data:`True`. + + Attributes: + DEFAULT_POOLING_METHOD: The default pooling method when running the model. + """ + DEFAULT_POOLING_METHOD = "last_token" + + def __init__( + self, + model_name_or_path: str, + normalize_embeddings: bool = True, + use_fp16: bool = True, + query_instruction_for_retrieval: Optional[str] = None, + query_instruction_format: str = "<instruct>{}\n<query>{}", # specify the format of query_instruction_for_retrieval + suffix: str = '\n<response>', + devices: Optional[Union[str, List[str]]] = None, # specify devices, such as "cuda:0" or ["cuda:0", "cuda:1"] + # Additional parameters for ICLLLMEmbedder + examples_for_task: Optional[List[dict]] = None, + examples_instruction_format: str = "<instruct>{}\n<query>{}\n<response>{}", # specify the format of examples_for_task + trust_remote_code: bool = False, + cache_dir: Optional[str] = None, + # inference + batch_size: int = 256, + query_max_length: int = 512, + passage_max_length: int = 512, + convert_to_numpy: bool = True, + **kwargs: Any, + ): + query_instruction_format = query_instruction_format.replace('\\n', '\n') + examples_instruction_format = examples_instruction_format.replace('\\n', '\n') + super().__init__( + model_name_or_path, + normalize_embeddings=normalize_embeddings, + use_fp16=use_fp16, + query_instruction_for_retrieval=query_instruction_for_retrieval, + query_instruction_format=query_instruction_format, + devices=devices, + batch_size=batch_size, + query_max_length=query_max_length, + passage_max_length=passage_max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, + trust_remote_code=trust_remote_code, + cache_dir=cache_dir + ) + self.model = AutoModel.from_pretrained( + model_name_or_path, + trust_remote_code=trust_remote_code, + cache_dir=cache_dir + ) + self.examples_for_task = examples_for_task + self.examples_instruction_format = examples_instruction_format + + if self.kwargs.get("pooling_method", "last_token") != "last_token": + raise ValueError("Pooling method must be 'last_token' for LLM-based models.") + + self.set_examples() + self.suffix = suffix + + self.query_pool = None + + def __del__(self): + self.stop_self_pool() + self.stop_self_query_pool() + +
+[docs] + def set_examples(self, examples_for_task: Optional[List[dict]] = None): + """Set the prefix to the provided examples. + + Args: + examples_for_task (Optional[List[dict]], optional): Few-shot examples for the model to enhance model's ability. + Defaults to :data:`None`. + """ + if examples_for_task is None and self.examples_for_task is None: + self.prefix = '' + elif examples_for_task is not None: + eg_paris = [] + for i in range(len(examples_for_task)): + eg_paris.append( + self.get_detailed_example( + self.examples_instruction_format, + examples_for_task[i].get('instruct', self.query_instruction_for_retrieval), + examples_for_task[i].get('query', ''), + examples_for_task[i].get('response', '') + ) + ) + self.prefix = '\n\n'.join(eg_paris) + '\n\n' + else: + eg_paris = [] + for i in range(len(self.examples_for_task)): + eg_paris.append( + self.get_detailed_example( + self.examples_instruction_format, + self.examples_for_task[i].get('instruct', self.query_instruction_for_retrieval), + self.examples_for_task[i].get('query', ''), + self.examples_for_task[i].get('response', '') + ) + ) + self.prefix = '\n\n'.join(eg_paris) + '\n\n'
+ + +
+[docs] + @staticmethod + def get_detailed_example(instruction_format: str, instruction: str, query: str, response: str): + """Combine the instruction and sentence along with the instruction format. + + Args: + instruction_format (str): Format for instruction. + instruction (str): The text of instruction. + query (str): The text of example query. + response (str): The text of example response. + + Returns: + str: The complete example following the given format. + """ + return instruction_format.format(instruction, query, response)
+ + + def stop_self_query_pool(self): + if self.query_pool is not None: + self.stop_multi_process_pool(self.query_pool) + self.query_pool = None + try: + self.model.to('cpu') + torch.cuda.empty_cache() + except: + pass + gc.collect() + +
+[docs] + def encode_queries( + self, + queries: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ) -> Union[np.ndarray, torch.Tensor]: + """Encode the queries. + + Args: + queries (Union[List[str], str]): Input queries to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor. + """ + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.query_max_length + if convert_to_numpy is None: convert_to_numpy = self.convert_to_numpy + + if isinstance(queries, str) or len(self.target_devices) == 1: + return self.encode_queries_single_device( + queries, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + device=self.target_devices[0], + **kwargs + ) + + self.stop_self_pool() + if self.query_pool is None: + self.query_pool = self.start_multi_process_pool(ICLLLMEmbedder._encode_queries_multi_process_worker) + embeddings = self.encode_multi_process( + queries, + self.query_pool, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + ) + return embeddings
+ + +
+[docs] + def encode_corpus( + self, + corpus: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ) -> Union[np.ndarray, torch.Tensor]: + """Encode the corpus. + + Args: + corpus (Union[List[str], str]): Input corpus to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor. + """ + self.stop_self_query_pool() + return super().encode_corpus( + corpus, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + )
+ + +
+[docs] + def encode( + self, + sentences: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ) -> Union[np.ndarray, torch.Tensor]: + """Encode the input sentences with the embedding model. + + Args: + sentences (Union[List[str], str]): Input sentences to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor. + """ + return super().encode( + sentences, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + )
+ + + # adapted from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L976 + @staticmethod + def _encode_queries_multi_process_worker( + target_device: str, model: 'ICLLLMEmbedder', input_queue: Queue, results_queue: Queue + ) -> None: + """ + Internal working process to encode sentences in multi-process setup + """ + while True: + try: + chunk_id, sentences, kwargs = ( + input_queue.get() + ) + embeddings = model.encode_queries_single_device( + sentences, + device=target_device, + **kwargs + ) + + results_queue.put([chunk_id, embeddings]) + except queue.Empty: + break + +
+[docs] + @torch.no_grad() + def encode_queries_single_device( + self, + queries: Union[List[str], str], + batch_size: int = 256, + max_length: int = 512, + convert_to_numpy: bool = True, + device: Optional[str] = None, + **kwargs: Any + ): + """Encode queries by a single device. + + Args: + queries (Union[List[str], str]): Input queries to encode. + batch_size (int, optional): Number of queries for each iter. Defaults to :data:`256`. + max_length (int, optional): Maximum length of tokens. Defaults to :data:`512`. + convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`True`. + device (Optional[str], optional): Device to use for encoding. Defaults to None. + + Returns: + Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor. + """ + if device is None: + device = self.target_devices[0] + + if device == "cpu": self.use_fp16 = False + if self.use_fp16: self.model.half() + + self.model.to(device) + self.model.eval() + + input_was_string = False + if isinstance(queries, str): + queries = [queries] + input_was_string = True + + if self.query_instruction_for_retrieval is not None: + if isinstance(queries, str): + input_texts = self.get_detailed_instruct(self.query_instruction_format, self.query_instruction_for_retrieval, queries) + else: + input_texts = [self.get_detailed_instruct(self.query_instruction_format, self.query_instruction_for_retrieval, query) for query in queries] + else: + input_texts = queries + + prefix_ids = self.tokenizer(self.prefix, add_special_tokens=False)['input_ids'] + suffix_ids = self.tokenizer(self.suffix, add_special_tokens=False)['input_ids'] + + _len_1 = len(self.tokenizer('<s>', add_special_tokens=False)['input_ids']) + _len_2 = len(self.tokenizer(f'{self.suffix}</s>', add_special_tokens=False)['input_ids']) + new_max_length = (len(prefix_ids) + len(suffix_ids) + max_length + 8) // 8 * 8 + 8 + + # tokenize without padding to get the correct length + all_inputs = [] + for start_index in trange(0, len(input_texts), batch_size, desc='pre tokenize'): + sentences_batch = input_texts[start_index:start_index + batch_size] + inputs_batch = self.tokenizer( + sentences_batch, + truncation=True, + max_length=max_length - _len_1 - _len_2, + add_special_tokens=False, + **kwargs + ) + sentences_batch = self.tokenizer.batch_decode(inputs_batch['input_ids']) + for i in range(len(sentences_batch)): + sentences_batch[i] = self.prefix + sentences_batch[i] + self.suffix + inputs_batch = self.tokenizer( + sentences_batch, + truncation=True, + max_length=new_max_length, + **kwargs + ) + inputs_batch = [{ + k: inputs_batch[k][i] for k in inputs_batch.keys() + } for i in range(len(sentences_batch))] + all_inputs.extend(inputs_batch) + + # sort by length for less padding + length_sorted_idx = np.argsort([-len(x['input_ids']) for x in all_inputs]) + all_inputs_sorted = [all_inputs[i] for i in length_sorted_idx] + sentences_sorted = [input_texts[i] for i in length_sorted_idx] + + # adjust batch size + flag = False + while flag is False: + try: + inputs_batch = self.tokenizer.pad( + all_inputs_sorted[: batch_size], + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + last_hidden_state = self.model(**inputs_batch, return_dict=True).last_hidden_state + embeddings = last_token_pool(last_hidden_state, inputs_batch['attention_mask']) + flag = True + except RuntimeError as e: + batch_size = batch_size * 3 // 4 + except torch.OutofMemoryError as e: + batch_size = batch_size * 3 // 4 + + # encode + all_embeddings = [] + for start_index in tqdm(range(0, len(sentences_sorted), batch_size), desc="Inference Embeddings", + disable=len(sentences_sorted) < 256): + inputs_batch = all_inputs_sorted[start_index:start_index + batch_size] + inputs_batch = self.tokenizer.pad( + inputs_batch, + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + + last_hidden_state = self.model(**inputs_batch, return_dict=True).last_hidden_state + embeddings = last_token_pool(last_hidden_state, inputs_batch['attention_mask']) + if self.normalize_embeddings: + embeddings = torch.nn.functional.normalize(embeddings, dim=-1) + embeddings = cast(torch.Tensor, embeddings) + + if convert_to_numpy: + embeddings = embeddings.cpu().numpy() + all_embeddings.append(embeddings) + + if convert_to_numpy: + all_embeddings = np.concatenate(all_embeddings, axis=0) + else: + all_embeddings = torch.cat(all_embeddings, dim=0) + + # adjust the order of embeddings + all_embeddings = all_embeddings[np.argsort(length_sorted_idx)] + + # return the embeddings + if input_was_string: + return all_embeddings[0] + return all_embeddings
+ + +
+[docs] + @torch.no_grad() + def encode_single_device( + self, + sentences: Union[List[str], str], + batch_size: int = 256, + max_length: int = 512, + convert_to_numpy: bool = True, + device: Optional[str] = None, + **kwargs: Any + ): + """Encode input sentences by a single device. + + Args: + sentences (Union[List[str], str]): Input sentences to encode. + batch_size (int, optional): Number of sentences for each iter. Defaults to :data:`256`. + max_length (int, optional): Maximum length of tokens. Defaults to :data:`512`. + convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`True`. + device (Optional[str], optional): Device to use for encoding. Defaults to None. + + Returns: + Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor. + """ + if device is None: + device = self.target_devices[0] + + if device == "cpu": self.use_fp16 = False + if self.use_fp16: self.model.half() + + self.model.to(device) + self.model.eval() + + input_was_string = False + if isinstance(sentences, str): + sentences = [sentences] + input_was_string = True + + # tokenize without padding to get the correct length + all_inputs = [] + for start_index in trange(0, len(sentences), batch_size, desc='pre tokenize', + disable=len(sentences) < 256): + sentences_batch = sentences[start_index:start_index + batch_size] + inputs_batch = self.tokenizer( + sentences_batch, + truncation=True, + max_length=max_length, + **kwargs + ) + inputs_batch = [{ + k: inputs_batch[k][i] for k in inputs_batch.keys() + } for i in range(len(sentences_batch))] + all_inputs.extend(inputs_batch) + + # sort by length for less padding + length_sorted_idx = np.argsort([-len(x['input_ids']) for x in all_inputs]) + all_inputs_sorted = [all_inputs[i] for i in length_sorted_idx] + + # adjust batch size + flag = False + while flag is False: + try: + inputs_batch = self.tokenizer.pad( + all_inputs_sorted[: batch_size], + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + last_hidden_state = self.model(**inputs_batch, return_dict=True).last_hidden_state + embeddings = last_token_pool(last_hidden_state, inputs_batch['attention_mask']) + flag = True + except RuntimeError as e: + batch_size = batch_size * 3 // 4 + except torch.OutofMemoryError as e: + batch_size = batch_size * 3 // 4 + + # encode + all_embeddings = [] + for start_index in tqdm(range(0, len(sentences), batch_size), desc="Inference Embeddings", + disable=len(sentences) < 256): + inputs_batch = all_inputs_sorted[start_index:start_index + batch_size] + inputs_batch = self.tokenizer.pad( + inputs_batch, + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + last_hidden_state = self.model(**inputs_batch, return_dict=True).last_hidden_state + embeddings = last_token_pool(last_hidden_state, inputs_batch['attention_mask']) + if self.normalize_embeddings: + embeddings = torch.nn.functional.normalize(embeddings, dim=-1) + embeddings = cast(torch.Tensor, embeddings) + + if convert_to_numpy: + embeddings = embeddings.cpu().numpy() + all_embeddings.append(embeddings) + + if convert_to_numpy: + all_embeddings = np.concatenate(all_embeddings, axis=0) + else: + all_embeddings = torch.cat(all_embeddings, dim=0) + + # adjust the order of embeddings + all_embeddings = all_embeddings[np.argsort(length_sorted_idx)] + + # return the embeddings + if input_was_string: + return all_embeddings[0] + return all_embeddings
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/inference/embedder/encoder_only/base.html b/_modules/FlagEmbedding/inference/embedder/encoder_only/base.html new file mode 100644 index 00000000..2e1cfe8b --- /dev/null +++ b/_modules/FlagEmbedding/inference/embedder/encoder_only/base.html @@ -0,0 +1,806 @@ + + + + + + + + FlagEmbedding.inference.embedder.encoder_only.base - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.inference.embedder.encoder_only.base

+from tqdm import tqdm, trange
+from typing import cast, Any, List, Union, Optional
+
+import torch
+import numpy as np
+from transformers import AutoModel, AutoTokenizer
+
+from FlagEmbedding.abc.inference import AbsEmbedder
+
+
+
+[docs] +class BaseEmbedder(AbsEmbedder): + """ + Base embedder for encoder only models. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + normalize_embeddings (bool, optional): If True, normalize the embedding vector. Defaults to :data:`True`. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`True`. + query_instruction_for_retrieval (Optional[str], optional): Query instruction for retrieval tasks, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`None`. + query_instruction_format (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"{}{}"`. + devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`. + pooling_method (str, optional): Pooling method to get embedding vector from the last hidden state. Defaults to :data:`"cls"`. + trust_remote_code (bool, optional): trust_remote_code for HF datasets or models. Defaults to :data:`False`. + cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`256`. + query_max_length (int, optional): Maximum length for query. Defaults to :data:`512`. + passage_max_length (int, optional): Maximum length for passage. Defaults to :data:`512`. + convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will be a Torch Tensor. + Defaults to :data:`True`. + + Attributes: + DEFAULT_POOLING_METHOD: The default pooling method when running the model. + """ + + DEFAULT_POOLING_METHOD = "cls" + + def __init__( + self, + model_name_or_path: str, + normalize_embeddings: bool = True, + use_fp16: bool = True, + query_instruction_for_retrieval: Optional[str] = None, + query_instruction_format: str = "{}{}", # specify the format of query_instruction_for_retrieval + devices: Optional[Union[str, List[str]]] = None, # specify devices, such as "cuda:0" or ["cuda:0", "cuda:1"] + # Additional parameters for BaseEmbedder + pooling_method: str = "cls", + trust_remote_code: bool = False, + cache_dir: Optional[str] = None, + # inference + batch_size: int = 256, + query_max_length: int = 512, + passage_max_length: int = 512, + convert_to_numpy: bool = True, + **kwargs: Any, + ): + super().__init__( + model_name_or_path, + normalize_embeddings=normalize_embeddings, + use_fp16=use_fp16, + query_instruction_for_retrieval=query_instruction_for_retrieval, + query_instruction_format=query_instruction_format, + devices=devices, + batch_size=batch_size, + query_max_length=query_max_length, + passage_max_length=passage_max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + ) + self.pooling_method = pooling_method + + self.tokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, + trust_remote_code=trust_remote_code, + cache_dir=cache_dir + ) + self.model = AutoModel.from_pretrained( + model_name_or_path, + trust_remote_code=trust_remote_code, + cache_dir=cache_dir + ) + +
+[docs] + def encode_queries( + self, + queries: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ) -> Union[np.ndarray, torch.Tensor]: + """Encode the queries. + + Args: + queries (Union[List[str], str]): Input queries to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor. + """ + return super().encode_queries( + queries, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + )
+ + +
+[docs] + def encode_corpus( + self, + corpus: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ) -> Union[np.ndarray, torch.Tensor]: + """Encode the corpus using the instruction if provided. + + Args: + corpus (Union[List[str], str]): Input corpus to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: Return the embedding vectors in a numpy array or tensor. + """ + return super().encode_corpus( + corpus, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + )
+ + +
+[docs] + def encode( + self, + sentences: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + convert_to_numpy: Optional[bool] = None, + **kwargs: Any + ) -> Union[np.ndarray, torch.Tensor]: + """Encode the input sentences with the embedding model. + + Args: + sentences (Union[List[str], str]): Input sentences to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + convert_to_numpy (Optional[bool], optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`None`. + + Returns: + Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor. + """ + return super().encode( + sentences, + batch_size=batch_size, + max_length=max_length, + convert_to_numpy=convert_to_numpy, + **kwargs + )
+ + +
+[docs] + @torch.no_grad() + def encode_single_device( + self, + sentences: Union[List[str], str], + batch_size: int = 256, + max_length: int = 512, + convert_to_numpy: bool = True, + device: Optional[str] = None, + **kwargs: Any + ): + """Encode input sentences by a single device. + + Args: + sentences (Union[List[str], str]): Input sentences to encode. + batch_size (int, optional): Number of sentences for each iter. Defaults to :data:`256`. + max_length (int, optional): Maximum length of tokens. Defaults to :data:`512`. + convert_to_numpy (bool, optional): If True, the output embedding will be a Numpy array. Otherwise, it will + be a Torch Tensor. Defaults to :data:`True`. + device (Optional[str], optional): Device to use for encoding. Defaults to None. + + Returns: + Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor. + """ + if device is None: + device = self.target_devices[0] + + if device == "cpu": self.use_fp16 = False + if self.use_fp16: self.model.half() + + self.model.to(device) + self.model.eval() + + input_was_string = False + if isinstance(sentences, str): + sentences = [sentences] + input_was_string = True + + # tokenize without padding to get the correct length + all_inputs = [] + for start_index in trange(0, len(sentences), batch_size, desc='pre tokenize', + disable=len(sentences) < 256): + sentences_batch = sentences[start_index:start_index + batch_size] + inputs_batch = self.tokenizer( + sentences_batch, + truncation=True, + max_length=max_length, + **kwargs + ) + inputs_batch = [{ + k: inputs_batch[k][i] for k in inputs_batch.keys() + } for i in range(len(sentences_batch))] + all_inputs.extend(inputs_batch) + + # sort by length for less padding + length_sorted_idx = np.argsort([-len(x['input_ids']) for x in all_inputs]) + all_inputs_sorted = [all_inputs[i] for i in length_sorted_idx] + + # adjust batch size + flag = False + while flag is False: + try: + inputs_batch = self.tokenizer.pad( + all_inputs_sorted[: batch_size], + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + last_hidden_state = self.model(**inputs_batch, return_dict=True).last_hidden_state + embeddings = self.pooling(last_hidden_state, inputs_batch['attention_mask']) + flag = True + except RuntimeError as e: + batch_size = batch_size * 3 // 4 + except torch.OutofMemoryError as e: + batch_size = batch_size * 3 // 4 + + # encode + all_embeddings = [] + for start_index in tqdm(range(0, len(sentences), batch_size), desc="Inference Embeddings", + disable=len(sentences) < 256): + inputs_batch = all_inputs_sorted[start_index:start_index + batch_size] + inputs_batch = self.tokenizer.pad( + inputs_batch, + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + last_hidden_state = self.model(**inputs_batch, return_dict=True).last_hidden_state + embeddings = self.pooling(last_hidden_state, inputs_batch['attention_mask']) + if self.normalize_embeddings: + embeddings = torch.nn.functional.normalize(embeddings, dim=-1) + embeddings = cast(torch.Tensor, embeddings) + + if convert_to_numpy: + embeddings = embeddings.cpu().numpy() + all_embeddings.append(embeddings) + + if convert_to_numpy: + all_embeddings = np.concatenate(all_embeddings, axis=0) + else: + all_embeddings = torch.cat(all_embeddings, dim=0) + + # adjust the order of embeddings + all_embeddings = all_embeddings[np.argsort(length_sorted_idx)] + + # return the embeddings + if input_was_string: + return all_embeddings[0] + return all_embeddings
+ + +
+[docs] + def pooling( + self, + last_hidden_state: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None + ): + """The pooling function. + + Args: + last_hidden_state (torch.Tensor): The last hidden state of the model. + attention_mask (Optional[torch.Tensor], optional): Attention mask. Defaults to :data:`None`. + + Raises: + NotImplementedError: pooling method not implemented. + + Returns: + torch.Tensor: The embedding vectors after pooling. + """ + if self.pooling_method == 'cls': + return last_hidden_state[:, 0] + elif self.pooling_method == 'mean': + s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1) + d = attention_mask.sum(dim=1, keepdim=True).float() + return s / d + else: + raise NotImplementedError(f"pooling method {self.pooling_method} not implemented")
+
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/inference/embedder/encoder_only/m3.html b/_modules/FlagEmbedding/inference/embedder/encoder_only/m3.html new file mode 100644 index 00000000..c2cc8d70 --- /dev/null +++ b/_modules/FlagEmbedding/inference/embedder/encoder_only/m3.html @@ -0,0 +1,1296 @@ + + + + + + + + FlagEmbedding.inference.embedder.encoder_only.m3 - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.inference.embedder.encoder_only.m3

+import math
+import torch
+import queue
+import logging
+import numpy as np
+from tqdm import tqdm, trange
+from multiprocessing import Queue
+from collections import defaultdict
+from transformers import AutoTokenizer
+from typing import Any, List, Union, Dict, Literal, Tuple, Optional
+
+from FlagEmbedding.abc.inference import AbsEmbedder
+from FlagEmbedding.finetune.embedder.encoder_only.m3 import (
+    EncoderOnlyEmbedderM3ModelForInference, EncoderOnlyEmbedderM3Runner
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +class M3Embedder(AbsEmbedder): + """ + Embedder class for BGE-M3. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + normalize_embeddings (bool, optional): If True, normalize the dense embedding vector. Defaults to :data:`True`. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`True`. + query_instruction_for_retrieval: (Optional[str], optional): Query instruction for retrieval tasks, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`None`. + query_instruction_format: (str, optional): The template for :attr:`query_instruction_for_retrieval`. Defaults to :data:`"{}{}"`. + devices (Optional[Union[str, int, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`. + pooling_method (str, optional): Pooling method to get embedding vector from the last hidden state. Defaults to :data:`"cls"`. + trust_remote_code (bool, optional): trust_remote_code for HF datasets or models. Defaults to :data:`False`. + cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`. + cobert_dim (int, optional): Dimension of colbert linear. Return the hidden_size if -1. Defaults to :data:`-1`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`256`. + query_max_length (int, optional): Maximum length for query. Defaults to :data:`512`. + passage_max_length (int, optional): Maximum length for passage. Defaults to :data:`512`. + return_dense (bool, optional): If true, will return the dense embedding. Defaults to :data:`True`. + return_sparse (bool, optional): If true, will return the sparce embedding. Defaults to :data:`False`. + return_colbert_vecs (bool, optional): If true, will return the colbert vectors. Defaults to :data:`False`. + + Attributes: + DEFAULT_POOLING_METHOD: The default pooling method when running the model. + """ + DEFAULT_POOLING_METHOD = "cls" + + def __init__( + self, + model_name_or_path: str, + normalize_embeddings: bool = True, + use_fp16: bool = True, + query_instruction_for_retrieval: Optional[str] = None, + query_instruction_format: str = "{}{}", # specify the format of query_instruction_for_retrieval + devices: Optional[Union[str, List[str]]] = None, # specify devices, such as "cuda:0" or ["cuda:0", "cuda:1"] + # Additional parameters for M3Embedder + pooling_method: str = "cls", + trust_remote_code: bool = False, + cache_dir: Optional[str] = None, + colbert_dim: int = -1, + # inference + batch_size: int = 256, + query_max_length: int = 512, + passage_max_length: int = 512, + return_dense: bool = True, + return_sparse: bool = False, + return_colbert_vecs: bool = False, + **kwargs: Any, + ): + super().__init__( + model_name_or_path, + normalize_embeddings=normalize_embeddings, + use_fp16=use_fp16, + query_instruction_for_retrieval=query_instruction_for_retrieval, + query_instruction_format=query_instruction_format, + devices=devices, + batch_size=batch_size, + query_max_length=query_max_length, + passage_max_length=passage_max_length, + return_dense=return_dense, + return_sparse=return_sparse, + return_colbert_vecs=return_colbert_vecs, + **kwargs + ) + self.pooling_method = pooling_method + + self.tokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, + trust_remote_code=trust_remote_code, + cache_dir=cache_dir + ) + self.model = EncoderOnlyEmbedderM3ModelForInference( + EncoderOnlyEmbedderM3Runner.get_model( + model_name_or_path, + trust_remote_code=trust_remote_code, + colbert_dim=colbert_dim, + cache_dir=cache_dir + ), + tokenizer=self.tokenizer, + sentence_pooling_method=pooling_method, + normalize_embeddings=normalize_embeddings + ) + +
+[docs] + def convert_id_to_token(self, lexical_weights: List[Dict]): + """Convert the ids back to tokens. + + Args: + lexical_weights (List[Dict]): A list of dictionaries of id & weights. + + Returns: + List[Dict]: A list of dictionaries of tokens & weights. + """ + if isinstance(lexical_weights, dict): + lexical_weights = [lexical_weights] + new_lexical_weights = [] + for item in lexical_weights: + new_item = {} + for id, weight in item.items(): + token = self.tokenizer.decode([int(id)]) + new_item[token] = weight + new_lexical_weights.append(new_item) + + if len(new_lexical_weights) == 1: + new_lexical_weights = new_lexical_weights[0] + return new_lexical_weights
+ + +
+[docs] + def compute_lexical_matching_score( + self, + lexical_weights_1: Union[Dict[str, float], List[Dict[str, float]]], + lexical_weights_2: Union[Dict[str, float], List[Dict[str, float]]] + ) -> Union[np.ndarray, float]: + """Compute the laxical matching score of two given lexical weights. + + Args: + lexical_weights_1 (Union[Dict[str, float], List[Dict[str, float]]]): First array of lexical weights. + lexical_weights_2 (Union[Dict[str, float], List[Dict[str, float]]]): Second array of lexical weights. + + Returns: + Union[np.ndarray, float]: The computed lexical weights across the two arries of lexical weights. + """ + def _compute_single_lexical_matching_score(lw1: Dict[str, float], lw2: Dict[str, float]): + scores = 0 + for token, weight in lw1.items(): + if token in lw2: + scores += weight * lw2[token] + return scores + + if isinstance(lexical_weights_1, dict) and isinstance(lexical_weights_2, dict): + return _compute_single_lexical_matching_score(lexical_weights_1, lexical_weights_2) + elif isinstance(lexical_weights_1, list) and isinstance(lexical_weights_2, list): + scores_array = [] + for lw1 in lexical_weights_1: + scores_array.append([ + _compute_single_lexical_matching_score(lw1, lw2) + for lw2 in lexical_weights_2 + ]) + return np.array(scores_array) + else: + raise ValueError("The input format of lexical_weights is not correct.")
+ + +
+[docs] + def colbert_score(self, q_reps, p_reps): + """Compute colbert scores of input queries and passages. + + Args: + q_reps (np.ndarray): Multi-vector embeddings for queries. + p_reps (np.ndarray): Multi-vector embeddings for passages/corpus. + + Returns: + torch.Tensor: Computed colbert scores. + """ + q_reps, p_reps = torch.from_numpy(q_reps), torch.from_numpy(p_reps) + token_scores = torch.einsum('in,jn->ij', q_reps, p_reps) + scores, _ = token_scores.max(-1) + scores = torch.sum(scores) / q_reps.size(0) + return scores
+ + +
+[docs] + def encode_queries( + self, + queries: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + return_dense: Optional[bool] = None, + return_sparse: Optional[bool] = None, + return_colbert_vecs: Optional[bool] = None, + **kwargs: Any + ) -> Dict[ + Literal["dense_vecs", "lexical_weights", "colbert_vecs"], + Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]] + ]: + """Encode the queries using the specified way. + + Args: + queries (Union[List[str], str]): The input queries to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + return_dense (Optional[bool], optional): If True, compute and return dense embedding. Defaults to :data:`None`. + return_sparse (Optional[bool], optional): If True, compute and return sparce embedding. Defaults to :data:`None`. + return_colbert_vecs (Optional[bool], optional): If True, compute and return cobert vectors. Defaults to :data:`None`. + + Returns: + Dict[Literal["dense_vecs", "lexical_weights", "colbert_vecs"], Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]] + """ + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.query_max_length + if return_dense is None: return_dense = self.return_dense + if return_sparse is None: return_sparse = self.return_sparse + if return_colbert_vecs is None: return_colbert_vecs = self.return_colbert_vecs + + return super().encode_queries( + queries, + batch_size=batch_size, + max_length=max_length, + return_dense=return_dense, + return_sparse=return_sparse, + return_colbert_vecs=return_colbert_vecs, + **kwargs + )
+ + +
+[docs] + def encode_corpus( + self, + corpus: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + return_dense: Optional[bool] = None, + return_sparse: Optional[bool] = None, + return_colbert_vecs: Optional[bool] = None, + **kwargs: Any + ) -> Dict[ + Literal["dense_vecs", "lexical_weights", "colbert_vecs"], + Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]] + ]: + """Encode the corpus using the specified way. + + Args: + corpus (Union[List[str], str]): The input corpus to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + return_dense (Optional[bool], optional): If True, compute and return dense embedding. Defaults to :data:`None`. + return_sparse (Optional[bool], optional): If True, compute and return sparce embedding. Defaults to :data:`None`. + return_colbert_vecs (Optional[bool], optional): If True, compute and return cobert vectors. Defaults to :data:`None`. + + Returns: + Dict[Literal["dense_vecs", "lexical_weights", "colbert_vecs"], Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]] + """ + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.passage_max_length + if return_dense is None: return_dense = self.return_dense + if return_sparse is None: return_sparse = self.return_sparse + if return_colbert_vecs is None: return_colbert_vecs = self.return_colbert_vecs + + return super().encode_corpus( + corpus, + batch_size=batch_size, + max_length=max_length, + return_dense=return_dense, + return_sparse=return_sparse, + return_colbert_vecs=return_colbert_vecs, + **kwargs + )
+ + +
+[docs] + def encode( + self, + sentences: Union[List[str], str], + batch_size: Optional[int] = None, + max_length: Optional[int] = None, + return_dense: Optional[bool] = None, + return_sparse: Optional[bool] = None, + return_colbert_vecs: Optional[bool] = None, + **kwargs: Any + ) -> Dict[ + Literal["dense_vecs", "lexical_weights", "colbert_vecs"], + Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]] + ]: + """Encode the sentences using the specified way. + + Args: + sentences (Union[List[str], str]): The input sentences to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + return_dense (Optional[bool], optional): If True, compute and return dense embedding. Defaults to :data:`None`. + return_sparse (Optional[bool], optional): If True, compute and return sparce embedding. Defaults to :data:`None`. + return_colbert_vecs (Optional[bool], optional): If True, compute and return cobert vectors. Defaults to :data:`None`. + + Returns: + Dict[Literal["dense_vecs", "lexical_weights", "colbert_vecs"], Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]] + """ + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.passage_max_length + if return_dense is None: return_dense = self.return_dense + if return_sparse is None: return_sparse = self.return_sparse + if return_colbert_vecs is None: return_colbert_vecs = self.return_colbert_vecs + + return super().encode( + sentences, + batch_size=batch_size, + max_length=max_length, + return_dense=return_dense, + return_sparse=return_sparse, + return_colbert_vecs=return_colbert_vecs, + **kwargs + )
+ + +
+[docs] + @torch.no_grad() + def encode_single_device( + self, + sentences: Union[List[str], str], + batch_size: int = 256, + max_length: int = 512, + return_dense: bool = True, + return_sparse: bool = False, + return_colbert_vecs: bool = False, + device: Optional[str] = None, + **kwargs: Any + ): + """Using single device to encode the input sentences. + + Args: + sentences (Union[List[str], str]): The input sentences to encode. + batch_size (Optional[int], optional): Number of sentences for each iter. Defaults to :data:`256`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`512`. + return_dense (Optional[bool], optional): If True, compute and return dense embedding. Defaults to :data:`True`. + return_sparse (Optional[bool], optional): If True, compute and return sparce embedding. Defaults to :data:`False`. + return_colbert_vecs (Optional[bool], optional): If True, compute and return cobert vectors. Defaults to :data:`False`. + device (Optional[str], optional): _description_. Defaults to :data:`None`. + + Returns: + Dict[Literal["dense_vecs", "lexical_weights", "colbert_vecs"], Union[np.ndarray, List[Dict[str, float]], List[np.ndarray]] + """ + # pop convert_to_numpy from kwargs + kwargs.pop("convert_to_numpy", None) + + if device is None: + device = self.target_devices[0] + + if device == "cpu": self.use_fp16 = False + if self.use_fp16: self.model.half() + + self.model.to(device) + self.model.eval() + + input_was_string = False + if isinstance(sentences, str): + sentences = [sentences] + input_was_string = True + + def _process_token_weights(token_weights: np.ndarray, input_ids: list): + # conver to dict + result = defaultdict(int) + unused_tokens = set() + for _token in ['cls_token', 'eos_token', 'pad_token', 'unk_token']: + if _token in self.tokenizer.special_tokens_map: + _token_id = self.tokenizer.convert_tokens_to_ids(self.tokenizer.special_tokens_map[_token]) + unused_tokens.add(_token_id) + # token_weights = np.ceil(token_weights * 100) + for w, idx in zip(token_weights, input_ids): + if idx not in unused_tokens and w > 0: + idx = str(idx) + # w = int(w) + if w > result[idx]: + result[idx] = w + return result + + def _process_colbert_vecs(colbert_vecs: np.ndarray, attention_mask: list): + # delte the vectors of padding tokens + tokens_num = np.sum(attention_mask) + return colbert_vecs[:tokens_num - 1] # we don't use the embedding of cls, so select tokens_num-1 + + # tokenize without padding to get the correct length + all_inputs = [] + for start_index in trange(0, len(sentences), batch_size, desc='pre tokenize', + disable=len(sentences) < 256): + sentences_batch = sentences[start_index:start_index + batch_size] + inputs_batch = self.tokenizer( + sentences_batch, + truncation=True, + max_length=max_length, + **kwargs + ) + inputs_batch = [{ + k: inputs_batch[k][i] for k in inputs_batch.keys() + } for i in range(len(sentences_batch))] + all_inputs.extend(inputs_batch) + + # sort by length for less padding + length_sorted_idx = np.argsort([-len(x['input_ids']) for x in all_inputs]) + all_inputs_sorted = [all_inputs[i] for i in length_sorted_idx] + + # adjust batch size + flag = False + while flag is False: + try: + inputs_batch = self.tokenizer.pad( + all_inputs_sorted[: batch_size], + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + outputs = self.model( + inputs_batch, + return_dense=return_dense, + return_sparse=return_sparse, + return_colbert_vecs=return_colbert_vecs + ) + flag = True + except RuntimeError as e: + batch_size = batch_size * 3 // 4 + except torch.OutofMemoryError as e: + batch_size = batch_size * 3 // 4 + + # encode + all_dense_embeddings, all_lexical_weights, all_colbert_vecs = [], [], [] + for start_index in tqdm(range(0, len(sentences), batch_size), desc="Inference Embeddings", + disable=len(sentences) < 256): + inputs_batch = all_inputs_sorted[start_index:start_index + batch_size] + inputs_batch = self.tokenizer.pad( + inputs_batch, + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + outputs = self.model( + inputs_batch, + return_dense=return_dense, + return_sparse=return_sparse, + return_colbert_vecs=return_colbert_vecs + ) + + if return_dense: + all_dense_embeddings.append(outputs['dense_vecs'].cpu().numpy()) + + if return_sparse: + token_weights = outputs['sparse_vecs'].squeeze(-1) + all_lexical_weights.extend( + list(map( + _process_token_weights, + token_weights.cpu().numpy(), + inputs_batch['input_ids'].cpu().numpy().tolist() + ))) + + if return_colbert_vecs: + all_colbert_vecs.extend( + list(map( + _process_colbert_vecs, + outputs['colbert_vecs'].cpu().numpy(), + inputs_batch['attention_mask'].cpu().numpy() + ))) + + if return_dense: + all_dense_embeddings = np.concatenate(all_dense_embeddings, axis=0) + # adjust the order of embeddings + all_dense_embeddings = all_dense_embeddings[np.argsort(length_sorted_idx)] + if input_was_string: + all_dense_embeddings = all_dense_embeddings[0] + else: + all_dense_embeddings = None + + if return_sparse: + # adjust the order of lexical weights + all_lexical_weights = [all_lexical_weights[i] for i in np.argsort(length_sorted_idx)] + if input_was_string: + all_lexical_weights = all_lexical_weights[0] + else: + all_lexical_weights = None + + if return_colbert_vecs: + # adjust the order of embeddings + all_colbert_vecs = [all_colbert_vecs[i] for i in np.argsort(length_sorted_idx)] + if input_was_string: + all_colbert_vecs = all_colbert_vecs[0] + else: + all_colbert_vecs = None + + # return the embeddings + return { + "dense_vecs": all_dense_embeddings, + "lexical_weights": all_lexical_weights, + "colbert_vecs": all_colbert_vecs + }
+ + +
+[docs] + def compute_score( + self, + sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], + batch_size: Optional[int] = None, + max_query_length: Optional[int] = None, + max_passage_length: Optional[int] = None, + weights_for_different_modes: Optional[List[float]] = None, + **kwargs: Any + ) -> Dict[ + Literal["colbert", "sparse", "dense", "sparse+dense", "colbert+sparse+dense"], + List[float] + ]: + """Compute the relevance score of different attributes. + + Args: + sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]): _description_ + batch_size (Optional[int], optional): _description_. Defaults to None. + max_query_length (Optional[int], optional): _description_. Defaults to None. + max_passage_length (Optional[int], optional): _description_. Defaults to None. + weights_for_different_modes (Optional[List[float]], optional): _description_. Defaults to None. + + Returns: + Dict[Literal["colbert", "sparse", "dense", "sparse+dense", "colbert+sparse+dense"], List[float]] + """ + if batch_size is None: batch_size = self.batch_size + if max_query_length is None: max_query_length = self.query_max_length + if max_passage_length is None: max_passage_length = self.passage_max_length + + if len(self.target_devices) == 1: + return self.compute_score_single_device( + sentence_pairs, + batch_size=batch_size, + max_query_length=max_query_length, + max_passage_length=max_passage_length, + weights_for_different_modes=weights_for_different_modes, + device=self.target_devices[0], + **kwargs + ) + + pool = self.start_multi_process_pool(M3Embedder._compute_score_multi_process_worker) + embeddings = self.compute_score_multi_process( + sentence_pairs, + pool, + batch_size=batch_size, + max_query_length=max_query_length, + max_passage_length=max_passage_length, + weights_for_different_modes=weights_for_different_modes, + **kwargs + ) + self.stop_multi_process_pool(pool) + return embeddings
+ + + # adapted from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L877 +
+[docs] + def compute_score_multi_process( + self, + sentence_pairs: List[Tuple[str, str]], + pool: Dict[Literal["input", "output", "processes"], Any], + **kwargs + ): + chunk_size = math.ceil(len(sentence_pairs) / len(pool["processes"])) + + input_queue = pool["input"] + last_chunk_id = 0 + chunk = [] + + for sentence_pair in sentence_pairs: + chunk.append(sentence_pair) + if len(chunk) >= chunk_size: + input_queue.put( + [last_chunk_id, chunk, kwargs] + ) + last_chunk_id += 1 + chunk = [] + + if len(chunk) > 0: + input_queue.put([last_chunk_id, chunk, kwargs]) + last_chunk_id += 1 + + output_queue = pool["output"] + results_list = sorted( + [output_queue.get() for _ in trange(last_chunk_id, desc="Chunks")], + key=lambda x: x[0], + ) + + scores_dict = self._concatenate_compute_score_results_from_multi_process([result[1] for result in results_list]) + return scores_dict
+ + + # adapted from https://github.com/UKPLab/sentence-transformers/blob/1802076d4eae42ff0a5629e1b04e75785d4e193b/sentence_transformers/SentenceTransformer.py#L976 + @staticmethod + def _compute_score_multi_process_worker( + target_device: str, model: 'M3Embedder', input_queue: Queue, results_queue: Queue + ) -> None: + """ + Internal working process to encode sentences in multi-process setup + """ + while True: + try: + chunk_id, sentences, kwargs = ( + input_queue.get() + ) + embeddings = model.compute_score_single_device( + sentences, + device=target_device, + **kwargs + ) + + results_queue.put([chunk_id, embeddings]) + except queue.Empty: + break + +
+[docs] + @torch.no_grad() + def compute_score_single_device( + self, + sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], + batch_size: int = 256, + max_query_length: int = 512, + max_passage_length: int = 512, + weights_for_different_modes: Optional[List[float]] = None, + device: Optional[str] = None, + **kwargs: Any + ) -> Dict[ + Literal["colbert", "sparse", "dense", "sparse+dense", "colbert+sparse+dense"], + List[float] + ]: + """Compute the relevance score of different attributes. + + Args: + sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]): Pairs of sentences to compute the score. + batch_size (Optional[int], optional): _description_. Defaults to :data:`None`. + max_query_length (Optional[int], optional): _description_. Defaults to :data:`None`. + max_passage_length (Optional[int], optional): _description_. Defaults to :data:`None`. + weights_for_different_modes (Optional[List[float]], optional): The weights for different methods. Defaults to :data:`None`. + device (Optional[str], optional): The device to use. Defaults to :data:`None`. + + Returns: + Dict[Literal["colbert", "sparse", "dense", "sparse+dense", "colbert+sparse+dense"], List[float]] + """ + def _tokenize(texts: list, max_length: int): + return self.tokenizer( + texts, + max_length=max_length, + padding=True, + return_token_type_ids=False, + truncation=True, + return_tensors='pt', + **kwargs + ) + + if device is None: + device = self.target_devices[0] + + if device == "cpu": self.use_fp16 = False + if self.use_fp16: self.model.half() + + self.model.to(device) + self.model.eval() + + if isinstance(sentence_pairs, list) and len(sentence_pairs) == 0: + return [] + if isinstance(sentence_pairs[0], str): + one_input_pair = True + sentence_pairs = [sentence_pairs] + else: + one_input_pair = False + + all_scores = { + 'colbert': [], + 'sparse': [], + 'dense': [], + 'sparse+dense': [], + 'colbert+sparse+dense': [] + } + for start_index in tqdm(range(0, len(sentence_pairs), batch_size), desc="Compute Scores", + disable=len(sentence_pairs) < 128): + sentences_batch = sentence_pairs[start_index:start_index + batch_size] + + queries_batch = [pair[0] for pair in sentences_batch] + corpus_batch = [pair[1] for pair in sentences_batch] + + queries_inputs = _tokenize(queries_batch, max_length=max_query_length).to(device) + corpus_inputs = _tokenize(corpus_batch, max_length=max_passage_length).to(device) + + queries_output = self.model( + queries_inputs, + return_dense=True, return_sparse=True, return_colbert_vecs=True, + return_sparse_embedding=True + ) + corpus_output = self.model( + corpus_inputs, + return_dense=True, return_sparse=True, return_colbert_vecs=True, + return_sparse_embedding=True + ) + + q_dense_vecs, q_sparse_vecs, q_colbert_vecs = queries_output['dense_vecs'], queries_output['sparse_vecs'], \ + queries_output['colbert_vecs'] + p_dense_vecs, p_sparse_vecs, p_colbert_vecs = corpus_output['dense_vecs'], corpus_output['sparse_vecs'], \ + corpus_output['colbert_vecs'] + + dense_scores = self.model.compute_dense_score(q_dense_vecs, p_dense_vecs) + sparse_scores = self.model.compute_sparse_score(q_sparse_vecs, p_sparse_vecs) + colbert_scores = self.model.compute_colbert_score( + q_colbert_vecs, p_colbert_vecs, + q_mask=queries_inputs['attention_mask'] + ) + + if weights_for_different_modes is None: + weights_for_different_modes = [1., 1., 1.] + weight_sum = 3 + logger.info("default weights for dense, sparse, colbert are [1.0, 1.0, 1.0] ") + else: + assert len(weights_for_different_modes) == 3 + weight_sum = sum(weights_for_different_modes) + + inx = torch.arange(0, len(sentences_batch)) + dense_scores, sparse_scores, colbert_scores = dense_scores[inx, inx].float(), sparse_scores[ + inx, inx].float(), colbert_scores[inx, inx].float() + + all_scores['colbert'].extend( + colbert_scores.cpu().numpy().tolist() + ) + all_scores['sparse'].extend( + sparse_scores.cpu().numpy().tolist() + ) + all_scores['dense'].extend( + dense_scores.cpu().numpy().tolist() + ) + all_scores['sparse+dense'].extend( + ((sparse_scores * weights_for_different_modes[1] + dense_scores * weights_for_different_modes[0])/(weights_for_different_modes[1]+weights_for_different_modes[0])).cpu().numpy().tolist() + ) + all_scores['colbert+sparse+dense'].extend( + ((colbert_scores * weights_for_different_modes[2] + sparse_scores * weights_for_different_modes[1] + dense_scores * weights_for_different_modes[0])/weight_sum).cpu().numpy().tolist() + ) + + if one_input_pair: + return {k: v[0] for k, v in all_scores.items()} + return all_scores
+ + + def _concatenate_results_from_multi_process( + self, + results_list: List[Dict[Literal["dense_vecs", "lexical_weights", "colbert_vecs"], Any]] + ): + """Concatenate and return the results from all the processes. + + Args: + results_list (List[Dict[Literal[&quot;dense_vecs&quot;, &quot;lexical_weights&quot;, &quot;colbert_vecs&quot;], Any]]): + A list of results from all the processes. + + Returns: + Dict: The merged encoding results from the multi processes. + """ + merged_results = { + "dense_vecs": [], + "lexical_weights": [], + "colbert_vecs": [] + } + for key in merged_results.keys(): + for results in results_list: + if results[key] is None: + merged_results[key] = None + break + else: + if key == "dense_vecs": + merged_results[key].append(results[key]) + else: + merged_results[key].extend(results[key]) + + if merged_results["dense_vecs"] is not None: + merged_results["dense_vecs"] = np.concatenate(merged_results["dense_vecs"], axis=0) + + return merged_results + + def _concatenate_compute_score_results_from_multi_process( + self, + results_list: List[Dict[Literal["colbert", "sparse", "dense", "sparse+dense", "colbert+sparse+dense"], List[float]]] + ): + """Concatenate and return the results from all the processes. + + Args: + results_list (List[Dict[Literal[&quot;colbert&quot;, &quot;sparse&quot;, &quot;dense&quot;, &quot;sparse): + A list of computed scores. + + Returns: + Dict: The merged computed scores from the multi processes. + """ + merged_results = { + "colbert": [], + "sparse": [], + "dense": [], + "sparse+dense": [], + "colbert+sparse+dense": [] + } + for key in merged_results.keys(): + for results in results_list: + merged_results[key].extend(results[key]) + + return merged_results
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/inference/reranker/decoder_only/base.html b/_modules/FlagEmbedding/inference/reranker/decoder_only/base.html new file mode 100644 index 00000000..adf033e1 --- /dev/null +++ b/_modules/FlagEmbedding/inference/reranker/decoder_only/base.html @@ -0,0 +1,995 @@ + + + + + + + + FlagEmbedding.inference.reranker.decoder_only.base - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.inference.reranker.decoder_only.base

+import torch
+import warnings
+import numpy as np
+from tqdm import tqdm, trange
+from typing import Any, List, Union, Tuple, Optional
+from peft import PeftModel
+from torch import Tensor
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from torch.utils.data import Dataset, DataLoader
+
+from FlagEmbedding.abc.inference import AbsReranker
+from FlagEmbedding.inference.reranker.encoder_only.base import sigmoid
+
+
+def last_logit_pool(logits: Tensor,
+                    attention_mask: Tensor) -> Tensor:
+    """Pool the last logit.
+
+    Args:
+        logits (torch.Tensor): The output logits of the model.
+        attention_mask (torch.Tensor): Attention mask.
+
+    Returns:
+        torch.Tensor: The tensor after pooling.
+    """
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return logits[:, -1, :]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = logits.shape[0]
+        return torch.stack([logits[i, sequence_lengths[i], :] for i in range(batch_size)], dim=0)
+
+
+class DatasetForReranker(Dataset):
+    """Prepare the dataset for dataloader.
+
+    Args:
+        all_queries_inputs (_type_): All the input queries.
+        all_passages_inputs (_type_): All the input passages.
+        tokenizer_path (str): Path to the tokenizer to use.
+        max_len (int, optional): Maximum length of tokens. Defaults to :data:`512`.
+        cache_dir (Optional[str], optional): Cache directory for the tokenzier. Defaults to :data:`None`.
+        prompt (Optional[str], optional): Prompt for the specific task, will use the default if not provided.
+            Defaults to `None`.
+    """
+    def __init__(
+        self,
+        all_queries_inputs,
+        all_passages_inputs,
+        tokenizer_path: str,
+        max_len: int = 512,
+        cache_dir: Optional[str] = None,
+        prompt: Optional[str] = None,
+        **kwargs: Any, 
+    ):
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_path,
+            trust_remote_code=True,
+            cache_dir=cache_dir
+        )
+
+        self.all_queries_inputs = all_queries_inputs
+        self.all_passages_inputs = all_passages_inputs
+        self.max_len = max_len
+        self.total_len = len(self.all_queries_inputs)
+        self.kwargs = kwargs
+
+        if prompt is None:
+            prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
+        self.prompt_inputs = self.tokenizer(
+            prompt,
+            return_tensors=None,
+            add_special_tokens=False
+        )['input_ids']
+        sep = "\n"
+        self.sep_inputs = self.tokenizer(
+            sep,
+            return_tensors=None,
+            add_special_tokens=False
+        )['input_ids']
+
+        self.encode_max_length = self.max_len + len(self.sep_inputs) + len(self.prompt_inputs)
+
+    def __len__(self):
+        return self.total_len
+
+    def __getitem__(self, item):
+        query_inputs = self.all_queries_inputs[item]
+        passage_inputs = self.all_passages_inputs[item]
+        if self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id != self.tokenizer.pad_token_id:
+            item = self.tokenizer.prepare_for_model(
+                [self.tokenizer.bos_token_id] + query_inputs['input_ids'],
+                self.sep_inputs + passage_inputs['input_ids'],
+                truncation='only_second',
+                max_length=self.encode_max_length,
+                padding=False,
+                return_attention_mask=False,
+                return_token_type_ids=False,
+                add_special_tokens=False
+            )
+        else:
+            item = self.tokenizer.prepare_for_model(
+                query_inputs['input_ids'],
+                self.sep_inputs + passage_inputs['input_ids'],
+                truncation='only_second',
+                max_length=self.encode_max_length,
+                padding=False,
+                return_attention_mask=False,
+                return_token_type_ids=False,
+                add_special_tokens=False
+            )
+        item['input_ids'] = item['input_ids'] + self.sep_inputs + self.prompt_inputs
+        item['attention_mask'] = [1] * len(item['input_ids'])
+        item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None
+        if 'position_ids' in item.keys():
+            item['position_ids'] = list(range(len(item['input_ids'])))
+
+        return item
+
+
+class Collater:
+    """
+    Collator of the reranker.
+    
+    Args:
+        tokenizer (transformers.AutoTokenizer): The tokenizer for reranker.
+        max_len (int): Maximum length of tokens.
+    """
+    def __init__(self, tokenizer, max_len):
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        self.pad_to_multiple_of = 8
+        self.label_pad_token_id = -100
+        warnings.filterwarnings("ignore",
+                                message="`max_length` is ignored when `padding`=`True` and there is no truncation strategy.")
+
+    def __call__(self, data):
+        labels = [feature["labels"] for feature in data] if "labels" in data[0].keys() else None
+        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
+        # same length to return tensors.
+        if labels is not None:
+            max_label_length = max(len(l) for l in labels)
+            if self.pad_to_multiple_of is not None:
+                max_label_length = (
+                        (max_label_length + self.pad_to_multiple_of - 1)
+                        // self.pad_to_multiple_of
+                        * self.pad_to_multiple_of
+                )
+
+            padding_side = self.tokenizer.padding_side
+            for feature in data:
+                remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
+                if isinstance(feature["labels"], list):
+                    feature["labels"] = (
+                        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
+                    )
+                elif padding_side == "right":
+                    feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
+                else:
+                    feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
+
+        return self.tokenizer.pad(
+            data,
+            padding=True,
+            pad_to_multiple_of=8,
+            return_tensors='pt',
+        )
+
+
+
+[docs] +class BaseLLMReranker(AbsReranker): + """Base reranker class for LLM like decoder only models. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + peft_path (Optional[str], optional): Path to the PEFT config. Defaults to :data:`None`. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`False`. Defaults to :data:`False`. + use_bf16 (bool, optional): Another type of half-precision floating-point, you can use bf16 if the hardware supports. + Defaults to :data:False. + query_instruction_for_rerank (str, optional): Query instruction for retrieval tasks, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`"A: "`. + query_instruction_format (str, optional): The template for :attr:`query_instruction_for_rerank`. Defaults to :data:`"{}{}"`. + passage_instruction_for_rerank (str, optional): Passage instruction for retrieval tasks, which will be used with + with :attr:`passage_instruction_format`. Defaults to :data:`"B: "`. + passage_instruction_format (str, optional): The template for passage. Defaults to "{}{}". + cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`. + trust_remote_code (bool, optional): trust_remote_code. Defaults to :data:`False`. + devices (Union[str, List[str], List[int]], optional): Devices to use for model inference, such as ["cuda:0"] or ["0"]. + Defaults to :data:`None`. + prompt (Optional[str], optional): Prompt for the specific task. Defaults to :data:`None`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`128`. + query_max_length (int, optional): Maximum length for queries. If not specified, will be 3/4 of :attr:`max_length`. + Defaults to :data:`None`. + max_length (int, optional): Maximum length of passages. Defaults to :data`512`. + normalize (bool, optional): If True, use Sigmoid to normalize the results. Defaults to :data:`False`. + """ + def __init__( + self, + model_name_or_path: str, + peft_path: Optional[str] = None, + use_fp16: bool = False, + use_bf16: bool = False, + query_instruction_for_rerank: str = "A: ", + query_instruction_format: str = "{}{}", # specify the format of query_instruction_for_rerank + passage_instruction_for_rerank: str = "B: ", + passage_instruction_format: str = "{}{}", # specify the format of passage_instruction_for_rerank + cache_dir: Optional[str] = None, + trust_remote_code: bool = False, + devices: Union[str, List[str], List[int]] = None, # specify devices, such as ["cuda:0"] or ["0"] + # inference + prompt: Optional[str] = None, + batch_size: int = 128, + query_max_length: int = None, + max_length: int = 512, + normalize: bool = False, + **kwargs: Any, + ) -> None: + super().__init__( + model_name_or_path=model_name_or_path, + use_fp16=use_fp16, + query_instruction_for_rerank=query_instruction_for_rerank, + query_instruction_format=query_instruction_format, + passage_instruction_for_rerank=passage_instruction_for_rerank, + passage_instruction_format=passage_instruction_format, + devices=devices, + batch_size=batch_size, + query_max_length=query_max_length, + max_length=max_length, + normalize=normalize, + prompt=prompt, + **kwargs + ) + + self.prompt = prompt + + self.tokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, + cache_dir=cache_dir, + trust_remote_code=trust_remote_code + ) + + self.model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + cache_dir=cache_dir, + trust_remote_code=trust_remote_code, + torch_dtype=torch.bfloat16 if use_bf16 else torch.float32 + ) + if peft_path: + self.model = PeftModel.from_pretrained(self.model, peft_path) + self.model = self.model.merge_and_unload() + + self.yes_loc = self.tokenizer('Yes', add_special_tokens=False)['input_ids'][0] + + @torch.no_grad() + def compute_score_single_gpu( + self, + sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], + batch_size: Optional[int] = None, + query_max_length: Optional[int] = None, + max_length: Optional[int] = None, + prompt: Optional[str] = None, + normalize: Optional[bool] = None, + use_dataloader: bool = False, + num_workers: int = None, + device: Optional[str] = None, + **kwargs: Any + ) -> List[float]: + """Compute the relevance scores using a single GPU. + + Args: + sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]): Input sentence pairs to compute scores. + batch_size (Optional[int], optional): Number of inputs for each iter. Defaults to :data:`None`. + query_max_length (Optional[int], optional): Maximum length of tokens of queries. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + prompt (Optional[str], optional): Prompt for the specific task. Defaults to :data:`None`. + normalize (Optional[bool], optional): If True, use Sigmoid to normalize the results. Defaults to :data:`None`. + use_dataloader (bool, optional): If True, will use the dataloader to load the datasets. Defaults to :data:`False`. + num_workers (int, optional): Number of workers for dataloader. Defaults to :data:`None`. + device (Optional[str], optional): Device to use for computation. Defaults to :data:`None`. + + Returns: + List[float]: The computed scores. + """ + if prompt is None: prompt = self.prompt + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.max_length + if query_max_length is None: + if self.query_max_length is not None: + query_max_length = self.query_max_length + else: + query_max_length = max_length * 3 // 4 + if normalize is None: normalize = self.normalize + + if device is None: + device = self.target_devices[0] + + if device == "cpu": self.use_fp16 = False + if self.use_fp16: self.model.half() + + self.model.to(device) + self.model.eval() + + assert isinstance(sentence_pairs, list) + if isinstance(sentence_pairs[0], str): + sentence_pairs = [sentence_pairs] + + # tokenize without padding to get the correct length + all_queries_inputs = [] + all_passages_inputs = [] + for start_index in trange(0, len(sentence_pairs), batch_size, desc="pre tokenize", + disable=len(sentence_pairs) < 128): + sentences_batch = sentence_pairs[start_index:start_index + batch_size] + queries = [s[0] for s in sentences_batch] + passages = [s[1] for s in sentences_batch] + queries_inputs_batch = self.tokenizer( + queries, + return_tensors=None, + add_special_tokens=False, + max_length=query_max_length, + truncation=True, + **kwargs + ) + passages_inputs_batch = self.tokenizer( + passages, + return_tensors=None, + add_special_tokens=False, + max_length=max_length, + truncation=True, + **kwargs + ) + queries_inputs_batch = [{ + k: queries_inputs_batch[k][i] for k in queries_inputs_batch.keys() + } for i in range(len(sentences_batch))] + passages_inputs_batch = [{ + k: passages_inputs_batch[k][i] for k in passages_inputs_batch.keys() + } for i in range(len(sentences_batch))] + + all_queries_inputs.extend(queries_inputs_batch) + all_passages_inputs.extend(passages_inputs_batch) + + # sort by length for less padding + length_sorted_idx = np.argsort([-len(x['input_ids']) - len(y['input_ids']) for (x, y) in zip(all_queries_inputs, all_passages_inputs)]) + all_queries_inputs_sorted = [all_queries_inputs[i] for i in length_sorted_idx] + all_passages_inputs_sorted = [all_passages_inputs[i] for i in length_sorted_idx] + + # other inputs + if prompt is None: + prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." + prompt_inputs = self.tokenizer( + prompt, + return_tensors=None, + add_special_tokens=False + )['input_ids'] + sep = "\n" + sep_inputs = self.tokenizer( + sep, + return_tensors=None, + add_special_tokens=False + )['input_ids'] + encode_max_length = max_length + len(sep_inputs) + len(prompt_inputs) + + # adjust batch size + flag = False + while flag is False: + try: + batch_inputs = [] + for query_inputs, passage_inputs in zip( + all_queries_inputs_sorted[:min(len(all_queries_inputs_sorted), batch_size)], + all_passages_inputs_sorted[:min(len(all_passages_inputs_sorted), batch_size)] + ): + if self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id != self.tokenizer.pad_token_id: + item = self.tokenizer.prepare_for_model( + [self.tokenizer.bos_token_id] + query_inputs['input_ids'], + sep_inputs + passage_inputs['input_ids'], + truncation='only_second', + max_length=encode_max_length, + padding=False, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=False + ) + else: + item = self.tokenizer.prepare_for_model( + query_inputs['input_ids'], + sep_inputs + passage_inputs['input_ids'], + truncation='only_second', + max_length=encode_max_length, + padding=False, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=False + ) + item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs + item['attention_mask'] = [1] * len(item['input_ids']) + item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None + if 'position_ids' in item.keys(): + item['position_ids'] = list(range(len(item['input_ids']))) + batch_inputs.append(item) + + collater_instance = Collater(self.tokenizer, encode_max_length) + batch_inputs = collater_instance([{ + 'input_ids': item['input_ids'], + 'attention_mask': item['attention_mask'] + } for item in batch_inputs] + ) + + batch_inputs = {key: val.to(device) for key, val in batch_inputs.items()} + + self.model(**batch_inputs, output_hidden_states=True) + flag = True + except RuntimeError as e: + batch_size = batch_size * 3 // 4 + except torch.OutofMemoryError as e: + batch_size = batch_size * 3 // 4 + + dataset, dataloader = None, None + if use_dataloader: + if num_workers is None: + num_workers = min(batch_size, 16) + dataset = DatasetForReranker( + all_queries_inputs_sorted, + all_passages_inputs_sorted, + self.model_name_or_path, + max_length, + cache_dir=self.cache_dir, + prompt=prompt, + **kwargs + ) + dataloader = DataLoader( + dataset, shuffle=False, batch_size=batch_size, drop_last=False, + num_workers=num_workers, + collate_fn=Collater(self.tokenizer, encode_max_length) + ) + + all_scores = [] + if dataloader is not None: + for inputs in tqdm(dataloader): + inputs = inputs.to(device) + + outputs = self.model(**inputs, output_hidden_states=True) + logits = outputs.logits + scores = last_logit_pool(logits, inputs['attention_mask']) + scores = scores[:, self.yes_loc] + all_scores.extend(scores.cpu().float().tolist()) + else: + for batch_start in trange(0, len(all_queries_inputs_sorted), batch_size): + queries_inputs = all_queries_inputs_sorted[batch_start:batch_start+batch_size] + passages_inputs = all_passages_inputs_sorted[batch_start:batch_start+batch_size] + + batch_inputs = [] + for query_inputs, passage_inputs in zip(queries_inputs, passages_inputs): + if self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id != self.tokenizer.pad_token_id: + item = self.tokenizer.prepare_for_model( + [self.tokenizer.bos_token_id] + query_inputs['input_ids'], + sep_inputs + passage_inputs['input_ids'], + truncation='only_second', + max_length=encode_max_length, + padding=False, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=False + ) + else: + item = self.tokenizer.prepare_for_model( + query_inputs['input_ids'], + sep_inputs + passage_inputs['input_ids'], + truncation='only_second', + max_length=encode_max_length, + padding=False, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=False + ) + item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs + item['attention_mask'] = [1] * len(item['input_ids']) + item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None + if 'position_ids' in item.keys(): + item['position_ids'] = list(range(len(item['input_ids']))) + batch_inputs.append(item) + + collater_instance = Collater(self.tokenizer, encode_max_length) + batch_inputs = collater_instance([{ + 'input_ids': item['input_ids'], + 'attention_mask': item['attention_mask'] + } for item in batch_inputs] + ) + + batch_inputs = {key: val.to(device) for key, val in batch_inputs.items()} + + outputs = self.model(**batch_inputs, output_hidden_states=True) + logits = outputs.logits + scores = last_logit_pool(logits, batch_inputs['attention_mask']) + scores = scores[:, self.yes_loc] + all_scores.extend(scores.cpu().float().tolist()) + + all_scores = [all_scores[idx] for idx in np.argsort(length_sorted_idx)] + + if normalize: + all_scores = [sigmoid(score) for score in all_scores] + + # if len(all_scores) == 1: + # return all_scores[0] + + return all_scores
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/inference/reranker/decoder_only/layerwise.html b/_modules/FlagEmbedding/inference/reranker/decoder_only/layerwise.html new file mode 100644 index 00000000..67938a17 --- /dev/null +++ b/_modules/FlagEmbedding/inference/reranker/decoder_only/layerwise.html @@ -0,0 +1,869 @@ + + + + + + + + FlagEmbedding.inference.reranker.decoder_only.layerwise - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.inference.reranker.decoder_only.layerwise

+import torch
+import warnings
+import numpy as np
+from tqdm import tqdm, trange
+from typing import Any, List, Union, Tuple, Optional
+from peft import PeftModel
+from torch import Tensor
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from torch.utils.data import DataLoader
+
+from FlagEmbedding.abc.inference import AbsReranker
+from FlagEmbedding.inference.reranker.encoder_only.base import sigmoid
+from FlagEmbedding.inference.reranker.decoder_only.base import DatasetForReranker, Collater
+
+from .models.modeling_minicpm_reranker import LayerWiseMiniCPMForCausalLM
+
+
+def last_logit_pool_layerwise(logits: Tensor,
+                              attention_mask: Tensor) -> Tensor:
+    """Pool the last logit.
+
+    Args:
+        logits (torch.Tensor): The output logits of the model.
+        attention_mask (torch.Tensor): Attention mask.
+
+    Returns:
+        torch.Tensor: The tensor after pooling.
+    """
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return logits[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = logits.shape[0]
+        return logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+
+
+[docs] +class LayerWiseLLMReranker(AbsReranker): + """Base reranker class for layerwise LLM like decoder only models. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + peft_path (Optional[str], optional): Path to the PEFT config. Defaults to :data:`None`. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`False`. Defaults to :data:`False`. + use_bf16 (bool, optional): Another type of half-precision floating-point, you can use bf16 if the hardware supports. + Defaults to :data:False. + query_instruction_for_rerank (str, optional): Query instruction for retrieval tasks, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`"A: "`. + query_instruction_format (str, optional): The template for :attr:`query_instruction_for_rerank`. Defaults to :data:`"{}{}"`. + passage_instruction_for_rerank (str, optional): Passage instruction for retrieval tasks, which will be used with + with :attr:`passage_instruction_format`. Defaults to :data:`"B: "`. + passage_instruction_format (str, optional): The template for passage. Defaults to "{}{}". + cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`. + trust_remote_code (bool, optional): trust_remote_code. Defaults to :data:`False`. + devices (Union[str, List[str], List[int]], optional): Devices to use for model inference, such as ["cuda:0"] or ["0"]. + Defaults to :data:`None`. + cutoff_layers (Optional[List[int]]): Pick which layers are used for computing the score. Defaults to :data:`None`. + prompt (Optional[str], optional): Prompt for the specific task. Defaults to :data:`None`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`128`. + query_max_length (int, optional): Maximum length for queries. If not specified, will be 3/4 of :attr:`max_length`. + Defaults to :data:`None`. + max_length (int, optional): Maximum length of passages. Defaults to :data`512`. + normalize (bool, optional): If True, use Sigmoid to normalize the results. Defaults to :data:`False`. + """ + def __init__( + self, + model_name_or_path: str, + peft_path: Optional[str] = None, + use_fp16: bool = False, + use_bf16: bool = False, + query_instruction_for_rerank: str = "A: ", + query_instruction_format: str = "{}{}", # specify the format of query_instruction_for_rerank + passage_instruction_for_rerank: str = "B: ", + passage_instruction_format: str = "{}{}", # specify the format of passage_instruction_for_rerank + cache_dir: Optional[str] = None, + trust_remote_code: bool = False, + devices: Optional[Union[str, List[str], List[int]]] = None, # specify devices, such as ["cuda:0"] or ["0"] + # inference + cutoff_layers: Optional[List[int]] = None, + prompt: Optional[str] = None, + batch_size: int = 128, + query_max_length: Optional[int] = None, + max_length: int = 512, + normalize: bool = False, + **kwargs: Any, + ) -> None: + super().__init__( + model_name_or_path=model_name_or_path, + use_fp16=use_fp16, + query_instruction_for_rerank=query_instruction_for_rerank, + query_instruction_format=query_instruction_format, + passage_instruction_for_rerank=passage_instruction_for_rerank, + passage_instruction_format=passage_instruction_format, + devices=devices, + batch_size=batch_size, + query_max_length=query_max_length, + max_length=max_length, + normalize=normalize, + **kwargs + ) + + self.cutoff_layers = cutoff_layers + self.prompt = prompt + + self.tokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, + cache_dir=cache_dir, + trust_remote_code=trust_remote_code + ) + + if use_bf16 is False and use_fp16 is False: + warnings.warn("Due to model constraints, `use_bf16` and `use_fp16` cannot both be `False`. Here, `use_fp16` is set to `True` by default.", UserWarning) + self.use_fp16 = True + + try: + self.model = LayerWiseMiniCPMForCausalLM.from_pretrained( + model_name_or_path, + cache_dir=cache_dir, + trust_remote_code=trust_remote_code, + torch_dtype=torch.bfloat16 if use_bf16 else torch.float32 + ) + except: + self.model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + cache_dir=cache_dir, + trust_remote_code=trust_remote_code, + torch_dtype=torch.bfloat16 if use_bf16 else torch.float32 + ) + if peft_path: + self.model = PeftModel.from_pretrained(self.model,peft_path) + self.model = self.model.merge_and_unload() + + @torch.no_grad() + def compute_score_single_gpu( + self, + sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], + batch_size: Optional[int] = None, + query_max_length: Optional[int] = None, + max_length: Optional[int] = None, + cutoff_layers: Optional[List[int]] = None, + prompt: Optional[str] = None, + normalize: Optional[bool] = None, + use_dataloader: bool = False, + num_workers: Optional[int] = None, + device: Optional[str] = None, + **kwargs: Any + ) -> List[float]: + """Compute the relevance scores using a single GPU. + + Args: + sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]): Input sentence pairs to compute scores. + batch_size (Optional[int], optional): Number of inputs for each iter. Defaults to :data:`None`. + query_max_length (Optional[int], optional): Maximum length of tokens of queries. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + cutoff_layers (Optional[List[int]], optional): Pick which layers are used for computing the score. Defaults to :data:`None`. + prompt (Optional[str], optional): Prompt for the specific task. Defaults to :data:`None`. + normalize (Optional[bool], optional): If True, use Sigmoid to normalize the results. Defaults to :data:`None`. + use_dataloader (bool, optional): If True, will use the dataloader to load the datasets. Defaults to :data:`False`. + num_workers (int, optional): Number of workers for dataloader. Defaults to :data:`None`. + device (Optional[str], optional): Device to use for computation. Defaults to :data:`None`. + + Returns: + List[float]: The computed scores. + """ + if cutoff_layers is None: cutoff_layers = self.cutoff_layers + if prompt is None: prompt = self.prompt + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.max_length + if query_max_length is None: + if self.query_max_length is not None: + query_max_length = self.query_max_length + else: + query_max_length = max_length * 3 // 4 + if normalize is None: normalize = self.normalize + + if device is None: + device = self.target_devices[0] + + if device == "cpu": self.use_fp16 = False + if self.use_fp16: self.model.half() + + self.model.to(device) + self.model.eval() + + assert isinstance(sentence_pairs, list) + if isinstance(sentence_pairs[0], str): + sentence_pairs = [sentence_pairs] + + # tokenize without padding to get the correct length + all_queries_inputs = [] + all_passages_inputs = [] + for start_index in trange(0, len(sentence_pairs), batch_size, desc="pre tokenize", + disable=len(sentence_pairs) < 128): + sentences_batch = sentence_pairs[start_index:start_index + batch_size] + queries = [s[0] for s in sentences_batch] + passages = [s[1] for s in sentences_batch] + queries_inputs_batch = self.tokenizer( + queries, + return_tensors=None, + add_special_tokens=False, + max_length=query_max_length, + truncation=True, + **kwargs + ) + passages_inputs_batch = self.tokenizer( + passages, + return_tensors=None, + add_special_tokens=False, + max_length=max_length, + truncation=True, + **kwargs + ) + queries_inputs_batch = [{ + k: queries_inputs_batch[k][i] for k in queries_inputs_batch.keys() + } for i in range(len(sentences_batch))] + passages_inputs_batch = [{ + k: passages_inputs_batch[k][i] for k in passages_inputs_batch.keys() + } for i in range(len(sentences_batch))] + + all_queries_inputs.extend(queries_inputs_batch) + all_passages_inputs.extend(passages_inputs_batch) + + # sort by length for less padding + length_sorted_idx = np.argsort([-len(x['input_ids']) - len(y['input_ids']) for (x, y) in zip(all_queries_inputs, all_passages_inputs)]) + all_queries_inputs_sorted = [all_queries_inputs[i] for i in length_sorted_idx] + all_passages_inputs_sorted = [all_passages_inputs[i] for i in length_sorted_idx] + + # other inputs + if prompt is None: + prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." + prompt_inputs = self.tokenizer( + prompt, + return_tensors=None, + add_special_tokens=False + )['input_ids'] + sep = "\n" + sep_inputs = self.tokenizer( + sep, + return_tensors=None, + add_special_tokens=False + )['input_ids'] + encode_max_length = max_length + len(sep_inputs) + len(prompt_inputs) + + # adjust batch size + flag = False + while flag is False: + try: + batch_inputs = [] + for query_inputs, passage_inputs in zip( + all_queries_inputs_sorted[:min(len(all_queries_inputs_sorted), batch_size)], + all_passages_inputs_sorted[:min(len(all_passages_inputs_sorted), batch_size)] + ): + item = self.tokenizer.prepare_for_model( + [self.tokenizer.bos_token_id] + query_inputs['input_ids'], + sep_inputs + passage_inputs['input_ids'], + truncation='only_second', + max_length=encode_max_length, + padding=False, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=False + ) + item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs + item['attention_mask'] = [1] * len(item['input_ids']) + item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None + if 'position_ids' in item.keys(): + item['position_ids'] = list(range(len(item['input_ids']))) + batch_inputs.append(item) + + collater_instance = Collater(self.tokenizer, encode_max_length) + batch_inputs = collater_instance([{ + 'input_ids': item['input_ids'], + 'attention_mask': item['attention_mask'] + } for item in batch_inputs] + ) + + batch_inputs = {key: val.to(device) for key, val in batch_inputs.items()} + + self.model(**batch_inputs, output_hidden_states=True, cutoff_layers=cutoff_layers) + flag = True + except RuntimeError as e: + batch_size = batch_size * 3 // 4 + except torch.OutofMemoryError as e: + batch_size = batch_size * 3 // 4 + + dataset, dataloader = None, None + if use_dataloader: + if num_workers is None: + num_workers = min(batch_size, 16) + dataset = DatasetForReranker( + all_queries_inputs_sorted, + all_passages_inputs_sorted, + self.model_name_or_path, + max_length, + cache_dir=self.cache_dir, + prompt=prompt, + **kwargs + ) + dataloader = DataLoader( + dataset, shuffle=False, batch_size=batch_size, drop_last=False, + num_workers=num_workers, + collate_fn=Collater(self.tokenizer, encode_max_length) + ) + + all_scores = [] + if dataloader is not None: + for inputs in tqdm(dataloader): + inputs = inputs.to(device) + + outputs = self.model(**inputs, output_hidden_states=True, cutoff_layers=cutoff_layers) + all_logits = outputs.logits + tmp_all_scores = [] + for logits in all_logits: + scores = last_logit_pool_layerwise(logits, inputs['attention_mask']) + tmp_all_scores.append(scores.contiguous()) + + if len(all_scores) == 0: + for _ in range(len(tmp_all_scores)): + all_scores.append([]) + + for i in range(len(tmp_all_scores)): + all_scores[i].extend(tmp_all_scores[i].cpu().float().tolist()) + else: + for batch_start in trange(0, len(all_queries_inputs_sorted), batch_size): + queries_inputs = all_queries_inputs_sorted[batch_start:batch_start+batch_size] + passages_inputs = all_passages_inputs_sorted[batch_start:batch_start+batch_size] + + batch_inputs = [] + for query_inputs, passage_inputs in zip(queries_inputs, passages_inputs): + item = self.tokenizer.prepare_for_model( + [self.tokenizer.bos_token_id] + query_inputs['input_ids'], + sep_inputs + passage_inputs['input_ids'], + truncation='only_second', + max_length=encode_max_length, + padding=False, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=False + ) + item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs + item['attention_mask'] = [1] * len(item['input_ids']) + item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None + if 'position_ids' in item.keys(): + item['position_ids'] = list(range(len(item['input_ids']))) + batch_inputs.append(item) + + collater_instance = Collater(self.tokenizer, encode_max_length) + batch_inputs = collater_instance([{ + 'input_ids': item['input_ids'], + 'attention_mask': item['attention_mask'] + } for item in batch_inputs] + ) + + batch_inputs = {key: val.to(device) for key, val in batch_inputs.items()} + + outputs = self.model(**batch_inputs, output_hidden_states=True, cutoff_layers=cutoff_layers) + all_logits = outputs.logits + tmp_all_scores = [] + for logits in all_logits: + scores = last_logit_pool_layerwise(logits, batch_inputs['attention_mask']) + tmp_all_scores.append(scores.contiguous()) + + if len(all_scores) == 0: + for _ in range(len(tmp_all_scores)): + all_scores.append([]) + + for i in range(len(tmp_all_scores)): + all_scores[i].extend(tmp_all_scores[i].cpu().float().tolist()) + + for i in range(len(all_scores)): + all_scores[i] = [all_scores[i][idx] for idx in np.argsort(length_sorted_idx)] + if normalize: + all_scores[i] = [sigmoid(score) for score in all_scores[i]] + + if isinstance(all_scores[0], list): + all_scores = all_scores[0] + + return all_scores
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/inference/reranker/decoder_only/lightweight.html b/_modules/FlagEmbedding/inference/reranker/decoder_only/lightweight.html new file mode 100644 index 00000000..b97d7dd6 --- /dev/null +++ b/_modules/FlagEmbedding/inference/reranker/decoder_only/lightweight.html @@ -0,0 +1,930 @@ + + + + + + + + FlagEmbedding.inference.reranker.decoder_only.lightweight - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.inference.reranker.decoder_only.lightweight

+import torch
+import warnings
+import numpy as np
+from tqdm import trange
+from typing import Any, List, Union, Tuple, Optional
+from peft import PeftModel
+from torch import Tensor
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from FlagEmbedding.abc.inference import AbsReranker
+from FlagEmbedding.inference.reranker.encoder_only.base import sigmoid
+
+from .models.gemma_model import CostWiseGemmaForCausalLM
+
+
+def last_logit_pool_lightweight(logits: Tensor,
+                    attention_mask: Tensor) -> Tensor:
+    """Pool the last logit.
+
+    Args:
+        logits (torch.Tensor): The output logits of the model.
+        attention_mask (torch.Tensor): Attention mask.
+
+    Returns:
+        torch.Tensor: The tensor after pooling.
+    """
+    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+    if left_padding:
+        return logits[:, -1]
+    else:
+        sequence_lengths = attention_mask.sum(dim=1) - 1
+        batch_size = logits.shape[0]
+        return torch.stack([logits[i, sequence_lengths[i]] for i in range(batch_size)], dim=0)
+
+
+class Collater_for_lightweight:
+    """
+    Collator of the lightweight LLM reranker.
+    
+    Args:
+        tokenizer (transformers.AutoTokenizer): The tokenizer for reranker.
+        max_len (int): Maximum length of tokens.
+    """
+    def __init__(self, tokenizer, max_len):
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        self.pad_to_multiple_of = 8
+        self.label_pad_token_id = -100
+        warnings.filterwarnings("ignore",
+                                message="`max_length` is ignored when `padding`=`True` and there is no truncation strategy.")
+
+    def __call__(self, data):
+        features = data[0]
+        query_lengths = data[1]
+        prompt_lengths = data[2]
+
+        labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
+        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
+        # same length to return tensors.
+        if labels is not None:
+            max_label_length = max(len(l) for l in labels)
+            if self.pad_to_multiple_of is not None:
+                max_label_length = (
+                        (max_label_length + self.pad_to_multiple_of - 1)
+                        // self.pad_to_multiple_of
+                        * self.pad_to_multiple_of
+                )
+
+            padding_side = self.tokenizer.padding_side
+            for feature in features:
+                remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
+                if isinstance(feature["labels"], list):
+                    feature["labels"] = (
+                        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
+                    )
+                elif padding_side == "right":
+                    feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
+                else:
+                    feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
+
+        collected = self.tokenizer.pad(
+            features,
+            padding=True,
+            pad_to_multiple_of=8,
+            return_tensors='pt',
+        )
+
+        return collected, query_lengths, prompt_lengths
+
+
+
+[docs] +class LightweightLLMReranker(AbsReranker): + """Base reranker class for light weight LLM like decoder only models. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + peft_path (Optional[str], optional): Path to the PEFT config. Defaults to :data:`None`. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`False`. Defaults to :data:`False`. + use_bf16 (bool, optional): Another type of half-precision floating-point, you can use bf16 if the hardware supports. + Defaults to :data:False. + query_instruction_for_rerank (str, optional): Query instruction for retrieval tasks, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`"A: "`. + query_instruction_format (str, optional): The template for :attr:`query_instruction_for_rerank`. Defaults to :data:`"{}{}"`. + passage_instruction_for_rerank (str, optional): Passage instruction for retrieval tasks, which will be used with + with :attr:`passage_instruction_format`. Defaults to :data:`"B: "`. + passage_instruction_format (str, optional): The template for passage. Defaults to "{}{}". + cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`. + trust_remote_code (bool, optional): trust_remote_code. Defaults to :data:`False`. + devices (Union[str, List[str], List[int]], optional): Devices to use for model inference, such as ["cuda:0"] or ["0"]. + Defaults to :data:`None`. + cutoff_layers (Optional[List[int]]): Pick which layers are used for computing the score. Defaults to :data:`None`. + compress_layers (List[int], optional): Choose the layers to compress. Defaults to :data:`[8]`. + compress_ratio (int, optional): Ratio to compress the selected layers, supported ratios: :data:`[1, 2, 4, 8]`. + Defaults to :data:`1`. + prompt (Optional[str], optional): Prompt for the specific task. Defaults to :data:`None`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`128`. + query_max_length (int, optional): Maximum length for queries. If not specified, will be 3/4 of :attr:`max_length`. + Defaults to :data:`None`. + max_length (int, optional): Maximum length of passages. Defaults to :data`512`. + normalize (bool, optional): If True, use Sigmoid to normalize the results. Defaults to :data:`False`. + """ + def __init__( + self, + model_name_or_path: str, + peft_path: Optional[str] = None, + use_fp16: bool = False, + use_bf16: bool = False, + query_instruction_for_rerank: str = "A: ", + query_instruction_format: str = "{}{}", # specify the format of query_instruction_for_rerank + passage_instruction_for_rerank: str = "B: ", + passage_instruction_format: str = "{}{}", # specify the format of passage_instruction_for_rerank + cache_dir: Optional[str] = None, + trust_remote_code: bool = False, + devices: Union[str, List[str], List[int]] = None, # specify devices, such as ["cuda:0"] or ["0"] + # inference + cutoff_layers: Optional[List[int]] = None, + compress_layers: List[int] = [8], + compress_ratio: int = 1, + prompt: Optional[str] = None, + batch_size: int = 128, + query_max_length: Optional[int] = None, + max_length: int = 512, + normalize: bool = False, + **kwargs: Any, + ) -> None: + + super().__init__( + model_name_or_path=model_name_or_path, + use_fp16=use_fp16, + query_instruction_for_rerank=query_instruction_for_rerank, + query_instruction_format=query_instruction_format, + passage_instruction_for_rerank=passage_instruction_for_rerank, + passage_instruction_format=passage_instruction_format, + devices=devices, + batch_size=batch_size, + query_max_length=query_max_length, + max_length=max_length, + normalize=normalize, + **kwargs + ) + + self.cutoff_layers = cutoff_layers + self.compress_layers = compress_layers + self.compress_ratio = compress_ratio + self.prompt = prompt + + self.tokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, + cache_dir=cache_dir, + trust_remote_code=trust_remote_code + ) + self.tokenizer.padding_side = 'right' + + if use_bf16 is False and use_fp16 is False: + warnings.warn("Due to model constraints, `use_bf16` and `use_fp16` cannot both be `False`. Here, `use_fp16` is set to `True` by default.", UserWarning) + use_fp16 = True + + try: + self.model = CostWiseGemmaForCausalLM.from_pretrained( + model_name_or_path, + cache_dir=cache_dir, + trust_remote_code=trust_remote_code, + torch_dtype=torch.bfloat16 if use_bf16 else torch.float32 + ) + except: + self.model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + cache_dir=cache_dir, + trust_remote_code=trust_remote_code, + torch_dtype=torch.bfloat16 if use_bf16 else torch.float32 + ) + if peft_path: + self.model = PeftModel.from_pretrained(self.model,peft_path) + self.model = self.model.merge_and_unload() + + @torch.no_grad() + def compute_score_single_gpu( + self, + sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], + batch_size: Optional[int] = None, + query_max_length: Optional[int] = None, + max_length: Optional[int] = None, + cutoff_layers: Optional[List[int]] = None, + compress_layer: Optional[List[int]] = None, + compress_layers: Optional[List[int]] = None, + compress_ratio: Optional[int] = None, + prompt: Optional[str] = None, + normalize: Optional[bool] = None, + device: Optional[str] = None, + **kwargs: Any + ) -> List[float]: + """Compute the relevance scores using a single GPU. + + Args: + sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]): Input sentence pairs to compute scores. + batch_size (Optional[int], optional): Number of inputs for each iter. Defaults to :data:`None`. + query_max_length (Optional[int], optional): Maximum length of tokens of queries. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + cutoff_layers (Optional[List[int]], optional): Pick which layers are used for computing the score. Defaults to :data:`None`. + compress_layer (Optional[List[int]]): Deprecated, use :attr:`compress_layers` instead. Defaults to :data:`None`. + compress_layers (Optional[List[int]]): Selected layers to compress. Defaults to :data:`None`. + compress_ratio (Optional[int]): Ratio to compress the selected layers, supported ratios: :data:`[1, 2, 4, 8]`. + Defaults to :data:`None`. + prompt (Optional[str], optional): Prompt for the specific task. Defaults to :data:`None`. + normalize (Optional[bool], optional): If True, use Sigmoid to normalize the results. Defaults to :data:`None`. + device (Optional[str], optional): Device to use for computation. Defaults to :data:`None`. + + Returns: + List[float]: The computed scores. + """ + + if cutoff_layers is None: cutoff_layers = self.cutoff_layers + if compress_layers is None: compress_layers = self.compress_layers + if compress_layer is not None: + print('Try not to use the parameter `compress_layer`; use `compress_layers` instead.') + compress_layers = compress_layer + if compress_ratio is None: compress_ratio = self.compress_ratio + if prompt is None: prompt = self.prompt + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.max_length + if query_max_length is None: + if self.query_max_length is not None: + query_max_length = self.query_max_length + else: + query_max_length = max_length * 3 // 4 + if normalize is None: normalize = self.normalize + + if device is None: + device = self.target_devices[0] + + if device == "cpu": self.use_fp16 = False + if self.use_fp16: self.model.half() + + self.model.to(device) + self.model.eval() + + assert isinstance(sentence_pairs, list) + if isinstance(sentence_pairs[0], str): + sentence_pairs = [sentence_pairs] + + # tokenize without padding to get the correct length + all_queries_inputs = [] + all_passages_inputs = [] + for start_index in trange(0, len(sentence_pairs), batch_size, desc="pre tokenize", + disable=len(sentence_pairs) < 128): + sentences_batch = sentence_pairs[start_index:start_index + batch_size] + queries = [s[0] for s in sentences_batch] + passages = [s[1] for s in sentences_batch] + queries_inputs_batch = self.tokenizer( + queries, + return_tensors=None, + add_special_tokens=False, + max_length=query_max_length, + truncation=True, + **kwargs + ) + passages_inputs_batch = self.tokenizer( + passages, + return_tensors=None, + add_special_tokens=False, + max_length=max_length, + truncation=True, + **kwargs + ) + queries_inputs_batch = [{ + k: queries_inputs_batch[k][i] for k in queries_inputs_batch.keys() + } for i in range(len(sentences_batch))] + passages_inputs_batch = [{ + k: passages_inputs_batch[k][i] for k in passages_inputs_batch.keys() + } for i in range(len(sentences_batch))] + + all_queries_inputs.extend(queries_inputs_batch) + all_passages_inputs.extend(passages_inputs_batch) + + # sort by length for less padding + length_sorted_idx = np.argsort([-len(x['input_ids']) - len(y['input_ids']) for (x, y) in zip(all_queries_inputs, all_passages_inputs)]) + all_queries_inputs_sorted = [all_queries_inputs[i] for i in length_sorted_idx] + all_passages_inputs_sorted = [all_passages_inputs[i] for i in length_sorted_idx] + + # other inputs + if prompt is None: + prompt = "Predict whether passage B contains an answer to query A." + prompt_inputs = self.tokenizer( + prompt, + return_tensors=None, + add_special_tokens=False + )['input_ids'] + sep = "\n" + sep_inputs = self.tokenizer( + sep, + return_tensors=None, + add_special_tokens=False + )['input_ids'] + encode_max_length = max_length + len(sep_inputs) + len(prompt_inputs) + + # adjust batch size + flag = False + while flag is False: + try: + batch_inputs = [] + query_lengths = [] + prompt_lengths = [] + for query_inputs, passage_inputs in zip( + all_queries_inputs_sorted[:min(len(all_queries_inputs_sorted), batch_size)], + all_passages_inputs_sorted[:min(len(all_passages_inputs_sorted), batch_size)] + ): + item = self.tokenizer.prepare_for_model( + [self.tokenizer.bos_token_id] + query_inputs['input_ids'], + sep_inputs + passage_inputs['input_ids'], + truncation='only_second', + max_length=encode_max_length, + padding=False, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=False + ) + item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs + item['attention_mask'] = [1] * len(item['input_ids']) + item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None + if 'position_ids' in item.keys(): + item['position_ids'] = list(range(len(item['input_ids']))) + batch_inputs.append(item) + query_lengths.append(len([self.tokenizer.bos_token_id] + query_inputs['input_ids'] + sep_inputs)) + prompt_lengths.append(len(sep_inputs + prompt_inputs)) + + collater_instance = Collater_for_lightweight(self.tokenizer, max_length) + batch_inputs = collater_instance([ + [{ + 'input_ids': item['input_ids'], + 'attention_mask': item['attention_mask'] + } for item in batch_inputs], + query_lengths, + prompt_lengths + ])[0] + + batch_inputs = {key: val.to(device) for key, val in batch_inputs.items()} + + self.model( + **batch_inputs, + output_hidden_states=True, + compress_layer=compress_layers, + compress_ratio=compress_ratio, + query_lengths=query_lengths, + prompt_lengths=prompt_lengths, + cutoff_layers=cutoff_layers + ) + flag = True + except RuntimeError as e: + batch_size = batch_size * 3 // 4 + except torch.OutofMemoryError as e: + batch_size = batch_size * 3 // 4 + + all_scores = [] + for batch_start in trange(0, len(all_queries_inputs_sorted), batch_size): + queries_inputs = all_queries_inputs_sorted[batch_start:batch_start+batch_size] + passages_inputs = all_passages_inputs_sorted[batch_start:batch_start+batch_size] + + batch_inputs = [] + query_lengths = [] + prompt_lengths = [] + for query_inputs, passage_inputs in zip(queries_inputs, passages_inputs): + item = self.tokenizer.prepare_for_model( + [self.tokenizer.bos_token_id] + query_inputs['input_ids'], + sep_inputs + passage_inputs['input_ids'], + truncation='only_second', + max_length=encode_max_length, + padding=False, + return_attention_mask=False, + return_token_type_ids=False, + add_special_tokens=False + ) + item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs + item['attention_mask'] = [1] * len(item['input_ids']) + item.pop('token_type_ids') if 'token_type_ids' in item.keys() else None + if 'position_ids' in item.keys(): + item['position_ids'] = list(range(len(item['input_ids']))) + batch_inputs.append(item) + query_lengths.append(len([self.tokenizer.bos_token_id] + query_inputs['input_ids'] + sep_inputs)) + prompt_lengths.append(len(sep_inputs + prompt_inputs)) + + collater_instance = Collater_for_lightweight(self.tokenizer, max_length) + batch_inputs = collater_instance([ + [{ + 'input_ids': item['input_ids'], + 'attention_mask': item['attention_mask'] + } for item in batch_inputs], + query_lengths, + prompt_lengths + ])[0] + + batch_inputs = {key: val.to(device) for key, val in batch_inputs.items()} + + outputs = self.model( + **batch_inputs, + output_hidden_states=True, + compress_layer=compress_layers, + compress_ratio=compress_ratio, + query_lengths=query_lengths, + prompt_lengths=prompt_lengths, + cutoff_layers=cutoff_layers + ) + scores = [] + for i in range(len(outputs.logits)): + logits = last_logit_pool_lightweight(outputs.logits[i], outputs.attention_masks[i]) + scores.append(logits.cpu().float().tolist()) + if len(all_scores) == 0: + for i in range(len(scores)): + all_scores.append([]) + for i in range(len(scores)): + all_scores[i].extend(scores[i]) + + for i in range(len(all_scores)): + all_scores[i] = [all_scores[i][idx] for idx in np.argsort(length_sorted_idx)] + if normalize: + all_scores[i] = [sigmoid(score) for score in all_scores[i]] + + if isinstance(all_scores[0], list): + all_scores = all_scores[0] + + return all_scores
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/FlagEmbedding/inference/reranker/encoder_only/base.html b/_modules/FlagEmbedding/inference/reranker/encoder_only/base.html new file mode 100644 index 00000000..bc7be921 --- /dev/null +++ b/_modules/FlagEmbedding/inference/reranker/encoder_only/base.html @@ -0,0 +1,683 @@ + + + + + + + + FlagEmbedding.inference.reranker.encoder_only.base - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

Source code for FlagEmbedding.inference.reranker.encoder_only.base

+import torch
+import numpy as np
+from tqdm import tqdm, trange
+from typing import Any, List, Union, Tuple, Optional
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+
+from FlagEmbedding.abc.inference import AbsReranker
+
+
+def sigmoid(x):
+    return float(1 / (1 + np.exp(-x)))
+
+
+
+[docs] +class BaseReranker(AbsReranker): + """Base reranker class for encoder only models. + + Args: + model_name_or_path (str): If it's a path to a local model, it loads the model from the path. Otherwise tries to download and + load a model from HuggingFace Hub with the name. + use_fp16 (bool, optional): If true, use half-precision floating-point to speed up computation with a slight performance + degradation. Defaults to :data:`False`. + query_instruction_for_rerank (Optional[str], optional): Query instruction for retrieval tasks, which will be used with + with :attr:`query_instruction_format`. Defaults to :data:`None`. + query_instruction_format (str, optional): The template for :attr:`query_instruction_for_rerank`. Defaults to :data:`"{}{}"`. + passage_instruction_format (str, optional): The template for passage. Defaults to "{}{}". + cache_dir (Optional[str], optional): Cache directory for the model. Defaults to :data:`None`. + devices (Optional[Union[str, List[str], List[int]]], optional): Devices to use for model inference. Defaults to :data:`None`. + batch_size (int, optional): Batch size for inference. Defaults to :data:`128`. + query_max_length (Optional[int], optional): Maximum length for queries. If not specified, will be 3/4 of :attr:`max_length`. + Defaults to :data:`None`. + max_length (int, optional): Maximum length of passages. Defaults to :data`512`. + normalize (bool, optional): If True, use Sigmoid to normalize the results. Defaults to :data:`False`. + """ + def __init__( + self, + model_name_or_path: str, + use_fp16: bool = False, + query_instruction_for_rerank: Optional[str] = None, + query_instruction_format: str = "{}{}", # specify the format of query_instruction_for_rerank + passage_instruction_for_rerank: Optional[str] = None, + passage_instruction_format: str = "{}{}", # specify the format of passage_instruction_for_rerank + trust_remote_code: bool = False, + cache_dir: Optional[str] = None, + devices: Optional[Union[str, List[str], List[int]]] = None, # specify devices, such as ["cuda:0"] or ["0"] + # inference + batch_size: int = 128, + query_max_length: Optional[int] = None, + max_length: int = 512, + normalize: bool = False, + **kwargs: Any, + ): + super().__init__( + model_name_or_path=model_name_or_path, + use_fp16=use_fp16, + query_instruction_for_rerank=query_instruction_for_rerank, + query_instruction_format=query_instruction_format, + passage_instruction_for_rerank=passage_instruction_for_rerank, + passage_instruction_format=passage_instruction_format, + devices=devices, + batch_size=batch_size, + query_max_length=query_max_length, + max_length=max_length, + normalize=normalize, + **kwargs + ) + self.tokenizer = AutoTokenizer.from_pretrained( + model_name_or_path, + cache_dir=cache_dir + ) + self.model = AutoModelForSequenceClassification.from_pretrained( + model_name_or_path, + trust_remote_code=trust_remote_code, + cache_dir=cache_dir + ) + + @torch.no_grad() + def compute_score_single_gpu( + self, + sentence_pairs: Union[List[Tuple[str, str]], Tuple[str, str]], + batch_size: Optional[int] = None, + query_max_length: Optional[int] = None, + max_length: Optional[int] = None, + normalize: Optional[bool] = None, + device: Optional[str] = None, + **kwargs: Any + ) -> List[float]: + """_summary_ + + Args: + sentence_pairs (Union[List[Tuple[str, str]], Tuple[str, str]]): Input sentence pairs to compute scores. + batch_size (Optional[int], optional): Number of inputs for each iter. Defaults to :data:`None`. + query_max_length (Optional[int], optional): Maximum length of tokens of queries. Defaults to :data:`None`. + max_length (Optional[int], optional): Maximum length of tokens. Defaults to :data:`None`. + normalize (Optional[bool], optional): If True, use Sigmoid to normalize the results. Defaults to :data:`None`. + device (Optional[str], optional): Device to use for computation. Defaults to :data:`None`. + + Returns: + List[float]: Computed scores of queries and passages. + """ + if batch_size is None: batch_size = self.batch_size + if max_length is None: max_length = self.max_length + if query_max_length is None: + if self.query_max_length is not None: + query_max_length = self.query_max_length + else: + query_max_length = max_length * 3 // 4 + if normalize is None: normalize = self.normalize + + if device is None: + device = self.target_devices[0] + + if device == "cpu": self.use_fp16 = False + if self.use_fp16: self.model.half() + + self.model.to(device) + self.model.eval() + + assert isinstance(sentence_pairs, list) + if isinstance(sentence_pairs[0], str): + sentence_pairs = [sentence_pairs] + + # tokenize without padding to get the correct length + all_inputs = [] + for start_index in trange(0, len(sentence_pairs), batch_size, desc="pre tokenize", + disable=len(sentence_pairs) < 128): + sentences_batch = sentence_pairs[start_index:start_index + batch_size] + queries = [s[0] for s in sentences_batch] + passages = [s[1] for s in sentences_batch] + queries_inputs_batch = self.tokenizer( + queries, + return_tensors=None, + add_special_tokens=False, + max_length=query_max_length, + truncation=True, + **kwargs + )['input_ids'] + passages_inputs_batch = self.tokenizer( + passages, + return_tensors=None, + add_special_tokens=False, + max_length=max_length, + truncation=True, + **kwargs + )['input_ids'] + for q_inp, d_inp in zip(queries_inputs_batch, passages_inputs_batch): + item = self.tokenizer.prepare_for_model( + q_inp, + d_inp, + truncation='only_second', + max_length=max_length, + padding=False, + ) + all_inputs.append(item) + # sort by length for less padding + length_sorted_idx = np.argsort([-len(x['input_ids']) for x in all_inputs]) + all_inputs_sorted = [all_inputs[i] for i in length_sorted_idx] + + # adjust batch size + flag = False + while flag is False: + try: + test_inputs_batch = self.tokenizer.pad( + all_inputs_sorted[:min(len(all_inputs_sorted), batch_size)], + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + scores = self.model(**test_inputs_batch, return_dict=True).logits.view(-1, ).float() + flag = True + except RuntimeError as e: + batch_size = batch_size * 3 // 4 + except torch.OutofMemoryError as e: + batch_size = batch_size * 3 // 4 + + all_scores = [] + for start_index in tqdm(range(0, len(all_inputs_sorted), batch_size), desc="Compute Scores", + disable=len(all_inputs_sorted) < 128): + sentences_batch = all_inputs_sorted[start_index:start_index + batch_size] + inputs = self.tokenizer.pad( + sentences_batch, + padding=True, + return_tensors='pt', + **kwargs + ).to(device) + + scores = self.model(**inputs, return_dict=True).logits.view(-1, ).float() + all_scores.extend(scores.cpu().numpy().tolist()) + + all_scores = [all_scores[idx] for idx in np.argsort(length_sorted_idx)] + + if normalize: + all_scores = [sigmoid(score) for score in all_scores] + + return all_scores
+ +
+
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_modules/index.html b/_modules/index.html new file mode 100644 index 00000000..3cb5f3b9 --- /dev/null +++ b/_modules/index.html @@ -0,0 +1,535 @@ + + + + + + + + Overview: module code - FlagEmbedding + + + + + + + + + + + + + + + + + Contents + + + + + + Menu + + + + + + + + Expand + + + + + + Light mode + + + + + + + + + + + + + + Dark mode + + + + + + + Auto light/dark, in light mode + + + + + + + + + + + + + + + Auto light/dark, in dark mode + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Skip to content + + + +
+
+
+ +
+ +
+
+ +
+ +
+
+ +
+
+
+ + + + + Back to top + +
+
+ +
+ +
+
+

All modules for which code is available

+ +
+
+
+ + +
+
+ + Made with Sphinx and @pradyunsg's + + Furo + +
+
+ +
+
+ +
+
+ +
+
+ + + + + \ No newline at end of file diff --git a/_sources/API/abc.rst.txt b/_sources/API/abc.rst.txt new file mode 100644 index 00000000..4da03d98 --- /dev/null +++ b/_sources/API/abc.rst.txt @@ -0,0 +1,6 @@ +Abstract Class +============== + +.. toctree:: + abc/inference + abc/finetune \ No newline at end of file diff --git a/_sources/API/abc/evaluation.rst.txt b/_sources/API/abc/evaluation.rst.txt new file mode 100644 index 00000000..40896911 --- /dev/null +++ b/_sources/API/abc/evaluation.rst.txt @@ -0,0 +1,9 @@ +Evaluation +========== + +.. toctree:: + evaluation/arguments + evaluation/data_loader + evaluation/searcher + evaluation/evaluator + evaluation/runner \ No newline at end of file diff --git a/_sources/API/abc/evaluation/arguments.rst.txt b/_sources/API/abc/evaluation/arguments.rst.txt new file mode 100644 index 00000000..287e601e --- /dev/null +++ b/_sources/API/abc/evaluation/arguments.rst.txt @@ -0,0 +1,7 @@ +Arguments +========= + +.. autoclass:: FlagEmbedding.abc.evaluation.AbsEvalArgs + + +.. autoclass:: FlagEmbedding.abc.evaluation.AbsEvalModelArgs \ No newline at end of file diff --git a/_sources/API/abc/evaluation/data_loader.rst.txt b/_sources/API/abc/evaluation/data_loader.rst.txt new file mode 100644 index 00000000..38c3d8b2 --- /dev/null +++ b/_sources/API/abc/evaluation/data_loader.rst.txt @@ -0,0 +1,25 @@ +dataset loader +============== + +.. autoclass:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader + +Methods +------- + +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader.available_dataset_names +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader.available_splits +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader.check_dataset_names +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader.check_splits +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader.load_corpus +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader.load_qrels +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader.load_queries +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader._load_remote_corpus +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader._load_remote_qrels +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader._load_remote_queries +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader._load_local_corpus +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader._load_local_qrels +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader._load_local_queries +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader._download_file +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader._get_fpath_size +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader._download_gz_file +.. automethod:: FlagEmbedding.abc.evaluation.AbsEvalDataLoader._download_zip_file \ No newline at end of file diff --git a/_sources/API/abc/evaluation/evaluator.rst.txt b/_sources/API/abc/evaluation/evaluator.rst.txt new file mode 100644 index 00000000..af3317ce --- /dev/null +++ b/_sources/API/abc/evaluation/evaluator.rst.txt @@ -0,0 +1,4 @@ +Evaluator +========= + +.. autoclass:: FlagEmbedding.abc.evaluation.AbsEvaluator \ No newline at end of file diff --git a/_sources/API/abc/evaluation/runner.rst.txt b/_sources/API/abc/evaluation/runner.rst.txt new file mode 100644 index 00000000..b38f0b59 --- /dev/null +++ b/_sources/API/abc/evaluation/runner.rst.txt @@ -0,0 +1,4 @@ +runner +====== + +.. autoclass:: FlagEmbedding.abc.evaluation.AbsEvalRunner \ No newline at end of file diff --git a/_sources/API/abc/evaluation/searcher.rst.txt b/_sources/API/abc/evaluation/searcher.rst.txt new file mode 100644 index 00000000..f713ca4d --- /dev/null +++ b/_sources/API/abc/evaluation/searcher.rst.txt @@ -0,0 +1,18 @@ +======== +searcher +======== + +EvalRetriever +============= + +.. autoclass:: FlagEmbedding.abc.evaluation.EvalRetriever + +EvalDenseRetriever +================== + +.. autoclass:: FlagEmbedding.abc.evaluation.EvalDenseRetriever + +EvalReranker +============ + +.. autoclass:: FlagEmbedding.abc.evaluation.EvalReranker \ No newline at end of file diff --git a/_sources/API/abc/finetune.rst.txt b/_sources/API/abc/finetune.rst.txt new file mode 100644 index 00000000..96d46c77 --- /dev/null +++ b/_sources/API/abc/finetune.rst.txt @@ -0,0 +1,6 @@ +Finetune +======== + +.. toctree:: + finetune/embedder + finetune/reranker \ No newline at end of file diff --git a/_sources/API/abc/finetune/embedder.rst.txt b/_sources/API/abc/finetune/embedder.rst.txt new file mode 100644 index 00000000..4948c6ff --- /dev/null +++ b/_sources/API/abc/finetune/embedder.rst.txt @@ -0,0 +1,9 @@ +Embedder +======== + +.. toctree:: + embedder/AbsArguments + embedder/AbsDataset + embedder/AbsModeling + embedder/AbsTrainer + embedder/AbsRunner \ No newline at end of file diff --git a/_sources/API/abc/finetune/embedder/AbsArguments.rst.txt b/_sources/API/abc/finetune/embedder/AbsArguments.rst.txt new file mode 100644 index 00000000..6c41ffd9 --- /dev/null +++ b/_sources/API/abc/finetune/embedder/AbsArguments.rst.txt @@ -0,0 +1,6 @@ +AbsArguments +============ + +.. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModelArguments + +.. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerDataArguments diff --git a/_sources/API/abc/finetune/embedder/AbsDataset.rst.txt b/_sources/API/abc/finetune/embedder/AbsDataset.rst.txt new file mode 100644 index 00000000..9c61f763 --- /dev/null +++ b/_sources/API/abc/finetune/embedder/AbsDataset.rst.txt @@ -0,0 +1,53 @@ +========== +AbsDataset +========== + +AbsEmbedderTrainDataset +======================= + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderTrainDataset + +Methods +------- + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderTrainDataset._load_dataset + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderTrainDataset._shuffle_text + +AbsEmbedderCollator +=================== + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderCollator + +AbsEmbedderSameDatasetTrainDataset +================================== + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderSameDatasetTrainDataset + +Methods +------- + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderSameDatasetTrainDataset.refresh_epoch + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderSameDatasetTrainDataset._load_dataset + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderSameDatasetTrainDataset._get_file_batch_size + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderSameDatasetTrainDataset._get_train_group_size + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderSameDatasetTrainDataset._create_batch_data + +AbsEmbedderSameDatasetCollator +============================== + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderSameDatasetCollator + +EmbedderTrainerCallbackForDataRefresh +===================================== + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.EmbedderTrainerCallbackForDataRefresh + +Methods +------- + +.. automethod:: FlagEmbedding.abc.finetune.embedder.EmbedderTrainerCallbackForDataRefresh.on_epoch_end \ No newline at end of file diff --git a/_sources/API/abc/finetune/embedder/AbsModeling.rst.txt b/_sources/API/abc/finetune/embedder/AbsModeling.rst.txt new file mode 100644 index 00000000..fa150ffa --- /dev/null +++ b/_sources/API/abc/finetune/embedder/AbsModeling.rst.txt @@ -0,0 +1,41 @@ +=========== +AbsModeling +=========== + +AbsEmbedderModel +================ + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel + +Methods +------- + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel.encode + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel.compute_loss + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel.compute_score + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel.save + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel.get_local_score + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel.compute_local_score + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel.forward + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel.distill_loss + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel._compute_no_in_batch_neg_loss + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel._compute_in_batch_neg_loss + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel._compute_cross_device_neg_loss + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModel._dist_gather_tensor + + +EmbedderOutput +============== + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.EmbedderOutput \ No newline at end of file diff --git a/_sources/API/abc/finetune/embedder/AbsRunner.rst.txt b/_sources/API/abc/finetune/embedder/AbsRunner.rst.txt new file mode 100644 index 00000000..60a20fa4 --- /dev/null +++ b/_sources/API/abc/finetune/embedder/AbsRunner.rst.txt @@ -0,0 +1,21 @@ +========= +AbsRunner +========= + +AbsEmbedderTrainer +================== + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner + +Methods +------- + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner.load_tokenizer_and_model + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner.load_trainer + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner.load_train_dataset + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner.load_data_collator + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner.run \ No newline at end of file diff --git a/_sources/API/abc/finetune/embedder/AbsTrainer.rst.txt b/_sources/API/abc/finetune/embedder/AbsTrainer.rst.txt new file mode 100644 index 00000000..f170c89b --- /dev/null +++ b/_sources/API/abc/finetune/embedder/AbsTrainer.rst.txt @@ -0,0 +1,13 @@ +========== +AbsTrainer +========== + +AbsEmbedderTrainer +================== + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderTrainer + +Methods +------- + +.. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderTrainer.compute_loss \ No newline at end of file diff --git a/_sources/API/abc/finetune/reranker.rst.txt b/_sources/API/abc/finetune/reranker.rst.txt new file mode 100644 index 00000000..e0579d3b --- /dev/null +++ b/_sources/API/abc/finetune/reranker.rst.txt @@ -0,0 +1,9 @@ +Reranker +======== + +.. toctree:: + reranker/AbsArguments + reranker/AbsDataset + reranker/AbsModeling + reranker/AbsTrainer + reranker/AbsRunner \ No newline at end of file diff --git a/_sources/API/abc/finetune/reranker/AbsArguments.rst.txt b/_sources/API/abc/finetune/reranker/AbsArguments.rst.txt new file mode 100644 index 00000000..4de1266b --- /dev/null +++ b/_sources/API/abc/finetune/reranker/AbsArguments.rst.txt @@ -0,0 +1,6 @@ +AbsArguments +============ + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModelArguments + +.. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderDataArguments diff --git a/_sources/API/abc/finetune/reranker/AbsDataset.rst.txt b/_sources/API/abc/finetune/reranker/AbsDataset.rst.txt new file mode 100644 index 00000000..5b57d2e5 --- /dev/null +++ b/_sources/API/abc/finetune/reranker/AbsDataset.rst.txt @@ -0,0 +1,32 @@ +========== +AbsDataset +========== + +AbsRerankerTrainDataset +======================= + +.. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainDataset + +Methods +------- + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainDataset.create_one_example + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainDataset._load_dataset + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainDataset._shuffle_text + +AbsRerankerCollator +=================== + +.. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerCollator + +AbsLLMRerankerTrainDataset +========================== + +.. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsLLMRerankerTrainDataset + +AbsLLMRerankerCollator +====================== + +.. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsLLMRerankerCollator diff --git a/_sources/API/abc/finetune/reranker/AbsModeling.rst.txt b/_sources/API/abc/finetune/reranker/AbsModeling.rst.txt new file mode 100644 index 00000000..36f1ab29 --- /dev/null +++ b/_sources/API/abc/finetune/reranker/AbsModeling.rst.txt @@ -0,0 +1,31 @@ +=========== +AbsModeling +=========== + +AbsRerankerModel +================ + +.. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel + +Methods +------- + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.encode + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.gradient_checkpointing_enable + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.enable_input_require_grads + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.forward + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.compute_loss + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.save + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.save_pretrained + + +RerankerOutput +============== + +.. autoclass:: FlagEmbedding.abc.finetune.reranker.RerankerOutput \ No newline at end of file diff --git a/_sources/API/abc/finetune/reranker/AbsRunner.rst.txt b/_sources/API/abc/finetune/reranker/AbsRunner.rst.txt new file mode 100644 index 00000000..962db7d3 --- /dev/null +++ b/_sources/API/abc/finetune/reranker/AbsRunner.rst.txt @@ -0,0 +1,21 @@ +========= +AbsRunner +========= + +AbsRerankerTrainer +================== + +.. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner + +Methods +------- + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner.load_tokenizer_and_model + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner.load_trainer + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner.load_train_dataset + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner.load_data_collator + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner.run \ No newline at end of file diff --git a/_sources/API/abc/finetune/reranker/AbsTrainer.rst.txt b/_sources/API/abc/finetune/reranker/AbsTrainer.rst.txt new file mode 100644 index 00000000..a0aa757f --- /dev/null +++ b/_sources/API/abc/finetune/reranker/AbsTrainer.rst.txt @@ -0,0 +1,13 @@ +========== +AbsTrainer +========== + +AbsRerankerTrainer +================== + +.. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainer + +Methods +------- + +.. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainer.compute_loss \ No newline at end of file diff --git a/_sources/API/abc/inference.rst.txt b/_sources/API/abc/inference.rst.txt new file mode 100644 index 00000000..e118b20c --- /dev/null +++ b/_sources/API/abc/inference.rst.txt @@ -0,0 +1,6 @@ +Inference +========= + +.. toctree:: + inference/AbsEmbedder + inference/AbsReranker \ No newline at end of file diff --git a/_sources/API/abc/inference/AbsEmbedder.rst.txt b/_sources/API/abc/inference/AbsEmbedder.rst.txt new file mode 100644 index 00000000..2068f87e --- /dev/null +++ b/_sources/API/abc/inference/AbsEmbedder.rst.txt @@ -0,0 +1,29 @@ +AbsEmbedder +=========== + +.. autoclass:: FlagEmbedding.abc.inference.AbsEmbedder + +Methods +------- + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.get_target_devices + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.get_detailed_instruct + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.encode_queries + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.encode_corpus + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.encode + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.encode_single_device + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.start_multi_process_pool + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder._encode_multi_process_worker + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.stop_multi_process_pool + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.encode_multi_process + +.. automethod:: FlagEmbedding.abc.inference.AbsEmbedder._concatenate_results_from_multi_process \ No newline at end of file diff --git a/_sources/API/abc/inference/AbsReranker.rst.txt b/_sources/API/abc/inference/AbsReranker.rst.txt new file mode 100644 index 00000000..6ab0b50b --- /dev/null +++ b/_sources/API/abc/inference/AbsReranker.rst.txt @@ -0,0 +1,25 @@ +AbsReranker +=========== + +.. autoclass:: FlagEmbedding.abc.inference.AbsReranker + +Methods +------- + +.. automethod:: FlagEmbedding.abc.inference.AbsReranker.get_target_devices + +.. automethod:: FlagEmbedding.abc.inference.AbsReranker.get_detailed_instruct + +.. automethod:: FlagEmbedding.abc.inference.AbsReranker.get_detailed_inputs + +.. automethod:: FlagEmbedding.abc.inference.AbsReranker.compute_score + +.. automethod:: FlagEmbedding.abc.inference.AbsReranker.compute_score_single_gpu + +.. automethod:: FlagEmbedding.abc.inference.AbsReranker.start_multi_process_pool + +.. automethod:: FlagEmbedding.abc.inference.AbsReranker.encode_multi_process + +.. automethod:: FlagEmbedding.abc.inference.AbsReranker._encode_multi_process_worker + +.. automethod:: FlagEmbedding.abc.inference.AbsReranker.stop_multi_process_pool \ No newline at end of file diff --git a/_sources/API/evaluation.rst.txt b/_sources/API/evaluation.rst.txt new file mode 100644 index 00000000..d7a68c38 --- /dev/null +++ b/_sources/API/evaluation.rst.txt @@ -0,0 +1,11 @@ +Evaluation +========== + +.. toctree:: + evaluation/mteb + evaluation/airbench + evaluation/msmarco + evaluation/beir + evaluation/miracl + evaluation/mkqa + evaluation/mldr \ No newline at end of file diff --git a/_sources/API/evaluation/airbench.rst.txt b/_sources/API/evaluation/airbench.rst.txt new file mode 100644 index 00000000..6080554f --- /dev/null +++ b/_sources/API/evaluation/airbench.rst.txt @@ -0,0 +1,42 @@ +AIR-Bench +========= + +`AIR-Bench `_ (Automated heterogeneous Information Retrieval Benchmark) is a dynamic (actively being updated) benchmark for information retrieval. +Now the benchmark contains two versions. Notice that the testing data is generated by LLMs with out human intervention. +This helps the evaluation of new domains easier and faster to be updated. It also makes it impossible for any models to have the test data covered in their training sets. + +You can evaluate model's performance on AIR-Bench by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/air_bench/eval_air_bench.sh + ./examples/evaluation/air_bench/eval_air_bench.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.air_bench \ + --benchmark_version AIR-Bench_24.05 \ + --task_types qa long-doc \ + --domains arxiv \ + --languages en \ + --splits dev test \ + --output_dir ./air_bench/search_results \ + --search_top_k 1000 \ + --rerank_top_k 100 \ + --cache_dir /root/.cache/huggingface/hub \ + --overwrite False \ + --embedder_name_or_path BAAI/bge-m3 \ + --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ + --devices cuda:0 cuda:1 \ + --model_cache_dir /root/.cache/huggingface/hub \ + --reranker_max_length 1024 + +change the embedder, reranker, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + airbench/arguments + airbench/runner \ No newline at end of file diff --git a/_sources/API/evaluation/airbench/arguments.rst.txt b/_sources/API/evaluation/airbench/arguments.rst.txt new file mode 100644 index 00000000..48c2d61c --- /dev/null +++ b/_sources/API/evaluation/airbench/arguments.rst.txt @@ -0,0 +1,4 @@ +arguments +========= + +.. autoclass:: FlagEmbedding.evaluation.air_bench.AIRBenchEvalModelArgs \ No newline at end of file diff --git a/_sources/API/evaluation/airbench/runner.rst.txt b/_sources/API/evaluation/airbench/runner.rst.txt new file mode 100644 index 00000000..0f3b36d8 --- /dev/null +++ b/_sources/API/evaluation/airbench/runner.rst.txt @@ -0,0 +1,4 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.air_bench.AIRBenchEvalRunner \ No newline at end of file diff --git a/_sources/API/evaluation/beir.rst.txt b/_sources/API/evaluation/beir.rst.txt new file mode 100644 index 00000000..ba584ca1 --- /dev/null +++ b/_sources/API/evaluation/beir.rst.txt @@ -0,0 +1,47 @@ +BEIR +==== + +`BEIR `_ (Benchmarking-IR) is a heterogeneous evaluation benchmark for information retrieval. +It is designed for evaluating the performance of NLP-based retrieval models and widely used by research of modern embedding models. + +You can evaluate model's performance on the BEIR benchmark by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/beir/eval_beir.sh + ./examples/evaluation/beir/eval_beir.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.beir \ + --eval_name beir \ + --dataset_dir ./beir/data \ + --dataset_names fiqa arguana cqadupstack \ + --splits test dev \ + --corpus_embd_save_dir ./beir/corpus_embd \ + --output_dir ./beir/search_results \ + --search_top_k 1000 \ + --rerank_top_k 100 \ + --cache_path /root/.cache/huggingface/hub \ + --overwrite False \ + --k_values 10 100 \ + --eval_output_method markdown \ + --eval_output_path ./beir/beir_eval_results.md \ + --eval_metrics ndcg_at_10 recall_at_100 \ + --ignore_identical_ids True \ + --embedder_name_or_path BAAI/bge-large-en-v1.5 \ + --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ + --devices cuda:0 cuda:1 \ + --reranker_max_length 1024 \ + +change the embedder, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + beir/arguments + beir/data_loader + beir/evaluator + beir/runner \ No newline at end of file diff --git a/_sources/API/evaluation/beir/arguments.rst.txt b/_sources/API/evaluation/beir/arguments.rst.txt new file mode 100644 index 00000000..71593837 --- /dev/null +++ b/_sources/API/evaluation/beir/arguments.rst.txt @@ -0,0 +1,4 @@ +arguments +========= + +.. autoclass:: FlagEmbedding.evaluation.bier.BEIREvalArgs \ No newline at end of file diff --git a/_sources/API/evaluation/beir/data_loader.rst.txt b/_sources/API/evaluation/beir/data_loader.rst.txt new file mode 100644 index 00000000..de224fa1 --- /dev/null +++ b/_sources/API/evaluation/beir/data_loader.rst.txt @@ -0,0 +1,4 @@ +data loader +=========== + +.. autoclass:: FlagEmbedding.abc.evaluation.BEIREvalDataLoader \ No newline at end of file diff --git a/_sources/API/evaluation/beir/evaluator.rst.txt b/_sources/API/evaluation/beir/evaluator.rst.txt new file mode 100644 index 00000000..cc752f6a --- /dev/null +++ b/_sources/API/evaluation/beir/evaluator.rst.txt @@ -0,0 +1,4 @@ +evaluator +========= + +.. autoclass:: FlagEmbedding.evaluation.beir.BEIREvaluator \ No newline at end of file diff --git a/_sources/API/evaluation/beir/runner.rst.txt b/_sources/API/evaluation/beir/runner.rst.txt new file mode 100644 index 00000000..e2866159 --- /dev/null +++ b/_sources/API/evaluation/beir/runner.rst.txt @@ -0,0 +1,4 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.beir.BEIREvalRunner \ No newline at end of file diff --git a/_sources/API/evaluation/miracl.rst.txt b/_sources/API/evaluation/miracl.rst.txt new file mode 100644 index 00000000..132bcf7c --- /dev/null +++ b/_sources/API/evaluation/miracl.rst.txt @@ -0,0 +1,48 @@ +MIRACL +====== + +`MIRACL `_ (Multilingual Information Retrieval Across a Continuum of Languages) +is an WSDM 2023 Cup challenge that focuses on search across 18 different languages. +They release a multilingual retrieval dataset containing the train and dev set for 16 "known languages" and only dev set for 2 "surprise languages". +The topics are generated by native speakers of each language, who also label the relevance between the topics and a given document list. +You can found the `dataset `_ on HuggingFace. + +You can evaluate model's performance on MIRACL simply by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/miracl/eval_miracl.sh + ./examples/evaluation/miracl/eval_miracl.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.miracl \ + --eval_name miracl \ + --dataset_dir ./miracl/data \ + --dataset_names bn hi sw te th yo \ + --splits dev \ + --corpus_embd_save_dir ./miracl/corpus_embd \ + --output_dir ./miracl/search_results \ + --search_top_k 1000 \ + --rerank_top_k 100 \ + --cache_path /root/.cache/huggingface/hub \ + --overwrite False \ + --k_values 10 100 \ + --eval_output_method markdown \ + --eval_output_path ./miracl/miracl_eval_results.md \ + --eval_metrics ndcg_at_10 recall_at_100 \ + --embedder_name_or_path BAAI/bge-m3 \ + --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ + --devices cuda:0 cuda:1 \ + --cache_dir /root/.cache/huggingface/hub \ + --reranker_max_length 1024 + +change the embedder, reranker, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + miracl/data_loader + miracl/runner \ No newline at end of file diff --git a/_sources/API/evaluation/miracl/data_loader.rst.txt b/_sources/API/evaluation/miracl/data_loader.rst.txt new file mode 100644 index 00000000..7dbcfced --- /dev/null +++ b/_sources/API/evaluation/miracl/data_loader.rst.txt @@ -0,0 +1,13 @@ +data_loader +=========== + +.. autoclass:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader + +Methods +------- + +.. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader.available_dataset_names +.. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader.available_splits +.. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader._load_remote_corpus +.. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader._load_remote_qrels +.. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader._load_remote_queries \ No newline at end of file diff --git a/_sources/API/evaluation/miracl/runner.rst.txt b/_sources/API/evaluation/miracl/runner.rst.txt new file mode 100644 index 00000000..b77da139 --- /dev/null +++ b/_sources/API/evaluation/miracl/runner.rst.txt @@ -0,0 +1,5 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.miracl.MIRACLEvalRunner + :members: \ No newline at end of file diff --git a/_sources/API/evaluation/mkqa.rst.txt b/_sources/API/evaluation/mkqa.rst.txt new file mode 100644 index 00000000..283eac22 --- /dev/null +++ b/_sources/API/evaluation/mkqa.rst.txt @@ -0,0 +1,89 @@ +MKQA +==== + +`MKQA `_ is an open-domain question answering evaluation set comprising 10k question-answer pairs aligned across 26 typologically diverse languages. +The queries are sampled from the [Google Natural Questions Dataset](https://github.com/google-research-datasets/natural-questions). + +Each example in the dataset has the following structure: + +.. code:: bash + + { + 'example_id': 563260143484355911, + 'queries': { + 'en': "who sings i hear you knocking but you can't come in", + 'ru': "кто поет i hear you knocking but you can't come in", + 'ja': '「 I hear you knocking」は誰が歌っていますか', + 'zh_cn': "《i hear you knocking but you can't come in》是谁演唱的", + ... + }, + 'query': "who sings i hear you knocking but you can't come in", + 'answers': { + 'en': [{ + 'type': 'entity', + 'entity': 'Q545186', + 'text': 'Dave Edmunds', + 'aliases': [], + }], + 'ru': [{ + 'type': 'entity', + 'entity': 'Q545186', + 'text': 'Эдмундс, Дэйв', + 'aliases': ['Эдмундс', 'Дэйв Эдмундс', 'Эдмундс Дэйв', 'Dave Edmunds'], + }], + 'ja': [{ + 'type': 'entity', + 'entity': 'Q545186', + 'text': 'デイヴ・エドモンズ', + 'aliases': ['デーブ・エドモンズ', 'デイブ・エドモンズ'], + }], + 'zh_cn': [{ + 'type': 'entity', + 'text': '戴维·埃德蒙兹 ', + 'entity': 'Q545186', + }], + ... + }, + } + + +You can evaluate model's performance on MKQA simply by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/mkqa/eval_mkqa.sh + ./examples/evaluation/mkqa/eval_mkqa.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.mkqa \ + --eval_name mkqa \ + --dataset_dir ./mkqa/data \ + --dataset_names en zh_cn \ + --splits test \ + --corpus_embd_save_dir ./mkqa/corpus_embd \ + --output_dir ./mkqa/search_results \ + --search_top_k 1000 \ + --rerank_top_k 100 \ + --cache_path /root/.cache/huggingface/hub \ + --overwrite False \ + --k_values 20 \ + --eval_output_method markdown \ + --eval_output_path ./mkqa/mkqa_eval_results.md \ + --eval_metrics qa_recall_at_20 \ + --embedder_name_or_path BAAI/bge-m3 \ + --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ + --devices cuda:0 cuda:1 \ + --cache_dir /root/.cache/huggingface/hub \ + --reranker_max_length 1024 + +change the embedder, reranker, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + mkqa/data_loader + mkqa/evaluator + mkqa/runner \ No newline at end of file diff --git a/_sources/API/evaluation/mkqa/data_loader.rst.txt b/_sources/API/evaluation/mkqa/data_loader.rst.txt new file mode 100644 index 00000000..94c62b22 --- /dev/null +++ b/_sources/API/evaluation/mkqa/data_loader.rst.txt @@ -0,0 +1,15 @@ +data_loader +=========== + +.. autoclass:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader + +Methods +------- + +.. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader.available_dataset_names +.. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader.available_splits +.. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader.load_corpus +.. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader._load_local_qrels +.. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader._load_remote_corpus +.. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader._load_remote_qrels +.. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader._load_remote_queries \ No newline at end of file diff --git a/_sources/API/evaluation/mkqa/evaluator.rst.txt b/_sources/API/evaluation/mkqa/evaluator.rst.txt new file mode 100644 index 00000000..c46fc2f9 --- /dev/null +++ b/_sources/API/evaluation/mkqa/evaluator.rst.txt @@ -0,0 +1,5 @@ +evaluator +========= + +.. autoclass:: FlagEmbedding.evaluation.mkqa.MKQAEvaluator + :members: \ No newline at end of file diff --git a/_sources/API/evaluation/mkqa/runner.rst.txt b/_sources/API/evaluation/mkqa/runner.rst.txt new file mode 100644 index 00000000..bddedfcb --- /dev/null +++ b/_sources/API/evaluation/mkqa/runner.rst.txt @@ -0,0 +1,4 @@ +runner +====== +.. autoclass:: FlagEmbedding.evaluation.mkqa.MKQAEvalRunner + :members: \ No newline at end of file diff --git a/_sources/API/evaluation/mldr.rst.txt b/_sources/API/evaluation/mldr.rst.txt new file mode 100644 index 00000000..7865536c --- /dev/null +++ b/_sources/API/evaluation/mldr.rst.txt @@ -0,0 +1,95 @@ +MLDR +==== + +`MLDR `_ is a Multilingual Long-Document Retrieval dataset built on Wikipeida, Wudao and mC4, covering 13 typologically diverse languages. +Specifically, we sample lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose paragraphs from them. +Then we use GPT-3.5 to generate questions based on these paragraphs. +The generated question and the sampled article constitute a new text pair to the dataset. + +An example of ``train`` set looks like: + +.. code:: bash + + { + 'query_id': 'q-zh-<...>', + 'query': '...', + 'positive_passages': [ + { + 'docid': 'doc-zh-<...>', + 'text': '...' + } + ], + 'negative_passages': [ + { + 'docid': 'doc-zh-<...>', + 'text': '...' + }, + ... + ] + } + +An example of ``dev`` and ``test`` set looks like: + +.. code:: bash + + { + 'query_id': 'q-zh-<...>', + 'query': '...', + 'positive_passages': [ + { + 'docid': 'doc-zh-<...>', + 'text': '...' + } + ], + 'negative_passages': [] + } + +An example of ``corpus`` looks like: + +.. code:: bash + + { + 'docid': 'doc-zh-<...>', + 'text': '...' + } + +You can evaluate model's performance on MLDR simply by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/mldr/eval_mldr.sh + ./examples/evaluation/mldr/eval_mldr.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.mldr \ + --eval_name mldr \ + --dataset_dir ./mldr/data \ + --dataset_names hi \ + --splits test \ + --corpus_embd_save_dir ./mldr/corpus_embd \ + --output_dir ./mldr/search_results \ + --search_top_k 1000 \ + --rerank_top_k 100 \ + --cache_path /root/.cache/huggingface/hub \ + --overwrite False \ + --k_values 10 100 \ + --eval_output_method markdown \ + --eval_output_path ./mldr/mldr_eval_results.md \ + --eval_metrics ndcg_at_10 \ + --embedder_name_or_path BAAI/bge-m3 \ + --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ + --devices cuda:0 cuda:1 \ + --cache_dir /root/.cache/huggingface/hub \ + --embedder_passage_max_length 8192 \ + --reranker_max_length 8192 + +change the args of embedder, reranker, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + mldr/data_loader + mldr/runner \ No newline at end of file diff --git a/_sources/API/evaluation/mldr/data_loader.rst.txt b/_sources/API/evaluation/mldr/data_loader.rst.txt new file mode 100644 index 00000000..f0fe313e --- /dev/null +++ b/_sources/API/evaluation/mldr/data_loader.rst.txt @@ -0,0 +1,13 @@ +data_loader +=========== + +.. autoclass:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader + +Methods +------- + +.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader.available_dataset_names +.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader.available_splits +.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_corpus +.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_qrels +.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_queries \ No newline at end of file diff --git a/_sources/API/evaluation/mldr/runner.rst.txt b/_sources/API/evaluation/mldr/runner.rst.txt new file mode 100644 index 00000000..d6c1ee6d --- /dev/null +++ b/_sources/API/evaluation/mldr/runner.rst.txt @@ -0,0 +1,5 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.mldr.MLDREvalRunner + :members: \ No newline at end of file diff --git a/_sources/API/evaluation/msmarco.rst.txt b/_sources/API/evaluation/msmarco.rst.txt new file mode 100644 index 00000000..db8ff200 --- /dev/null +++ b/_sources/API/evaluation/msmarco.rst.txt @@ -0,0 +1,46 @@ +MSMARCO +======= + +`MS Marco `_ (Microsoft MAchine Reading Comprehension) is a large scale real-world reading comprehension dataset. +It is widely used in information retrieval, question answering, and natural language processing research. + + +You can evaluate model's performance on MS MARCO simply by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/msmarco/eval_msmarco.sh + ./examples/evaluation/msmarco/eval_msmarco.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.msmarco \ + --eval_name msmarco \ + --dataset_dir ./msmarco/data \ + --dataset_names passage \ + --splits dev \ + --corpus_embd_save_dir ./msmarco/corpus_embd \ + --output_dir ./msmarco/search_results \ + --search_top_k 1000 \ + --rerank_top_k 100 \ + --cache_path /root/.cache/huggingface/hub \ + --overwrite True \ + --k_values 10 100 \ + --eval_output_method markdown \ + --eval_output_path ./msmarco/msmarco_eval_results.md \ + --eval_metrics ndcg_at_10 recall_at_100 \ + --embedder_name_or_path BAAI/bge-large-en-v1.5 \ + --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ + --devices cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 \ + --cache_dir /root/.cache/huggingface/hub \ + --reranker_max_length 1024 + +change the embedder, reranker, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + msmarco/data_loader + msmarco/runner \ No newline at end of file diff --git a/_sources/API/evaluation/msmarco/data_loader.rst.txt b/_sources/API/evaluation/msmarco/data_loader.rst.txt new file mode 100644 index 00000000..f886eda5 --- /dev/null +++ b/_sources/API/evaluation/msmarco/data_loader.rst.txt @@ -0,0 +1,13 @@ +data_loader +=========== + +.. autoclass:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader + +Methods +------- + +.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader.available_dataset_names +.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader.available_splits +.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_corpus +.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_qrels +.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_queries \ No newline at end of file diff --git a/_sources/API/evaluation/msmarco/runner.rst.txt b/_sources/API/evaluation/msmarco/runner.rst.txt new file mode 100644 index 00000000..ae56a455 --- /dev/null +++ b/_sources/API/evaluation/msmarco/runner.rst.txt @@ -0,0 +1,5 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalRunner + :members: \ No newline at end of file diff --git a/_sources/API/evaluation/mteb.rst.txt b/_sources/API/evaluation/mteb.rst.txt new file mode 100644 index 00000000..044b5e68 --- /dev/null +++ b/_sources/API/evaluation/mteb.rst.txt @@ -0,0 +1,37 @@ +MTEB +==== + +`MTEB `_ (The Massive Text Embedding Benchmark) is a large-scale evaluation framework designed to assess the performance of text embedding models across a wide variety of NLP tasks. +Introduced to standardize and improve the evaluation of text embeddings, MTEB is crucial for assessing how well these models generalize across various real-world applications. +It contains a wide range of datasets in eight main NLP tasks and different languages, and provides an easy pipeline for evaluation. +It also holds the well known MTEB `leaderboard `_, which contains a ranking of the latest first-class embedding models. + +You can evaluate model's performance on the whole MTEB benchmark by running our provided shell script: + +.. code:: bash + + chmod +x /examples/evaluation/mteb/eval_mteb.sh + ./examples/evaluation/mteb/eval_mteb.sh + +Or by running: + +.. code:: bash + + python -m FlagEmbedding.evaluation.mteb \ + --eval_name mteb \ + --output_dir ./mteb/search_results \ + --languages eng \ + --tasks NFCorpus BiorxivClusteringS2S SciDocsRR \ + --eval_output_path ./mteb/mteb_eval_results.json \ + --embedder_name_or_path BAAI/bge-large-en-v1.5 \ + --devices cuda:7 \ + --cache_dir /root/.cache/huggingface/hub + +change the embedder, devices and cache directory to your preference. + +.. toctree:: + :hidden: + + mteb/arguments + mteb/searcher + mteb/runner \ No newline at end of file diff --git a/_sources/API/evaluation/mteb/arguments.rst.txt b/_sources/API/evaluation/mteb/arguments.rst.txt new file mode 100644 index 00000000..b07f3a97 --- /dev/null +++ b/_sources/API/evaluation/mteb/arguments.rst.txt @@ -0,0 +1,4 @@ +arguments +========= + +.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalArgs \ No newline at end of file diff --git a/_sources/API/evaluation/mteb/runner.rst.txt b/_sources/API/evaluation/mteb/runner.rst.txt new file mode 100644 index 00000000..495a929c --- /dev/null +++ b/_sources/API/evaluation/mteb/runner.rst.txt @@ -0,0 +1,4 @@ +runner +====== + +.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalRunner \ No newline at end of file diff --git a/_sources/API/evaluation/mteb/searcher.rst.txt b/_sources/API/evaluation/mteb/searcher.rst.txt new file mode 100644 index 00000000..f51873a3 --- /dev/null +++ b/_sources/API/evaluation/mteb/searcher.rst.txt @@ -0,0 +1,6 @@ +searcher +======== + +.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalDenseRetriever + +.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalReranker \ No newline at end of file diff --git a/_sources/API/finetune.rst.txt b/_sources/API/finetune.rst.txt new file mode 100644 index 00000000..96d46c77 --- /dev/null +++ b/_sources/API/finetune.rst.txt @@ -0,0 +1,6 @@ +Finetune +======== + +.. toctree:: + finetune/embedder + finetune/reranker \ No newline at end of file diff --git a/_sources/API/finetune/embedder.rst.txt b/_sources/API/finetune/embedder.rst.txt new file mode 100644 index 00000000..a2a64972 --- /dev/null +++ b/_sources/API/finetune/embedder.rst.txt @@ -0,0 +1,6 @@ +Embedder +======== + +.. toctree:: + embedder/encoder_only + embedder/decoder_only \ No newline at end of file diff --git a/_sources/API/finetune/embedder/decoder_only.rst.txt b/_sources/API/finetune/embedder/decoder_only.rst.txt new file mode 100644 index 00000000..29dbc50a --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only.rst.txt @@ -0,0 +1,6 @@ +Decoder Only +============ + +.. toctree:: + decoder_only/base + decoder_only/icl \ No newline at end of file diff --git a/_sources/API/finetune/embedder/decoder_only/base.rst.txt b/_sources/API/finetune/embedder/decoder_only/base.rst.txt new file mode 100644 index 00000000..840078b7 --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/base.rst.txt @@ -0,0 +1,8 @@ +Base +==== + +.. toctree:: + base/arguments + base/modeling + base/runner + base/trainer \ No newline at end of file diff --git a/_sources/API/finetune/embedder/decoder_only/base/arguments.rst.txt b/_sources/API/finetune/embedder/decoder_only/base/arguments.rst.txt new file mode 100644 index 00000000..f224e1a9 --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/base/arguments.rst.txt @@ -0,0 +1,4 @@ +Arguments +========= + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.base.DecoderOnlyEmbedderModelArguments diff --git a/_sources/API/finetune/embedder/decoder_only/base/modeling.rst.txt b/_sources/API/finetune/embedder/decoder_only/base/modeling.rst.txt new file mode 100644 index 00000000..9b02895b --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/base/modeling.rst.txt @@ -0,0 +1,10 @@ +======== +Modeling +======== + +.. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.base.CrossDecoderModel + +Methods +======= + +.. automethod:: FlagEmbedding.finetune.reranker.decoder_only.base.CrossDecoderModel.encode diff --git a/_sources/API/finetune/embedder/decoder_only/base/runner.rst.txt b/_sources/API/finetune/embedder/decoder_only/base/runner.rst.txt new file mode 100644 index 00000000..6a839e3d --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/base/runner.rst.txt @@ -0,0 +1,5 @@ +Runner +====== + +.. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.base.DecoderOnlyRerankerRunner + :members: \ No newline at end of file diff --git a/_sources/API/finetune/embedder/decoder_only/base/trainer.rst.txt b/_sources/API/finetune/embedder/decoder_only/base/trainer.rst.txt new file mode 100644 index 00000000..02681eb6 --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/base/trainer.rst.txt @@ -0,0 +1,5 @@ +Trainer +======= + +.. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.base.DecoderOnlyRerankerTrainer + :members: \ No newline at end of file diff --git a/_sources/API/finetune/embedder/decoder_only/icl.rst.txt b/_sources/API/finetune/embedder/decoder_only/icl.rst.txt new file mode 100644 index 00000000..65198260 --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/icl.rst.txt @@ -0,0 +1,9 @@ +ICL +=== + +.. toctree:: + icl/arguments + icl/dataset + icl/modeling + icl/runner + icl/trainer \ No newline at end of file diff --git a/_sources/API/finetune/embedder/decoder_only/icl/arguments.rst.txt b/_sources/API/finetune/embedder/decoder_only/icl/arguments.rst.txt new file mode 100644 index 00000000..d2bd89a5 --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/icl/arguments.rst.txt @@ -0,0 +1,6 @@ +Arguments +========= + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLModelArguments + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLDataArguments \ No newline at end of file diff --git a/_sources/API/finetune/embedder/decoder_only/icl/dataset.rst.txt b/_sources/API/finetune/embedder/decoder_only/icl/dataset.rst.txt new file mode 100644 index 00000000..87d5796e --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/icl/dataset.rst.txt @@ -0,0 +1,18 @@ +======= +Dataset +======= + +DecoderOnlyEmbedderICLSameDatasetTrainDataset +============================================= + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLSameDatasetTrainDataset + +Methods +------- + +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLSameDatasetTrainDataset._create_batch_data + +AbsEmbedderSameDatasetCollator +============================== + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.AbsEmbedderSameDatasetCollator \ No newline at end of file diff --git a/_sources/API/finetune/embedder/decoder_only/icl/modeling.rst.txt b/_sources/API/finetune/embedder/decoder_only/icl/modeling.rst.txt new file mode 100644 index 00000000..6bdd1ee1 --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/icl/modeling.rst.txt @@ -0,0 +1,18 @@ +======== +Modeling +======== + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel + +Methods +======= + +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.encode +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.compute_score +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.compute_loss +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.gradient_checkpointing_enable +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.enable_input_require_grads +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.save + +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel._sentence_embedding +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel._compute_similarity diff --git a/_sources/API/finetune/embedder/decoder_only/icl/runner.rst.txt b/_sources/API/finetune/embedder/decoder_only/icl/runner.rst.txt new file mode 100644 index 00000000..eae8a796 --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/icl/runner.rst.txt @@ -0,0 +1,5 @@ +Runner +====== + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLRunner + :members: \ No newline at end of file diff --git a/_sources/API/finetune/embedder/decoder_only/icl/trainer.rst.txt b/_sources/API/finetune/embedder/decoder_only/icl/trainer.rst.txt new file mode 100644 index 00000000..c25d594f --- /dev/null +++ b/_sources/API/finetune/embedder/decoder_only/icl/trainer.rst.txt @@ -0,0 +1,5 @@ +Trainer +======= + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLTrainer + :members: \ No newline at end of file diff --git a/_sources/API/finetune/embedder/encoder_only.rst.txt b/_sources/API/finetune/embedder/encoder_only.rst.txt new file mode 100644 index 00000000..b756a88d --- /dev/null +++ b/_sources/API/finetune/embedder/encoder_only.rst.txt @@ -0,0 +1,6 @@ +Encoder Only +============ + +.. toctree:: + encoder_only/base + encoder_only/m3 \ No newline at end of file diff --git a/_sources/API/finetune/embedder/encoder_only/base.rst.txt b/_sources/API/finetune/embedder/encoder_only/base.rst.txt new file mode 100644 index 00000000..14070e7e --- /dev/null +++ b/_sources/API/finetune/embedder/encoder_only/base.rst.txt @@ -0,0 +1,7 @@ +Base +==== + +.. toctree:: + base/modeling + base/runner + base/trainer \ No newline at end of file diff --git a/_sources/API/finetune/embedder/encoder_only/base/modeling.rst.txt b/_sources/API/finetune/embedder/encoder_only/base/modeling.rst.txt new file mode 100644 index 00000000..d03c21c1 --- /dev/null +++ b/_sources/API/finetune/embedder/encoder_only/base/modeling.rst.txt @@ -0,0 +1,17 @@ +Modeling +======== + +.. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel + +Methods +------- + +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.encode + +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.compute_score +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.compute_loss +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.gradient_checkpointing_enable +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.enable_input_require_grads +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.save +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel._sentence_embedding +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel._compute_similarity \ No newline at end of file diff --git a/_sources/API/finetune/embedder/encoder_only/base/runner.rst.txt b/_sources/API/finetune/embedder/encoder_only/base/runner.rst.txt new file mode 100644 index 00000000..a9212a68 --- /dev/null +++ b/_sources/API/finetune/embedder/encoder_only/base/runner.rst.txt @@ -0,0 +1,5 @@ +Runner +====== + +.. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.base.EncoderOnlyEmbedderRunner + :members: \ No newline at end of file diff --git a/_sources/API/finetune/embedder/encoder_only/base/trainer.rst.txt b/_sources/API/finetune/embedder/encoder_only/base/trainer.rst.txt new file mode 100644 index 00000000..6b135de5 --- /dev/null +++ b/_sources/API/finetune/embedder/encoder_only/base/trainer.rst.txt @@ -0,0 +1,5 @@ +Trainer +======= + +.. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.base.EncoderOnlyEmbedderTrainer + :members: \ No newline at end of file diff --git a/_sources/API/finetune/embedder/encoder_only/m3.rst.txt b/_sources/API/finetune/embedder/encoder_only/m3.rst.txt new file mode 100644 index 00000000..174c9302 --- /dev/null +++ b/_sources/API/finetune/embedder/encoder_only/m3.rst.txt @@ -0,0 +1,8 @@ +M3 +== + +.. toctree:: + m3/arguments + m3/modeling + m3/runner + m3/trainer \ No newline at end of file diff --git a/_sources/API/finetune/embedder/encoder_only/m3/arguments.rst.txt b/_sources/API/finetune/embedder/encoder_only/m3/arguments.rst.txt new file mode 100644 index 00000000..f89292a3 --- /dev/null +++ b/_sources/API/finetune/embedder/encoder_only/m3/arguments.rst.txt @@ -0,0 +1,6 @@ +Arguments +========= + +.. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3ModelArguments + +.. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3TrainingArguments \ No newline at end of file diff --git a/_sources/API/finetune/embedder/encoder_only/m3/modeling.rst.txt b/_sources/API/finetune/embedder/encoder_only/m3/modeling.rst.txt new file mode 100644 index 00000000..3784b4f2 --- /dev/null +++ b/_sources/API/finetune/embedder/encoder_only/m3/modeling.rst.txt @@ -0,0 +1,35 @@ +======== +Modeling +======== + +EncoderOnlyEmbedderM3Model +============================ + +.. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model + +Methods +------- + +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.encode +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.compute_score +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.compute_dense_score +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.compute_sparse_score +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.compute_colbert_score +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.ensemble_score +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.forward +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.compute_loss +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.gradient_checkpointing_enable +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.enable_input_require_grads +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model.save +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model._dense_embedding +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model._sparse_embedding +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model._colbert_embedding +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model._encode +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model._compute_similarity +.. automethod:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Model._get_queries_attention_mask + +EncoderOnlyEmbedderM3ModelForInference +====================================== + +.. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3ModelForInference + :members: \ No newline at end of file diff --git a/_sources/API/finetune/embedder/encoder_only/m3/runner.rst.txt b/_sources/API/finetune/embedder/encoder_only/m3/runner.rst.txt new file mode 100644 index 00000000..e5c6d424 --- /dev/null +++ b/_sources/API/finetune/embedder/encoder_only/m3/runner.rst.txt @@ -0,0 +1,5 @@ +Runner +====== + +.. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Runner + :members: \ No newline at end of file diff --git a/_sources/API/finetune/embedder/encoder_only/m3/trainer.rst.txt b/_sources/API/finetune/embedder/encoder_only/m3/trainer.rst.txt new file mode 100644 index 00000000..2c68a989 --- /dev/null +++ b/_sources/API/finetune/embedder/encoder_only/m3/trainer.rst.txt @@ -0,0 +1,5 @@ +Trainer +======= + +.. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Trainer + :members: \ No newline at end of file diff --git a/_sources/API/finetune/reranker.rst.txt b/_sources/API/finetune/reranker.rst.txt new file mode 100644 index 00000000..e6a47f2f --- /dev/null +++ b/_sources/API/finetune/reranker.rst.txt @@ -0,0 +1,6 @@ +Reranker +======== + +.. toctree:: + reranker/encoder_only + reranker/decoder_only \ No newline at end of file diff --git a/_sources/API/finetune/reranker/decoder_only.rst.txt b/_sources/API/finetune/reranker/decoder_only.rst.txt new file mode 100644 index 00000000..bc5f39fa --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only.rst.txt @@ -0,0 +1,6 @@ +Decoder Only +============ + +.. toctree:: + decoder_only/base + decoder_only/layerwise \ No newline at end of file diff --git a/_sources/API/finetune/reranker/decoder_only/base.rst.txt b/_sources/API/finetune/reranker/decoder_only/base.rst.txt new file mode 100644 index 00000000..840078b7 --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only/base.rst.txt @@ -0,0 +1,8 @@ +Base +==== + +.. toctree:: + base/arguments + base/modeling + base/runner + base/trainer \ No newline at end of file diff --git a/_sources/API/finetune/reranker/decoder_only/base/arguments.rst.txt b/_sources/API/finetune/reranker/decoder_only/base/arguments.rst.txt new file mode 100644 index 00000000..d725e3e1 --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only/base/arguments.rst.txt @@ -0,0 +1,4 @@ +Arguments +========= + +.. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.base.RerankerModelArguments diff --git a/_sources/API/finetune/reranker/decoder_only/base/modeling.rst.txt b/_sources/API/finetune/reranker/decoder_only/base/modeling.rst.txt new file mode 100644 index 00000000..ce83f82b --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only/base/modeling.rst.txt @@ -0,0 +1,18 @@ +======== +Modeling +======== + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel + +Methods +======= + +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.encode +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.compute_score +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.compute_loss +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.gradient_checkpointing_enable +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.enable_input_require_grads +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.save + +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel._sentence_embedding +.. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel._compute_similarity diff --git a/_sources/API/finetune/reranker/decoder_only/base/runner.rst.txt b/_sources/API/finetune/reranker/decoder_only/base/runner.rst.txt new file mode 100644 index 00000000..6d6cba72 --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only/base/runner.rst.txt @@ -0,0 +1,5 @@ +Runner +====== + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.base.DecoderOnlyEmbedderRunner + :members: \ No newline at end of file diff --git a/_sources/API/finetune/reranker/decoder_only/base/trainer.rst.txt b/_sources/API/finetune/reranker/decoder_only/base/trainer.rst.txt new file mode 100644 index 00000000..1b6de8be --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only/base/trainer.rst.txt @@ -0,0 +1,5 @@ +Trainer +======= + +.. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.base.DecoderOnlyEmbedderTrainer + :members: \ No newline at end of file diff --git a/_sources/API/finetune/reranker/decoder_only/layerwise.rst.txt b/_sources/API/finetune/reranker/decoder_only/layerwise.rst.txt new file mode 100644 index 00000000..2473432e --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only/layerwise.rst.txt @@ -0,0 +1,8 @@ +Layerwise +========= + +.. toctree:: + layerwise/arguments + layerwise/modeling + layerwise/runner + layerwise/trainer \ No newline at end of file diff --git a/_sources/API/finetune/reranker/decoder_only/layerwise/arguments.rst.txt b/_sources/API/finetune/reranker/decoder_only/layerwise/arguments.rst.txt new file mode 100644 index 00000000..462ef342 --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only/layerwise/arguments.rst.txt @@ -0,0 +1,4 @@ +Arguments +========= + +.. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.RerankerModelArguments diff --git a/_sources/API/finetune/reranker/decoder_only/layerwise/modeling.rst.txt b/_sources/API/finetune/reranker/decoder_only/layerwise/modeling.rst.txt new file mode 100644 index 00000000..ff1e0da7 --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only/layerwise/modeling.rst.txt @@ -0,0 +1,11 @@ +======== +Modeling +======== + +.. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.CrossDecoderModel + +Methods +======= + +.. automethod:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.CrossDecoderModel.encode +.. automethod:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.CrossDecoderModel.forward diff --git a/_sources/API/finetune/reranker/decoder_only/layerwise/runner.rst.txt b/_sources/API/finetune/reranker/decoder_only/layerwise/runner.rst.txt new file mode 100644 index 00000000..6bac60a3 --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only/layerwise/runner.rst.txt @@ -0,0 +1,5 @@ +Runner +====== + +.. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.DecoderOnlyRerankerRunner + :members: \ No newline at end of file diff --git a/_sources/API/finetune/reranker/decoder_only/layerwise/trainer.rst.txt b/_sources/API/finetune/reranker/decoder_only/layerwise/trainer.rst.txt new file mode 100644 index 00000000..7b979cfd --- /dev/null +++ b/_sources/API/finetune/reranker/decoder_only/layerwise/trainer.rst.txt @@ -0,0 +1,5 @@ +Trainer +======= + +.. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.DecoderOnlyRerankerTrainer + :members: \ No newline at end of file diff --git a/_sources/API/finetune/reranker/encoder_only.rst.txt b/_sources/API/finetune/reranker/encoder_only.rst.txt new file mode 100644 index 00000000..e34aa5eb --- /dev/null +++ b/_sources/API/finetune/reranker/encoder_only.rst.txt @@ -0,0 +1,5 @@ +Encoder Only +============ + +.. toctree:: + encoder_only/base \ No newline at end of file diff --git a/_sources/API/finetune/reranker/encoder_only/base.rst.txt b/_sources/API/finetune/reranker/encoder_only/base.rst.txt new file mode 100644 index 00000000..14070e7e --- /dev/null +++ b/_sources/API/finetune/reranker/encoder_only/base.rst.txt @@ -0,0 +1,7 @@ +Base +==== + +.. toctree:: + base/modeling + base/runner + base/trainer \ No newline at end of file diff --git a/_sources/API/finetune/reranker/encoder_only/base/modeling.rst.txt b/_sources/API/finetune/reranker/encoder_only/base/modeling.rst.txt new file mode 100644 index 00000000..2a26078a --- /dev/null +++ b/_sources/API/finetune/reranker/encoder_only/base/modeling.rst.txt @@ -0,0 +1,9 @@ +Modeling +======== + +.. autoclass:: FlagEmbedding.finetune.reranker.encoder_only.base.CrossEncoderModel + +Methods +------- + +.. automethod:: FlagEmbedding.finetune.reranker.encoder_only.base.CrossEncoderModel.encode diff --git a/_sources/API/finetune/reranker/encoder_only/base/runner.rst.txt b/_sources/API/finetune/reranker/encoder_only/base/runner.rst.txt new file mode 100644 index 00000000..c465793e --- /dev/null +++ b/_sources/API/finetune/reranker/encoder_only/base/runner.rst.txt @@ -0,0 +1,5 @@ +Runner +====== + +.. autoclass:: FlagEmbedding.finetune.reranker.encoder_only.base.EncoderOnlyRerankerRunner + :members: \ No newline at end of file diff --git a/_sources/API/finetune/reranker/encoder_only/base/trainer.rst.txt b/_sources/API/finetune/reranker/encoder_only/base/trainer.rst.txt new file mode 100644 index 00000000..d3aeac86 --- /dev/null +++ b/_sources/API/finetune/reranker/encoder_only/base/trainer.rst.txt @@ -0,0 +1,5 @@ +Trainer +======= + +.. autoclass:: FlagEmbedding.finetune.reranker.encoder_only.base.EncoderOnlyRerankerTrainer + :members: \ No newline at end of file diff --git a/_sources/API/inference.rst.txt b/_sources/API/inference.rst.txt new file mode 100644 index 00000000..047e28fb --- /dev/null +++ b/_sources/API/inference.rst.txt @@ -0,0 +1,8 @@ +Inference +========= + +.. toctree:: + inference/FlagAutoModel + inference/FlagAutoReranker + inference/embedder/embedder + inference/reranker/reranker \ No newline at end of file diff --git a/_sources/API/inference/FlagAutoModel.rst.txt b/_sources/API/inference/FlagAutoModel.rst.txt new file mode 100644 index 00000000..c9bfec11 --- /dev/null +++ b/_sources/API/inference/FlagAutoModel.rst.txt @@ -0,0 +1,9 @@ +FlagAutoModel +============= + +.. autoclass:: FlagEmbedding.inference.FlagAutoModel + +Methods +------- + +.. automethod:: FlagEmbedding.inference.FlagAutoModel.from_finetuned \ No newline at end of file diff --git a/_sources/API/inference/FlagAutoReranker.rst.txt b/_sources/API/inference/FlagAutoReranker.rst.txt new file mode 100644 index 00000000..9285e74f --- /dev/null +++ b/_sources/API/inference/FlagAutoReranker.rst.txt @@ -0,0 +1,9 @@ +FlagAutoReranker +================ + +.. autoclass:: FlagEmbedding.inference.FlagAutoReranker + +Methods +------- + +.. automethod:: FlagEmbedding.inference.FlagAutoReranker.from_finetuned \ No newline at end of file diff --git a/_sources/API/inference/embedder/decoder_only/BaseLLMEmbedder.rst.txt b/_sources/API/inference/embedder/decoder_only/BaseLLMEmbedder.rst.txt new file mode 100644 index 00000000..29c7bae7 --- /dev/null +++ b/_sources/API/inference/embedder/decoder_only/BaseLLMEmbedder.rst.txt @@ -0,0 +1,15 @@ +BaseEmbedder +============ + +.. autoclass:: FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder + +Methods +------- + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder.encode_queries + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder.encode_corpus + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder.encode + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder.encode_single_device \ No newline at end of file diff --git a/_sources/API/inference/embedder/decoder_only/ICLLLMEmbedder.rst.txt b/_sources/API/inference/embedder/decoder_only/ICLLLMEmbedder.rst.txt new file mode 100644 index 00000000..728a2ccc --- /dev/null +++ b/_sources/API/inference/embedder/decoder_only/ICLLLMEmbedder.rst.txt @@ -0,0 +1,21 @@ +ICLLLMEmbedder +============== + +.. autoclass:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder + +Methods +------- + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.encode_queries + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.encode_corpus + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.encode + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.set_examples + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.get_detailed_example + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.encode_queries_single_device + +.. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.encode_single_device \ No newline at end of file diff --git a/_sources/API/inference/embedder/embedder.rst.txt b/_sources/API/inference/embedder/embedder.rst.txt new file mode 100644 index 00000000..cdb45d8d --- /dev/null +++ b/_sources/API/inference/embedder/embedder.rst.txt @@ -0,0 +1,8 @@ +Embedder +======== + +.. toctree:: + encoder_only/BaseEmbedder + encoder_only/M3Embedder + decoder_only/BaseLLMEmbedder + decoder_only/ICLLLMEmbedder \ No newline at end of file diff --git a/_sources/API/inference/embedder/encoder_only/BaseEmbedder.rst.txt b/_sources/API/inference/embedder/encoder_only/BaseEmbedder.rst.txt new file mode 100644 index 00000000..f57a4727 --- /dev/null +++ b/_sources/API/inference/embedder/encoder_only/BaseEmbedder.rst.txt @@ -0,0 +1,18 @@ +BaseEmbedder +============ + +.. autoclass:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder + +Methods +------- + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder.encode_queries + :no-index: + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder.encode_corpus + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder.encode + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder.encode_single_device + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder.pooling \ No newline at end of file diff --git a/_sources/API/inference/embedder/encoder_only/M3Embedder.rst.txt b/_sources/API/inference/embedder/encoder_only/M3Embedder.rst.txt new file mode 100644 index 00000000..18ce3f74 --- /dev/null +++ b/_sources/API/inference/embedder/encoder_only/M3Embedder.rst.txt @@ -0,0 +1,27 @@ +M3Embedder +============ + +.. autoclass:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder + +Methods +------- + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder.encode_queries + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder.encode_corpus + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder.encode + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder.convert_id_to_token + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder.compute_lexical_matching_score + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder.colbert_score + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder.encode_single_device + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder.compute_score + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder.compute_score_multi_process + +.. automethod:: FlagEmbedding.inference.embedder.encoder_only.m3.M3Embedder.compute_score_single_device \ No newline at end of file diff --git a/_sources/API/inference/reranker/decoder_only/BaseLLMReranker.rst.txt b/_sources/API/inference/reranker/decoder_only/BaseLLMReranker.rst.txt new file mode 100644 index 00000000..1b21c86b --- /dev/null +++ b/_sources/API/inference/reranker/decoder_only/BaseLLMReranker.rst.txt @@ -0,0 +1,9 @@ +BaseLLMReranker +=============== + +.. autoclass:: FlagEmbedding.inference.reranker.decoder_only.base.BaseLLMReranker + +Methods +------- + +.. autoclass:: FlagEmbedding.inference.reranker.decoder_only.base.BaseLLMReranker.compute_score_single_gpu diff --git a/_sources/API/inference/reranker/decoder_only/LayerWiseLLMReranker.rst.txt b/_sources/API/inference/reranker/decoder_only/LayerWiseLLMReranker.rst.txt new file mode 100644 index 00000000..7f0ce702 --- /dev/null +++ b/_sources/API/inference/reranker/decoder_only/LayerWiseLLMReranker.rst.txt @@ -0,0 +1,9 @@ +LayerWiseLLMReranker +==================== + +.. autoclass:: FlagEmbedding.inference.reranker.decoder_only.layerwise.LayerWiseLLMReranker + +Methods +------- + +.. autoclass:: FlagEmbedding.inference.reranker.decoder_only.layerwise.LayerWiseLLMReranker.compute_score_single_gpu diff --git a/_sources/API/inference/reranker/decoder_only/LightweightLLMReranker.rst.txt b/_sources/API/inference/reranker/decoder_only/LightweightLLMReranker.rst.txt new file mode 100644 index 00000000..28ce1e52 --- /dev/null +++ b/_sources/API/inference/reranker/decoder_only/LightweightLLMReranker.rst.txt @@ -0,0 +1,9 @@ +LightweightLLMReranker +====================== + +.. autoclass:: FlagEmbedding.inference.reranker.decoder_only.lightweight.LightweightLLMReranker + +Methods +------- + +.. autoclass:: FlagEmbedding.inference.reranker.decoder_only.lightweight.LightweightLLMReranker.compute_score_single_gpu diff --git a/_sources/API/inference/reranker/encoder_only/BaseReranker.rst.txt b/_sources/API/inference/reranker/encoder_only/BaseReranker.rst.txt new file mode 100644 index 00000000..24e1585f --- /dev/null +++ b/_sources/API/inference/reranker/encoder_only/BaseReranker.rst.txt @@ -0,0 +1,9 @@ +BaseReranker +============ + +.. autoclass:: FlagEmbedding.inference.reranker.encoder_only.base.BaseReranker + +Methods +------- + +.. autoclass:: FlagEmbedding.inference.reranker.encoder_only.base.BaseReranker.compute_score_single_gpu diff --git a/_sources/API/inference/reranker/reranker.rst.txt b/_sources/API/inference/reranker/reranker.rst.txt new file mode 100644 index 00000000..608e7175 --- /dev/null +++ b/_sources/API/inference/reranker/reranker.rst.txt @@ -0,0 +1,8 @@ +Reranker +======== + +.. toctree:: + encoder_only/BaseReranker + decoder_only/BaseLLMReranker + decoder_only/LayerWiseLLMReranker + decoder_only/LightweightLLMReranker \ No newline at end of file diff --git a/_sources/C-MTEB.rst.txt b/_sources/C-MTEB.rst.txt new file mode 100644 index 00000000..fac2ec33 --- /dev/null +++ b/_sources/C-MTEB.rst.txt @@ -0,0 +1,47 @@ +.. C-MTEB +.. ====== + +.. Introduction +.. ------------ + +.. `C-MTEB `_ is a benchmark for chinese text embedding. It contains 35 +.. datasets in 6 different tasks, providing a comprehensive evaluation to the quality of an embedding model on Chinese. + + +.. .. image:: ../_static/img/C_MTEB.png +.. :width: 700 +.. :align: center + + +.. Installation +.. ------------ + +.. C-MTEB is developed based on MTEB, you can install C-MTEB by: + +.. .. code:: bash + +.. pip install -U C_MTEB + +.. or install by FlagEmbedding's repo: + +.. .. code:: bash + +.. git clone https://github.com/FlagOpen/FlagEmbedding.git +.. cd FlagEmbedding/C_MTEB +.. pip install -e . + +.. Citing the Work +.. --------------- + +.. There are more details in our publication. If you find C-MTEB useful, you can cite it by: + +.. .. code:: + +.. @misc{c-pack, +.. title={C-Pack: Packaged Resources To Advance General Chinese Embedding}, +.. author={Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff}, +.. year={2023}, +.. eprint={2309.07597}, +.. archivePrefix={arXiv}, +.. primaryClass={cs.CL} +.. } \ No newline at end of file diff --git a/_sources/Introduction/installation.rst.txt b/_sources/Introduction/installation.rst.txt new file mode 100644 index 00000000..4aacbb27 --- /dev/null +++ b/_sources/Introduction/installation.rst.txt @@ -0,0 +1,43 @@ +:github_url: https://github.com/AI4Finance-Foundation/FinRL + +Installation +============ + +Using pip: +---------- + +If you do not want to finetune the models, you can install the package without the finetune dependency: + +.. code:: bash + + pip install -U FlagEmbedding + +If you want to finetune the models, you can install the package with the finetune dependency: + +.. code:: bash + + pip install -U FlagEmbedding[finetune] + + +Install from sources: +--------------------- + +Clone the repository and install + +.. code:: bash + + git clone https://github.com/FlagOpen/FlagEmbedding.git + cd FlagEmbedding + # If you do not want to finetune the models, you can install the package without the finetune dependency: + pip install . + # If you want to finetune the models, you can install the package with the finetune dependency: + pip install .[finetune] + +For development in editable mode: + +.. code:: bash + + # If you do not want to finetune the models, you can install the package without the finetune dependency: + pip install -e . + # If you want to finetune the models, you can install the package with the finetune dependency: + pip install -e .[finetune] \ No newline at end of file diff --git a/_sources/Introduction/quick_start.ipynb.txt b/_sources/Introduction/quick_start.ipynb.txt new file mode 100644 index 00000000..4eed81cb --- /dev/null +++ b/_sources/Introduction/quick_start.ipynb.txt @@ -0,0 +1,472 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Quick Start" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we will show how to use BGE models on a text retrieval task in 5 minutes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 0: Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, install FlagEmbedding in the environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -U FlagEmbedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below is a super tiny courpus with only 10 sentences, which will be the dataset we use.\n", + "\n", + "Each sentence is a concise discription of a famous people in specific domain." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = [\n", + " \"Michael Jackson was a legendary pop icon known for his record-breaking music and dance innovations.\",\n", + " \"Fei-Fei Li is a professor in Stanford University, revolutionized computer vision with the ImageNet project.\",\n", + " \"Brad Pitt is a versatile actor and producer known for his roles in films like 'Fight Club' and 'Once Upon a Time in Hollywood.'\",\n", + " \"Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.\",\n", + " \"Eminem is a renowned rapper and one of the best-selling music artists of all time.\",\n", + " \"Taylor Swift is a Grammy-winning singer-songwriter known for her narrative-driven music.\",\n", + " \"Sam Altman leads OpenAI as its CEO, with astonishing works of GPT series and pursuing safe and beneficial AI.\",\n", + " \"Morgan Freeman is an acclaimed actor famous for his distinctive voice and diverse roles.\",\n", + " \"Andrew Ng spread AI knowledge globally via public courses on Coursera and Stanford University.\",\n", + " \"Robert Downey Jr. is an iconic actor best known for playing Iron Man in the Marvel Cinematic Universe.\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We want to know which one of these people could be an expert of neural network and who he/she is. \n", + "\n", + "Thus we generate the following query:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"Who could be an expert of neural network?\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Text -> Embedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's use a [BGE embedding model](https://huggingface.co/BAAI/bge-base-en-v1.5) to create sentence embedding for the corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from FlagEmbedding import FlagModel\n", + "\n", + "# get the BGE embedding model\n", + "model = FlagModel('BAAI/bge-base-en-v1.5',\n", + " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", + " use_fp16=True)\n", + "\n", + "# get the embedding of the query and corpus\n", + "corpus_embeddings = model.encode(corpus)\n", + "query_embedding = model.encode(query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The embedding of each sentence is a vector with length 768. " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape of the query embedding: (768,)\n", + "shape of the corpus embeddings: (10, 768)\n" + ] + } + ], + "source": [ + "print(\"shape of the query embedding: \", query_embedding.shape)\n", + "print(\"shape of the corpus embeddings:\", corpus_embeddings.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the following print line to take a look at the first 10 elements of the query embedding vector." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.00790005 -0.00683443 -0.00806659 0.00756918 0.04374858 0.02838556\n", + " 0.02357143 -0.02270943 -0.03611493 -0.03038301]\n" + ] + } + ], + "source": [ + "print(query_embedding[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Calculate Similarity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we have the embeddings of the query and the corpus. The next step is to calculate the similarity between the query and each sentence in the corpus. Here we use the dot product/inner product as our similarity metric." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.39290053 0.6031525 0.32672375 0.6082418 0.39446455 0.35350388\n", + " 0.4626108 0.40196604 0.5284606 0.36792332]\n" + ] + } + ], + "source": [ + "sim_scores = query_embedding @ corpus_embeddings.T\n", + "print(sim_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The result is a list of score representing the query's similarity to: [sentence 0, sentence 1, sentence 2, ...]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Ranking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After we have the similarity score of the query to each sentence in the corpus, we can rank them from large to small." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[3, 1, 8, 6, 7, 4, 0, 9, 5, 2]\n" + ] + } + ], + "source": [ + "# get the indices in sorted order\n", + "sorted_indices = sorted(range(len(sim_scores)), key=lambda k: sim_scores[k], reverse=True)\n", + "print(sorted_indices)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now from the ranking, the sentence with index 3 is the best answer to our query \"Who could be an expert of neural network?\"\n", + "\n", + "And that person is Geoffrey Hinton!" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.\n" + ] + } + ], + "source": [ + "print(corpus[3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "According to the order of indecies, we can print out the ranking of people that our little retriever got." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score of 0.608: \"Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.\"\n", + "Score of 0.603: \"Fei-Fei Li is a professor in Stanford University, revolutionized computer vision with the ImageNet project.\"\n", + "Score of 0.528: \"Andrew Ng spread AI knowledge globally via public courses on Coursera and Stanford University.\"\n", + "Score of 0.463: \"Sam Altman leads OpenAI as its CEO, with astonishing works of GPT series and pursuing safe and beneficial AI.\"\n", + "Score of 0.402: \"Morgan Freeman is an acclaimed actor famous for his distinctive voice and diverse roles.\"\n", + "Score of 0.394: \"Eminem is a renowned rapper and one of the best-selling music artists of all time.\"\n", + "Score of 0.393: \"Michael Jackson was a legendary pop icon known for his record-breaking music and dance innovations.\"\n", + "Score of 0.368: \"Robert Downey Jr. is an iconic actor best known for playing Iron Man in the Marvel Cinematic Universe.\"\n", + "Score of 0.354: \"Taylor Swift is a Grammy-winning singer-songwriter known for her narrative-driven music.\"\n", + "Score of 0.327: \"Brad Pitt is a versatile actor and producer known for his roles in films like 'Fight Club' and 'Once Upon a Time in Hollywood.'\"\n" + ] + } + ], + "source": [ + "# iteratively print the score and corresponding sentences in descending order\n", + "\n", + "for i in sorted_indices:\n", + " print(f\"Score of {sim_scores[i]:.3f}: \\\"{corpus[i]}\\\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the ranking, not surprisingly, the similarity scores of the query and the discriptions of Geoffrey Hinton and Fei-Fei Li is way higher than others, following by those of Andrew Ng and Sam Altman. \n", + "\n", + "While the key phrase \"neural network\" in the query does not appear in any of those discriptions, the BGE embedding model is still powerful enough to get the semantic meaning of query and corpus well." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We've seen the embedding model performed pretty well on the \"neural network\" query. What about the more general quality?\n", + "\n", + "Let's generate a very small dataset of queries and corresponding ground truth answers. Note that the ground truth answers are the indices of sentences in the corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "queries = [\n", + " \"Who could be an expert of neural network?\",\n", + " \"Who might had won Grammy?\",\n", + " \"Won Academy Awards\",\n", + " \"One of the most famous female singers.\",\n", + " \"Inventor of AlexNet\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "ground_truth = [\n", + " [1, 3],\n", + " [0, 4, 5],\n", + " [2, 7, 9],\n", + " [5],\n", + " [3],\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we repeat the steps we covered above to get the predicted ranking of each query." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[3, 1, 8, 6, 7, 4, 0, 9, 5, 2],\n", + " [5, 0, 3, 4, 1, 9, 7, 2, 6, 8],\n", + " [3, 2, 7, 5, 9, 0, 1, 4, 6, 8],\n", + " [5, 0, 4, 7, 1, 9, 2, 3, 6, 8],\n", + " [3, 1, 8, 6, 0, 7, 5, 9, 4, 2]]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# use bge model to generate embeddings for all the queries\n", + "queries_embedding = model.encode(queries)\n", + "# compute similarity scores\n", + "scores = queries_embedding @ corpus_embeddings.T\n", + "# get he final rankings\n", + "rankings = [sorted(range(len(sim_scores)), key=lambda k: sim_scores[k], reverse=True) for sim_scores in scores]\n", + "rankings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mean Reciprocal Rank ([MRR](https://en.wikipedia.org/wiki/Mean_reciprocal_rank)) is a widely used metric in information retrieval to evaluate the effectiveness of a system. Here we use that to have a very rough idea how our system performs." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "def MRR(preds, labels, cutoffs):\n", + " mrr = [0 for _ in range(len(cutoffs))]\n", + " for pred, label in zip(preds, labels):\n", + " for i, c in enumerate(cutoffs):\n", + " for j, index in enumerate(pred):\n", + " if j < c and index in label:\n", + " mrr[i] += 1/(j+1)\n", + " break\n", + " mrr = [k/len(preds) for k in mrr]\n", + " return mrr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We choose to use 1 and 5 as our cutoffs, with the result of 0.8 and 0.9 respectively." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MRR@1: 0.8\n", + "MRR@5: 0.9\n" + ] + } + ], + "source": [ + "cutoffs = [1, 5]\n", + "mrrs = MRR(rankings, ground_truth, cutoffs)\n", + "for i, c in enumerate(cutoffs):\n", + " print(f\"MRR@{c}: {mrrs[i]}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/bge/bge_icl.rst.txt b/_sources/bge/bge_icl.rst.txt new file mode 100644 index 00000000..5c91e04f --- /dev/null +++ b/_sources/bge/bge_icl.rst.txt @@ -0,0 +1,2 @@ +BGE-en-icl +========== \ No newline at end of file diff --git a/_sources/bge/bge_m3.rst.txt b/_sources/bge/bge_m3.rst.txt new file mode 100644 index 00000000..77609d50 --- /dev/null +++ b/_sources/bge/bge_m3.rst.txt @@ -0,0 +1,2 @@ +BGE-M3 +====== \ No newline at end of file diff --git a/_sources/bge/bge_reranker.rst.txt b/_sources/bge/bge_reranker.rst.txt new file mode 100644 index 00000000..47545c37 --- /dev/null +++ b/_sources/bge/bge_reranker.rst.txt @@ -0,0 +1,2 @@ +BGE-Reranker +============ \ No newline at end of file diff --git a/_sources/bge/bge_v1.rst.txt b/_sources/bge/bge_v1.rst.txt new file mode 100644 index 00000000..6c141ca4 --- /dev/null +++ b/_sources/bge/bge_v1.rst.txt @@ -0,0 +1,49 @@ +BGE-v1 +====== + +BGE +--- + +The first group of BGE models was released in Aug 2023. The :code:`bge-large-en` and :code:`bge-large-zh` ranked 1st on MTEB and +C-MTEB benchmarks at the time released. + ++-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ +| Model | Language | Parameters | Model Size | Description | ++===================================================================+===========+============+==============+=======================================================================+ +| `BAAI/bge-large-en `_ | English | 335M | 1.34 GB | Embedding Model which map text into vector | ++-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ +| `BAAI/bge-base-en `_ | English | 109M | 438 MB | a base-scale model but with similar ability to `BAAI/bge-large-en` | ++-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ +| `BAAI/bge-small-en `_ | English | 33.4M | 133 MB | a small-scale model but with competitive performance | ++-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ +| `BAAI/bge-large-zh `_ | Chinese | 326M | 1.3 GB | Embedding Model which map text into vector | ++-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ +| `BAAI/bge-base-zh `_ | Chinese | 102M | 409 MB | a base-scale model but with similar ability to `BAAI/bge-large-zh` | ++-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ +| `BAAI/bge-small-zh `_ | Chinese | 24M | 95.8 MB | a small-scale model but with competitive performance | ++-------------------------------------------------------------------+-----------+------------+--------------+-----------------------------------------------------------------------+ + +BGE-v1.5 +-------- + +Then to enhance its retrieval ability without instruction and alleviate the issue of the similarity distribution, :code:`bge-*-1.5` models +were released in Sep 2023. They are still the most popular embedding models that balanced well between embedding quality and model sizes. + ++-----------------------------------------------------------------------------+-----------+------------+--------------+--------------+ +| Model | Language | Parameters | Model Size | Description | ++=============================================================================+===========+============+==============+==============+ +| `BAAI/bge-large-en-v1.5 `_ | English | 335M | 1.34 GB | version 1.5 | ++-----------------------------------------------------------------------------+-----------+------------+--------------+ with more + +| `BAAI/bge-base-en-v1.5 `_ | English | 109M | 438 MB | reasonable | ++-----------------------------------------------------------------------------+-----------+------------+--------------+ similarity + +| `BAAI/bge-small-en-v1.5 `_ | English | 33.4M | 133 MB | distribution | ++-----------------------------------------------------------------------------+-----------+------------+--------------+ + +| `BAAI/bge-large-zh-v1.5 `_ | Chinese | 326M | 1.3 GB | | ++-----------------------------------------------------------------------------+-----------+------------+--------------+ + +| `BAAI/bge-base-zh-v1.5 `_ | Chinese | 102M | 409 MB | | ++-----------------------------------------------------------------------------+-----------+------------+--------------+ + +| `BAAI/bge-small-zh-v1.5 `_ | Chinese | 24M | 95.8 MB | | ++-----------------------------------------------------------------------------+-----------+------------+--------------+--------------+ + + + diff --git a/_sources/bge/introduction.rst.txt b/_sources/bge/introduction.rst.txt new file mode 100644 index 00000000..86cb3ce7 --- /dev/null +++ b/_sources/bge/introduction.rst.txt @@ -0,0 +1,5 @@ +Introduction +============ + +**BGE** stands for **BAAI General Embeddings**, which is a series of embedding models released by BAAI. + diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt new file mode 100644 index 00000000..6c4ba5e7 --- /dev/null +++ b/_sources/index.rst.txt @@ -0,0 +1,75 @@ +.. FlagEmbedding documentation master file, created by + sphinx-quickstart on Sat Oct 12 13:27:49 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +BAAI General Embedding +====================== + +| +| + +.. image:: _static/img/bge_logo.jpg + :target: https://github.com/FlagOpen/FlagEmbedding + :width: 500 + :align: center + +| +| + +Welcome to BGE documentation! + +We aim for building one-stop retrieval toolkit for search and RAG. + +Besides the resources we provide here in this documentation, please visit our `GitHub repo `_ for more contents including: + +- Want to get familiar with BGE quickly? There are hands-on `examples `_ to run for embedder and reranker's inference, evaluation, and finetuning. +- Unfamiliar with some area, keywords or techniques of retrieval and RAG? We provide `tutorials `_ to teach you basic knowledge and coding tips. +- Interested in research topics that expanding from BGE and retrieval? Our `research `_ folder contains many exciting topics for you to explore. + +BGE is developed by Beijing Academy of Artificial Intelligence (BAAI). + +| + +.. image:: _static/img/BAAI_logo.png + :target: https://github.com/FlagOpen/FlagEmbedding + :width: 300 + :align: center + + +.. toctree:: + :maxdepth: 1 + :hidden: + + Home + + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Introduction + + Introduction/installation + Introduction/quick_start + +.. toctree:: + :hidden: + :maxdepth: 5 + :caption: API + + API/abc + API/inference + API/evaluation + API/finetune + +.. toctree:: + :hidden: + :maxdepth: 2 + :caption: Tutorials + + tutorial/1_Embedding + tutorial/2_Metrics + tutorial/3_Indexing + tutorial/4_Evaluation + tutorial/5_Reranking + tutorial/6_RAG \ No newline at end of file diff --git a/_sources/tutorial/1_Embedding.rst.txt b/_sources/tutorial/1_Embedding.rst.txt new file mode 100644 index 00000000..f68ea30f --- /dev/null +++ b/_sources/tutorial/1_Embedding.rst.txt @@ -0,0 +1,12 @@ +1. Embedding +============ + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Embedding + + 1_Embedding/1.1.1 + 1_Embedding/1.2.1 + 1_Embedding/1.2.2 + 1_Embedding/1.2.3 \ No newline at end of file diff --git a/_sources/tutorial/1_Embedding/1.1.1.ipynb.txt b/_sources/tutorial/1_Embedding/1.1.1.ipynb.txt new file mode 100644 index 00000000..a3de317e --- /dev/null +++ b/_sources/tutorial/1_Embedding/1.1.1.ipynb.txt @@ -0,0 +1,395 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Intro to Embedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For text retrieval, pattern matching is the most intuitive way. People would use certain characters, words, phrases, or sentence patterns. However, not only for human, it is also extremely inefficient for computer to do pattern matching between a query and a collection of text files to find the possible results. \n", + "\n", + "For images and acoustic waves, there are rgb pixels and digital signals. Similarly, in order to accomplish more sophisticated tasks of natural language such as retrieval, classification, clustering, or semantic search, we need a way to represent text data. That's how text embedding comes in front of the stage." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Background" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Traditional text embedding methods like one-hot encoding and bag-of-words (BoW) represent words and sentences as sparse vectors based on their statistical features, such as word appearance and frequency within a document. More advanced methods like TF-IDF and BM25 improve on these by considering a word's importance across an entire corpus, while n-gram techniques capture word order in small groups. However, these approaches suffer from the \"curse of dimensionality\" and fail to capture semantic similarity like \"cat\" and \"kitty\", difference like \"play the watch\" and \"watch the play\"." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# example of bag-of-words\n", + "sentence1 = \"I love basketball\"\n", + "sentence2 = \"I have a basketball match\"\n", + "\n", + "words = ['I', 'love', 'basketball', 'have', 'a', 'match']\n", + "sen1_vec = [1, 1, 1, 0, 0, 0]\n", + "sen2_vec = [1, 0, 1, 1, 1, 1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To overcome these limitations, dense word embeddings were developed, mapping words to vectors in a low-dimensional space that captures semantic and relational information. Early models like Word2Vec demonstrated the power of dense embeddings using neural networks. Subsequent advancements with neural network architectures like RNNs, LSTMs, and Transformers have enabled more sophisticated models such as BERT, RoBERTa, and GPT to excel in capturing complex word relationships and contexts. **BAAI General Embedding (BGE)** provide a series of open-source models that could satisfy all kinds of demands." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Embedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first step of modern text retrieval is embedding the text. So let's take a look at how to use the embedding models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the packages:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "%pip install -U FlagEmbedding sentence_transformers openai cohere" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll use the following three sentences as the inputs:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "sentences = [\n", + " \"That is a happy dog\",\n", + " \"That is a very happy person\",\n", + " \"Today is a sunny day\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Open-source Models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A huge portion of embedding models are in the open source community. The advantages of open-source models include:\n", + "- Free, no extra cost. But make sure to check the License and your use case before using.\n", + "- No frequency limit, can accelerate a lot if you have enough GPUs to parallelize.\n", + "- Transparent and might be reproducible.\n", + "\n", + "Let's take a look at two representatives:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### BGE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BGE is a series of embedding models and rerankers published by BAAI. Several of them reached SOTA at the time they released." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embeddings:\n", + "(3, 768)\n", + "Similarity scores:\n", + "[[1. 0.7900386 0.57525384]\n", + " [0.7900386 0.9999998 0.59190154]\n", + " [0.57525384 0.59190154 0.99999994]]\n" + ] + } + ], + "source": [ + "from FlagEmbedding import FlagModel\n", + "\n", + "# Load BGE model\n", + "model = FlagModel('BAAI/bge-base-en-v1.5')\n", + "\n", + "# encode the queries and corpus\n", + "embeddings = model.encode(sentences)\n", + "print(f\"Embeddings:\\n{embeddings.shape}\")\n", + "\n", + "scores = embeddings @ embeddings.T\n", + "print(f\"Similarity scores:\\n{scores}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Sentence Transformers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sentence Transformers is a library for sentence embeddings with a huge amount of embedding models and datasets for related tasks." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embeddings:\n", + "(3, 384)\n", + "Similarity scores:\n", + "[[0.99999976 0.6210502 0.24906276]\n", + " [0.6210502 0.9999997 0.21061528]\n", + " [0.24906276 0.21061528 0.9999999 ]]\n" + ] + } + ], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "\n", + "model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", + "\n", + "embeddings = model.encode(sentences, normalize_embeddings=True)\n", + "print(f\"Embeddings:\\n{embeddings.shape}\")\n", + "\n", + "scores = embeddings @ embeddings.T\n", + "print(f\"Similarity scores:\\n{scores}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Commercial Models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are also plenty choices of commercial models. They have the advantages of:\n", + "- Efficient memory usage, fast inference with no need of GPUs.\n", + "- Systematic support, commercial models have closer connections with their other products.\n", + "- Better training data, commercial models might be trained on larger, higher-quality datasets than some open-source models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### OpenAI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Along with GPT series, OpenAI has their own embedding models. Make sure to fill in your own API key in the field `\"YOUR_API_KEY\"`" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_API_KEY\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then run the following cells to get the embeddings. Check their official [documentation](https://platform.openai.com/docs/guides/embeddings) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "\n", + "client = OpenAI()\n", + "\n", + "response = client.embeddings.create(input = sentences, model=\"text-embedding-3-small\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embeddings:\n", + "(3, 1536)\n", + "Similarity scores:\n", + "[[1.00000004 0.697673 0.34739798]\n", + " [0.697673 1.00000005 0.31969923]\n", + " [0.34739798 0.31969923 0.99999998]]\n" + ] + } + ], + "source": [ + "embeddings = np.asarray([response.data[i].embedding for i in range(len(sentences))])\n", + "print(f\"Embeddings:\\n{embeddings.shape}\")\n", + "\n", + "scores = embeddings @ embeddings.T\n", + "print(f\"Similarity scores:\\n{scores}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Voyage AI" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voyage AI provides embedding models and rerankers for different purpus and in various fields. Their API keys can be freely used in low frequency and token length." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"VOYAGE_API_KEY\"] = \"YOUR_API_KEY\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check their official [documentation](https://docs.voyageai.com/docs/api-key-and-installation) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "import voyageai\n", + "\n", + "vo = voyageai.Client()\n", + "\n", + "result = vo.embed(sentences, model=\"voyage-large-2-instruct\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embeddings:\n", + "(3, 1024)\n", + "Similarity scores:\n", + "[[0.99999997 0.87282517 0.63276503]\n", + " [0.87282517 0.99999998 0.64720015]\n", + " [0.63276503 0.64720015 0.99999999]]\n" + ] + } + ], + "source": [ + "embeddings = np.asarray(result.embeddings)\n", + "print(f\"Embeddings:\\n{embeddings.shape}\")\n", + "\n", + "scores = embeddings @ embeddings.T\n", + "print(f\"Similarity scores:\\n{scores}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/1_Embedding/1.2.1.ipynb.txt b/_sources/tutorial/1_Embedding/1.2.1.ipynb.txt new file mode 100644 index 00000000..39d5cf07 --- /dev/null +++ b/_sources/tutorial/1_Embedding/1.2.1.ipynb.txt @@ -0,0 +1,486 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "06cff9e4", + "metadata": {}, + "source": [ + "# BGE Series" + ] + }, + { + "cell_type": "markdown", + "id": "880e229d", + "metadata": {}, + "source": [ + "In this Part, we will walk through the BGE series and introduce how to use the BGE embedding models." + ] + }, + { + "cell_type": "markdown", + "id": "2516fd49", + "metadata": {}, + "source": [ + "## 1. BAAI General Embedding" + ] + }, + { + "cell_type": "markdown", + "id": "2113ee71", + "metadata": {}, + "source": [ + "BGE stands for BAAI General Embedding, it's a series of embeddings models developed and published by Beijing Academy of Artificial Intelligence (BAAI)." + ] + }, + { + "cell_type": "markdown", + "id": "16515b99", + "metadata": {}, + "source": [ + "A full support of APIs and related usages of BGE is maintained in [FlagEmbedding](https://github.com/FlagOpen/FlagEmbedding) on GitHub.\n", + "\n", + "Run the following cell to install FlagEmbedding in your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88095fd0", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "%pip install -U FlagEmbedding" + ] + }, + { + "cell_type": "markdown", + "id": "bc6e30a0", + "metadata": {}, + "source": [ + "The collection of BGE models can be found in [Huggingface collection](https://huggingface.co/collections/BAAI/bge-66797a74476eb1f085c7446d)." + ] + }, + { + "cell_type": "markdown", + "id": "67a16ccf", + "metadata": {}, + "source": [ + "## 2. BGE Series Models" + ] + }, + { + "cell_type": "markdown", + "id": "2e10034a", + "metadata": {}, + "source": [ + "### 2.1 BGE" + ] + }, + { + "cell_type": "markdown", + "id": "0cdc6702", + "metadata": {}, + "source": [ + "The very first version of BGE has 6 models, with 'large', 'base', and 'small' for English and Chinese. " + ] + }, + { + "cell_type": "markdown", + "id": "04b75f72", + "metadata": {}, + "source": [ + "| Model | Language | Parameters | Model Size | Description | Base Model |\n", + "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", + "| [BAAI/bge-large-en](https://huggingface.co/BAAI/bge-large-en) | English | 335M | 1.34 GB | Embedding Model which map text into vector | BERT |\n", + "| [BAAI/bge-base-en](https://huggingface.co/BAAI/bge-base-en) | English | 109M | 438 MB | a base-scale model but with similar ability to `bge-large-en` | BERT |\n", + "| [BAAI/bge-small-en](https://huggingface.co/BAAI/bge-small-en) | English | 33.4M | 133 MB | a small-scale model but with competitive performance | BERT |\n", + "| [BAAI/bge-large-zh](https://huggingface.co/BAAI/bge-large-zh) | Chinese | 326M | 1.3 GB | Embedding Model which map text into vector | BERT |\n", + "| [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh) | Chinese | 102M | 409 MB | a base-scale model but with similar ability to `bge-large-zh` | BERT |\n", + "| [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh) | Chinese | 24M | 95.8 MB | a small-scale model but with competitive performance | BERT |" + ] + }, + { + "cell_type": "markdown", + "id": "c9c45d17", + "metadata": {}, + "source": [ + "For inference, import FlagModel from FlagEmbedding and initialize the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89e07751", + "metadata": {}, + "outputs": [], + "source": [ + "from FlagEmbedding import FlagModel\n", + "\n", + "# Load BGE model\n", + "model = FlagModel('BAAI/bge-base-en',\n", + " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", + " use_fp16=True)\n", + "\n", + "queries = [\"query 1\", \"query 2\"]\n", + "corpus = [\"passage 1\", \"passage 2\"]\n", + "\n", + "# encode the queries and corpus\n", + "q_embeddings = model.encode(queries)\n", + "p_embeddings = model.encode(corpus)\n", + "\n", + "# compute the similarity scores\n", + "scores = q_embeddings @ p_embeddings.T\n", + "print(scores)" + ] + }, + { + "cell_type": "markdown", + "id": "6c8e69ed", + "metadata": {}, + "source": [ + "To use `FlagModel`:\n", + "```\n", + "FlagModel.encode(sentences, batch_size=256, max_length=512, convert_to_numpy=True)\n", + "```\n", + "The *encode()* function directly encode the input sentences to embedding vectors.\n", + "```\n", + "FlagModel.encode_queries(sentences, batch_size=256, max_length=512, convert_to_numpy=True)\n", + "```\n", + "The *encode_queries()* function concatenate the `query_instruction_for_retrieval` with each of the input query, and then call `encode()`." + ] + }, + { + "cell_type": "markdown", + "id": "2c86a5a3", + "metadata": {}, + "source": [ + "### 2.2 BGE v1.5" + ] + }, + { + "cell_type": "markdown", + "id": "454ff7aa", + "metadata": {}, + "source": [ + "BGE 1.5 alleviate the issue of the similarity distribution, and enhance retrieval ability without instruction." + ] + }, + { + "cell_type": "markdown", + "id": "30b1f897", + "metadata": {}, + "source": [ + "| Model | Language | Parameters | Model Size | Description | Base Model |\n", + "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", + "| [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5) | English | 335M | 1.34 GB | version 1.5 with more reasonable similarity distribution | BERT |\n", + "| [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) | English | 109M | 438 MB | version 1.5 with more reasonable similarity distribution | BERT |\n", + "| [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) | English | 33.4M | 133 MB | version 1.5 with more reasonable similarity distribution | BERT |\n", + "| [BAAI/bge-large-zh-v1.5](https://huggingface.co/BAAI/bge-large-zh-v1.5) | Chinese | 326M | 1.3 GB | version 1.5 with more reasonable similarity distribution | BERT |\n", + "| [BAAI/bge-base-zh-v1.5](https://huggingface.co/BAAI/bge-base-zh-v1.5) | Chinese | 102M | 409 MB | version 1.5 with more reasonable similarity distribution | BERT |\n", + "| [BAAI/bge-small-zh-v1.5](https://huggingface.co/BAAI/bge-small-zh-v1.5) | Chinese | 24M | 95.8 MB | version 1.5 with more reasonable similarity distribution | BERT |" + ] + }, + { + "cell_type": "markdown", + "id": "ed00c504", + "metadata": {}, + "source": [ + "BGE 1.5 models shares the same API of `FlagModel` with BGE models." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9b17afcc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.736794 0.5989914]\n", + " [0.5684842 0.7461165]]\n" + ] + } + ], + "source": [ + "model = FlagModel('BAAI/bge-base-en-v1.5',\n", + " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", + " use_fp16=True)\n", + "\n", + "queries = [\"query 1\", \"query 2\"]\n", + "corpus = [\"passage 1\", \"passage 2\"]\n", + "\n", + "# encode the queries and corpus\n", + "q_embeddings = model.encode(queries)\n", + "p_embeddings = model.encode(corpus)\n", + "\n", + "# compute the similarity scores\n", + "scores = q_embeddings @ p_embeddings.T\n", + "print(scores)" + ] + }, + { + "cell_type": "markdown", + "id": "38c3ce1c", + "metadata": {}, + "source": [ + "### 2.3 LLM-Embedder" + ] + }, + { + "cell_type": "markdown", + "id": "1bc3fee0", + "metadata": {}, + "source": [ + "LLM-Embedder is a unified embedding model supporting diverse retrieval augmentation needs for LLMs. It is fine-tuned over 6 tasks:\n", + "- Question Answering (qa)\n", + "- Conversational Search (convsearch)\n", + "- Long Conversation (chat)\n", + "- Long-Rnage Language Modeling (lrlm)\n", + "- In-Context Learning (icl)\n", + "- Tool Learning (tool)" + ] + }, + { + "cell_type": "markdown", + "id": "13b926e9", + "metadata": {}, + "source": [ + "| Model | Language | Parameters | Model Size | Description | Base Model |\n", + "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", + "| [BAAI/llm-embedder](https://huggingface.co/BAAI/llm-embedder) | English | 109M | 438 MB | a unified embedding model to support diverse retrieval augmentation needs for LLMs | BERT |" + ] + }, + { + "cell_type": "markdown", + "id": "a7b3f109", + "metadata": {}, + "source": [ + "To use `LLMEmbedder`:\n", + "```python\n", + "LLMEmbedder.encode_queries(\n", + " queries, \n", + " batch_size=256, \n", + " max_length=256, \n", + " task='qa'\n", + ")\n", + "```\n", + "The *encode_queries()* will call the *_encode()* functions (similar to the *encode()* in `FlagModel`) and add the corresponding query instruction of the given *task* in front of each of the input *queries*.\n", + "```python\n", + "LLMEmbedder.encode_keys(\n", + " keys, \n", + " batch_size=256, \n", + " max_length=512, \n", + " task='qa'\n", + ")\n", + "```\n", + "Similarly, *encode_keys()* also calls *_encode()* and automatically add instructions according to given task." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5f077420", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.89705944 0.85341793]\n", + " [0.8462474 0.90914035]]\n" + ] + } + ], + "source": [ + "from FlagEmbedding import LLMEmbedder\n", + "\n", + "# load the LLMEmbedder model\n", + "model = LLMEmbedder('BAAI/llm-embedder', use_fp16=False)\n", + "\n", + "# Define queries and keys\n", + "queries = [\"test query 1\", \"test query 2\"]\n", + "keys = [\"test key 1\", \"test key 2\"]\n", + "\n", + "# Encode for a specific task (qa, icl, chat, lrlm, tool, convsearch)\n", + "task = \"qa\"\n", + "query_embeddings = model.encode_queries(queries, task=task)\n", + "key_embeddings = model.encode_keys(keys, task=task)\n", + "\n", + "# compute the similarity scores\n", + "similarity = query_embeddings @ key_embeddings.T\n", + "print(similarity)" + ] + }, + { + "cell_type": "markdown", + "id": "dcf2a82b", + "metadata": {}, + "source": [ + "### 2.4 BGE M3" + ] + }, + { + "cell_type": "markdown", + "id": "cc5b5a5e", + "metadata": {}, + "source": [ + "BGE-M3 is the new version of BGE models that is distinguished for its versatility in:\n", + "- Multi-Functionality: Simultaneously perform the three common retrieval functionalities of embedding model: dense retrieval, multi-vector retrieval, and sparse retrieval.\n", + "- Multi-Linguality: Supports more than 100 working languages.\n", + "- Multi-Granularity: Can proces inputs with different granularityies, spanning from short sentences to long documents of up to 8192 tokens.\n", + "\n", + "For more details, feel free to check out the [paper](https://arxiv.org/pdf/2402.03216)." + ] + }, + { + "cell_type": "markdown", + "id": "41348e03", + "metadata": {}, + "source": [ + "| Model | Language | Parameters | Model Size | Description | Base Model |\n", + "|:-------|:--------:|:--------------:|:--------------:|:-----------------:|:----------------:|\n", + "| [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3) | Multilingual | 568M | 2.27 GB | Multi-Functionality(dense retrieval, sparse retrieval, multi-vector(colbert)), Multi-Linguality, and Multi-Granularity(8192 tokens) | XLM-RoBERTa |" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d4647625", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 228780.22it/s]\n" + ] + } + ], + "source": [ + "from FlagEmbedding import BGEM3FlagModel\n", + "\n", + "model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)\n", + "\n", + "sentences = [\"What is BGE M3?\", \"Defination of BM25\"]" + ] + }, + { + "cell_type": "markdown", + "id": "1f89f1a9", + "metadata": {}, + "source": [ + "```python\n", + "BGEM3FlagModel.encode(\n", + " sentences, \n", + " batch_size=12, \n", + " max_length=8192, \n", + " return_dense=True, \n", + " return_sparse=False, \n", + " return_colbert_vecs=False\n", + ")\n", + "```\n", + "It returns a dictionary like:\n", + "```python\n", + "{\n", + " 'dense_vecs': 'array of dense embeddings of inputs if return_dense=True, otherwise None,'\n", + " 'lexical_weights': 'array of dictionaries with keys and values are ids of tokens and their corresponding weights if return_sparse=True, otherwise None,'\n", + " 'colbert_vecs': 'array of multi-vector embeddings of inputs if return_cobert_vecs=True, otherwise None,'\n", + "}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f0b11cf0", + "metadata": {}, + "outputs": [], + "source": [ + "# If you don't need such a long length of 8192 input tokens, you can set max_length to a smaller value to speed up encoding.\n", + "embeddings = model.encode(\n", + " sentences, \n", + " max_length=10,\n", + " return_dense=True, \n", + " return_sparse=True, \n", + " return_colbert_vecs=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "72cba126", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dense embedding:\n", + "[[-0.03411707 -0.04707828 -0.00089447 ... 0.04828531 0.00755427\n", + " -0.02961654]\n", + " [-0.01041734 -0.04479263 -0.02429199 ... -0.00819298 0.01503995\n", + " 0.01113793]]\n", + "sparse embedding:\n", + "[defaultdict(, {'4865': 0.08362077, '83': 0.081469566, '335': 0.12964639, '11679': 0.25186998, '276': 0.17001738, '363': 0.26957875, '32': 0.040755156}), defaultdict(, {'262': 0.050144322, '5983': 0.13689369, '2320': 0.045134712, '111': 0.06342201, '90017': 0.25167602, '2588': 0.33353207})]\n", + "multi-vector:\n", + "[array([[-8.6726490e-03, -4.8921868e-02, -3.0449261e-03, ...,\n", + " -2.2082448e-02, 5.7268854e-02, 1.2811369e-02],\n", + " [-8.8765034e-03, -4.6860173e-02, -9.5845405e-03, ...,\n", + " -3.1404708e-02, 5.3911421e-02, 6.8714428e-03],\n", + " [ 1.8445771e-02, -4.2359587e-02, 8.6754939e-04, ...,\n", + " -1.9803897e-02, 3.8384371e-02, 7.6852231e-03],\n", + " ...,\n", + " [-2.5543230e-02, -1.6561864e-02, -4.2125367e-02, ...,\n", + " -4.5030322e-02, 4.4091221e-02, -1.0043185e-02],\n", + " [ 4.9905590e-05, -5.5475257e-02, 8.4884483e-03, ...,\n", + " -2.2911752e-02, 6.0379632e-02, 9.3577225e-03],\n", + " [ 2.5895271e-03, -2.9331330e-02, -1.8961012e-02, ...,\n", + " -8.0389353e-03, 3.2842189e-02, 4.3894034e-02]], dtype=float32), array([[ 0.01715658, 0.03835309, -0.02311821, ..., 0.00146474,\n", + " 0.02993429, -0.05985384],\n", + " [ 0.00996143, 0.039217 , -0.03855301, ..., 0.00599566,\n", + " 0.02722942, -0.06509776],\n", + " [ 0.01777726, 0.03919311, -0.01709837, ..., 0.00805702,\n", + " 0.03988946, -0.05069073],\n", + " ...,\n", + " [ 0.05474931, 0.0075684 , 0.00329455, ..., -0.01651684,\n", + " 0.02397249, 0.00368039],\n", + " [ 0.0093503 , 0.05022853, -0.02385841, ..., 0.02575599,\n", + " 0.00786822, -0.03260205],\n", + " [ 0.01805054, 0.01337725, 0.00016697, ..., 0.01843987,\n", + " 0.01374448, 0.00310114]], dtype=float32)]\n" + ] + } + ], + "source": [ + "print(f\"dense embedding:\\n{embeddings['dense_vecs']}\")\n", + "print(f\"sparse embedding:\\n{embeddings['lexical_weights']}\")\n", + "print(f\"multi-vector:\\n{embeddings['colbert_vecs']}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/_sources/tutorial/1_Embedding/1.2.2.ipynb.txt b/_sources/tutorial/1_Embedding/1.2.2.ipynb.txt new file mode 100644 index 00000000..dbe94b89 --- /dev/null +++ b/_sources/tutorial/1_Embedding/1.2.2.ipynb.txt @@ -0,0 +1,419 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BGE Explanation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section, we will go through BGE and BGE-v1.5's structure and how they generate embeddings." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Installation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the required packages in your environment." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "%pip install -U transformers FlagEmbedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Encode sentences" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To know how exactly a sentence is encoded, let's first load the tokenizer and model from HF transformers instead of FlagEmbedding" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoModel\n", + "import torch\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-base-en-v1.5\")\n", + "model = AutoModel.from_pretrained(\"BAAI/bge-base-en-v1.5\")\n", + "\n", + "sentences = [\"embedding\", \"I love machine learning and nlp\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the following cell to check the model of bge-base-en-v1.5. It has the exactly same structure of BERT-base, 12 encoder layers and hidden dimension of 768.\n", + "\n", + "Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "BertModel(\n", + " (embeddings): BertEmbeddings(\n", + " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", + " (position_embeddings): Embedding(512, 768)\n", + " (token_type_embeddings): Embedding(2, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): BertEncoder(\n", + " (layer): ModuleList(\n", + " (0-11): 12 x BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (pooler): BertPooler(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (activation): Tanh()\n", + " )\n", + ")" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.eval()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's tokenize the sentences." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_ids': tensor([[ 101, 7861, 8270, 4667, 102, 0, 0, 0, 0],\n", + " [ 101, 1045, 2293, 3698, 4083, 1998, 17953, 2361, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0],\n", + " [1, 1, 1, 1, 1, 1, 1, 1, 1]])}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs = tokenizer(\n", + " sentences, \n", + " padding=True, \n", + " truncation=True, \n", + " return_tensors='pt', \n", + " max_length=512\n", + ")\n", + "inputs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the results, we can see that each sentence begins with token 101 and ends with 102, they are the `[CLS]` and `[SEP]` special token used in BERT." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([2, 9, 768])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "last_hidden_state = model(**inputs, return_dict=True).last_hidden_state\n", + "last_hidden_state.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we implement the pooling function, with two choices of using `[CLS]`'s last hidden state, or the mean pooling of the whole last hidden state." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "def pooling(last_hidden_state: torch.Tensor, pooling_method='cls', attention_mask: torch.Tensor = None):\n", + " if pooling_method == 'cls':\n", + " return last_hidden_state[:, 0]\n", + " elif pooling_method == 'mean':\n", + " s = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1).float(), dim=1)\n", + " d = attention_mask.sum(dim=1, keepdim=True).float()\n", + " return s / d" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Different from more commonly used mean pooling, BGE is trained to use the last hidden state of `[CLS]` as the sentence embedding: \n", + "\n", + "`sentence_embeddings = model_output[0][:, 0]`\n", + "\n", + "If you use mean pooling, there will be a significant decrease in performance. Therefore, make sure to use the correct method to obtain sentence vectors." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([2, 768])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeddings = pooling(\n", + " last_hidden_state, \n", + " pooling_method='cls', \n", + " attention_mask=inputs['attention_mask']\n", + ")\n", + "embeddings.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assembling them together, we get the whole encoding function:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "def _encode(sentences, max_length=512, convert_to_numpy=True):\n", + "\n", + " # handle the case of single sentence and a list of sentences\n", + " input_was_string = False\n", + " if isinstance(sentences, str):\n", + " sentences = [sentences]\n", + " input_was_string = True\n", + "\n", + " inputs = tokenizer(\n", + " sentences, \n", + " padding=True, \n", + " truncation=True, \n", + " return_tensors='pt', \n", + " max_length=max_length\n", + " )\n", + "\n", + " last_hidden_state = model(**inputs, return_dict=True).last_hidden_state\n", + " \n", + " embeddings = pooling(\n", + " last_hidden_state, \n", + " pooling_method='cls', \n", + " attention_mask=inputs['attention_mask']\n", + " )\n", + "\n", + " # normalize the embedding vectors\n", + " embeddings = torch.nn.functional.normalize(embeddings, dim=-1)\n", + "\n", + " # convert to numpy if needed\n", + " if convert_to_numpy:\n", + " embeddings = embeddings.detach().numpy()\n", + "\n", + " return embeddings[0] if input_was_string else embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Comparison" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's run the function we wrote to get the embeddings of the two sentences:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embeddings:\n", + "[[ 1.4549762e-02 -9.6840411e-03 3.7761475e-03 ... -8.5092714e-04\n", + " 2.8417887e-02 6.3214332e-02]\n", + " [ 3.3924331e-05 -3.2998275e-03 1.7206438e-02 ... 3.5703944e-03\n", + " 1.8721525e-02 -2.0371782e-02]]\n", + "Similarity scores:\n", + "[[0.9999997 0.6077381]\n", + " [0.6077381 0.9999999]]\n" + ] + } + ], + "source": [ + "embeddings = _encode(sentences)\n", + "print(f\"Embeddings:\\n{embeddings}\")\n", + "\n", + "scores = embeddings @ embeddings.T\n", + "print(f\"Similarity scores:\\n{scores}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, run the API provided in FlagEmbedding:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Embeddings:\n", + "[[ 1.4549762e-02 -9.6840411e-03 3.7761475e-03 ... -8.5092714e-04\n", + " 2.8417887e-02 6.3214332e-02]\n", + " [ 3.3924331e-05 -3.2998275e-03 1.7206438e-02 ... 3.5703944e-03\n", + " 1.8721525e-02 -2.0371782e-02]]\n", + "Similarity scores:\n", + "[[0.9999997 0.6077381]\n", + " [0.6077381 0.9999999]]\n" + ] + } + ], + "source": [ + "from FlagEmbedding import FlagModel\n", + "\n", + "model = FlagModel('BAAI/bge-base-en-v1.5')\n", + "\n", + "embeddings = model.encode(sentences)\n", + "print(f\"Embeddings:\\n{embeddings}\")\n", + "\n", + "scores = embeddings @ embeddings.T\n", + "print(f\"Similarity scores:\\n{scores}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/flag_models.py#L370) for more details." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/1_Embedding/1.2.3.ipynb.txt b/_sources/tutorial/1_Embedding/1.2.3.ipynb.txt new file mode 100644 index 00000000..b691f499 --- /dev/null +++ b/_sources/tutorial/1_Embedding/1.2.3.ipynb.txt @@ -0,0 +1,414 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BGE-M3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Installation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the required packages in your environment." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "%pip install -U transformers FlagEmbedding accelerate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. BGE-M3 structure" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer, AutoModel\n", + "import torch, os\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"BAAI/bge-m3\")\n", + "raw_model = AutoModel.from_pretrained(\"BAAI/bge-m3\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The base model of BGE-M3 is [XLM-RoBERTa-large](https://huggingface.co/FacebookAI/xlm-roberta-large), which is a multilingual version of RoBERTa." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "XLMRobertaModel(\n", + " (embeddings): XLMRobertaEmbeddings(\n", + " (word_embeddings): Embedding(250002, 1024, padding_idx=1)\n", + " (position_embeddings): Embedding(8194, 1024, padding_idx=1)\n", + " (token_type_embeddings): Embedding(1, 1024)\n", + " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): XLMRobertaEncoder(\n", + " (layer): ModuleList(\n", + " (0-23): 24 x XLMRobertaLayer(\n", + " (attention): XLMRobertaAttention(\n", + " (self): XLMRobertaSelfAttention(\n", + " (query): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (key): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (value): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): XLMRobertaSelfOutput(\n", + " (dense): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): XLMRobertaIntermediate(\n", + " (dense): Linear(in_features=1024, out_features=4096, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): XLMRobertaOutput(\n", + " (dense): Linear(in_features=4096, out_features=1024, bias=True)\n", + " (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (pooler): XLMRobertaPooler(\n", + " (dense): Linear(in_features=1024, out_features=1024, bias=True)\n", + " (activation): Tanh()\n", + " )\n", + ")" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_model.eval()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Multi-Functionality" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 240131.91it/s]\n" + ] + } + ], + "source": [ + "from FlagEmbedding import BGEM3FlagModel\n", + "\n", + "model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)\n", + "\n", + "sentences_1 = [\"What is BGE M3?\", \"Defination of BM25\"]\n", + "sentences_2 = [\"BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.\", \n", + " \"BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.1 Dense Retrieval" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using BGE M3 for dense embedding has similar steps to BGE or BGE 1.5 models.\n", + "\n", + "Use the normalized hidden state of the special token [CLS] as the embedding:\n", + "\n", + "$$e_q = norm(H_q[0])$$\n", + "\n", + "Then compute the relevance score between the query and passage:\n", + "\n", + "$$s_{dense}=f_{sim}(e_p, e_q)$$\n", + "\n", + "where $e_p, e_q$ are the embedding vectors of passage and query, respectively.\n", + "\n", + "$f_{sim}$ is the score function (such as inner product and L2 distance) for comupting two embeddings' similarity." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.6259035 0.34749585]\n", + " [0.349868 0.6782462 ]]\n" + ] + } + ], + "source": [ + "# If you don't need such a long length of 8192 input tokens, you can set max_length to a smaller value to speed up encoding.\n", + "embeddings_1 = model.encode(sentences_1, max_length=10)['dense_vecs']\n", + "embeddings_2 = model.encode(sentences_2, max_length=100)['dense_vecs']\n", + "\n", + "# compute the similarity scores\n", + "s_dense = embeddings_1 @ embeddings_2.T\n", + "print(s_dense)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Sparse Retrieval" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set `return_sparse` to true to make the model return sparse vector. If a term token appears multiple times in the sentence, we only retain its max weight.\n", + "\n", + "BGE-M3 generates sparce embeddings by adding a linear layer and a ReLU activation function following the hidden states:\n", + "\n", + "$$w_{qt} = \\text{Relu}(W_{lex}^T H_q [i])$$\n", + "\n", + "where $W_{lex}$ representes the weights of linear layer and $H_q[i]$ is the encoder's output of the $i^{th}$ token." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'What': 0.08362077, 'is': 0.081469566, 'B': 0.12964639, 'GE': 0.25186998, 'M': 0.17001738, '3': 0.26957875, '?': 0.040755156}, {'De': 0.050144322, 'fin': 0.13689369, 'ation': 0.045134712, 'of': 0.06342201, 'BM': 0.25167602, '25': 0.33353207}]\n" + ] + } + ], + "source": [ + "output_1 = model.encode(sentences_1, return_sparse=True)\n", + "output_2 = model.encode(sentences_2, return_sparse=True)\n", + "\n", + "# you can see the weight for each token:\n", + "print(model.convert_id_to_token(output_1['lexical_weights']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Based on the tokens' weights of query and passage, the relevance score between them is computed by the joint importance of the co-existed terms within the query and passage:\n", + "\n", + "$$s_{lex} = \\sum_{t\\in q\\cap p}(w_{qt} * w_{pt})$$\n", + "\n", + "where $w_{qt}, w_{pt}$ are the importance weights of each co-existed term $t$ in query and passage, respectively." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.19554448500275612\n", + "0.00880391988903284\n" + ] + } + ], + "source": [ + "# compute the scores via lexical mathcing\n", + "s_lex_10_20 = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][0])\n", + "s_lex_10_21 = model.compute_lexical_matching_score(output_1['lexical_weights'][0], output_2['lexical_weights'][1])\n", + "\n", + "print(s_lex_10_20)\n", + "print(s_lex_10_21)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3 Multi-Vector" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The multi-vector method utilizes the entire output embeddings for the representation of query $E_q$ and passage $E_p$.\n", + "\n", + "$$E_q = norm(W_{mul}^T H_q)$$\n", + "$$E_p = norm(W_{mul}^T H_p)$$\n", + "\n", + "where $W_{mul}$ is the learnable projection matrix." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(8, 1024)\n", + "(30, 1024)\n" + ] + } + ], + "source": [ + "output_1 = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=True)\n", + "output_2 = model.encode(sentences_2, return_dense=True, return_sparse=True, return_colbert_vecs=True)\n", + "\n", + "print(f\"({len(output_1['colbert_vecs'][0])}, {len(output_1['colbert_vecs'][0][0])})\")\n", + "print(f\"({len(output_2['colbert_vecs'][0])}, {len(output_2['colbert_vecs'][0][0])})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Following ColBert, we use late-interaction to compute the fine-grained relevance score:\n", + "\n", + "$$s_{mul}=\\frac{1}{N}\\sum_{i=1}^N\\max_{j=1}^M E_q[i]\\cdot E_p^T[j]$$\n", + "\n", + "where $E_q, E_p$ are the entire output embeddings of query and passage, respectively.\n", + "\n", + "This is a summation of average of maximum similarity of each $v\\in E_q$ with vectors in $E_p$" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7796662449836731\n", + "0.4621177911758423\n" + ] + } + ], + "source": [ + "s_mul_10_20 = model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][0]).item()\n", + "s_mul_10_21 = model.colbert_score(output_1['colbert_vecs'][0], output_2['colbert_vecs'][1]).item()\n", + "\n", + "print(s_mul_10_20)\n", + "print(s_mul_10_21)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.4 Hybrid Ranking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BGE-M3's multi-functionality gives the possibility of hybrid ranking to improve retrieval. Firstly, due to the heavy cost of multi-vector method, we can retrieve the candidate results by either of the dense or sparse method. Then, to get the final result, we can rerank the candidates based on the integrated relevance score:\n", + "\n", + "$$s_{rank} = w_1\\cdot s_{dense}+w_2\\cdot s_{lex} + w_3\\cdot s_{mul}$$\n", + "\n", + "where the values chosen for $w_1, w_2$ and $w_3$ varies depending on the downstream scenario (here 1/3 is just for demonstration)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.5337047390639782\n", + "0.27280585498859483\n" + ] + } + ], + "source": [ + "s_rank_10_20 = 1/3 * s_dense[0][0] + 1/3 * s_lex_10_20 + 1/3 * s_mul_10_20\n", + "s_rank_10_21 = 1/3 * s_dense[0][1] + 1/3 * s_lex_10_21 + 1/3 * s_mul_10_21\n", + "\n", + "print(s_rank_10_20)\n", + "print(s_rank_10_21)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/2_Metrics.rst.txt b/_sources/tutorial/2_Metrics.rst.txt new file mode 100644 index 00000000..af97b3de --- /dev/null +++ b/_sources/tutorial/2_Metrics.rst.txt @@ -0,0 +1,10 @@ +2. Metrics +========== + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Metrics + + 2_Metrics/2.1 + 2_Metrics/2.2 \ No newline at end of file diff --git a/_sources/tutorial/2_Metrics/2.1.ipynb.txt b/_sources/tutorial/2_Metrics/2.1.ipynb.txt new file mode 100644 index 00000000..da3ec56c --- /dev/null +++ b/_sources/tutorial/2_Metrics/2.1.ipynb.txt @@ -0,0 +1,798 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0d0f87e9-657d-46b9-a3f0-ebc1bf0656bd", + "metadata": {}, + "source": [ + "# Similarity" + ] + }, + { + "cell_type": "markdown", + "id": "00c817d5", + "metadata": {}, + "source": [ + "In this section, we will introduce several different ways to measure similarity." + ] + }, + { + "cell_type": "markdown", + "id": "dae49384-2450-425c-b050-c27d3c07d8e7", + "metadata": { + "tags": [] + }, + "source": [ + "## 1. Jaccard Similarity" + ] + }, + { + "cell_type": "markdown", + "id": "03266267-2d6d-4124-9702-f61e0510586c", + "metadata": {}, + "source": [ + "Before directly calculate the similarity between embedding vectors, let's first take a look at the primal method for measuring how similar two sentenses are: Jaccard similarity.\n", + "\n", + "**Definition:** For sets $A$ and $B$, the Jaccard index, or the Jaccard similarity coefficient between them is the size of their intersection divided by the size of their union:\n", + "$$J(A,B)=\\frac{|A\\cap B|}{|A\\cup B|}$$\n", + "\n", + "The value of $J(A,B)$ falls in the range of $[0, 1]$." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bed533e1-a17c-4595-bdff-7f4a29e4deb3", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-17T03:12:47.091346Z", + "iopub.status.busy": "2024-07-17T03:12:47.091019Z", + "iopub.status.idle": "2024-07-17T03:12:47.094401Z", + "shell.execute_reply": "2024-07-17T03:12:47.093967Z", + "shell.execute_reply.started": "2024-07-17T03:12:47.091327Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def jaccard_similarity(sentence1, sentence2):\n", + " set1 = set(sentence1.split(\" \"))\n", + " set2 = set(sentence2.split(\" \"))\n", + " intersection = set1.intersection(set2)\n", + " union = set1.union(set2)\n", + " return len(intersection)/len(union)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ea766de8-572d-4eca-91f7-284a121e8edb", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-07-17T03:14:06.133012Z", + "iopub.status.busy": "2024-07-17T03:14:06.132502Z", + "iopub.status.idle": "2024-07-17T03:14:06.135483Z", + "shell.execute_reply": "2024-07-17T03:14:06.135044Z", + "shell.execute_reply.started": "2024-07-17T03:14:06.132992Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "s1 = \"Hawaii is a wonderful place for holiday\"\n", + "s2 = \"Peter's favorite place to spend his holiday is Hawaii\"\n", + "s3 = \"Anna enjoys baking during her holiday\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b359ff4e-21a1-489a-ad46-ba53e974dc48", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-17T03:13:34.646320Z", + "iopub.status.busy": "2024-07-17T03:13:34.645942Z", + "iopub.status.idle": "2024-07-17T03:13:34.649389Z", + "shell.execute_reply": "2024-07-17T03:13:34.648998Z", + "shell.execute_reply.started": "2024-07-17T03:13:34.646302Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.3333333333333333" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "jaccard_similarity(s1, s2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "069868a9-d379-4d55-8a23-835a2972d079", + "metadata": { + "execution": { + "iopub.execute_input": "2024-07-17T03:14:13.727400Z", + "iopub.status.busy": "2024-07-17T03:14:13.726949Z", + "iopub.status.idle": "2024-07-17T03:14:13.730545Z", + "shell.execute_reply": "2024-07-17T03:14:13.730121Z", + "shell.execute_reply.started": "2024-07-17T03:14:13.727381Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.08333333333333333" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "jaccard_similarity(s1, s3)" + ] + }, + { + "cell_type": "markdown", + "id": "b0323128", + "metadata": {}, + "source": [ + "We can see that sentence 1 and 2 are sharing 'Hawaii', 'place', and 'holiday'. Thus getting a larger score of similarity (0.333) than that (0.083) of the sentence 1 and 3 that only share 'holiday'." + ] + }, + { + "cell_type": "markdown", + "id": "b509fa6c-87ac-4c59-b40e-fda95fd036d9", + "metadata": { + "tags": [] + }, + "source": [ + "## 2. Euclidean Distance" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9da366b8-427f-4e8f-b3e6-b453050f0591", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-07-17T02:30:37.643857Z", + "iopub.status.busy": "2024-07-17T02:30:37.643302Z", + "iopub.status.idle": "2024-07-17T02:30:37.647921Z", + "shell.execute_reply": "2024-07-17T02:30:37.647513Z", + "shell.execute_reply.started": "2024-07-17T02:30:37.643840Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[5., 2., 2., 6.]]) tensor([[4., 6., 6., 4.]])\n" + ] + } + ], + "source": [ + "import torch\n", + "\n", + "A = torch.randint(1, 7, (1, 4), dtype=torch.float32)\n", + "B = torch.randint(1, 7, (1, 4), dtype=torch.float32)\n", + "print(A, B)" + ] + }, + { + "cell_type": "markdown", + "id": "6c068bb3-90ce-4266-8335-e3fb2ad3e996", + "metadata": {}, + "source": [ + "**Definition:** For vectors $A$ and $B$, the Euclidean distance or L2 distance between them is defined as:\n", + "$$d(A, B) = \\|A-B\\|_2 = \\sqrt{\\sum_{i=1}^n (A_i-B_i)^2}$$\n", + "\n", + "The value of $d(A, B)$ falls in the range of [0, $+\\infty$). Since this is the measurement of distance, the closer the value is to 0, the more similar the two vector is. And the larger the value is, the two vectors are more dissimilar." + ] + }, + { + "cell_type": "markdown", + "id": "1d6c734d-cc03-4dd1-bb9e-3243006dcff4", + "metadata": {}, + "source": [ + "You can calculate Euclidean distance step by step or directly call *torch.cdist()*" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0773acf4-eb53-4058-85da-af82af20c469", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-07-17T02:32:45.240684Z", + "iopub.status.busy": "2024-07-17T02:32:45.240216Z", + "iopub.status.idle": "2024-07-17T02:32:45.244248Z", + "shell.execute_reply": "2024-07-17T02:32:45.243843Z", + "shell.execute_reply.started": "2024-07-17T02:32:45.240665Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "6.082762718200684" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dist = torch.sqrt(torch.sum(torch.pow(torch.subtract(A, B), 2), dim=-1))\n", + "dist.item()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1dd45446-f7d6-4aab-b078-1d34f0a949e4", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-07-17T02:32:57.551560Z", + "iopub.status.busy": "2024-07-17T02:32:57.550896Z", + "iopub.status.idle": "2024-07-17T02:32:57.555031Z", + "shell.execute_reply": "2024-07-17T02:32:57.554638Z", + "shell.execute_reply.started": "2024-07-17T02:32:57.551536Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "6.082762718200684" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.cdist(A, B, p=2).item()" + ] + }, + { + "cell_type": "markdown", + "id": "da4435c0-98da-4397-8a45-c954dd3ada56", + "metadata": {}, + "source": [ + "### (Maximum inner-product search)" + ] + }, + { + "cell_type": "markdown", + "id": "0e0fa5c2-e619-4a0f-a785-9cc209f1503b", + "metadata": { + "tags": [] + }, + "source": [ + "## 3. Cosine Similarity" + ] + }, + { + "cell_type": "markdown", + "id": "790e1ce3-1468-4819-a956-fc8eac690d89", + "metadata": {}, + "source": [ + "For vectors $A$ and $B$, their cosine similarity is defined as:\n", + "$$\\cos(\\theta)=\\frac{A\\cdot B}{\\|A\\|\\|B\\|}$$\n", + "\n", + "The value of $\\cos(\\theta)$ falls in the range of $[-1, 1]$. Different from Euclidean distance, close to -1 denotes not similar at all and close to +1 means very similar." + ] + }, + { + "cell_type": "markdown", + "id": "d0a64b4b-5caf-4bee-be0f-2e26b1c7ed6e", + "metadata": { + "tags": [] + }, + "source": [ + "### 3.1 Naive Approach" + ] + }, + { + "cell_type": "markdown", + "id": "350cc48d-6e73-4e20-86dd-c05d1238ef60", + "metadata": {}, + "source": [ + "The naive approach is just expanding the expression:\n", + "$$\\frac{A\\cdot B}{\\|A\\|\\|B\\|}=\\frac{\\sum_{i=1}^{i=n}A_i B_i}{\\sqrt{\\sum_{i=1}^{n}A_i^2}\\cdot\\sqrt{\\sum_{i=1}^{n}B_i^2}}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "20c7cff0-55a7-4222-9e5a-f5450171fb00", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-07-17T02:24:35.239550Z", + "iopub.status.busy": "2024-07-17T02:24:35.239073Z", + "iopub.status.idle": "2024-07-17T02:24:35.242844Z", + "shell.execute_reply": "2024-07-17T02:24:35.242417Z", + "shell.execute_reply.started": "2024-07-17T02:24:35.239531Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Compute the dot product of A and B\n", + "dot_prod = sum(a*b for a, b in zip(A[0], B[0]))\n", + "\n", + "# Compute the magnitude of A and B\n", + "A_norm = torch.sqrt(sum(a*a for a in A[0]))\n", + "B_norm = torch.sqrt(sum(b*b for b in B[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f4dce1fb-9cff-4a0d-bc7f-a503be6a37ae", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-07-17T02:24:36.533667Z", + "iopub.status.busy": "2024-07-17T02:24:36.533224Z", + "iopub.status.idle": "2024-07-17T02:24:36.536611Z", + "shell.execute_reply": "2024-07-17T02:24:36.536181Z", + "shell.execute_reply.started": "2024-07-17T02:24:36.533650Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.802726686000824\n" + ] + } + ], + "source": [ + "cos_1 = dot_prod / (A_norm * B_norm)\n", + "print(cos_1.item())" + ] + }, + { + "cell_type": "markdown", + "id": "4665f38f-c1f1-42dd-914d-d1d69c038e88", + "metadata": { + "tags": [] + }, + "source": [ + "### 3.2 PyTorch Implementation" + ] + }, + { + "cell_type": "markdown", + "id": "6154391d-1dea-4673-8502-b496cf87d4b0", + "metadata": {}, + "source": [ + "The naive approach has few issues:\n", + "- There are chances of losing precision in the numerator and the denominator\n", + "- Losing precision may cause the computed cosine similarity > 1.0\n", + "\n", + "Thus PyTorch uses the following way:\n", + "\n", + "$$\n", + "\\frac{A\\cdot B}{\\|A\\|\\|B\\|}=\\frac{A}{\\|A\\|}\\cdot\\frac{B}{\\|B\\|}\n", + "$$" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b8be02be-3ac3-4e5f-a450-c53f05781ab4", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-07-17T02:24:38.945105Z", + "iopub.status.busy": "2024-07-17T02:24:38.944403Z", + "iopub.status.idle": "2024-07-17T02:24:38.948117Z", + "shell.execute_reply": "2024-07-17T02:24:38.947698Z", + "shell.execute_reply.started": "2024-07-17T02:24:38.945085Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.802726686000824\n" + ] + } + ], + "source": [ + "res = torch.mm(A / A.norm(dim=1), B.T / B.norm(dim=1))\n", + "print(res.item())" + ] + }, + { + "cell_type": "markdown", + "id": "988acff0-e6b5-41db-92d6-8f175dd3e272", + "metadata": { + "tags": [] + }, + "source": [ + "### 3.3 PyTorch Function Call" + ] + }, + { + "cell_type": "markdown", + "id": "a61b4871-4039-4c6e-b5ee-f66a12156be9", + "metadata": {}, + "source": [ + "In practice, the most convinient way is directly use *cosine_similarity()* in torch.nn.functional:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "1ac4012e-b90a-4e60-97b8-e42636fde1c9", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-07-17T02:24:55.804298Z", + "iopub.status.busy": "2024-07-17T02:24:55.803810Z", + "iopub.status.idle": "2024-07-17T02:24:55.807551Z", + "shell.execute_reply": "2024-07-17T02:24:55.807146Z", + "shell.execute_reply.started": "2024-07-17T02:24:55.804278Z" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.802726686000824" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch.nn.functional as F\n", + "\n", + "F.cosine_similarity(A, B).item()" + ] + }, + { + "cell_type": "markdown", + "id": "f4ab87cc", + "metadata": {}, + "source": [ + "## 4. Inner Product/Dot Product" + ] + }, + { + "cell_type": "markdown", + "id": "e3c025ab", + "metadata": {}, + "source": [ + "Coordinate definition:\n", + "$$A\\cdot B = \\sum_{i=1}^{i=n}A_i B_i$$\n", + "\n", + "Geometric definition:\n", + "$$A\\cdot B = \\|A\\|\\|B\\|\\cos(\\theta)$$" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f0291d42", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "68.0" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dot_prod = A @ B.T\n", + "dot_prod.item()" + ] + }, + { + "cell_type": "markdown", + "id": "33099a2e", + "metadata": {}, + "source": [ + "### Relationship with Cosine similarity" + ] + }, + { + "cell_type": "markdown", + "id": "2790e183", + "metadata": {}, + "source": [ + "For computing the distance/similarity between two vectors, dot product and Cos similarity are closely related. Cos similarity only cares about the angle difference (because it is normalized by the product of two vectors' magnitude), while dot product takes both magnitude and angle into consideration. So the two metrics are preferred in different use cases.\n", + "\n", + "The BGE series models already normalized the output embedding vector to have the magnitude of 1. Thus using dot product and cos similarity will have the same result." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e0f40534", + "metadata": {}, + "outputs": [], + "source": [ + "from FlagEmbedding import FlagModel\n", + "\n", + "model = FlagModel('BAAI/bge-large-en-v1.5',\n", + " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", + " use_fp16=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "78445a86", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sentence = \"I am very interested in natural language processing\"\n", + "embedding = torch.tensor(model.encode(sentence))\n", + "torch.norm(embedding).item()" + ] + }, + { + "cell_type": "markdown", + "id": "9e1822ee", + "metadata": {}, + "source": [ + "## 5. Examples" + ] + }, + { + "cell_type": "markdown", + "id": "6c665e3a", + "metadata": {}, + "source": [ + "Now we've learned the mechanism of different types of similarity. Let's look at a real example." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "73012cbb", + "metadata": {}, + "outputs": [], + "source": [ + "sentence_1 = \"I will watch a show tonight\"\n", + "sentence_2 = \"I will show you my watch tonight\"\n", + "sentence_3 = \"I'm going to enjoy a performance this evening\"" + ] + }, + { + "cell_type": "markdown", + "id": "3cb79a47", + "metadata": {}, + "source": [ + "It's clear to us that in sentence 1, 'watch' is a verb and 'show' is a noun. \n", + "\n", + "But in sentence 2, 'show' is a verb and 'watch' is a noun, which leads to different meaning of the two sentences.\n", + "\n", + "While sentence 3 has very similar meaning to sentence 1." + ] + }, + { + "cell_type": "markdown", + "id": "dc44dee9", + "metadata": {}, + "source": [ + "Now let's see how does different similarity metrics tell us the relationship of the sentences." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "98bfcc6d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.625\n", + "0.07692307692307693\n" + ] + } + ], + "source": [ + "print(jaccard_similarity(sentence_1, sentence_2))\n", + "print(jaccard_similarity(sentence_1, sentence_3))" + ] + }, + { + "cell_type": "markdown", + "id": "b7e4cd15", + "metadata": {}, + "source": [ + "The results show that sentence 1 and 2 (0.625) are way more similar than sentence 1 and 3 (0.077), which indicate the opposite conclusion compare to what we have made." + ] + }, + { + "cell_type": "markdown", + "id": "cff73692", + "metadata": {}, + "source": [ + "Now let's first get the embeddings of these sentences." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "426c0b42", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 1024])\n" + ] + } + ], + "source": [ + "embeddings = torch.from_numpy(model.encode([sentence_1, sentence_2, sentence_3]))\n", + "embedding_1 = embeddings[0].view(1, -1)\n", + "embedding_2 = embeddings[1].view(1, -1)\n", + "embedding_3 = embeddings[2].view(1, -1)\n", + "\n", + "print(embedding_1.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "63fe1b31", + "metadata": {}, + "source": [ + "Then let's compute the Euclidean distance:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "d9bb35cf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.714613139629364\n", + "0.5931472182273865\n" + ] + } + ], + "source": [ + "euc_dist1_2 = torch.cdist(embedding_1, embedding_2, p=2).item()\n", + "euc_dist1_3 = torch.cdist(embedding_1, embedding_3, p=2).item()\n", + "print(euc_dist1_2)\n", + "print(euc_dist1_3)" + ] + }, + { + "cell_type": "markdown", + "id": "402e6ea8", + "metadata": {}, + "source": [ + "Then, let's see the cosine similarity:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "29e70bbc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7446640729904175\n", + "0.8240882158279419\n" + ] + } + ], + "source": [ + "cos_dist1_2 = F.cosine_similarity(embedding_1, embedding_2).item()\n", + "cos_dist1_3 = F.cosine_similarity(embedding_1, embedding_3).item()\n", + "print(cos_dist1_2)\n", + "print(cos_dist1_3)" + ] + }, + { + "cell_type": "markdown", + "id": "c353d8cc", + "metadata": {}, + "source": [ + "Using embedding, we can get the correct result different from Jaccard similarity that sentence 1 and 2 should be more similar than sentence 1 and 3 using either Euclidean distance or cos similarity as the metric." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/_sources/tutorial/2_Metrics/2.2.ipynb.txt b/_sources/tutorial/2_Metrics/2.2.ipynb.txt new file mode 100644 index 00000000..6fdc09f4 --- /dev/null +++ b/_sources/tutorial/2_Metrics/2.2.ipynb.txt @@ -0,0 +1,472 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluation Metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we'll cover a list of metrics that are widely used for evaluating embedding model's performance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install numpy scikit-learn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose we have a corpus with document ids from 0 - 30. \n", + "- `ground_truth` contains the actual relevant document ids to each query.\n", + "- `results` contains the search results of each query by some retrieval system." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "ground_truth = [\n", + " [11, 1, 7, 17, 21],\n", + " [ 4, 16, 1],\n", + " [26, 10, 22, 8],\n", + "]\n", + "\n", + "results = [\n", + " [11, 1, 17, 7, 21, 8, 0, 28, 9, 20],\n", + " [16, 1, 6, 18, 3, 4, 25, 19, 8, 14],\n", + " [24, 10, 26, 2, 8, 28, 4, 23, 13, 21],\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 14, 16, 17, 18, 19,\n", + " 21, 22, 24, 25, 26, 28])" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.intersect1d(ground_truth, results)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0],\n", + " [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],\n", + " [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.isin(ground_truth, results).astype(int)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And we are interested in the following cutoffs:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "cutoffs = [1, 5, 10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we will use the above small example to show how different metrics evaluate the retrieval system's quality." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Recall" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recall represents the model's capability of correctly predicting positive instances from all the actual positive samples in the dataset.\n", + "\n", + "$$\\textbf{Recall}=\\frac{\\text{True Positives}}{\\text{True Positives}+\\text{False Negatives}}$$\n", + "\n", + "to write it in the form of information retrieval, which is the ratio of relevant documents retrieved to the total number of relevant documents in the corpus. In practice, we usually make the denominator to be the minimum between the current cutoff (usually 1, 5, 10, 100, etc) and the total number of relevant documents in the corpus:\n", + "\n", + "$$\\textbf{Recall}=\\frac{|\\text{\\{Relevant docs\\}}\\cap\\text{\\{Retrieved docs\\}}|}{\\text{min}(|\\text{\\{Retrieved docs\\}}|, |\\text{\\{Relevant docs\\}}|)}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_recall(preds, truths, cutoffs):\n", + " recalls = np.zeros(len(cutoffs))\n", + " for text, truth in zip(preds, truths):\n", + " for i, c in enumerate(cutoffs):\n", + " hits = np.intersect1d(truth, text[:c])\n", + " recalls[i] += len(hits) / max(min(c, len(truth)), 1)\n", + " recalls /= len(preds)\n", + " return recalls" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "recall@1: 0.6666666666666666\n", + "recall@5: 0.8055555555555555\n", + "recall@10: 0.9166666666666666\n" + ] + } + ], + "source": [ + "recalls = calc_recall(results, ground_truth, cutoffs)\n", + "for i, c in enumerate(cutoffs):\n", + " print(f\"recall@{c}: {recalls[i]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. MRR" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mean Reciprocal Rank ([MRR](https://en.wikipedia.org/wiki/Mean_reciprocal_rank)) is a widely used metric in information retrieval to evaluate the effectiveness of a system. It measures the rank position of the first relevant result in a list of search results.\n", + "\n", + "$$MRR=\\frac{1}{|Q|}\\sum_{i=1}^{|Q|}\\frac{1}{rank_i}$$\n", + "\n", + "where \n", + "- $|Q|$ is the total number of queries.\n", + "- $rank_i$ is the rank position of the first relevant document of the i-th query." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_MRR(preds, truth, cutoffs):\n", + " mrr = [0 for _ in range(len(cutoffs))]\n", + " for pred, t in zip(preds, truth):\n", + " for i, c in enumerate(cutoffs):\n", + " for j, p in enumerate(pred):\n", + " if j < c and p in t:\n", + " mrr[i] += 1/(j+1)\n", + " break\n", + " mrr = [k/len(preds) for k in mrr]\n", + " return mrr" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MRR@1: 0.6666666666666666\n", + "MRR@5: 0.8333333333333334\n", + "MRR@10: 0.8333333333333334\n" + ] + } + ], + "source": [ + "mrr = calc_MRR(results, ground_truth, cutoffs)\n", + "for i, c in enumerate(cutoffs):\n", + " print(f\"MRR@{c}: {mrr[i]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. nDCG" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Normalized Discounted Cumulative Gain (nDCG) measures the quality of a ranked list of search results by considering both the position of the relevant documents and their graded relevance scores. The calculation of nDCG involves two main steps:\n", + "\n", + "1. Discounted cumulative gain (DCG) measures the ranking quality in retrieval tasks.\n", + "\n", + "$$DCG_p=\\sum_{i=1}^p\\frac{2^{rel_i}-1}{\\log_2(i+1)}$$\n", + "\n", + "2. Normalized by ideal DCG to make it comparable across queries.\n", + "$$nDCG_p=\\frac{DCG_p}{IDCG_p}$$\n", + "where $IDCG$ is the maximum possible DCG for a given set of documents, assuming they are perfectly ranked in order of relevance." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "pred_hard_encodings = []\n", + "for pred, label in zip(results, ground_truth):\n", + " pred_hard_encoding = list(np.isin(pred, label).astype(int))\n", + " pred_hard_encodings.append(pred_hard_encoding)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "nDCG@1: 0.0\n", + "nDCG@5: 0.3298163165186628\n", + "nDCG@10: 0.5955665344840209\n" + ] + } + ], + "source": [ + "from sklearn.metrics import ndcg_score\n", + "\n", + "for i, c in enumerate(cutoffs):\n", + " nDCG = ndcg_score(pred_hard_encodings, results, k=c)\n", + " print(f\"nDCG@{c}: {nDCG}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Precision" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Precision \n", + "\n", + "$$\\textbf{Recall}=\\frac{\\text{True Positives}}{\\text{True Positives}+\\text{False Positive}}$$\n", + "\n", + "in information retrieval, it's the ratio of relevant documents retrieved to the totoal number of documents retrieved:\n", + "\n", + "$$\\textbf{Recall}=\\frac{|\\text{\\{Relevant docs\\}}\\cap\\text{\\{Retrieved docs\\}}|}{|\\text{\\{Retrieved docs\\}}|}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_precision(preds, truths, cutoffs):\n", + " prec = np.zeros(len(cutoffs))\n", + " for text, truth in zip(preds, truths):\n", + " for i, c in enumerate(cutoffs):\n", + " hits = np.intersect1d(truth, text[:c])\n", + " prec[i] += len(hits) / c\n", + " prec /= len(preds)\n", + " return prec" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "precision@1: 0.6666666666666666\n", + "precision@5: 0.6666666666666666\n", + "precision@10: 0.3666666666666667\n" + ] + } + ], + "source": [ + "precisions = calc_precision(results, ground_truth, cutoffs)\n", + "for i, c in enumerate(cutoffs):\n", + " print(f\"precision@{c}: {precisions[i]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. MAP" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mean Average Precision (MAP) measures the effectiveness of a system at returning relevant documents across multiple queries. \n", + "\n", + "First, Average Precision (AP) evals how well relevant documents are ranked within the retrieved documents. It's computed by averaging the precision values for each position of relevant document in the ranking of all the retrieved documents:\n", + "\n", + "$$\\textbf{AP}=\\frac{\\sum_{k=1}^{M}\\text{Relevance}(k) \\times \\text{Precision}(k)}{|\\{\\text{Relevant Docs}\\}|}$$\n", + "\n", + "where \n", + "- $M$ is the total number of documents retrieved.\n", + "- $\\text{Relevance}(k)$ is a binary value, indicating whether document at position $k$ is relevant (=1) or not (=0).\n", + "- $\\text{Precision}(k)$ is the precision when considering only top $k$ retrieved items." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then calculate the average AP across multiple queries to get the MAP:\n", + "\n", + "$$\\textbf{MAP}=\\frac{1}{N}\\sum_{i=1}^{N}\\text{AP}_i$$\n", + "\n", + "where\n", + "- $N$ is the total number of queries.\n", + "- $\\text{AP}_i$ is the average precision of the $i^{th}$ query." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_AP(encoding):\n", + " rel = 0\n", + " precs = 0.0\n", + " for k, hit in enumerate(encoding, start=1):\n", + " if hit == 1:\n", + " rel += 1\n", + " precs += rel/k\n", + "\n", + " return 0 if rel == 0 else precs/rel" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_MAP(encodings, cutoffs):\n", + " res = []\n", + " for c in cutoffs:\n", + " ap_sum = 0.0\n", + " for encoding in encodings:\n", + " ap_sum += calc_AP(encoding[:c])\n", + " res.append(ap_sum/len(encodings))\n", + " \n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MAP@1: 0.6666666666666666\n", + "MAP@5: 0.862962962962963\n", + "MAP@10: 0.8074074074074075\n" + ] + } + ], + "source": [ + "maps = calc_MAP(pred_hard_encodings, cutoffs)\n", + "for i, c in enumerate(cutoffs):\n", + " print(f\"MAP@{c}: {maps[i]}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "test", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/3_Indexing.rst.txt b/_sources/tutorial/3_Indexing.rst.txt new file mode 100644 index 00000000..f4eddca4 --- /dev/null +++ b/_sources/tutorial/3_Indexing.rst.txt @@ -0,0 +1,13 @@ +3. Indexing +=========== + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Indexing + + 3_Indexing/3.1.1 + 3_Indexing/3.1.2 + 3_Indexing/3.1.3 + 3_Indexing/3.1.4 + 3_Indexing/3.1.5 \ No newline at end of file diff --git a/_sources/tutorial/3_Indexing/3.1.1.ipynb.txt b/_sources/tutorial/3_Indexing/3.1.1.ipynb.txt new file mode 100644 index 00000000..46a157d2 --- /dev/null +++ b/_sources/tutorial/3_Indexing/3.1.1.ipynb.txt @@ -0,0 +1,411 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Indexing Using Faiss" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In practical cases, datasets contain thousands or millions of rows. Looping through the whole corpus to find the best answer to a query is very time and space consuming. In this tutorial, we'll introduce how to use indexing to make our retrieval fast and neat." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 0: Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the dependencies in the environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -U FlagEmbedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### faiss-gpu on Linux (x86_64)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Faiss maintain the latest updates on conda. So if you have GPUs on Linux x86_64, create a conda virtual environment and run:\n", + "\n", + "```conda install -c pytorch -c nvidia faiss-gpu=1.8.0```\n", + "\n", + "and make sure you select that conda env as the kernel for this notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### faiss-cpu\n", + "\n", + "Otherwise it's simple, just run the following cell to install `faiss-cpu`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -U faiss-cpu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below is a super tiny courpus with only 10 sentences, which will be the dataset we use.\n", + "\n", + "Each sentence is a concise discription of a famous people in specific domain." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = [\n", + " \"Michael Jackson was a legendary pop icon known for his record-breaking music and dance innovations.\",\n", + " \"Fei-Fei Li is a professor in Stanford University, revolutionized computer vision with the ImageNet project.\",\n", + " \"Brad Pitt is a versatile actor and producer known for his roles in films like 'Fight Club' and 'Once Upon a Time in Hollywood.'\",\n", + " \"Geoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.\",\n", + " \"Eminem is a renowned rapper and one of the best-selling music artists of all time.\",\n", + " \"Taylor Swift is a Grammy-winning singer-songwriter known for her narrative-driven music.\",\n", + " \"Sam Altman leads OpenAI as its CEO, with astonishing works of GPT series and pursuing safe and beneficial AI.\",\n", + " \"Morgan Freeman is an acclaimed actor famous for his distinctive voice and diverse roles.\",\n", + " \"Andrew Ng spread AI knowledge globally via public courses on Coursera and Stanford University.\",\n", + " \"Robert Downey Jr. is an iconic actor best known for playing Iron Man in the Marvel Cinematic Universe.\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And a few queries (add your own queries and check the result!): " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "queries = [\n", + " \"Who is Robert Downey Jr.?\",\n", + " \"An expert of neural network\",\n", + " \"A famous female singer\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Text Embedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, for the sake of speed, we just embed the first 500 docs in the corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape of the corpus embeddings: (10, 768)\n", + "data type of the embeddings: float32\n" + ] + } + ], + "source": [ + "from FlagEmbedding import FlagModel\n", + "\n", + "# get the BGE embedding model\n", + "model = FlagModel('BAAI/bge-base-en-v1.5',\n", + " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", + " use_fp16=True)\n", + "\n", + "# get the embedding of the corpus\n", + "corpus_embeddings = model.encode(corpus)\n", + "\n", + "print(\"shape of the corpus embeddings:\", corpus_embeddings.shape)\n", + "print(\"data type of the embeddings: \", corpus_embeddings.dtype)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Faiss only accepts float32 inputs.\n", + "\n", + "So make sure the dtype of corpus_embeddings is float32 before adding them to the index." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "corpus_embeddings = corpus_embeddings.astype(np.float32)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Indexing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this step, we build an index and add the embedding vectors to it." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import faiss\n", + "\n", + "# get the length of our embedding vectors, vectors by bge-base-en-v1.5 have length 768\n", + "dim = corpus_embeddings.shape[-1]\n", + "\n", + "# create the faiss index and store the corpus embeddings into the vector space\n", + "index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n", + "\n", + "# if you installed faiss-gpu, uncomment the following lines to make the index on your GPUs.\n", + "\n", + "# co = faiss.GpuMultipleClonerOptions()\n", + "# index = faiss.index_cpu_to_all_gpus(index, co)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "No need to train if we use \"Flat\" quantizer and METRIC_INNER_PRODUCT as metric. Some other indices that using quantization might need training." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "total number of vectors: 10\n" + ] + } + ], + "source": [ + "# check if the index is trained\n", + "print(index.is_trained) \n", + "# index.train(corpus_embeddings)\n", + "\n", + "# add all the vectors to the index\n", + "index.add(corpus_embeddings)\n", + "\n", + "print(f\"total number of vectors: {index.ntotal}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3.5 (Optional): Saving Faiss index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have your index with the embedding vectors, you can save it locally for future usage." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# change the path to where you want to save the index\n", + "path = \"./index.bin\"\n", + "faiss.write_index(index, path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you already have stored index in your local directory, you can load it by:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "index = faiss.read_index(\"./index.bin\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Find answers to the query" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, get the embeddings of all the queries:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "query_embeddings = model.encode_queries(queries)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, use the Faiss index to do a knn search in the vector space:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.6686779 0.37858668 0.3767978 ]\n", + " [0.6062041 0.59364545 0.527691 ]\n", + " [0.5409331 0.5097007 0.42427146]]\n", + "[[9 7 2]\n", + " [3 1 8]\n", + " [5 0 4]]\n" + ] + } + ], + "source": [ + "dists, ids = index.search(query_embeddings, k=3)\n", + "print(dists)\n", + "print(ids)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see the result:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query:\tWho is Robert Downey Jr.?\n", + "answer:\tRobert Downey Jr. is an iconic actor best known for playing Iron Man in the Marvel Cinematic Universe.\n", + "\n", + "query:\tAn expert of neural network\n", + "answer:\tGeoffrey Hinton, as a foundational figure in AI, received Turing Award for his contribution in deep learning.\n", + "\n", + "query:\tA famous female singer\n", + "answer:\tTaylor Swift is a Grammy-winning singer-songwriter known for her narrative-driven music.\n", + "\n" + ] + } + ], + "source": [ + "for i, q in enumerate(queries):\n", + " print(f\"query:\\t{q}\\nanswer:\\t{corpus[ids[i][0]]}\\n\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/3_Indexing/3.1.2.ipynb.txt b/_sources/tutorial/3_Indexing/3.1.2.ipynb.txt new file mode 100644 index 00000000..b75cb5ed --- /dev/null +++ b/_sources/tutorial/3_Indexing/3.1.2.ipynb.txt @@ -0,0 +1,373 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Faiss GPU" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the last tutorial, we went through the basics of indexing using faiss-cpu. While for the use cases in research and industry. The size of dataset for indexing will be extremely large, the frequency of searching might also be very high. In this tutorial we'll see how to combine Faiss and GPU almost seamlessly." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Installation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Faiss maintain the latest updates on conda. And its gpu version only supports Linux x86_64\n", + "\n", + "create a conda virtual environment and run:\n", + "\n", + "```conda install -c pytorch -c nvidia faiss-gpu=1.8.0```\n", + "\n", + "make sure you select that conda env as the kernel for this notebook. After installation, restart the kernal." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If your system does not satisfy the requirement, install faiss-cpu and just skip the steps with gpu related codes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Data Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First let's create two datasets with \"fake embeddings\" of corpus and queries:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import faiss\n", + "import numpy as np\n", + "\n", + "dim = 768\n", + "corpus_size = 1000\n", + "# np.random.seed(111)\n", + "\n", + "corpus = np.random.random((corpus_size, dim)).astype('float32')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Create Index on CPU" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 1:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Faiss provides a great amount of choices of indexes by initializing directly:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# first build a flat index (on CPU)\n", + "index = faiss.IndexFlatIP(dim)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Option 2:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Besides the basic index class, we can also use the index_factory function to produce composite Faiss index." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "index = faiss.index_factory(dim, \"Flat\", faiss.METRIC_L2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Build GPU Index and Search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All the GPU indexes are built with `StandardGpuResources` object. It contains all the needed resources for each GPU in use. By default it will allocate 18% of the total VRAM as a temporary scratch space." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `GpuClonerOptions` and `GpuMultipleClonerOptions` objects are optional when creating index from cpu to gpu. They are used to adjust the way the GPUs stores the objects." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Single GPU:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# use a single GPU\n", + "rs = faiss.StandardGpuResources()\n", + "co = faiss.GpuClonerOptions()\n", + "\n", + "# then make it to gpu index\n", + "index_gpu = faiss.index_cpu_to_gpu(provider=rs, device=0, index=index, options=co)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.31 ms, sys: 6.26 ms, total: 11.6 ms\n", + "Wall time: 8.94 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "index_gpu.add(corpus)\n", + "D, I = index_gpu.search(corpus, 4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### All Available GPUs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If your system contains multiple GPUs, Faiss provides the option to deploy al available GPUs. You can control their usages through `GpuMultipleClonerOptions`, e.g. whether to shard or replicate the index acrross GPUs." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# cloner options for multiple GPUs\n", + "co = faiss.GpuMultipleClonerOptions()\n", + "\n", + "index_gpu = faiss.index_cpu_to_all_gpus(index=index, co=co)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 29.8 ms, sys: 26.8 ms, total: 56.6 ms\n", + "Wall time: 33.9 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "index_gpu.add(corpus)\n", + "D, I = index_gpu.search(corpus, 4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multiple GPUs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There's also option that use multiple GPUs but not all:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "ngpu = 4\n", + "resources = [faiss.StandardGpuResources() for _ in range(ngpu)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create vectors for the GpuResources and divices, then pass them to the index_cpu_to_gpu_multiple() function." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "vres = faiss.GpuResourcesVector()\n", + "vdev = faiss.Int32Vector()\n", + "for i, res in zip(range(ngpu), resources):\n", + " vdev.push_back(i)\n", + " vres.push_back(res)\n", + "index_gpu = faiss.index_cpu_to_gpu_multiple(vres, vdev, index)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3.49 ms, sys: 13.4 ms, total: 16.9 ms\n", + "Wall time: 9.03 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "index_gpu.add(corpus)\n", + "D, I = index_gpu.search(corpus, 4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All the three approaches should lead to identical result. Now let's do a quick sanity check:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# The nearest neighbor of each vector in the corpus is itself\n", + "assert np.all(corpus[:] == corpus[I[:, 0]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And the corresponding distance should be 0." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[ 0. 111.30057 113.2251 113.342316]\n", + " [ 0. 111.158875 111.742325 112.09038 ]\n", + " [ 0. 116.44429 116.849915 117.30502 ]]\n" + ] + } + ], + "source": [ + "print(D[:3])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "faiss", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/3_Indexing/3.1.3.ipynb.txt b/_sources/tutorial/3_Indexing/3.1.3.ipynb.txt new file mode 100644 index 00000000..4444d8fc --- /dev/null +++ b/_sources/tutorial/3_Indexing/3.1.3.ipynb.txt @@ -0,0 +1,417 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Faiss Indexes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This tutorial will go through several widely used indexes in Faiss that fits different requirements, and how to use them." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For CPU usage, use:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install faiss-cpu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For GPU on Linux x86_64 system, use Conda:\n", + "\n", + "```conda install -c pytorch -c nvidia faiss-gpu=1.8.0```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import faiss\n", + "import numpy as np\n", + "\n", + "np.random.seed(768)\n", + "\n", + "data = np.random.random((1000, 128))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. `IndexFlat*`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Flat index is the very fundamental index structure. It does not do any preprocess for the incoming vectors. All the vectors are stored directly without compression or quantization. Thus no training is need for flat indexes.\n", + "\n", + "When searching, Flat index will decode all the vectors sequentially and compute the similarity score to the query vectors. Thus, Flat Index guarantees the global optimum of results." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Flat index family is small: just `IndexFlatL2` and `IndexFlatIP`, which are just different by the similarity metrics of Euclidean distance and inner product." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Usage:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "d = 128 # dimension of the vector\n", + "k = 3 # number of nearest neighbors to search\n", + "\n", + "# just simply create the index and add all the data\n", + "index = faiss.IndexFlatL2(d)\n", + "index.add(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sanity check:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "closest elements: [[ 0 471 188]]\n", + "distance: [[ 0. 16.257435 16.658928]]\n" + ] + } + ], + "source": [ + "# search for the k nearest neighbor for the first element in data\n", + "D, I = index.search(data[:1], k)\n", + "\n", + "print(f\"closest elements: {I}\")\n", + "print(f\"distance: {D}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Flat Indexes guarantee the perfect quality but with terrible speed. It works well on small datasets or the cases that speed is not a crucial factor. \n", + "\n", + "But what about the cases that speed is important? There's no way to have it all. So we want some indexes that only sacrifice as small as possible quality to speed up. That's why approximate nearest-neighbors (ANN) algorithms are widely accepted. Now we will go through a few popular ANN methods used in vector searching." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. `IndexIVF*`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Intro\n", + "\n", + "Inverted File Flat (IVF) Index is a widely accepted technique to speed up searching by using k-means or Voronoi diagram to create a number of cells (or say, clusters) in the whole space. Then when given a query, an amount of closest cells will be searched. After that, `k` closest elements to the query will be searched in those cells.\n", + "\n", + "- `quantizer` is another index/quantizer to assign vectors to inverted lists.\n", + "- `nlist` is the number of cells the space to be partitioned.\n", + "- `nprob` is the nuber of closest cells to visit for searching in query time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tradeoff\n", + "\n", + "Increasing `nlist` will shrink the size of each cell, which speed up the search process. But the smaller coverage will sacrifice accuracy and increase the possibility of the edge/surface problem discribed above.\n", + "\n", + "Increasing `nprob` will have a greater scope, preferring search quality by the tradeoff of slower speed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Shortage\n", + "\n", + "There could be a problem when the query vector lands on the edge/surface of the cell. It is possible that the closest element falls into the neighbor cell, which may not be considered due to `nprob` is not large enough." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "nlist = 5\n", + "nprob = 2\n", + "\n", + "# the quantizer defines how to store and compare the vectors\n", + "quantizer = faiss.IndexFlatL2(d)\n", + "index = faiss.IndexIVFFlat(quantizer, d, nlist)\n", + "\n", + "# note different from flat index, IVF index first needs training to create the cells\n", + "index.train(data)\n", + "index.add(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "closest elements: [[ 0 471 188]]\n", + "distance: [[ 0. 16.257435 16.658928]]\n" + ] + } + ], + "source": [ + "# set nprob before searching\n", + "index.nprobe = 8\n", + "D, I = index.search(data[:1], k)\n", + "\n", + "print(f\"closest elements: {I}\")\n", + "print(f\"distance: {D}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. `IndexHNSW*`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Intro\n", + "\n", + "Hierarchical Navigable Small World (HNSW) indexing is a graph based method, which is an extension of navigable small world (NSW). It builds a multi-layered graph where nodes (vectors) are connected based on their proximity, forming \"small-world\" structures that allow efficient navigation through the space.\n", + "\n", + "- `M` is the number of neighbors each vector has in the graph.\n", + "- `efConstruction` is the number of entry points to explore when building the index.\n", + "- `efSearch` is the number of entry points to explore when searching." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tradeoff\n", + "\n", + "Increasing `M` or `efSearch` will make greater fidelity with reasonable longer time. Larger `efConstruction` mainly increases the index construction time.\n", + "\n", + "HNSW has great searching quality and speed. But it is memory-consuming due to the graph structure. Scaling up `M` will cause a linear increase of memory usage.\n", + "\n", + "Note that HNSW index does not support vector's removal because removing nodes will distroy graph structure.\n", + "\n", + "Thus HNSW is a great index to choose when RAM is not a limiting factor." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "M = 32\n", + "ef_search = 16\n", + "ef_construction = 32\n", + "\n", + "index = faiss.IndexHNSWFlat(d, M)\n", + "# set the two parameters before adding data\n", + "index.hnsw.efConstruction = ef_construction\n", + "index.hnsw.efSearch = ef_search\n", + "\n", + "index.add(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "closest elements: [[ 0 471 188]]\n", + "distance: [[ 0. 16.257435 16.658928]]\n" + ] + } + ], + "source": [ + "D, I = index.search(data[:1], k)\n", + "\n", + "print(f\"closest elements: {I}\")\n", + "print(f\"distance: {D}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. `IndexLSH`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Intro\n", + "\n", + "Locality Sensitive Hashing (LSH) is an ANN method that hashing data points into buckets. While well known use cases of hash function such as dictionary/hashtabel are trying to avoid hashing collisions, LSH trys to maximize hashing collisions. Similar vectors will be grouped into same hash bucket.\n", + "\n", + "In Faiss, `IndexLSH` is a Flat index with binary codes. Vectors are hashed into binary codes and compared by Hamming distances.\n", + "\n", + "- `nbits` can be seen as the \"resolution\" of hashed vectors." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tradeoff\n", + "\n", + "Increasing `nbits` can get higher fidelity with the cost of more memory and longer searching time.\n", + "\n", + "LSH suffers the curse of dimensionality when using a larger `d`. In order to get similar search quality, the `nbits` value needs to be scaled up to maintain the search quality." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Shortage\n", + "\n", + "LSH speeds up searching time with a reasonable sacrifice of quality. But that only applies to small dimension `d`. Even 128 is already too large for LSH. Thus for vectors generated by transformer based embedding models, LSH index is not a common choice." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "nbits = d * 8\n", + "\n", + "index = faiss.IndexLSH(d, nbits)\n", + "index.train(data)\n", + "index.add(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "closest elements: [[ 0 471 392]]\n", + "distance: [[ 0. 197. 199.]]\n" + ] + } + ], + "source": [ + "D, I = index.search(data[:1], k)\n", + "\n", + "print(f\"closest elements: {I}\")\n", + "print(f\"distance: {D}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "faiss", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/3_Indexing/3.1.4.ipynb.txt b/_sources/tutorial/3_Indexing/3.1.4.ipynb.txt new file mode 100644 index 00000000..f45fee2e --- /dev/null +++ b/_sources/tutorial/3_Indexing/3.1.4.ipynb.txt @@ -0,0 +1,354 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Faiss Quantizers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we will introduce the quantizer object in Faiss and how to use them." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For CPU usage, run:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install faiss-cpu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For GPU on Linux x86_64 system, use Conda:\n", + "\n", + "```conda install -c pytorch -c nvidia faiss-gpu=1.8.0```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import faiss\n", + "import numpy as np\n", + "\n", + "np.random.seed(768)\n", + "\n", + "data = np.random.random((1000, 128))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Scalar Quantizer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Normal data type of vector embeedings is usually 32 bit floats. Scalar quantization is transforming the 32 float representation to, for example, 8 bit interger. Thus with a 4x reduction in size. In this way, it can be seen as we distribute each dimension into 256 buckets." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| Name | Class | Parameters |\n", + "|:------------:|:--------:|:-----------|\n", + "| `ScalarQuantizer` | Quantizer class | `d`: dimension of vectors
`qtype`: map dimension into $2^\\text{qtype}$ clusters |\n", + "| `IndexScalarQuantizer` | Flat index class | `d`: dimension of vectors
`qtype`: map dimension into $2^\\text{qtype}$ clusters
`metric`: similarity metric (L2 or IP) |\n", + "| `IndexIVFScalarQuantizer` | IVF index class | `d`: dimension of vectors
`nlist`: number of cells/clusters to partition the inverted file space
`qtype`: map dimension into $2^\\text{qtype}$ clusters
`metric`: similarity metric (L2 or IP)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Quantizer class objects are used to compress the data before adding into indexes. Flat index class objects and IVF index class objects can be used direct as and index. Quantization will be done automatically." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scalar Quantizer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[156 180 46 226 13 130 41 187 63 251 16 199 205 166 117 122 214 2\n", + " 206 137 71 186 20 131 59 57 68 114 35 45 28 210 27 93 74 245\n", + " 167 5 32 42 44 128 10 189 10 13 42 162 179 221 241 104 205 21\n", + " 70 87 52 219 172 138 193 0 228 175 144 34 59 88 170 1 233 220\n", + " 20 64 245 241 5 161 41 55 30 247 107 8 229 90 201 10 43 158\n", + " 238 184 187 114 232 90 116 205 14 214 135 158 237 192 205 141 232 176\n", + " 124 176 163 68 49 91 125 70 6 170 55 44 215 84 46 48 218 56\n", + " 107 176]\n" + ] + } + ], + "source": [ + "d = 128\n", + "qtype = faiss.ScalarQuantizer.QT_8bit\n", + "\n", + "quantizer = faiss.ScalarQuantizer(d, qtype)\n", + "\n", + "quantizer.train(data)\n", + "new_data = quantizer.compute_codes(data)\n", + "\n", + "print(new_data[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scalar Quantizer Index" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "d = 128\n", + "k = 3\n", + "qtype = faiss.ScalarQuantizer.QT_8bit\n", + "# nlist = 5\n", + "\n", + "index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)\n", + "# index = faiss.IndexIVFScalarQuantizer(d, nlist, faiss.ScalarQuantizer.QT_8bit, faiss.METRIC_L2)\n", + "\n", + "index.train(data)\n", + "index.add(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "closest elements: [[ 0 471 188]]\n", + "distance: [[1.6511828e-04 1.6252808e+01 1.6658131e+01]]\n" + ] + } + ], + "source": [ + "D, I = index.search(data[:1], k)\n", + "\n", + "print(f\"closest elements: {I}\")\n", + "print(f\"distance: {D}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Product Quantizer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When speed and memory are crucial factors in searching, product quantizer becomes a top choice. It is one of the effective quantizer on reducing memory size. \n", + "\n", + "The first step of PQ is dividing the original vectors with dimension `d` into smaller, low-dimensional sub-vectors with dimension `d/m`. Here `m` is the number of sub-vectors.\n", + "\n", + "Then clustering algorithms are used to create codebook of a fixed number of centroids.\n", + "\n", + "Next, each sub-vector of a vector is replaced by the index of the closest centroid from its corresponding codebook. Now each vector will be stored with only the indices instead of the full vector.\n", + "\n", + "When comuputing the distance between a query vector. Only the distances to the centroids in the codebooks are calculated, thus enable the quick approximate nearest neighbor searches." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| Name | Class | Parameters |\n", + "|:------------:|:--------:|:-----------|\n", + "| `ProductQuantizer` | Quantizer class | `d`: dimension of vectors
`M`: number of sub-vectors that D % M == 0
`nbits`: number of bits per subquantizer, so each contain $2^\\text{nbits}$ centroids |\n", + "| `IndexPQ` | Flat index class | `d`: dimension of vectors
`M`: number of sub-vectors that D % M == 0
`nbits`: number of bits per subquantizer, so each contain $2^\\text{nbits}$ centroids
`metric`: similarity metric (L2 or IP) |\n", + "| `IndexIVFPQ` | IVF index class | `quantizer`: the quantizer used in computing distance phase.
`d`: dimension of vectors
`nlist`: number of cells/clusters to partition the inverted file space
`M`: number of sub-vectors that D % M == 0
`nbits`: number of bits per subquantizer, so each contain $2^\\text{nbits}$ centroids
`metric`: similarity metric (L2 or IP) |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Product Quantizer" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "255\n", + "[[ 90 169 226 45]\n", + " [ 33 51 34 15]]\n" + ] + } + ], + "source": [ + "d = 128\n", + "M = 8\n", + "nbits = 4\n", + "\n", + "quantizer = faiss.ProductQuantizer(d, M, nbits)\n", + "\n", + "quantizer.train(data)\n", + "new_data = quantizer.compute_codes(data)\n", + "\n", + "print(new_data.max())\n", + "print(new_data[:2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Product Quantizer Index" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "index = faiss.IndexPQ(d, M, nbits, faiss.METRIC_L2)\n", + "\n", + "index.train(data)\n", + "index.add(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "closest elements: [[ 0 946 330]]\n", + "distance: [[ 8.823908 11.602461 11.746731]]\n" + ] + } + ], + "source": [ + "D, I = index.search(data[:1], k)\n", + "\n", + "print(f\"closest elements: {I}\")\n", + "print(f\"distance: {D}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Product Quantizer IVF Index" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "nlist = 5\n", + "\n", + "quantizer = faiss.IndexFlat(d, faiss.METRIC_L2)\n", + "index = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits, faiss.METRIC_L2)\n", + "\n", + "index.train(data)\n", + "index.add(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "closest elements: [[ 0 899 521]]\n", + "distance: [[ 8.911423 12.088312 12.104569]]\n" + ] + } + ], + "source": [ + "D, I = index.search(data[:1], k)\n", + "\n", + "print(f\"closest elements: {I}\")\n", + "print(f\"distance: {D}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/3_Indexing/3.1.5.ipynb.txt b/_sources/tutorial/3_Indexing/3.1.5.ipynb.txt new file mode 100644 index 00000000..f4b771e2 --- /dev/null +++ b/_sources/tutorial/3_Indexing/3.1.5.ipynb.txt @@ -0,0 +1,624 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Choosing Index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Give a great amount of indexes and quantizers, how to choose the one in the experiment/application? In this part, we will give a general suggestion on how to choose the one fits your need." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Packages" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For CPU usage, run:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %pip install -U faiss-cpu numpy h5py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For GPU on Linux x86_64 system, use Conda:\n", + "\n", + "```conda install -c pytorch -c nvidia faiss-gpu=1.8.0```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib.request import urlretrieve\n", + "import h5py\n", + "import faiss\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we'll use [SIFT1M](http://corpus-texmex.irisa.fr/), a very popular dataset for ANN evaluation, as our dataset to demonstrate the comparison.\n", + "\n", + "Run the following cell to download the dataset or you can also manually download from the repo [ann-benchmarks](https://github.com/erikbern/ann-benchmarks?tab=readme-ov-file#data-sets))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_url = \"http://ann-benchmarks.com/sift-128-euclidean.hdf5\"\n", + "destination = \"data.hdf5\"\n", + "urlretrieve(data_url, destination)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then load the data from the hdf5 file." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1000000, 128) float32\n", + "(10000, 128) float32\n" + ] + } + ], + "source": [ + "with h5py.File('data.hdf5', 'r') as f:\n", + " corpus = f['train'][:]\n", + " query = f['test'][:]\n", + "\n", + "print(corpus.shape, corpus.dtype)\n", + "print(query.shape, corpus.dtype)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "d = corpus[0].shape[0]\n", + "k = 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Helper function" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following is a helper function for computing recall." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# compute recall from the prediction results and ground truth\n", + "def compute_recall(res, truth):\n", + " recall = 0\n", + " for i in range(len(res)):\n", + " intersect = np.intersect1d(res[i], truth[i])\n", + " recall += len(intersect) / len(res[i])\n", + " recall /= len(res)\n", + "\n", + " return recall" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Flat Index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Flat index use brute force to search neighbors for each query. It guarantees the optimal result with 100% recall. Thus we use the result from it as the ground truth." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 69.2 ms, sys: 80.6 ms, total: 150 ms\n", + "Wall time: 149 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "index = faiss.IndexFlatL2(d)\n", + "index.add(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 17min 30s, sys: 1.62 s, total: 17min 31s\n", + "Wall time: 2min 1s\n" + ] + } + ], + "source": [ + "%%time\n", + "D, I_truth = index.search(query, k)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. IVF Index" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 10.6 s, sys: 831 ms, total: 11.4 s\n", + "Wall time: 419 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "nlist = 5\n", + "nprob = 3\n", + "\n", + "quantizer = faiss.IndexFlatL2(d)\n", + "index = faiss.IndexIVFFlat(quantizer, d, nlist)\n", + "index.nprobe = nprob\n", + "\n", + "index.train(corpus)\n", + "index.add(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 9min 15s, sys: 598 ms, total: 9min 16s\n", + "Wall time: 12.5 s\n" + ] + } + ], + "source": [ + "%%time\n", + "D, I = index.search(query, k)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recall: 0.9999189999999997\n" + ] + } + ], + "source": [ + "recall = compute_recall(I, I_truth)\n", + "print(f\"Recall: {recall}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the test we can see that IVFFlatL2 has a pretty good promotion for the searching speed with a very tiny loss of recall." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. HNSW Index" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 11min 21s, sys: 595 ms, total: 11min 22s\n", + "Wall time: 17 s\n" + ] + } + ], + "source": [ + "%%time\n", + "M = 64\n", + "ef_search = 32\n", + "ef_construction = 64\n", + "\n", + "index = faiss.IndexHNSWFlat(d, M)\n", + "# set the two parameters before adding data\n", + "index.hnsw.efConstruction = ef_construction\n", + "index.hnsw.efSearch = ef_search\n", + "\n", + "index.add(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.14 s, sys: 3.94 ms, total: 5.14 s\n", + "Wall time: 110 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "D, I = index.search(query, k)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recall: 0.8963409999999716\n" + ] + } + ], + "source": [ + "recall = compute_recall(I, I_truth)\n", + "print(f\"Recall: {recall}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the searching time of less than 1 second, we can see why HNSW is one of the best choice when looking for an extreme speed during searching phase. The reduction of recall is acceptable. But the longer time during creation of index and large memory footprint need to be considered." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. LSH" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 13.7 s, sys: 660 ms, total: 14.4 s\n", + "Wall time: 12.1 s\n" + ] + } + ], + "source": [ + "%%time\n", + "nbits = d * 8\n", + "\n", + "index = faiss.IndexLSH(d, nbits)\n", + "index.train(corpus)\n", + "index.add(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 3min 20s, sys: 84.2 ms, total: 3min 20s\n", + "Wall time: 5.64 s\n" + ] + } + ], + "source": [ + "%%time\n", + "D, I = index.search(query, k)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recall: 0.5856720000000037\n" + ] + } + ], + "source": [ + "recall = compute_recall(I, I_truth)\n", + "print(f\"Recall: {recall}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we covered in the last notebook, LSH is not a good choice when the data dimension is large. Here 128 is already burdened for LSH. As we can see, even we choose a relatively small `nbits` of d * 8, the index creating time and search time are still pretty long. And the recall of about 58.6% is not satisfactory." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Scalar Quantizer Index" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 550 ms, sys: 18 ms, total: 568 ms\n", + "Wall time: 87.4 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "qtype = faiss.ScalarQuantizer.QT_8bit\n", + "metric = faiss.METRIC_L2\n", + "\n", + "index = faiss.IndexScalarQuantizer(d, qtype, metric)\n", + "index.train(corpus)\n", + "index.add(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 7min 36s, sys: 169 ms, total: 7min 36s\n", + "Wall time: 12.7 s\n" + ] + } + ], + "source": [ + "%%time\n", + "D, I = index.search(query, k)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recall: 0.990444999999872\n" + ] + } + ], + "source": [ + "recall = compute_recall(I, I_truth)\n", + "print(f\"Recall: {recall}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here scalar quantizer index's performance looks very similar to the Flat index. Because the elements of vectors in the SIFT dataset are integers in the range of [0, 218]. Thus the index does not lose to much information during scalar quantization. For the dataset with more complex distribution in float32. The difference will be more obvious." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Product Quantizer Index" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 46.7 s, sys: 22.3 ms, total: 46.7 s\n", + "Wall time: 1.36 s\n" + ] + } + ], + "source": [ + "%%time\n", + "M = 16\n", + "nbits = 8\n", + "metric = faiss.METRIC_L2\n", + "\n", + "index = faiss.IndexPQ(d, M, nbits, metric)\n", + "\n", + "index.train(corpus)\n", + "index.add(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1min 37s, sys: 106 ms, total: 1min 37s\n", + "Wall time: 2.8 s\n" + ] + } + ], + "source": [ + "%%time\n", + "D, I = index.search(query, k)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recall: 0.630898999999999\n" + ] + } + ], + "source": [ + "recall = compute_recall(I, I_truth)\n", + "print(f\"Recall: {recall}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Product quantizer index is not standout in any one of the aspect. But it somewhat balance the tradeoffs. It is widely used in real applications with the combination of other indexes such as IVF or HNSW." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/4_Evaluation.rst.txt b/_sources/tutorial/4_Evaluation.rst.txt new file mode 100644 index 00000000..403b804d --- /dev/null +++ b/_sources/tutorial/4_Evaluation.rst.txt @@ -0,0 +1,12 @@ +4. Evaluation +============= + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Evaluation + + 4_Evaluation/4.1.1 + 4_Evaluation/4.2.1 + 4_Evaluation/4.2.2 + 4_Evaluation/4.3.1 diff --git a/_sources/tutorial/4_Evaluation/4.1.1.ipynb.txt b/_sources/tutorial/4_Evaluation/4.1.1.ipynb.txt new file mode 100644 index 00000000..ad24b17a --- /dev/null +++ b/_sources/tutorial/4_Evaluation/4.1.1.ipynb.txt @@ -0,0 +1,509 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Evaluation is a crucial part in all machine learning tasks. In this notebook, we will walk through the whole pipeline of evaluating the performance of an embedding model on [MS Marco](https://microsoft.github.io/msmarco/), and use three metrics to show its performance." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 0: Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the dependencies in the environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -U FlagEmbedding faiss-cpu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Load Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, download the queries and MS Marco from Huggingface Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import numpy as np\n", + "\n", + "data = load_dataset(\"namespace-Pt/msmarco\", split=\"dev\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Considering time cost, we will use the truncated dataset in this tutorial. `queries` contains the first 100 queries from the dataset. `corpus` is formed by the positives of the the first 5,000 queries." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "queries = np.array(data[:100][\"query\"])\n", + "corpus = sum(data[:5000][\"positive\"], [])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you have GPU and would like to try out the full evaluation of MS Marco, uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# data = load_dataset(\"namespace-Pt/msmarco\", split=\"dev\")\n", + "# queries = np.array(data[\"query\"])\n", + "\n", + "# corpus = load_dataset(\"namespace-PT/msmarco-corpus\", split=\"train\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Embedding" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Choose the embedding model that we would like to evaluate, and encode the corpus to embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Inference Embeddings: 100%|██████████| 21/21 [02:10<00:00, 6.22s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape of the corpus embeddings: (5331, 768)\n", + "data type of the embeddings: float32\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from FlagEmbedding import FlagModel\n", + "\n", + "# get the BGE embedding model\n", + "model = FlagModel('BAAI/bge-base-en-v1.5',\n", + " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", + " use_fp16=True)\n", + "\n", + "# get the embedding of the corpus\n", + "corpus_embeddings = model.encode(corpus)\n", + "\n", + "print(\"shape of the corpus embeddings:\", corpus_embeddings.shape)\n", + "print(\"data type of the embeddings: \", corpus_embeddings.dtype)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Indexing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use the index_factory() functions to create a Faiss index we want:\n", + "\n", + "- The first argument `dim` is the dimension of the vector space, in this case is 768 if you're using bge-base-en-v1.5.\n", + "\n", + "- The second argument `'Flat'` makes the index do exhaustive search.\n", + "\n", + "- The thrid argument `faiss.METRIC_INNER_PRODUCT` tells the index to use inner product as the distance metric." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total number of vectors: 5331\n" + ] + } + ], + "source": [ + "import faiss\n", + "\n", + "# get the length of our embedding vectors, vectors by bge-base-en-v1.5 have length 768\n", + "dim = corpus_embeddings.shape[-1]\n", + "\n", + "# create the faiss index and store the corpus embeddings into the vector space\n", + "index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n", + "corpus_embeddings = corpus_embeddings.astype(np.float32)\n", + "# train and add the embeddings to the index\n", + "index.train(corpus_embeddings)\n", + "index.add(corpus_embeddings)\n", + "\n", + "print(f\"total number of vectors: {index.ntotal}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the embedding process is time consuming, it's a good choice to save the index for reproduction or other experiments.\n", + "\n", + "Uncomment the following lines to save the index." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# path = \"./index.bin\"\n", + "# faiss.write_index(index, path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you already have stored index in your local directory, you can load it by:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# index = faiss.read_index(\"./index.bin\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Retrieval" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get the embeddings of all the queries, and get their corresponding ground truth answers for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "query_embeddings = model.encode_queries(queries)\n", + "ground_truths = [d[\"positive\"] for d in data]\n", + "corpus = np.asarray(corpus)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use the faiss index to search top $k$ answers of each query." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Searching: 100%|██████████| 1/1 [00:00<00:00, 20.91it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "\n", + "res_scores, res_ids, res_text = [], [], []\n", + "query_size = len(query_embeddings)\n", + "batch_size = 256\n", + "# The cutoffs we will use during evaluation, and set k to be the maximum of the cutoffs.\n", + "cut_offs = [1, 10]\n", + "k = max(cut_offs)\n", + "\n", + "for i in tqdm(range(0, query_size, batch_size), desc=\"Searching\"):\n", + " q_embedding = query_embeddings[i: min(i+batch_size, query_size)].astype(np.float32)\n", + " # search the top k answers for each of the queries\n", + " score, idx = index.search(q_embedding, k=k)\n", + " res_scores += list(score)\n", + " res_ids += list(idx)\n", + " res_text += list(corpus[idx])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: Evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.1 Recall" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recall represents the model's capability of correctly predicting positive instances from all the actual positive samples in the dataset.\n", + "\n", + "$$\\textbf{Recall}=\\frac{\\text{True Positives}}{\\text{True Positives}+\\text{False Negatives}}$$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Recall is useful when the cost of false negatives is high. In other words, we are trying to find all objects of the positive class, even if this results in some false positives. This attribute makes recall a useful metric for text retrieval tasks." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "recall@1: 0.97\n", + "recall@10: 1.0\n" + ] + } + ], + "source": [ + "def calc_recall(preds, truths, cutoffs):\n", + " recalls = np.zeros(len(cutoffs))\n", + " for text, truth in zip(preds, truths):\n", + " for i, c in enumerate(cutoffs):\n", + " recall = np.intersect1d(truth, text[:c])\n", + " recalls[i] += len(recall) / max(min(c, len(truth)), 1)\n", + " recalls /= len(preds)\n", + " return recalls\n", + "\n", + "recalls = calc_recall(res_text, ground_truths, cut_offs)\n", + "for i, c in enumerate(cut_offs):\n", + " print(f\"recall@{c}: {recalls[i]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.2 MRR" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mean Reciprocal Rank ([MRR](https://en.wikipedia.org/wiki/Mean_reciprocal_rank)) is a widely used metric in information retrieval to evaluate the effectiveness of a system. It measures the rank position of the first relevant result in a list of search results.\n", + "\n", + "$$MRR=\\frac{1}{|Q|}\\sum_{i=1}^{|Q|}\\frac{1}{rank_i}$$\n", + "\n", + "where \n", + "- $|Q|$ is the total number of queries.\n", + "- $rank_i$ is the rank position of the first relevant document of the i-th query." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def MRR(preds, truth, cutoffs):\n", + " mrr = [0 for _ in range(len(cutoffs))]\n", + " for pred, t in zip(preds, truth):\n", + " for i, c in enumerate(cutoffs):\n", + " for j, p in enumerate(pred):\n", + " if j < c and p in t:\n", + " mrr[i] += 1/(j+1)\n", + " break\n", + " mrr = [k/len(preds) for k in mrr]\n", + " return mrr" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MRR@1: 0.97\n", + "MRR@10: 0.9825\n" + ] + } + ], + "source": [ + "mrr = MRR(res_text, ground_truths, cut_offs)\n", + "for i, c in enumerate(cut_offs):\n", + " print(f\"MRR@{c}: {mrr[i]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5.3 nDCG" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Normalized Discounted cumulative gain (nDCG) measures the quality of a ranked list of search results by considering both the position of the relevant documents and their graded relevance scores. The calculation of nDCG involves two main steps:\n", + "\n", + "1. Discounted cumulative gain (DCG) measures the ranking quality in retrieval tasks.\n", + "\n", + "$$DCG_p=\\sum_{i=1}^p\\frac{2^{rel_i}-1}{\\log_2(i+1)}$$\n", + "\n", + "2. Normalized by ideal DCG to make it comparable across queries.\n", + "$$nDCG_p=\\frac{DCG_p}{IDCG_p}$$\n", + "where $IDCG$ is the maximum possible DCG for a given set of documents, assuming they are perfectly ranked in order of relevance." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "pred_hard_encodings = []\n", + "for pred, label in zip(res_text, ground_truths):\n", + " pred_hard_encoding = list(np.isin(pred, label).astype(int))\n", + " pred_hard_encodings.append(pred_hard_encoding)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "nDCG@1: 0.97\n", + "nDCG@10: 0.9869253606521631\n" + ] + } + ], + "source": [ + "from sklearn.metrics import ndcg_score\n", + "\n", + "for i, c in enumerate(cut_offs):\n", + " nDCG = ndcg_score(pred_hard_encodings, res_scores, k=c)\n", + " print(f\"nDCG@{c}: {nDCG}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Congrats! You have walked through a full pipeline of evaluating an embedding model. Feel free to play with different datasets and models!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/4_Evaluation/4.2.1.ipynb.txt b/_sources/tutorial/4_Evaluation/4.2.1.ipynb.txt new file mode 100644 index 00000000..3f636f19 --- /dev/null +++ b/_sources/tutorial/4_Evaluation/4.2.1.ipynb.txt @@ -0,0 +1,436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MTEB" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For evaluation of embedding models, MTEB is one of the most well-known benchmark. In this tutorial, we'll introduce MTEB, its basic usage, and evaluate how your model performs on the MTEB leaderboard." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Installation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the packages we will use in your environment:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "%pip install sentence_transformers mteb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Intro" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The [Massive Text Embedding Benchmark (MTEB)](https://github.com/embeddings-benchmark/mteb) is a large-scale evaluation framework designed to assess the performance of text embedding models across a wide variety of natural language processing (NLP) tasks. Introduced to standardize and improve the evaluation of text embeddings, MTEB is crucial for assessing how well these models generalize across various real-world applications. It contains a wide range of datasets in eight main NLP tasks and different languages, and provides an easy pipeline for evaluation.\n", + "\n", + "MTEB is also well known for the MTEB leaderboard, which contains a ranking of the latest first-class embedding models. We'll cover that in the next tutorial. Now let's have a look on how to use MTEB to do evaluation easily." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import mteb\n", + "from sentence_transformers import SentenceTransformer" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's take a look at how to use MTEB to do a quick evaluation.\n", + "\n", + "First we load the model that we would like to evaluate on:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"BAAI/bge-base-en-v1.5\"\n", + "model = SentenceTransformer(model_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below is the list of datasets of retrieval used by MTEB's English leaderboard.\n", + "\n", + "MTEB directly use the open source benchmark BEIR in its retrieval part, which contains 15 datasets (note there are 12 subsets of CQADupstack)." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "retrieval_tasks = [\n", + " \"ArguAna\",\n", + " \"ClimateFEVER\",\n", + " \"CQADupstackAndroidRetrieval\",\n", + " \"CQADupstackEnglishRetrieval\",\n", + " \"CQADupstackGamingRetrieval\",\n", + " \"CQADupstackGisRetrieval\",\n", + " \"CQADupstackMathematicaRetrieval\",\n", + " \"CQADupstackPhysicsRetrieval\",\n", + " \"CQADupstackProgrammersRetrieval\",\n", + " \"CQADupstackStatsRetrieval\",\n", + " \"CQADupstackTexRetrieval\",\n", + " \"CQADupstackUnixRetrieval\",\n", + " \"CQADupstackWebmastersRetrieval\",\n", + " \"CQADupstackWordpressRetrieval\",\n", + " \"DBPedia\",\n", + " \"FEVER\",\n", + " \"FiQA2018\",\n", + " \"HotpotQA\",\n", + " \"MSMARCO\",\n", + " \"NFCorpus\",\n", + " \"NQ\",\n", + " \"QuoraRetrieval\",\n", + " \"SCIDOCS\",\n", + " \"SciFact\",\n", + " \"Touche2020\",\n", + " \"TRECCOVID\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For demonstration, let's just run the first one, \"ArguAna\".\n", + "\n", + "For a full list of tasks and languages that MTEB supports, check the [page](https://github.com/embeddings-benchmark/mteb/blob/18662380f0f476db3d170d0926892045aa9f74ee/docs/tasks.md)." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "tasks = mteb.get_tasks(tasks=retrieval_tasks[:1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, create and initialize an MTEB instance with our chosen tasks, and run the evaluation process." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
───────────────────────────────────────────────── Selected tasks  ─────────────────────────────────────────────────\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;5;235m───────────────────────────────────────────────── \u001b[0m\u001b[1mSelected tasks \u001b[0m\u001b[38;5;235m ─────────────────────────────────────────────────\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Retrieval\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1mRetrieval\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
    - ArguAna, s2p\n",
+       "
\n" + ], + "text/plain": [ + " - ArguAna, \u001b[3;38;5;241ms2p\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n",
+       "\n",
+       "
\n" + ], + "text/plain": [ + "\n", + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 44/44 [00:41<00:00, 1.06it/s]\n", + "Batches: 100%|██████████| 272/272 [03:36<00:00, 1.26it/s]\n" + ] + } + ], + "source": [ + "# use the tasks we chose to initialize the MTEB instance\n", + "evaluation = mteb.MTEB(tasks=tasks)\n", + "\n", + "# call run() with the model and output_folder\n", + "results = evaluation.run(model, output_folder=\"results\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results should be stored in `{output_folder}/{model_name}/{model_revision}/{task_name}.json`.\n", + "\n", + "Openning the json file you should see contents as below, which are the evaluation results on \"ArguAna\" with different metrics on cutoffs from 1 to 1000." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "{\n", + " \"dataset_revision\": \"c22ab2a51041ffd869aaddef7af8d8215647e41a\",\n", + " \"evaluation_time\": 260.14976954460144,\n", + " \"kg_co2_emissions\": null,\n", + " \"mteb_version\": \"1.14.17\",\n", + " \"scores\": {\n", + " \"test\": [\n", + " {\n", + " \"hf_subset\": \"default\",\n", + " \"languages\": [\n", + " \"eng-Latn\"\n", + " ],\n", + " \"main_score\": 0.63616,\n", + " \"map_at_1\": 0.40754,\n", + " \"map_at_10\": 0.55773,\n", + " \"map_at_100\": 0.56344,\n", + " \"map_at_1000\": 0.56347,\n", + " \"map_at_20\": 0.56202,\n", + " \"map_at_3\": 0.51932,\n", + " \"map_at_5\": 0.54023,\n", + " \"mrr_at_1\": 0.4139402560455192,\n", + " \"mrr_at_10\": 0.5603739077423295,\n", + " \"mrr_at_100\": 0.5660817425350153,\n", + " \"mrr_at_1000\": 0.5661121884705748,\n", + " \"mrr_at_20\": 0.564661930998293,\n", + " \"mrr_at_3\": 0.5208629682313899,\n", + " \"mrr_at_5\": 0.5429113323850182,\n", + " \"nauc_map_at_1000_diff1\": 0.15930478114759905,\n", + " \"nauc_map_at_1000_max\": -0.06396189194646361,\n", + " \"nauc_map_at_1000_std\": -0.13168797291549253,\n", + " \"nauc_map_at_100_diff1\": 0.15934819555197366,\n", + " \"nauc_map_at_100_max\": -0.06389635013430676,\n", + " \"nauc_map_at_100_std\": -0.13164524259533786,\n", + " \"nauc_map_at_10_diff1\": 0.16057318234658585,\n", + " \"nauc_map_at_10_max\": -0.060962623117325254,\n", + " \"nauc_map_at_10_std\": -0.1300413865104607,\n", + " \"nauc_map_at_1_diff1\": 0.17346152653542332,\n", + " \"nauc_map_at_1_max\": -0.09705499215630589,\n", + " \"nauc_map_at_1_std\": -0.14726476953035533,\n", + " \"nauc_map_at_20_diff1\": 0.15956349246366208,\n", + " \"nauc_map_at_20_max\": -0.06259296677860492,\n", + " \"nauc_map_at_20_std\": -0.13097093150054095,\n", + " \"nauc_map_at_3_diff1\": 0.15620049317363813,\n", + " \"nauc_map_at_3_max\": -0.06690213479396273,\n", + " \"nauc_map_at_3_std\": -0.13440904793529648,\n", + " \"nauc_map_at_5_diff1\": 0.1557795701081579,\n", + " \"nauc_map_at_5_max\": -0.06255283252590663,\n", + " \"nauc_map_at_5_std\": -0.1355361594910923,\n", + " \"nauc_mrr_at_1000_diff1\": 0.1378988612808882,\n", + " \"nauc_mrr_at_1000_max\": -0.07507962333910836,\n", + " \"nauc_mrr_at_1000_std\": -0.12969109830101241,\n", + " \"nauc_mrr_at_100_diff1\": 0.13794450668758515,\n", + " \"nauc_mrr_at_100_max\": -0.07501290390362861,\n", + " \"nauc_mrr_at_100_std\": -0.12964855554504057,\n", + " \"nauc_mrr_at_10_diff1\": 0.1396047981645623,\n", + " \"nauc_mrr_at_10_max\": -0.07185174301688693,\n", + " \"nauc_mrr_at_10_std\": -0.12807325096717753,\n", + " \"nauc_mrr_at_1_diff1\": 0.15610387932529113,\n", + " \"nauc_mrr_at_1_max\": -0.09824591983546396,\n", + " \"nauc_mrr_at_1_std\": -0.13914318784294258,\n", + " \"nauc_mrr_at_20_diff1\": 0.1382786098284509,\n", + " \"nauc_mrr_at_20_max\": -0.07364476417961506,\n", + " \"nauc_mrr_at_20_std\": -0.12898192060943495,\n", + " \"nauc_mrr_at_3_diff1\": 0.13118224861025093,\n", + " \"nauc_mrr_at_3_max\": -0.08164985279853691,\n", + " \"nauc_mrr_at_3_std\": -0.13241573571401533,\n", + " \"nauc_mrr_at_5_diff1\": 0.1346130730317385,\n", + " \"nauc_mrr_at_5_max\": -0.07404093236468848,\n", + " \"nauc_mrr_at_5_std\": -0.1340775377068567,\n", + " \"nauc_ndcg_at_1000_diff1\": 0.15919987960292029,\n", + " \"nauc_ndcg_at_1000_max\": -0.05457945565481172,\n", + " \"nauc_ndcg_at_1000_std\": -0.12457339152558143,\n", + " \"nauc_ndcg_at_100_diff1\": 0.1604091882521101,\n", + " \"nauc_ndcg_at_100_max\": -0.05281549383775287,\n", + " \"nauc_ndcg_at_100_std\": -0.12347288098914058,\n", + " \"nauc_ndcg_at_10_diff1\": 0.1657018523692905,\n", + " \"nauc_ndcg_at_10_max\": -0.036222943297402846,\n", + " \"nauc_ndcg_at_10_std\": -0.11284619565817842,\n", + " \"nauc_ndcg_at_1_diff1\": 0.17346152653542332,\n", + " \"nauc_ndcg_at_1_max\": -0.09705499215630589,\n", + " \"nauc_ndcg_at_1_std\": -0.14726476953035533,\n", + " \"nauc_ndcg_at_20_diff1\": 0.16231721725673165,\n", + " \"nauc_ndcg_at_20_max\": -0.04147115653921931,\n", + " \"nauc_ndcg_at_20_std\": -0.11598700704312062,\n", + " \"nauc_ndcg_at_3_diff1\": 0.15256475371124711,\n", + " \"nauc_ndcg_at_3_max\": -0.05432154580979357,\n", + " \"nauc_ndcg_at_3_std\": -0.12841084787822227,\n", + " \"nauc_ndcg_at_5_diff1\": 0.15236205846534961,\n", + " \"nauc_ndcg_at_5_max\": -0.04356123278888682,\n", + " \"nauc_ndcg_at_5_std\": -0.12942556865700913,\n", + " \"nauc_precision_at_1000_diff1\": -0.038790629929866066,\n", + " \"nauc_precision_at_1000_max\": 0.3630826341915611,\n", + " \"nauc_precision_at_1000_std\": 0.4772189839676386,\n", + " \"nauc_precision_at_100_diff1\": 0.32118609204433185,\n", + " \"nauc_precision_at_100_max\": 0.4740132817600036,\n", + " \"nauc_precision_at_100_std\": 0.3456396169952022,\n", + " \"nauc_precision_at_10_diff1\": 0.22279659689895104,\n", + " \"nauc_precision_at_10_max\": 0.16823918613191954,\n", + " \"nauc_precision_at_10_std\": 0.0377209694331257,\n", + " \"nauc_precision_at_1_diff1\": 0.17346152653542332,\n", + " \"nauc_precision_at_1_max\": -0.09705499215630589,\n", + " \"nauc_precision_at_1_std\": -0.14726476953035533,\n", + " \"nauc_precision_at_20_diff1\": 0.23025740175221762,\n", + " \"nauc_precision_at_20_max\": 0.2892313928157665,\n", + " \"nauc_precision_at_20_std\": 0.13522755012490692,\n", + " \"nauc_precision_at_3_diff1\": 0.1410889527057097,\n", + " \"nauc_precision_at_3_max\": -0.010771302313530132,\n", + " \"nauc_precision_at_3_std\": -0.10744937823276193,\n", + " \"nauc_precision_at_5_diff1\": 0.14012953903010988,\n", + " \"nauc_precision_at_5_max\": 0.03977485677045894,\n", + " \"nauc_precision_at_5_std\": -0.10292184602358977,\n", + " \"nauc_recall_at_1000_diff1\": -0.03879062992990034,\n", + " \"nauc_recall_at_1000_max\": 0.36308263419153386,\n", + " \"nauc_recall_at_1000_std\": 0.47721898396760526,\n", + " \"nauc_recall_at_100_diff1\": 0.3211860920443005,\n", + " \"nauc_recall_at_100_max\": 0.4740132817599919,\n", + " \"nauc_recall_at_100_std\": 0.345639616995194,\n", + " \"nauc_recall_at_10_diff1\": 0.22279659689895054,\n", + " \"nauc_recall_at_10_max\": 0.16823918613192046,\n", + " \"nauc_recall_at_10_std\": 0.037720969433127145,\n", + " \"nauc_recall_at_1_diff1\": 0.17346152653542332,\n", + " \"nauc_recall_at_1_max\": -0.09705499215630589,\n", + " \"nauc_recall_at_1_std\": -0.14726476953035533,\n", + " \"nauc_recall_at_20_diff1\": 0.23025740175221865,\n", + " \"nauc_recall_at_20_max\": 0.2892313928157675,\n", + " \"nauc_recall_at_20_std\": 0.13522755012490456,\n", + " \"nauc_recall_at_3_diff1\": 0.14108895270570979,\n", + " \"nauc_recall_at_3_max\": -0.010771302313529425,\n", + " \"nauc_recall_at_3_std\": -0.10744937823276134,\n", + " \"nauc_recall_at_5_diff1\": 0.14012953903010958,\n", + " \"nauc_recall_at_5_max\": 0.039774856770459645,\n", + " \"nauc_recall_at_5_std\": -0.10292184602358935,\n", + " \"ndcg_at_1\": 0.40754,\n", + " \"ndcg_at_10\": 0.63616,\n", + " \"ndcg_at_100\": 0.66063,\n", + " \"ndcg_at_1000\": 0.6613,\n", + " \"ndcg_at_20\": 0.65131,\n", + " \"ndcg_at_3\": 0.55717,\n", + " \"ndcg_at_5\": 0.59461,\n", + " \"precision_at_1\": 0.40754,\n", + " \"precision_at_10\": 0.08841,\n", + " \"precision_at_100\": 0.00991,\n", + " \"precision_at_1000\": 0.001,\n", + " \"precision_at_20\": 0.04716,\n", + " \"precision_at_3\": 0.22238,\n", + " \"precision_at_5\": 0.15149,\n", + " \"recall_at_1\": 0.40754,\n", + " \"recall_at_10\": 0.88407,\n", + " \"recall_at_100\": 0.99147,\n", + " \"recall_at_1000\": 0.99644,\n", + " \"recall_at_20\": 0.9431,\n", + " \"recall_at_3\": 0.66714,\n", + " \"recall_at_5\": 0.75747\n", + " }\n", + " ]\n", + " },\n", + " \"task_name\": \"ArguAna\"\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we've successfully run the evaluation using mteb! In the next tutorial, we'll show how to evaluate your model on the whole 56 tasks of English MTEB and compete with models on the leaderboard." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/4_Evaluation/4.2.2.ipynb.txt b/_sources/tutorial/4_Evaluation/4.2.2.ipynb.txt new file mode 100644 index 00000000..aa71df61 --- /dev/null +++ b/_sources/tutorial/4_Evaluation/4.2.2.ipynb.txt @@ -0,0 +1,302 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MTEB Leaderboard" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the last tutorial we show how to evaluate an embedding model on an dataset supported by MTEB. In this tutorial, we will go through how to do a full evaluation and compare the results with MTEB English leaderboard.\n", + "\n", + "Caution: Evaluation on the full Eng MTEB is very time consuming even with GPU. So we encourage you to go through the notebook to have an idea. And run the experiment when you have enough computing resource and time." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Installation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the packages we will use in your environment:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "%pip install sentence_transformers mteb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Run the Evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The MTEB English leaderboard contains 56 datasets on 7 tasks:\n", + "1. **Classification**: Use the embeddings to train a logistic regression on the train set and is scored on the test set. F1 is the main metric.\n", + "2. **Clustering**: Train a mini-batch k-means model with batch size 32 and k equals to the number of different labels. Then score using v-measure.\n", + "3. **Pair Classification**: A pair of text inputs is provided and a label which is a binary variable needs to be assigned. The main metric is average precision score.\n", + "4. **Reranking**: Rank a list of relevant and irrelevant reference texts according to a query. Metrics are mean MRR@k and MAP.\n", + "5. **Retrieval**: Each dataset comprises corpus, queries, and a mapping that links each query to its relevant documents within the corpus. The goal is to retrieve relevant documents for each query. The main metric is nDCG@k. MTEB directly adopts BEIR for the retrieval task.\n", + "6. **Semantic Textual Similarity (STS)**: Determine the similarity between each sentence pair. Spearman correlation based on cosine\n", + "similarity serves as the main metric.\n", + "7. **Summarization**: Only 1 dataset is used in this task. Score the machine-generated summaries to human-written summaries by computing distances of their embeddings. The main metric is also Spearman correlation based on cosine similarity.\n", + "\n", + "The benchmark is widely accepted by researchers and engineers to fairly evaluate and compare the performance of the models they train. Now let's take a look at the whole evaluation pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import the `MTEB_MAIN_EN` to check the all 56 datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['AmazonCounterfactualClassification', 'AmazonPolarityClassification', 'AmazonReviewsClassification', 'ArguAna', 'ArxivClusteringP2P', 'ArxivClusteringS2S', 'AskUbuntuDupQuestions', 'BIOSSES', 'Banking77Classification', 'BiorxivClusteringP2P', 'BiorxivClusteringS2S', 'CQADupstackAndroidRetrieval', 'CQADupstackEnglishRetrieval', 'CQADupstackGamingRetrieval', 'CQADupstackGisRetrieval', 'CQADupstackMathematicaRetrieval', 'CQADupstackPhysicsRetrieval', 'CQADupstackProgrammersRetrieval', 'CQADupstackStatsRetrieval', 'CQADupstackTexRetrieval', 'CQADupstackUnixRetrieval', 'CQADupstackWebmastersRetrieval', 'CQADupstackWordpressRetrieval', 'ClimateFEVER', 'DBPedia', 'EmotionClassification', 'FEVER', 'FiQA2018', 'HotpotQA', 'ImdbClassification', 'MSMARCO', 'MTOPDomainClassification', 'MTOPIntentClassification', 'MassiveIntentClassification', 'MassiveScenarioClassification', 'MedrxivClusteringP2P', 'MedrxivClusteringS2S', 'MindSmallReranking', 'NFCorpus', 'NQ', 'QuoraRetrieval', 'RedditClustering', 'RedditClusteringP2P', 'SCIDOCS', 'SICK-R', 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STS17', 'STS22', 'STSBenchmark', 'SciDocsRR', 'SciFact', 'SprintDuplicateQuestions', 'StackExchangeClustering', 'StackExchangeClusteringP2P', 'StackOverflowDupQuestions', 'SummEval', 'TRECCOVID', 'Touche2020', 'ToxicConversationsClassification', 'TweetSentimentExtractionClassification', 'TwentyNewsgroupsClustering', 'TwitterSemEval2015', 'TwitterURLCorpus']\n" + ] + } + ], + "source": [ + "import mteb\n", + "from mteb.benchmarks import MTEB_MAIN_EN\n", + "\n", + "print(MTEB_MAIN_EN.tasks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the model we want to evaluate:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "\n", + "model_name = \"BAAI/bge-base-en-v1.5\"\n", + "model = SentenceTransformer(model_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, MTEB provides popular models on their leaderboard in order to reproduce their results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"BAAI/bge-base-en-v1.5\"\n", + "model = mteb.get_model(model_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then start to evaluate on each dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for task in MTEB_MAIN_EN.tasks:\n", + " # get the test set to evaluate on\n", + " eval_splits = [\"dev\"] if task == \"MSMARCO\" else [\"test\"]\n", + " evaluation = mteb.MTEB(\n", + " tasks=[task], task_langs=[\"en\"]\n", + " ) # Remove \"en\" to run all available languages\n", + " evaluation.run(\n", + " model, output_folder=\"results\", eval_splits=eval_splits\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Submit to MTEB Leaderboard" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the evaluation is done, all the evaluation results should be stored in `results/{model_name}/{model_revision}`.\n", + "\n", + "Then run the following shell command to create the model_card.md. Change {model_name} and {model_revision} to your path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the case that the readme of that model already exists:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md --from_existing your_existing_readme.md " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy and paste the contents of model_card.md to the top of README.md of your model on HF Hub. Now relax and wait for the daily refresh of leaderboard. Your model will show up soon!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Partially Evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that you don't need to finish all the tasks to get on to the leaderboard.\n", + "\n", + "For example you fine-tune a model's ability on clustering. And you only care about how your model performs with respoect to clustering, but not the other tasks. Then you can just test its performance on the clustering tasks of MTEB and submit to the leaderboard." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TASK_LIST_CLUSTERING = [\n", + " \"ArxivClusteringP2P\",\n", + " \"ArxivClusteringS2S\",\n", + " \"BiorxivClusteringP2P\",\n", + " \"BiorxivClusteringS2S\",\n", + " \"MedrxivClusteringP2P\",\n", + " \"MedrxivClusteringS2S\",\n", + " \"RedditClustering\",\n", + " \"RedditClusteringP2P\",\n", + " \"StackExchangeClustering\",\n", + " \"StackExchangeClusteringP2P\",\n", + " \"TwentyNewsgroupsClustering\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the evaluation with only clustering tasks:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluation = mteb.MTEB(tasks=TASK_LIST_CLUSTERING)\n", + "\n", + "results = evaluation.run(model, output_folder=\"results\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then repeat Step 2 to submit your model. After the leaderboard refresh, you can find your model in the \"Clustering\" section of the leaderboard." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Future Work" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "MTEB is working on a new version of English benchmark. It contains updated and concise tasks and will make the evaluation process faster.\n", + "\n", + "Please check out their [GitHub](https://github.com/embeddings-benchmark/mteb) page for future updates and releases." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/4_Evaluation/4.3.1.ipynb.txt b/_sources/tutorial/4_Evaluation/4.3.1.ipynb.txt new file mode 100644 index 00000000..5832680f --- /dev/null +++ b/_sources/tutorial/4_Evaluation/4.3.1.ipynb.txt @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# C-MTEB" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "C-MTEB is the largest benchmark for Chinese text embeddings, similar to MTEB. In this tutorial, we will go through how to evaluate an embedding model's ability on Chinese tasks in C-MTEB." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Installation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First install dependent packages:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install FlagEmbedding mteb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "C-MTEB uses similar task splits and metrics as English MTEB. It contains 35 datasets in 6 different tasks: Classification, Clustering, Pair Classification, Reranking, Retrieval, and Semantic Textual Similarity (STS). \n", + "\n", + "1. **Classification**: Use the embeddings to train a logistic regression on the train set and is scored on the test set. F1 is the main metric.\n", + "2. **Clustering**: Train a mini-batch k-means model with batch size 32 and k equals to the number of different labels. Then score using v-measure.\n", + "3. **Pair Classification**: A pair of text inputs is provided and a label which is a binary variable needs to be assigned. The main metric is average precision score.\n", + "4. **Reranking**: Rank a list of relevant and irrelevant reference texts according to a query. Metrics are mean MRR@k and MAP.\n", + "5. **Retrieval**: Each dataset comprises corpus, queries, and a mapping that links each query to its relevant documents within the corpus. The goal is to retrieve relevant documents for each query. The main metric is nDCG@k. MTEB directly adopts BEIR for the retrieval task.\n", + "6. **Semantic Textual Similarity (STS)**: Determine the similarity between each sentence pair. Spearman correlation based on cosine\n", + "similarity serves as the main metric.\n", + "\n", + "\n", + "Check the [HF page](https://huggingface.co/C-MTEB) for the details of each dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ChineseTaskList = [\n", + " 'TNews', 'IFlyTek', 'MultilingualSentiment', 'JDReview', 'OnlineShopping', 'Waimai',\n", + " 'CLSClusteringS2S.v2', 'CLSClusteringP2P.v2', 'ThuNewsClusteringS2S.v2', 'ThuNewsClusteringP2P.v2',\n", + " 'Ocnli', 'Cmnli',\n", + " 'T2Reranking', 'MMarcoReranking', 'CMedQAv1-reranking', 'CMedQAv2-reranking',\n", + " 'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval',\n", + " 'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC'\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, load the model for evaluation. Note that the instruction here is used for retreival tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from ...C_MTEB.flag_dres_model import FlagDRESModel\n", + "\n", + "instruction = \"为这个句子生成表示以用于检索相关文章:\"\n", + "model_name = \"BAAI/bge-base-zh-v1.5\"\n", + "\n", + "model = FlagDRESModel(model_name_or_path=\"BAAI/bge-base-zh-v1.5\",\n", + " query_instruction_for_retrieval=instruction,\n", + " pooling_method=\"cls\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Otherwise, you can load a model using sentence_transformers:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "\n", + "model = SentenceTransformer(\"PATH_TO_MODEL\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or implement a class following the structure below:\n", + "\n", + "```python\n", + "class MyModel():\n", + " def __init__(self):\n", + " \"\"\"initialize the tokenizer and model\"\"\"\n", + " pass\n", + "\n", + " def encode(self, sentences, batch_size=32, **kwargs):\n", + " \"\"\" Returns a list of embeddings for the given sentences.\n", + " Args:\n", + " sentences (`List[str]`): List of sentences to encode\n", + " batch_size (`int`): Batch size for the encoding\n", + "\n", + " Returns:\n", + " `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences\n", + " \"\"\"\n", + " pass\n", + "\n", + "model = MyModel()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After we've prepared the dataset and model, we can start the evaluation. For time efficiency, we highly recommend to use GPU for evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mteb\n", + "from mteb import MTEB\n", + "\n", + "tasks = mteb.get_tasks(ChineseTaskList)\n", + "\n", + "for task in tasks:\n", + " evaluation = MTEB(tasks=[task])\n", + " evaluation.run(model, output_folder=f\"zh_results/{model_name.split('/')[-1]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Submit to MTEB Leaderboard" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the evaluation is done, all the evaluation results should be stored in `zh_results/{model_name}/`.\n", + "\n", + "Then run the following shell command to create the model_card.md. Change {model_name} and its following to your path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!!mteb create_meta --results_folder results/{model_name}/ --output_path model_card.md" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copy and paste the contents of model_card.md to the top of README.md of your model on HF Hub. Then goto the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard) and choose the Chinese leaderboard to find your model! It will appear soon after the website's daily refresh." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/5_Reranking.rst.txt b/_sources/tutorial/5_Reranking.rst.txt new file mode 100644 index 00000000..9f9a8cbc --- /dev/null +++ b/_sources/tutorial/5_Reranking.rst.txt @@ -0,0 +1,9 @@ +5. Reranking +============ + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: Reranking + + 5_Reranking/5.1 \ No newline at end of file diff --git a/_sources/tutorial/5_Reranking/5.1.ipynb.txt b/_sources/tutorial/5_Reranking/5.1.ipynb.txt new file mode 100644 index 00000000..b87c70ff --- /dev/null +++ b/_sources/tutorial/5_Reranking/5.1.ipynb.txt @@ -0,0 +1,574 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reranker" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Reranker is disigned in cross-encoder architecture that takes the query and text at the same time and directly output their score of similarity. It is more capable on scoring the query-text relevance, but with the tradeoff of slower speed. Thus, complete retrieval system usually contains retrievers in the first stage to do a large scope retrieval, and then follows by rerankers to rerank the results more precisely.\n", + "\n", + "In this tutorial, we will go through text retrieval pipeline with reranker and evaluate the results before and after reranking.\n", + "\n", + "Note: Step 1-4 are identical to the tutorial of [evaluation](https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials/4_Evaluation). We suggest to first go through that if you are not familiar with retrieval." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the dependencies in the environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -U FlagEmbedding faiss-cpu" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download and preprocess the MS Marco dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import numpy as np\n", + "\n", + "data = load_dataset(\"namespace-Pt/msmarco\", split=\"dev\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "queries = np.array(data[:100][\"query\"])\n", + "corpus = sum(data[:5000][\"positive\"], [])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Inference Embeddings: 100%|██████████| 21/21 [01:59<00:00, 5.68s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape of the corpus embeddings: (5331, 768)\n", + "data type of the embeddings: float32\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "from FlagEmbedding import FlagModel\n", + "\n", + "# get the BGE embedding model\n", + "model = FlagModel('BAAI/bge-base-en-v1.5',\n", + " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", + " use_fp16=True)\n", + "\n", + "# get the embedding of the corpus\n", + "corpus_embeddings = model.encode(corpus)\n", + "\n", + "print(\"shape of the corpus embeddings:\", corpus_embeddings.shape)\n", + "print(\"data type of the embeddings: \", corpus_embeddings.dtype)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Indexing" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total number of vectors: 5331\n" + ] + } + ], + "source": [ + "import faiss\n", + "\n", + "# get the length of our embedding vectors, vectors by bge-base-en-v1.5 have length 768\n", + "dim = corpus_embeddings.shape[-1]\n", + "\n", + "# create the faiss index and store the corpus embeddings into the vector space\n", + "index = faiss.index_factory(dim, 'Flat', faiss.METRIC_INNER_PRODUCT)\n", + "corpus_embeddings = corpus_embeddings.astype(np.float32)\n", + "index.train(corpus_embeddings)\n", + "index.add(corpus_embeddings)\n", + "\n", + "print(f\"total number of vectors: {index.ntotal}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Retrieval" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "query_embeddings = model.encode_queries(queries)\n", + "ground_truths = [d[\"positive\"] for d in data]\n", + "corpus = np.asarray(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Searching: 100%|██████████| 1/1 [00:00<00:00, 22.35it/s]\n" + ] + } + ], + "source": [ + "from tqdm import tqdm\n", + "\n", + "res_scores, res_ids, res_text = [], [], []\n", + "query_size = len(query_embeddings)\n", + "batch_size = 256\n", + "# The cutoffs we will use during evaluation, and set k to be the maximum of the cutoffs.\n", + "cut_offs = [1, 10]\n", + "k = max(cut_offs)\n", + "\n", + "for i in tqdm(range(0, query_size, batch_size), desc=\"Searching\"):\n", + " q_embedding = query_embeddings[i: min(i+batch_size, query_size)].astype(np.float32)\n", + " # search the top k answers for each of the queries\n", + " score, idx = index.search(q_embedding, k=k)\n", + " res_scores += list(score)\n", + " res_ids += list(idx)\n", + " res_text += list(corpus[idx])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Reranking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will use a reranker to rerank the list of answers we retrieved using our index. Hopefully, this will lead to better results." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following table lists the available BGE rerankers. Feel free to try out to see their differences!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| Model | Language | Parameters | Description | Base Model |\n", + "|:-------|:--------:|:----:|:-----------------:|:--------------------------------------:|\n", + "| [BAAI/bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) | Multilingual | 568M | a lightweight cross-encoder model, possesses strong multilingual capabilities, easy to deploy, with fast inference. | XLM-RoBERTa-Large |\n", + "| [BAAI/bge-reranker-v2-gemma](https://huggingface.co/BAAI/bge-reranker-v2-gemma) | Multilingual | 2.51B | a cross-encoder model which is suitable for multilingual contexts, performs well in both English proficiency and multilingual capabilities. | Gemma2-2B |\n", + "| [BAAI/bge-reranker-v2-minicpm-layerwise](https://huggingface.co/BAAI/bge-reranker-v2-minicpm-layerwise) | Multilingual | 2.72B | a cross-encoder model which is suitable for multilingual contexts, performs well in both English and Chinese proficiency, allows freedom to select layers for output, facilitating accelerated inference. | MiniCPM |\n", + "| [BAAI/bge-reranker-v2.5-gemma2-lightweight](https://huggingface.co/BAAI/bge-reranker-v2.5-gemma2-lightweight) | Multilingual | 9.24B | a cross-encoder model which is suitable for multilingual contexts, performs well in both English and Chinese proficiency, allows freedom to select layers, compress ratio and compress layers for output, facilitating accelerated inference. | Gemma2-9B |\n", + "| [BAAI/bge-reranker-large](https://huggingface.co/BAAI/bge-reranker-large) | Chinese and English | 560M | a cross-encoder model which is more accurate but less efficient | XLM-RoBERTa-Large |\n", + "| [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base) | Chinese and English | 278M | a cross-encoder model which is more accurate but less efficient | XLM-RoBERTa-Base |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, let's use a small example to see how reranker works:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-9.474676132202148, -2.823843240737915, 5.76226806640625]\n" + ] + } + ], + "source": [ + "from FlagEmbedding import FlagReranker\n", + "\n", + "reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True) \n", + "# Setting use_fp16 to True speeds up computation with a slight performance degradation\n", + "\n", + "# use the compute_score() function to calculate scores for each input sentence pair\n", + "scores = reranker.compute_score([\n", + " ['what is panda?', 'Today is a sunny day'], \n", + " ['what is panda?', 'The tiger (Panthera tigris) is a member of the genus Panthera and the largest living cat species native to Asia.'],\n", + " ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']\n", + " ])\n", + "print(scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's use the reranker to rerank our previously retrieved results:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "new_ids, new_scores, new_text = [], [], []\n", + "for i in range(len(queries)):\n", + " # get the new scores of the previously retrieved results\n", + " new_score = reranker.compute_score([[queries[i], text] for text in res_text[i]])\n", + " # sort the lists of ids and scores by the new scores\n", + " new_id = [tup[1] for tup in sorted(list(zip(new_score, res_ids[i])), reverse=True)]\n", + " new_scores.append(sorted(new_score, reverse=True))\n", + " new_ids.append(new_id)\n", + " new_text.append(corpus[new_id])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Evaluate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For details of these metrics, please checkout the tutorial of [evaluation](https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials/4_Evaluation)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.1 Recall" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def calc_recall(preds, truths, cutoffs):\n", + " recalls = np.zeros(len(cutoffs))\n", + " for text, truth in zip(preds, truths):\n", + " for i, c in enumerate(cutoffs):\n", + " recall = np.intersect1d(truth, text[:c])\n", + " recalls[i] += len(recall) / max(min(len(recall), len(truth)), 1)\n", + " recalls /= len(preds)\n", + " return recalls" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before reranking:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "recall@1:\t0.97\n", + "recall@10:\t1.0\n" + ] + } + ], + "source": [ + "recalls_init = calc_recall(res_text, ground_truths, cut_offs)\n", + "for i, c in enumerate(cut_offs):\n", + " print(f\"recall@{c}:\\t{recalls_init[i]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After reranking:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "recall@1:\t0.99\n", + "recall@10:\t1.0\n" + ] + } + ], + "source": [ + "recalls_rerank = calc_recall(new_text, ground_truths, cut_offs)\n", + "for i, c in enumerate(cut_offs):\n", + " print(f\"recall@{c}:\\t{recalls_rerank[i]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.2 MRR" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "def MRR(preds, truth, cutoffs):\n", + " mrr = [0 for _ in range(len(cutoffs))]\n", + " for pred, t in zip(preds, truth):\n", + " for i, c in enumerate(cutoffs):\n", + " for j, p in enumerate(pred):\n", + " if j < c and p in t:\n", + " mrr[i] += 1/(j+1)\n", + " break\n", + " mrr = [k/len(preds) for k in mrr]\n", + " return mrr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before reranking:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MRR@1:\t0.97\n", + "MRR@10:\t0.9825\n" + ] + } + ], + "source": [ + "mrr_init = MRR(res_text, ground_truths, cut_offs)\n", + "for i, c in enumerate(cut_offs):\n", + " print(f\"MRR@{c}:\\t{mrr_init[i]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After reranking:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MRR@1:\t0.99\n", + "MRR@10:\t0.995\n" + ] + } + ], + "source": [ + "mrr_rerank = MRR(new_text, ground_truths, cut_offs)\n", + "for i, c in enumerate(cut_offs):\n", + " print(f\"MRR@{c}:\\t{mrr_rerank[i]}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6.3 nDCG" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before reranking:" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "nDCG@1: 0.97\n", + "nDCG@10: 0.9869253606521631\n" + ] + } + ], + "source": [ + "from sklearn.metrics import ndcg_score\n", + "\n", + "pred_hard_encodings = []\n", + "for pred, label in zip(res_text, ground_truths):\n", + " pred_hard_encoding = list(np.isin(pred, label).astype(int))\n", + " pred_hard_encodings.append(pred_hard_encoding)\n", + "\n", + "for i, c in enumerate(cut_offs):\n", + " nDCG = ndcg_score(pred_hard_encodings, res_scores, k=c)\n", + " print(f\"nDCG@{c}: {nDCG}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After reranking:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "nDCG@1: 0.99\n", + "nDCG@10: 0.9963092975357145\n" + ] + } + ], + "source": [ + "pred_hard_encodings_rerank = []\n", + "for pred, label in zip(new_text, ground_truths):\n", + " pred_hard_encoding = list(np.isin(pred, label).astype(int))\n", + " pred_hard_encodings_rerank.append(pred_hard_encoding)\n", + "\n", + "for i, c in enumerate(cut_offs):\n", + " nDCG = ndcg_score(pred_hard_encodings_rerank, new_scores, k=c)\n", + " print(f\"nDCG@{c}: {nDCG}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/6_RAG.rst.txt b/_sources/tutorial/6_RAG.rst.txt new file mode 100644 index 00000000..2f3896df --- /dev/null +++ b/_sources/tutorial/6_RAG.rst.txt @@ -0,0 +1,11 @@ +6. RAG +====== + +.. toctree:: + :hidden: + :maxdepth: 1 + :caption: RAG + + 6_RAG/6.1 + 6_RAG/6.2 + 6_RAG/6.3 \ No newline at end of file diff --git a/_sources/tutorial/6_RAG/6.1.ipynb.txt b/_sources/tutorial/6_RAG/6.1.ipynb.txt new file mode 100644 index 00000000..bfa1afea --- /dev/null +++ b/_sources/tutorial/6_RAG/6.1.ipynb.txt @@ -0,0 +1,327 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Simple RAG From Scratch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial, we will use BGE, Faiss, and OpenAI's GPT-4o-mini to build a simple RAG system from scratch." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the required packages in the environment:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -U numpy faiss-cpu FlagEmbedding openai" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Suppose I'm a resident of New York Manhattan, and I want the AI bot to provide suggestion on where should I go for dinner. It's not reliable to let it recommend some random restaurant. So let's provide a bunch of our favorate restaurants." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "corpus = [\n", + " \"Cheli: A downtown Chinese restaurant presents a distinctive dining experience with authentic and sophisticated flavors of Shanghai cuisine. Avg cost: $40-50\",\n", + " \"Masa: Midtown Japanese restaurant with exquisite sushi and omakase experiences crafted by renowned chef Masayoshi Takayama. The restaurant offers a luxurious dining atmosphere with a focus on the freshest ingredients and exceptional culinary artistry. Avg cost: $500-600\",\n", + " \"Per Se: A midtown restaurant features daily nine-course tasting menu and a nine-course vegetable tasting menu using classic French technique and the finest quality ingredients available. Avg cost: $300-400\",\n", + " \"Ortomare: A casual, earthy Italian restaurant locates uptown, offering wood-fired pizza, delicious pasta, wine & spirits & outdoor seating. Avg cost: $30-50\",\n", + " \"Banh: Relaxed, narrow restaurant in uptown, offering Vietnamese cuisine & sandwiches, famous for its pho and Vietnam sandwich. Avg cost: $20-30\",\n", + " \"Living Thai: An uptown typical Thai cuisine with different kinds of curry, Tom Yum, fried rice, Thai ice tea, etc. Avg cost: $20-30\",\n", + " \"Chick-fil-A: A Fast food restaurant with great chicken sandwich, fried chicken, fries, and salad, which can be found everywhere in New York. Avg cost: 10-20\",\n", + " \"Joe's Pizza: Most famous New York pizza locates midtown, serving different flavors including classic pepperoni, cheese, spinach, and also innovative pizza. Avg cost: $15-25\",\n", + " \"Red Lobster: In midtown, Red Lobster is a lively chain restaurant serving American seafood standards amid New England-themed decor, with fair price lobsters, shrips and crabs. Avg cost: $30-50\",\n", + " \"Bourbon Steak: It accomplishes all the traditions expected from a steakhouse, offering the finest cuts of premium beef and seafood complimented by wine and spirits program. Avg cost: $100-150\",\n", + " \"Da Long Yi: Locates in downtown, Da Long Yi is a Chinese Szechuan spicy hotpot restaurant that serves good quality meats. Avg cost: $30-50\",\n", + " \"Mitr Thai: An exquisite midtown Thai restaurant with traditional dishes as well as creative dishes, with a wonderful bar serving cocktails. Avg cost: $40-60\",\n", + " \"Yichiran Ramen: Famous Japenese ramen restaurant in both midtown and downtown, serving ramen that can be designed by customers themselves. Avg cost: $20-40\",\n", + " \"BCD Tofu House: Located in midtown, it's famous for its comforting and flavorful soondubu jjigae (soft tofu stew) and a variety of authentic Korean dishes. Avg cost: $30-50\",\n", + "]\n", + "\n", + "user_input = \"I want some Chinese food\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Indexing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we need to figure out a fast but powerful enough method to retrieve docs in the corpus that are most closely related to our questions. Indexing is a good choice for us.\n", + "\n", + "The first step is embed each document into a vector. We use bge-base-en-v1.5 as our embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from FlagEmbedding import FlagModel\n", + "\n", + "model = FlagModel('BAAI/bge-base-en-v1.5',\n", + " query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n", + " use_fp16=True)\n", + "\n", + "embeddings = model.encode(corpus, convert_to_numpy=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(14, 768)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeddings.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then, let's create a Faiss index and add all the vectors into it.\n", + "\n", + "If you want to know more about Faiss, refer to the tutorial of [Faiss and indexing](https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials/3_Indexing)." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import faiss\n", + "import numpy as np\n", + "\n", + "index = faiss.IndexFlatIP(embeddings.shape[1])\n", + "\n", + "index.add(embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index.ntotal" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Retrieve and Generate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we come to the most exciting part. Let's first embed our query and retrieve 3 most relevant document from it:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([['Cheli: A downtown Chinese restaurant presents a distinctive dining experience with authentic and sophisticated flavors of Shanghai cuisine. Avg cost: $40-50',\n", + " 'Da Long Yi: Locates in downtown, Da Long Yi is a Chinese Szechuan spicy hotpot restaurant that serves good quality meats. Avg cost: $30-50',\n", + " 'Yichiran Ramen: Famous Japenese ramen restaurant in both midtown and downtown, serving ramen that can be designed by customers themselves. Avg cost: $20-40']],\n", + " dtype='\n", + "{context}\n", + "\n", + "\n", + "Question: {input}\n", + "\"\"\"\n", + "\n", + "# Create a prompt template\n", + "prompt = ChatPromptTemplate.from_template(template)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now everything is ready. Assemble them to a chain and let the magic happen!" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains.combine_documents import create_stuff_documents_chain\n", + "from langchain.chains import create_retrieval_chain\n", + "\n", + "doc_chain = create_stuff_documents_chain(llm, prompt)\n", + "chain = create_retrieval_chain(retriever, doc_chain)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the following cell, we can see that the chatbot can answer the question correctly!" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "M3-Embedding stands for a new embedding model that is distinguished for its versatility in multi-linguality, multi-functionality, and multi-granularity.\n" + ] + } + ], + "source": [ + "response = chain.invoke({\"input\": \"What does M3-Embedding stands for?\"})\n", + "\n", + "# print the answer only\n", + "print(response['answer'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_sources/tutorial/6_RAG/6.3.ipynb.txt b/_sources/tutorial/6_RAG/6.3.ipynb.txt new file mode 100644 index 00000000..2defb7c3 --- /dev/null +++ b/_sources/tutorial/6_RAG/6.3.ipynb.txt @@ -0,0 +1,384 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAG with LlamaIndex" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "LlamaIndex is a very popular framework to help build connections between data sources and LLMs. It is also a top choice when people would like to build an RAG framework. In this tutorial, we will go through how to use LlamaIndex to aggregate bge-base-en-v1.5 and GPT-4o-mini to an RAG application." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First install the required packages in the environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index-llms-openai llama-index-embeddings-huggingface llama-index-vector-stores-faiss\n", + "%pip install llama_index " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then fill the OpenAI API key below:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# For openai key\n", + "import os\n", + "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_API_KEY\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BGE-M3 is a very powerful embedding model, We would like to know what does that 'M3' stands for.\n", + "\n", + "Let's first ask GPT the question:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "M3-Embedding stands for Multimodal Multiscale Embedding. It is a technique used in machine learning and data analysis to embed high-dimensional data into a lower-dimensional space while preserving the structure and relationships within the data. This technique is particularly useful for analyzing complex datasets that contain multiple modalities or scales of information.\n" + ] + } + ], + "source": [ + "from llama_index.llms.openai import OpenAI\n", + "\n", + "# non-streaming\n", + "response = OpenAI().complete(\"What does M3-Embedding stands for?\")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By checking the description in GitHub [repo](https://github.com/FlagOpen/FlagEmbedding/tree/master/FlagEmbedding/BGE_M3) of BGE-M3, we are pretty sure that GPT is giving us hallucination. Let's build an RAG pipeline to solve the problem!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, download BGE-M3 [paper](https://arxiv.org/pdf/2402.03216) to a directory, and load it through `SimpleDirectoryReader`. \n", + "\n", + "Note that `SimpleDirectoryReader` can read all the documents under that directory and supports a lot of commonly used [file types](https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader/#supported-file-types)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import SimpleDirectoryReader\n", + "\n", + "reader = SimpleDirectoryReader(\"data\")\n", + "# reader = SimpleDirectoryReader(\"DIR_TO_FILE\")\n", + "documents = reader.load_data()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `Settings` object is a global settings for the RAG pipeline. Attributes in it have default settings and can be modified by users (OpenAI's GPT and embedding model). Large attributes like models will be only loaded when being used.\n", + "\n", + "Here, we specify the `node_parser` to `SentenceSplitter()` with our chosen parameters, use the open-source `bge-base-en-v1.5` as our embedding model, and `gpt-4o-mini` as our llm." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import Settings\n", + "from llama_index.core.node_parser import SentenceSplitter\n", + "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", + "from llama_index.llms.openai import OpenAI\n", + "\n", + "# set the parser with parameters\n", + "Settings.node_parser = SentenceSplitter(\n", + " chunk_size=1000, # Maximum size of chunks to return\n", + " chunk_overlap=150, # number of overlap characters between chunks\n", + ")\n", + "\n", + "# set the specific embedding model\n", + "Settings.embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-base-en-v1.5\")\n", + "\n", + "# set the llm we want to use\n", + "Settings.llm = OpenAI(model=\"gpt-4o-mini\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Indexing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Indexing is one of the most important part in RAG. LlamaIndex integrates a great amount of vector databases. Here we will use Faiss as an example." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First check the dimension of the embeddings, which will need for initializing a Faiss index." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "768\n" + ] + } + ], + "source": [ + "embedding = Settings.embed_model.get_text_embedding(\"Hello world\")\n", + "dim = len(embedding)\n", + "print(dim)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then create the index with Faiss and our documents. Here LlamaIndex help capsulate the Faiss function calls. If you would like to know more about Faiss, refer to the tutorial of [Faiss and indexing](https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials/3_Indexing)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import faiss\n", + "from llama_index.vector_stores.faiss import FaissVectorStore\n", + "from llama_index.core import StorageContext, VectorStoreIndex\n", + "\n", + "# init Faiss and create a vector store\n", + "faiss_index = faiss.IndexFlatL2(dim)\n", + "vector_store = FaissVectorStore(faiss_index=faiss_index)\n", + "\n", + "# customize the storage context using our vector store\n", + "storage_context = StorageContext.from_defaults(\n", + " vector_store=vector_store\n", + ")\n", + "\n", + "# use the loaded documents to build the index\n", + "index = VectorStoreIndex.from_documents(\n", + " documents, storage_context=storage_context\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Retrieve and Generate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With a well constructed index, we can now build the query engine to accomplish our task:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "query_engine = index.as_query_engine()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following cell displays the default prompt template for Q&A in our pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Context information is below.\n", + "---------------------\n", + "{context_str}\n", + "---------------------\n", + "Given the context information and not prior knowledge, answer the query.\n", + "Query: {query_str}\n", + "Answer: \n" + ] + } + ], + "source": [ + "# check the default promt template\n", + "prompt_template = query_engine.get_prompts()['response_synthesizer:text_qa_template']\n", + "print(prompt_template.get_template())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(Optional) You could modify the prompt to match your use cases:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "You are a Q&A chat bot.\n", + "Use the given context only, answer the question.\n", + "\n", + "\n", + "{context_str}\n", + "\n", + "\n", + "Question: {query_str}\n", + "\n" + ] + } + ], + "source": [ + "from llama_index.core import PromptTemplate\n", + "\n", + "template = \"\"\"\n", + "You are a Q&A chat bot.\n", + "Use the given context only, answer the question.\n", + "\n", + "\n", + "{context_str}\n", + "\n", + "\n", + "Question: {query_str}\n", + "\"\"\"\n", + "\n", + "new_template = PromptTemplate(template)\n", + "query_engine.update_prompts(\n", + " {\"response_synthesizer:text_qa_template\": new_template}\n", + ")\n", + "\n", + "prompt_template = query_engine.get_prompts()['response_synthesizer:text_qa_template']\n", + "print(prompt_template.get_template())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's see how does the RAG application performs on our query!" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "M3-Embedding stands for Multi-Linguality, Multi-Functionality, and Multi-Granularity.\n" + ] + } + ], + "source": [ + "response = query_engine.query(\"What does M3-Embedding stands for?\")\n", + "print(response)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "test", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/_static/basic.css b/_static/basic.css new file mode 100644 index 00000000..7ebbd6d0 --- /dev/null +++ b/_static/basic.css @@ -0,0 +1,914 @@ +/* + * Sphinx stylesheet -- basic theme. + */ + +/* -- main layout ----------------------------------------------------------- */ + +div.clearer { + clear: both; +} + +div.section::after { + display: block; + content: ''; + clear: left; +} + +/* -- relbar ---------------------------------------------------------------- */ + +div.related { + width: 100%; + font-size: 90%; +} + +div.related h3 { + display: none; +} + +div.related ul { + margin: 0; + padding: 0 0 0 10px; + list-style: none; +} + +div.related li { + display: inline; +} + +div.related li.right { + float: right; + margin-right: 5px; +} + +/* -- sidebar --------------------------------------------------------------- */ + +div.sphinxsidebarwrapper { + padding: 10px 5px 0 10px; +} + +div.sphinxsidebar { + float: left; + width: 230px; + margin-left: -100%; + font-size: 90%; + word-wrap: break-word; + overflow-wrap : break-word; +} + +div.sphinxsidebar ul { + list-style: none; +} + +div.sphinxsidebar ul ul, +div.sphinxsidebar ul.want-points { + margin-left: 20px; + list-style: square; +} + +div.sphinxsidebar ul ul { + margin-top: 0; + margin-bottom: 0; +} + +div.sphinxsidebar form { + margin-top: 10px; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar #searchbox form.search { + overflow: hidden; +} + +div.sphinxsidebar #searchbox input[type="text"] { + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; +} + + +img { + border: 0; + max-width: 100%; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin-top: 10px; +} + +ul.search li { + padding: 5px 0; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li p.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; + margin-left: auto; + margin-right: auto; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable { + width: 100%; +} + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body { + min-width: 360px; + max-width: 800px; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +a:visited { + color: #551A8B; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-default { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar, +aside.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px; + background-color: #ffe; + width: 40%; + float: right; + clear: right; + overflow-x: auto; +} + +p.sidebar-title { + font-weight: bold; +} + +nav.contents, +aside.topic, +div.admonition, div.topic, blockquote { + clear: left; +} + +/* -- topics ---------------------------------------------------------------- */ + +nav.contents, +aside.topic, +div.topic { + border: 1px solid #ccc; + padding: 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- content of sidebars/topics/admonitions -------------------------------- */ + +div.sidebar > :last-child, +aside.sidebar > :last-child, +nav.contents > :last-child, +aside.topic > :last-child, +div.topic > :last-child, +div.admonition > :last-child { + margin-bottom: 0; +} + +div.sidebar::after, +aside.sidebar::after, +nav.contents::after, +aside.topic::after, +div.topic::after, +div.admonition::after, +blockquote::after { + display: block; + content: ''; + clear: both; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + margin-top: 10px; + margin-bottom: 10px; + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table.align-default { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +th > :first-child, +td > :first-child { + margin-top: 0px; +} + +th > :last-child, +td > :last-child { + margin-bottom: 0px; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure, figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption, figcaption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- hlist styles ---------------------------------------------------------- */ + +table.hlist { + margin: 1em 0; +} + +table.hlist td { + vertical-align: top; +} + +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +:not(li) > ol > li:first-child > :first-child, +:not(li) > ul > li:first-child > :first-child { + margin-top: 0px; +} + +:not(li) > ol > li:last-child > :last-child, +:not(li) > ul > li:last-child > :last-child { + margin-bottom: 0px; +} + +ol.simple ol p, +ol.simple ul p, +ul.simple ol p, +ul.simple ul p { + margin-top: 0; +} + +ol.simple > li:not(:first-child) > p, +ul.simple > li:not(:first-child) > p { + margin-top: 0; +} + +ol.simple p, +ul.simple p { + margin-bottom: 0; +} + +aside.footnote > span, +div.citation > span { + float: left; +} +aside.footnote > span:last-of-type, +div.citation > span:last-of-type { + padding-right: 0.5em; +} +aside.footnote > p { + margin-left: 2em; +} +div.citation > p { + margin-left: 4em; +} +aside.footnote > p:last-of-type, +div.citation > p:last-of-type { + margin-bottom: 0em; +} +aside.footnote > p:last-of-type:after, +div.citation > p:last-of-type:after { + content: ""; + clear: both; +} + +dl.field-list { + display: grid; + grid-template-columns: fit-content(30%) auto; +} + +dl.field-list > dt { + font-weight: bold; + word-break: break-word; + padding-left: 0.5em; + padding-right: 5px; +} + +dl.field-list > dd { + padding-left: 0.5em; + margin-top: 0em; + margin-left: 0em; + margin-bottom: 0em; +} + +dl { + margin-bottom: 15px; +} + +dd > :first-child { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +.sig dd { + margin-top: 0px; + margin-bottom: 0px; +} + +.sig dl { + margin-top: 0px; + margin-bottom: 0px; +} + +dl > dd:last-child, +dl > dd:last-child > :last-child { + margin-bottom: 0; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +.classifier:before { + font-style: normal; + margin: 0 0.5em; + content: ":"; + display: inline-block; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +.translated { + background-color: rgba(207, 255, 207, 0.2) +} + +.untranslated { + background-color: rgba(255, 207, 207, 0.2) +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +pre, div[class*="highlight-"] { + clear: both; +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; + white-space: nowrap; +} + +div[class*="highlight-"] { + margin: 1em 0; +} + +td.linenos pre { + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + display: block; +} + +table.highlighttable tbody { + display: block; +} + +table.highlighttable tr { + display: flex; +} + +table.highlighttable td { + margin: 0; + padding: 0; +} + +table.highlighttable td.linenos { + padding-right: 0.5em; +} + +table.highlighttable td.code { + flex: 1; + overflow: hidden; +} + +.highlight .hll { + display: block; +} + +div.highlight pre, +table.highlighttable pre { + margin: 0; +} + +div.code-block-caption + div { + margin-top: 0; +} + +div.code-block-caption { + margin-top: 1em; + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +table.highlighttable td.linenos, +span.linenos, +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + margin: 1em 0; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: absolute; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/_static/debug.css b/_static/debug.css new file mode 100644 index 00000000..74d4aec3 --- /dev/null +++ b/_static/debug.css @@ -0,0 +1,69 @@ +/* + This CSS file should be overridden by the theme authors. It's + meant for debugging and developing the skeleton that this theme provides. +*/ +body { + font-family: -apple-system, "Segoe UI", Roboto, Helvetica, Arial, sans-serif, + "Apple Color Emoji", "Segoe UI Emoji"; + background: lavender; +} +.sb-announcement { + background: rgb(131, 131, 131); +} +.sb-announcement__inner { + background: black; + color: white; +} +.sb-header { + background: lightskyblue; +} +.sb-header__inner { + background: royalblue; + color: white; +} +.sb-header-secondary { + background: lightcyan; +} +.sb-header-secondary__inner { + background: cornflowerblue; + color: white; +} +.sb-sidebar-primary { + background: lightgreen; +} +.sb-main { + background: blanchedalmond; +} +.sb-main__inner { + background: antiquewhite; +} +.sb-header-article { + background: lightsteelblue; +} +.sb-article-container { + background: snow; +} +.sb-article-main { + background: white; +} +.sb-footer-article { + background: lightpink; +} +.sb-sidebar-secondary { + background: lightgoldenrodyellow; +} +.sb-footer-content { + background: plum; +} +.sb-footer-content__inner { + background: palevioletred; +} +.sb-footer { + background: pink; +} +.sb-footer__inner { + background: salmon; +} +.sb-article { + background: white; +} diff --git a/_static/doctools.js b/_static/doctools.js new file mode 100644 index 00000000..0398ebb9 --- /dev/null +++ b/_static/doctools.js @@ -0,0 +1,149 @@ +/* + * Base JavaScript utilities for all Sphinx HTML documentation. + */ +"use strict"; + +const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([ + "TEXTAREA", + "INPUT", + "SELECT", + "BUTTON", +]); + +const _ready = (callback) => { + if (document.readyState !== "loading") { + callback(); + } else { + document.addEventListener("DOMContentLoaded", callback); + } +}; + +/** + * Small JavaScript module for the documentation. + */ +const Documentation = { + init: () => { + Documentation.initDomainIndexTable(); + Documentation.initOnKeyListeners(); + }, + + /** + * i18n support + */ + TRANSLATIONS: {}, + PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), + LOCALE: "unknown", + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext: (string) => { + const translated = Documentation.TRANSLATIONS[string]; + switch (typeof translated) { + case "undefined": + return string; // no translation + case "string": + return translated; // translation exists + default: + return translated[0]; // (singular, plural) translation tuple exists + } + }, + + ngettext: (singular, plural, n) => { + const translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated !== "undefined") + return translated[Documentation.PLURAL_EXPR(n)]; + return n === 1 ? singular : plural; + }, + + addTranslations: (catalog) => { + Object.assign(Documentation.TRANSLATIONS, catalog.messages); + Documentation.PLURAL_EXPR = new Function( + "n", + `return (${catalog.plural_expr})` + ); + Documentation.LOCALE = catalog.locale; + }, + + /** + * helper function to focus on search bar + */ + focusSearchBar: () => { + document.querySelectorAll("input[name=q]")[0]?.focus(); + }, + + /** + * Initialise the domain index toggle buttons + */ + initDomainIndexTable: () => { + const toggler = (el) => { + const idNumber = el.id.substr(7); + const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); + if (el.src.substr(-9) === "minus.png") { + el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; + toggledRows.forEach((el) => (el.style.display = "none")); + } else { + el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; + toggledRows.forEach((el) => (el.style.display = "")); + } + }; + + const togglerElements = document.querySelectorAll("img.toggler"); + togglerElements.forEach((el) => + el.addEventListener("click", (event) => toggler(event.currentTarget)) + ); + togglerElements.forEach((el) => (el.style.display = "")); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); + }, + + initOnKeyListeners: () => { + // only install a listener if it is really needed + if ( + !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && + !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS + ) + return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.altKey || event.ctrlKey || event.metaKey) return; + + if (!event.shiftKey) { + switch (event.key) { + case "ArrowLeft": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const prevLink = document.querySelector('link[rel="prev"]'); + if (prevLink && prevLink.href) { + window.location.href = prevLink.href; + event.preventDefault(); + } + break; + case "ArrowRight": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const nextLink = document.querySelector('link[rel="next"]'); + if (nextLink && nextLink.href) { + window.location.href = nextLink.href; + event.preventDefault(); + } + break; + } + } + + // some keyboard layouts may need Shift to get / + switch (event.key) { + case "/": + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; + Documentation.focusSearchBar(); + event.preventDefault(); + } + }); + }, +}; + +// quick alias for translations +const _ = Documentation.gettext; + +_ready(Documentation.init); diff --git a/_static/documentation_options.js b/_static/documentation_options.js new file mode 100644 index 00000000..7e4c114f --- /dev/null +++ b/_static/documentation_options.js @@ -0,0 +1,13 @@ +const DOCUMENTATION_OPTIONS = { + VERSION: '', + LANGUAGE: 'en', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: true, + SOURCELINK_SUFFIX: '.txt', + NAVIGATION_WITH_KEYS: false, + SHOW_SEARCH_SUMMARY: true, + ENABLE_SEARCH_SHORTCUTS: true, +}; \ No newline at end of file diff --git a/_static/file.png b/_static/file.png new file mode 100644 index 00000000..a858a410 Binary files /dev/null and b/_static/file.png differ diff --git a/_static/img/BAAI_logo.png b/_static/img/BAAI_logo.png new file mode 100644 index 00000000..c39cc6fd Binary files /dev/null and b/_static/img/BAAI_logo.png differ diff --git a/_static/img/C_MTEB.png b/_static/img/C_MTEB.png new file mode 100644 index 00000000..0b0f0941 Binary files /dev/null and b/_static/img/C_MTEB.png differ diff --git a/_static/img/bge_logo.jpg b/_static/img/bge_logo.jpg new file mode 100644 index 00000000..e9560649 Binary files /dev/null and b/_static/img/bge_logo.jpg differ diff --git a/_static/img/projects.png b/_static/img/projects.png new file mode 100644 index 00000000..69104541 Binary files /dev/null and b/_static/img/projects.png differ diff --git a/_static/language_data.js b/_static/language_data.js new file mode 100644 index 00000000..c7fe6c6f --- /dev/null +++ b/_static/language_data.js @@ -0,0 +1,192 @@ +/* + * This script contains the language-specific data used by searchtools.js, + * namely the list of stopwords, stemmer, scorer and splitter. + */ + +var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; + + +/* Non-minified version is copied as a separate JS file, if available */ + +/** + * Porter Stemmer + */ +var Stemmer = function() { + + var step2list = { + ational: 'ate', + tional: 'tion', + enci: 'ence', + anci: 'ance', + izer: 'ize', + bli: 'ble', + alli: 'al', + entli: 'ent', + eli: 'e', + ousli: 'ous', + ization: 'ize', + ation: 'ate', + ator: 'ate', + alism: 'al', + iveness: 'ive', + fulness: 'ful', + ousness: 'ous', + aliti: 'al', + iviti: 'ive', + biliti: 'ble', + logi: 'log' + }; + + var step3list = { + icate: 'ic', + ative: '', + alize: 'al', + iciti: 'ic', + ical: 'ic', + ful: '', + ness: '' + }; + + var c = "[^aeiou]"; // consonant + var v = "[aeiouy]"; // vowel + var C = c + "[^aeiouy]*"; // consonant sequence + var V = v + "[aeiou]*"; // vowel sequence + + var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 + var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 + var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 + var s_v = "^(" + C + ")?" + v; // vowel in stem + + this.stemWord = function (w) { + var stem; + var suffix; + var firstch; + var origword = w; + + if (w.length < 3) + return w; + + var re; + var re2; + var re3; + var re4; + + firstch = w.substr(0,1); + if (firstch == "y") + w = firstch.toUpperCase() + w.substr(1); + + // Step 1a + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) + w = w.replace(re,"$1$2"); + else if (re2.test(w)) + w = w.replace(re2,"$1$2"); + + // Step 1b + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + var fp = re.exec(w); + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re,""); + } + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) + w = w + "e"; + else if (re3.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + else if (re4.test(w)) + w = w + "e"; + } + } + + // Step 1c + re = /^(.+?)y$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(s_v); + if (re.test(stem)) + w = stem + "i"; + } + + // Step 2 + re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step2list[suffix]; + } + + // Step 3 + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step3list[suffix]; + } + + // Step 4 + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + if (re.test(stem)) + w = stem; + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = new RegExp(mgr1); + if (re2.test(stem)) + w = stem; + } + + // Step 5 + re = /^(.+?)e$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) + w = stem; + } + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + + // and turn initial Y back to y + if (firstch == "y") + w = firstch.toLowerCase() + w.substr(1); + return w; + } +} + diff --git a/_static/minus.png b/_static/minus.png new file mode 100644 index 00000000..d96755fd Binary files /dev/null and b/_static/minus.png differ diff --git a/_static/mystnb.4510f1fc1dee50b3e5859aac5469c37c29e427902b24a333a5f9fcb2f0b3ac41.css b/_static/mystnb.4510f1fc1dee50b3e5859aac5469c37c29e427902b24a333a5f9fcb2f0b3ac41.css new file mode 100644 index 00000000..33566310 --- /dev/null +++ b/_static/mystnb.4510f1fc1dee50b3e5859aac5469c37c29e427902b24a333a5f9fcb2f0b3ac41.css @@ -0,0 +1,2342 @@ +/* Variables */ +:root { + --mystnb-source-bg-color: #f7f7f7; + --mystnb-stdout-bg-color: #fcfcfc; + --mystnb-stderr-bg-color: #fdd; + --mystnb-traceback-bg-color: #fcfcfc; + --mystnb-source-border-color: #ccc; + --mystnb-source-margin-color: green; + --mystnb-stdout-border-color: #f7f7f7; + --mystnb-stderr-border-color: #f7f7f7; + --mystnb-traceback-border-color: #ffd6d6; + --mystnb-hide-prompt-opacity: 70%; + --mystnb-source-border-radius: .4em; + --mystnb-source-border-width: 1px; +} + +/* Whole cell */ +div.container.cell { + padding-left: 0; + margin-bottom: 1em; +} + +/* Removing all background formatting so we can control at the div level */ +.cell_input div.highlight, +.cell_output pre, +.cell_input pre, +.cell_output .output { + border: none; + box-shadow: none; +} + +.cell_output .output pre, +.cell_input pre { + margin: 0px; +} + +/* Input cells */ +div.cell div.cell_input, +div.cell details.above-input>summary { + padding-left: 0em; + padding-right: 0em; + border: var(--mystnb-source-border-width) var(--mystnb-source-border-color) solid; + background-color: var(--mystnb-source-bg-color); + border-left-color: var(--mystnb-source-margin-color); + border-left-width: medium; + border-radius: var(--mystnb-source-border-radius); +} + +div.cell_input>div, +div.cell_output div.output>div.highlight { + margin: 0em !important; + border: none !important; +} + +/* All cell outputs */ +.cell_output { + padding-left: 1em; + padding-right: 0em; + margin-top: 1em; +} + +/* Text outputs from cells */ +.cell_output .output.text_plain, +.cell_output .output.traceback, +.cell_output .output.stream, +.cell_output .output.stderr { + margin-top: 1em; + margin-bottom: 0em; + box-shadow: none; +} + +.cell_output .output.text_plain, +.cell_output .output.stream { + background: var(--mystnb-stdout-bg-color); + border: 1px solid var(--mystnb-stdout-border-color); +} + +.cell_output .output.stderr { + background: var(--mystnb-stderr-bg-color); + border: 1px solid var(--mystnb-stderr-border-color); +} + +.cell_output .output.traceback { + background: var(--mystnb-traceback-bg-color); + border: 1px solid var(--mystnb-traceback-border-color); +} + +/* Collapsible cell content */ +div.cell details.above-input div.cell_input { + border-top-left-radius: 0; + border-top-right-radius: 0; + border-top: var(--mystnb-source-border-width) var(--mystnb-source-border-color) dashed; +} + +div.cell div.cell_input.above-output-prompt { + border-bottom-left-radius: 0; + border-bottom-right-radius: 0; +} + +div.cell details.above-input>summary { + border-bottom-left-radius: 0; + border-bottom-right-radius: 0; + border-bottom: var(--mystnb-source-border-width) var(--mystnb-source-border-color) dashed; + padding-left: 1em; + margin-bottom: 0; +} + +div.cell details.above-output>summary { + background-color: var(--mystnb-source-bg-color); + padding-left: 1em; + padding-right: 0em; + border: var(--mystnb-source-border-width) var(--mystnb-source-border-color) solid; + border-radius: var(--mystnb-source-border-radius); + border-left-color: var(--mystnb-source-margin-color); + border-left-width: medium; +} + +div.cell details.below-input>summary { + background-color: var(--mystnb-source-bg-color); + padding-left: 1em; + padding-right: 0em; + border: var(--mystnb-source-border-width) var(--mystnb-source-border-color) solid; + border-top: none; + border-bottom-left-radius: var(--mystnb-source-border-radius); + border-bottom-right-radius: var(--mystnb-source-border-radius); + border-left-color: var(--mystnb-source-margin-color); + border-left-width: medium; +} + +div.cell details.hide>summary>span { + opacity: var(--mystnb-hide-prompt-opacity); +} + +div.cell details.hide[open]>summary>span.collapsed { + display: none; +} + +div.cell details.hide:not([open])>summary>span.expanded { + display: none; +} + +@keyframes collapsed-fade-in { + 0% { + opacity: 0; + } + + 100% { + opacity: 1; + } +} +div.cell details.hide[open]>summary~* { + -moz-animation: collapsed-fade-in 0.3s ease-in-out; + -webkit-animation: collapsed-fade-in 0.3s ease-in-out; + animation: collapsed-fade-in 0.3s ease-in-out; +} + +/* Math align to the left */ +.cell_output .MathJax_Display { + text-align: left !important; +} + +/* Pandas tables. Pulled from the Jupyter / nbsphinx CSS */ +div.cell_output table { + border: none; + border-collapse: collapse; + border-spacing: 0; + color: black; + font-size: 1em; + table-layout: fixed; +} + +div.cell_output thead { + border-bottom: 1px solid black; + vertical-align: bottom; +} + +div.cell_output tr, +div.cell_output th, +div.cell_output td { + text-align: right; + vertical-align: middle; + padding: 0.5em 0.5em; + line-height: normal; + white-space: normal; + max-width: none; + border: none; +} + +div.cell_output th { + font-weight: bold; +} + +div.cell_output tbody tr:nth-child(odd) { + background: #f5f5f5; +} + +div.cell_output tbody tr:hover { + background: rgba(66, 165, 245, 0.2); +} + +/** source code line numbers **/ +span.linenos { + opacity: 0.5; +} + +/* Inline text from `paste` operation */ + +span.pasted-text { + font-weight: bold; +} + +span.pasted-inline img { + max-height: 2em; +} + +tbody span.pasted-inline img { + max-height: none; +} + +/* Font colors for translated ANSI escape sequences +Color values are copied from Jupyter Notebook +https://github.com/jupyter/notebook/blob/52581f8eda9b319eb0390ac77fe5903c38f81e3e/notebook/static/notebook/less/ansicolors.less#L14-L21 +Background colors from +https://nbsphinx.readthedocs.io/en/latest/code-cells.html#ANSI-Colors +*/ +div.highlight .-Color-Bold { + font-weight: bold; +} + +div.highlight .-Color[class*=-Black] { + color: #3E424D +} + +div.highlight .-Color[class*=-Red] { + color: #E75C58 +} + +div.highlight .-Color[class*=-Green] { + color: #00A250 +} + +div.highlight .-Color[class*=-Yellow] { + color: #DDB62B +} + +div.highlight .-Color[class*=-Blue] { + color: #208FFB +} + +div.highlight .-Color[class*=-Magenta] { + color: #D160C4 +} + +div.highlight .-Color[class*=-Cyan] { + color: #60C6C8 +} + +div.highlight .-Color[class*=-White] { + color: #C5C1B4 +} + +div.highlight .-Color[class*=-BGBlack] { + background-color: #3E424D +} + +div.highlight .-Color[class*=-BGRed] { + background-color: #E75C58 +} + +div.highlight .-Color[class*=-BGGreen] { + background-color: #00A250 +} + +div.highlight .-Color[class*=-BGYellow] { + background-color: #DDB62B +} + +div.highlight .-Color[class*=-BGBlue] { + background-color: #208FFB +} + +div.highlight .-Color[class*=-BGMagenta] { + background-color: #D160C4 +} + +div.highlight .-Color[class*=-BGCyan] { + background-color: #60C6C8 +} + +div.highlight .-Color[class*=-BGWhite] { + background-color: #C5C1B4 +} + +/* Font colors for 8-bit ANSI */ + +div.highlight .-Color[class*=-C0] { + color: #000000 +} + +div.highlight .-Color[class*=-BGC0] { + background-color: #000000 +} + +div.highlight .-Color[class*=-C1] { + color: #800000 +} + +div.highlight .-Color[class*=-BGC1] { + background-color: #800000 +} + +div.highlight .-Color[class*=-C2] { + color: #008000 +} + +div.highlight .-Color[class*=-BGC2] { + background-color: #008000 +} + +div.highlight .-Color[class*=-C3] { + color: #808000 +} + +div.highlight .-Color[class*=-BGC3] { + background-color: #808000 +} + +div.highlight .-Color[class*=-C4] { + color: #000080 +} + +div.highlight .-Color[class*=-BGC4] { + background-color: #000080 +} + +div.highlight .-Color[class*=-C5] { + color: #800080 +} + +div.highlight .-Color[class*=-BGC5] { + background-color: #800080 +} + +div.highlight .-Color[class*=-C6] { + color: #008080 +} + +div.highlight .-Color[class*=-BGC6] { + background-color: #008080 +} + +div.highlight .-Color[class*=-C7] { + color: #C0C0C0 +} + +div.highlight .-Color[class*=-BGC7] { + background-color: #C0C0C0 +} + +div.highlight .-Color[class*=-C8] { + color: #808080 +} + +div.highlight .-Color[class*=-BGC8] { + background-color: #808080 +} + +div.highlight .-Color[class*=-C9] { + color: #FF0000 +} + +div.highlight .-Color[class*=-BGC9] { + background-color: #FF0000 +} + +div.highlight .-Color[class*=-C10] { + color: #00FF00 +} + +div.highlight .-Color[class*=-BGC10] { + background-color: #00FF00 +} + +div.highlight .-Color[class*=-C11] { + color: #FFFF00 +} + +div.highlight .-Color[class*=-BGC11] { + background-color: #FFFF00 +} + +div.highlight .-Color[class*=-C12] { + color: #0000FF +} + +div.highlight .-Color[class*=-BGC12] { + background-color: #0000FF +} + +div.highlight .-Color[class*=-C13] { + color: #FF00FF +} + +div.highlight .-Color[class*=-BGC13] { + background-color: #FF00FF +} + +div.highlight .-Color[class*=-C14] { + color: #00FFFF +} + +div.highlight .-Color[class*=-BGC14] { + background-color: #00FFFF +} + +div.highlight .-Color[class*=-C15] { + color: #FFFFFF +} + +div.highlight .-Color[class*=-BGC15] { + background-color: #FFFFFF +} + +div.highlight .-Color[class*=-C16] { + color: #000000 +} + +div.highlight .-Color[class*=-BGC16] { + background-color: #000000 +} + +div.highlight .-Color[class*=-C17] { + color: #00005F +} + +div.highlight .-Color[class*=-BGC17] { + background-color: #00005F +} + +div.highlight .-Color[class*=-C18] { + color: #000087 +} + +div.highlight .-Color[class*=-BGC18] { + background-color: #000087 +} + +div.highlight .-Color[class*=-C19] { + color: #0000AF +} + +div.highlight .-Color[class*=-BGC19] { + background-color: #0000AF +} + +div.highlight .-Color[class*=-C20] { + color: #0000D7 +} + +div.highlight .-Color[class*=-BGC20] { + background-color: #0000D7 +} + +div.highlight .-Color[class*=-C21] { + color: #0000FF +} + +div.highlight .-Color[class*=-BGC21] { + background-color: #0000FF +} + +div.highlight .-Color[class*=-C22] { + color: #005F00 +} + +div.highlight .-Color[class*=-BGC22] { + background-color: #005F00 +} + +div.highlight .-Color[class*=-C23] { + color: #005F5F +} + +div.highlight .-Color[class*=-BGC23] { + background-color: #005F5F +} + +div.highlight .-Color[class*=-C24] { + color: #005F87 +} + +div.highlight .-Color[class*=-BGC24] { + background-color: #005F87 +} + +div.highlight .-Color[class*=-C25] { + color: #005FAF +} + +div.highlight .-Color[class*=-BGC25] { + background-color: #005FAF +} + +div.highlight .-Color[class*=-C26] { + color: #005FD7 +} + +div.highlight .-Color[class*=-BGC26] { + background-color: #005FD7 +} + +div.highlight .-Color[class*=-C27] { + color: #005FFF +} + +div.highlight .-Color[class*=-BGC27] { + background-color: #005FFF +} + +div.highlight .-Color[class*=-C28] { + color: #008700 +} + +div.highlight .-Color[class*=-BGC28] { + background-color: #008700 +} + +div.highlight .-Color[class*=-C29] { + color: #00875F +} + +div.highlight .-Color[class*=-BGC29] { + background-color: #00875F +} + +div.highlight .-Color[class*=-C30] { + color: #008787 +} + +div.highlight .-Color[class*=-BGC30] { + background-color: #008787 +} + +div.highlight .-Color[class*=-C31] { + color: #0087AF +} + +div.highlight .-Color[class*=-BGC31] { + background-color: #0087AF +} + +div.highlight .-Color[class*=-C32] { + color: #0087D7 +} + +div.highlight .-Color[class*=-BGC32] { + background-color: #0087D7 +} + +div.highlight .-Color[class*=-C33] { + color: #0087FF +} + +div.highlight .-Color[class*=-BGC33] { + background-color: #0087FF +} + +div.highlight .-Color[class*=-C34] { + color: #00AF00 +} + +div.highlight .-Color[class*=-BGC34] { + background-color: #00AF00 +} + +div.highlight .-Color[class*=-C35] { + color: #00AF5F +} + +div.highlight .-Color[class*=-BGC35] { + background-color: #00AF5F +} + +div.highlight .-Color[class*=-C36] { + color: #00AF87 +} + +div.highlight .-Color[class*=-BGC36] { + background-color: #00AF87 +} + +div.highlight .-Color[class*=-C37] { + color: #00AFAF +} + +div.highlight .-Color[class*=-BGC37] { + background-color: #00AFAF +} + +div.highlight .-Color[class*=-C38] { + color: #00AFD7 +} + +div.highlight .-Color[class*=-BGC38] { + background-color: #00AFD7 +} + +div.highlight .-Color[class*=-C39] { + color: #00AFFF +} + +div.highlight .-Color[class*=-BGC39] { + background-color: #00AFFF +} + +div.highlight .-Color[class*=-C40] { + color: #00D700 +} + +div.highlight .-Color[class*=-BGC40] { + background-color: #00D700 +} + +div.highlight .-Color[class*=-C41] { + color: #00D75F +} + +div.highlight .-Color[class*=-BGC41] { + background-color: #00D75F +} + +div.highlight .-Color[class*=-C42] { + color: #00D787 +} + +div.highlight .-Color[class*=-BGC42] { + background-color: #00D787 +} + +div.highlight .-Color[class*=-C43] { + color: #00D7AF +} + +div.highlight .-Color[class*=-BGC43] { + background-color: #00D7AF +} + +div.highlight .-Color[class*=-C44] { + color: #00D7D7 +} + +div.highlight .-Color[class*=-BGC44] { + background-color: #00D7D7 +} + +div.highlight .-Color[class*=-C45] { + color: #00D7FF +} + +div.highlight .-Color[class*=-BGC45] { + background-color: #00D7FF +} + +div.highlight .-Color[class*=-C46] { + color: #00FF00 +} + +div.highlight .-Color[class*=-BGC46] { + background-color: #00FF00 +} + +div.highlight .-Color[class*=-C47] { + color: #00FF5F +} + +div.highlight .-Color[class*=-BGC47] { + background-color: #00FF5F +} + +div.highlight .-Color[class*=-C48] { + color: #00FF87 +} + +div.highlight .-Color[class*=-BGC48] { + background-color: #00FF87 +} + +div.highlight .-Color[class*=-C49] { + color: #00FFAF +} + +div.highlight .-Color[class*=-BGC49] { + background-color: #00FFAF +} + +div.highlight .-Color[class*=-C50] { + color: #00FFD7 +} + +div.highlight .-Color[class*=-BGC50] { + background-color: #00FFD7 +} + +div.highlight .-Color[class*=-C51] { + color: #00FFFF +} + +div.highlight .-Color[class*=-BGC51] { + background-color: #00FFFF +} + +div.highlight .-Color[class*=-C52] { + color: #5F0000 +} + +div.highlight .-Color[class*=-BGC52] { + background-color: #5F0000 +} + +div.highlight .-Color[class*=-C53] { + color: #5F005F +} + +div.highlight .-Color[class*=-BGC53] { + background-color: #5F005F +} + +div.highlight .-Color[class*=-C54] { + color: #5F0087 +} + +div.highlight .-Color[class*=-BGC54] { + background-color: #5F0087 +} + +div.highlight .-Color[class*=-C55] { + color: #5F00AF +} + +div.highlight .-Color[class*=-BGC55] { + background-color: #5F00AF +} + +div.highlight .-Color[class*=-C56] { + color: #5F00D7 +} + +div.highlight .-Color[class*=-BGC56] { + background-color: #5F00D7 +} + +div.highlight .-Color[class*=-C57] { + color: #5F00FF +} + +div.highlight .-Color[class*=-BGC57] { + background-color: #5F00FF +} + +div.highlight .-Color[class*=-C58] { + color: #5F5F00 +} + +div.highlight .-Color[class*=-BGC58] { + background-color: #5F5F00 +} + +div.highlight .-Color[class*=-C59] { + color: #5F5F5F +} + +div.highlight .-Color[class*=-BGC59] { + background-color: #5F5F5F +} + +div.highlight .-Color[class*=-C60] { + color: #5F5F87 +} + +div.highlight .-Color[class*=-BGC60] { + background-color: #5F5F87 +} + +div.highlight .-Color[class*=-C61] { + color: #5F5FAF +} + +div.highlight .-Color[class*=-BGC61] { + background-color: #5F5FAF +} + +div.highlight .-Color[class*=-C62] { + color: #5F5FD7 +} + +div.highlight .-Color[class*=-BGC62] { + background-color: #5F5FD7 +} + +div.highlight .-Color[class*=-C63] { + color: #5F5FFF +} + +div.highlight .-Color[class*=-BGC63] { + background-color: #5F5FFF +} + +div.highlight .-Color[class*=-C64] { + color: #5F8700 +} + +div.highlight .-Color[class*=-BGC64] { + background-color: #5F8700 +} + +div.highlight .-Color[class*=-C65] { + color: #5F875F +} + +div.highlight .-Color[class*=-BGC65] { + background-color: #5F875F +} + +div.highlight .-Color[class*=-C66] { + color: #5F8787 +} + +div.highlight .-Color[class*=-BGC66] { + background-color: #5F8787 +} + +div.highlight .-Color[class*=-C67] { + color: #5F87AF +} + +div.highlight .-Color[class*=-BGC67] { + background-color: #5F87AF +} + +div.highlight .-Color[class*=-C68] { + color: #5F87D7 +} + +div.highlight .-Color[class*=-BGC68] { + background-color: #5F87D7 +} + +div.highlight .-Color[class*=-C69] { + color: #5F87FF +} + +div.highlight .-Color[class*=-BGC69] { + background-color: #5F87FF +} + +div.highlight .-Color[class*=-C70] { + color: #5FAF00 +} + +div.highlight .-Color[class*=-BGC70] { + background-color: #5FAF00 +} + +div.highlight .-Color[class*=-C71] { + color: #5FAF5F +} + +div.highlight .-Color[class*=-BGC71] { + background-color: #5FAF5F +} + +div.highlight .-Color[class*=-C72] { + color: #5FAF87 +} + +div.highlight .-Color[class*=-BGC72] { + background-color: #5FAF87 +} + +div.highlight .-Color[class*=-C73] { + color: #5FAFAF +} + +div.highlight .-Color[class*=-BGC73] { + background-color: #5FAFAF +} + +div.highlight .-Color[class*=-C74] { + color: #5FAFD7 +} + +div.highlight .-Color[class*=-BGC74] { + background-color: #5FAFD7 +} + +div.highlight .-Color[class*=-C75] { + color: #5FAFFF +} + +div.highlight .-Color[class*=-BGC75] { + background-color: #5FAFFF +} + +div.highlight .-Color[class*=-C76] { + color: #5FD700 +} + +div.highlight .-Color[class*=-BGC76] { + background-color: #5FD700 +} + +div.highlight .-Color[class*=-C77] { + color: #5FD75F +} + +div.highlight .-Color[class*=-BGC77] { + background-color: #5FD75F +} + +div.highlight .-Color[class*=-C78] { + color: #5FD787 +} + +div.highlight .-Color[class*=-BGC78] { + background-color: #5FD787 +} + +div.highlight .-Color[class*=-C79] { + color: #5FD7AF +} + +div.highlight .-Color[class*=-BGC79] { + background-color: #5FD7AF +} + +div.highlight .-Color[class*=-C80] { + color: #5FD7D7 +} + +div.highlight .-Color[class*=-BGC80] { + background-color: #5FD7D7 +} + +div.highlight .-Color[class*=-C81] { + color: #5FD7FF +} + +div.highlight .-Color[class*=-BGC81] { + background-color: #5FD7FF +} + +div.highlight .-Color[class*=-C82] { + color: #5FFF00 +} + +div.highlight .-Color[class*=-BGC82] { + background-color: #5FFF00 +} + +div.highlight .-Color[class*=-C83] { + color: #5FFF5F +} + +div.highlight .-Color[class*=-BGC83] { + background-color: #5FFF5F +} + +div.highlight .-Color[class*=-C84] { + color: #5FFF87 +} + +div.highlight .-Color[class*=-BGC84] { + background-color: #5FFF87 +} + +div.highlight .-Color[class*=-C85] { + color: #5FFFAF +} + +div.highlight .-Color[class*=-BGC85] { + background-color: #5FFFAF +} + +div.highlight .-Color[class*=-C86] { + color: #5FFFD7 +} + +div.highlight .-Color[class*=-BGC86] { + background-color: #5FFFD7 +} + +div.highlight .-Color[class*=-C87] { + color: #5FFFFF +} + +div.highlight .-Color[class*=-BGC87] { + background-color: #5FFFFF +} + +div.highlight .-Color[class*=-C88] { + color: #870000 +} + +div.highlight .-Color[class*=-BGC88] { + background-color: #870000 +} + +div.highlight .-Color[class*=-C89] { + color: #87005F +} + +div.highlight .-Color[class*=-BGC89] { + background-color: #87005F +} + +div.highlight .-Color[class*=-C90] { + color: #870087 +} + +div.highlight .-Color[class*=-BGC90] { + background-color: #870087 +} + +div.highlight .-Color[class*=-C91] { + color: #8700AF +} + +div.highlight .-Color[class*=-BGC91] { + background-color: #8700AF +} + +div.highlight .-Color[class*=-C92] { + color: #8700D7 +} + +div.highlight .-Color[class*=-BGC92] { + background-color: #8700D7 +} + +div.highlight .-Color[class*=-C93] { + color: #8700FF +} + +div.highlight .-Color[class*=-BGC93] { + background-color: #8700FF +} + +div.highlight .-Color[class*=-C94] { + color: #875F00 +} + +div.highlight .-Color[class*=-BGC94] { + background-color: #875F00 +} + +div.highlight .-Color[class*=-C95] { + color: #875F5F +} + +div.highlight .-Color[class*=-BGC95] { + background-color: #875F5F +} + +div.highlight .-Color[class*=-C96] { + color: #875F87 +} + +div.highlight .-Color[class*=-BGC96] { + background-color: #875F87 +} + +div.highlight .-Color[class*=-C97] { + color: #875FAF +} + +div.highlight .-Color[class*=-BGC97] { + background-color: #875FAF +} + +div.highlight .-Color[class*=-C98] { + color: #875FD7 +} + +div.highlight .-Color[class*=-BGC98] { + background-color: #875FD7 +} + +div.highlight .-Color[class*=-C99] { + color: #875FFF +} + +div.highlight .-Color[class*=-BGC99] { + background-color: #875FFF +} + +div.highlight .-Color[class*=-C100] { + color: #878700 +} + +div.highlight .-Color[class*=-BGC100] { + background-color: #878700 +} + +div.highlight .-Color[class*=-C101] { + color: #87875F +} + +div.highlight .-Color[class*=-BGC101] { + background-color: #87875F +} + +div.highlight .-Color[class*=-C102] { + color: #878787 +} + +div.highlight .-Color[class*=-BGC102] { + background-color: #878787 +} + +div.highlight .-Color[class*=-C103] { + color: #8787AF +} + +div.highlight .-Color[class*=-BGC103] { + background-color: #8787AF +} + +div.highlight .-Color[class*=-C104] { + color: #8787D7 +} + +div.highlight .-Color[class*=-BGC104] { + background-color: #8787D7 +} + +div.highlight .-Color[class*=-C105] { + color: #8787FF +} + +div.highlight .-Color[class*=-BGC105] { + background-color: #8787FF +} + +div.highlight .-Color[class*=-C106] { + color: #87AF00 +} + +div.highlight .-Color[class*=-BGC106] { + background-color: #87AF00 +} + +div.highlight .-Color[class*=-C107] { + color: #87AF5F +} + +div.highlight .-Color[class*=-BGC107] { + background-color: #87AF5F +} + +div.highlight .-Color[class*=-C108] { + color: #87AF87 +} + +div.highlight .-Color[class*=-BGC108] { + background-color: #87AF87 +} + +div.highlight .-Color[class*=-C109] { + color: #87AFAF +} + +div.highlight .-Color[class*=-BGC109] { + background-color: #87AFAF +} + +div.highlight .-Color[class*=-C110] { + color: #87AFD7 +} + +div.highlight .-Color[class*=-BGC110] { + background-color: #87AFD7 +} + +div.highlight .-Color[class*=-C111] { + color: #87AFFF +} + +div.highlight .-Color[class*=-BGC111] { + background-color: #87AFFF +} + +div.highlight .-Color[class*=-C112] { + color: #87D700 +} + +div.highlight .-Color[class*=-BGC112] { + background-color: #87D700 +} + +div.highlight .-Color[class*=-C113] { + color: #87D75F +} + +div.highlight .-Color[class*=-BGC113] { + background-color: #87D75F +} + +div.highlight .-Color[class*=-C114] { + color: #87D787 +} + +div.highlight .-Color[class*=-BGC114] { + background-color: #87D787 +} + +div.highlight .-Color[class*=-C115] { + color: #87D7AF +} + +div.highlight .-Color[class*=-BGC115] { + background-color: #87D7AF +} + +div.highlight .-Color[class*=-C116] { + color: #87D7D7 +} + +div.highlight .-Color[class*=-BGC116] { + background-color: #87D7D7 +} + +div.highlight .-Color[class*=-C117] { + color: #87D7FF +} + +div.highlight .-Color[class*=-BGC117] { + background-color: #87D7FF +} + +div.highlight .-Color[class*=-C118] { + color: #87FF00 +} + +div.highlight .-Color[class*=-BGC118] { + background-color: #87FF00 +} + +div.highlight .-Color[class*=-C119] { + color: #87FF5F +} + +div.highlight .-Color[class*=-BGC119] { + background-color: #87FF5F +} + +div.highlight .-Color[class*=-C120] { + color: #87FF87 +} + +div.highlight .-Color[class*=-BGC120] { + background-color: #87FF87 +} + +div.highlight .-Color[class*=-C121] { + color: #87FFAF +} + +div.highlight .-Color[class*=-BGC121] { + background-color: #87FFAF +} + +div.highlight .-Color[class*=-C122] { + color: #87FFD7 +} + +div.highlight .-Color[class*=-BGC122] { + background-color: #87FFD7 +} + +div.highlight .-Color[class*=-C123] { + color: #87FFFF +} + +div.highlight .-Color[class*=-BGC123] { + background-color: #87FFFF +} + +div.highlight .-Color[class*=-C124] { + color: #AF0000 +} + +div.highlight .-Color[class*=-BGC124] { + background-color: #AF0000 +} + +div.highlight .-Color[class*=-C125] { + color: #AF005F +} + +div.highlight .-Color[class*=-BGC125] { + background-color: #AF005F +} + +div.highlight .-Color[class*=-C126] { + color: #AF0087 +} + +div.highlight .-Color[class*=-BGC126] { + background-color: #AF0087 +} + +div.highlight .-Color[class*=-C127] { + color: #AF00AF +} + +div.highlight .-Color[class*=-BGC127] { + background-color: #AF00AF +} + +div.highlight .-Color[class*=-C128] { + color: #AF00D7 +} + +div.highlight .-Color[class*=-BGC128] { + background-color: #AF00D7 +} + +div.highlight .-Color[class*=-C129] { + color: #AF00FF +} + +div.highlight .-Color[class*=-BGC129] { + background-color: #AF00FF +} + +div.highlight .-Color[class*=-C130] { + color: #AF5F00 +} + +div.highlight .-Color[class*=-BGC130] { + background-color: #AF5F00 +} + +div.highlight .-Color[class*=-C131] { + color: #AF5F5F +} + +div.highlight .-Color[class*=-BGC131] { + background-color: #AF5F5F +} + +div.highlight .-Color[class*=-C132] { + color: #AF5F87 +} + +div.highlight .-Color[class*=-BGC132] { + background-color: #AF5F87 +} + +div.highlight .-Color[class*=-C133] { + color: #AF5FAF +} + +div.highlight .-Color[class*=-BGC133] { + background-color: #AF5FAF +} + +div.highlight .-Color[class*=-C134] { + color: #AF5FD7 +} + +div.highlight .-Color[class*=-BGC134] { + background-color: #AF5FD7 +} + +div.highlight .-Color[class*=-C135] { + color: #AF5FFF +} + +div.highlight .-Color[class*=-BGC135] { + background-color: #AF5FFF +} + +div.highlight .-Color[class*=-C136] { + color: #AF8700 +} + +div.highlight .-Color[class*=-BGC136] { + background-color: #AF8700 +} + +div.highlight .-Color[class*=-C137] { + color: #AF875F +} + +div.highlight .-Color[class*=-BGC137] { + background-color: #AF875F +} + +div.highlight .-Color[class*=-C138] { + color: #AF8787 +} + +div.highlight .-Color[class*=-BGC138] { + background-color: #AF8787 +} + +div.highlight .-Color[class*=-C139] { + color: #AF87AF +} + +div.highlight .-Color[class*=-BGC139] { + background-color: #AF87AF +} + +div.highlight .-Color[class*=-C140] { + color: #AF87D7 +} + +div.highlight .-Color[class*=-BGC140] { + background-color: #AF87D7 +} + +div.highlight .-Color[class*=-C141] { + color: #AF87FF +} + +div.highlight .-Color[class*=-BGC141] { + background-color: #AF87FF +} + +div.highlight .-Color[class*=-C142] { + color: #AFAF00 +} + +div.highlight .-Color[class*=-BGC142] { + background-color: #AFAF00 +} + +div.highlight .-Color[class*=-C143] { + color: #AFAF5F +} + +div.highlight .-Color[class*=-BGC143] { + background-color: #AFAF5F +} + +div.highlight .-Color[class*=-C144] { + color: #AFAF87 +} + +div.highlight .-Color[class*=-BGC144] { + background-color: #AFAF87 +} + +div.highlight .-Color[class*=-C145] { + color: #AFAFAF +} + +div.highlight .-Color[class*=-BGC145] { + background-color: #AFAFAF +} + +div.highlight .-Color[class*=-C146] { + color: #AFAFD7 +} + +div.highlight .-Color[class*=-BGC146] { + background-color: #AFAFD7 +} + +div.highlight .-Color[class*=-C147] { + color: #AFAFFF +} + +div.highlight .-Color[class*=-BGC147] { + background-color: #AFAFFF +} + +div.highlight .-Color[class*=-C148] { + color: #AFD700 +} + +div.highlight .-Color[class*=-BGC148] { + background-color: #AFD700 +} + +div.highlight .-Color[class*=-C149] { + color: #AFD75F +} + +div.highlight .-Color[class*=-BGC149] { + background-color: #AFD75F +} + +div.highlight .-Color[class*=-C150] { + color: #AFD787 +} + +div.highlight .-Color[class*=-BGC150] { + background-color: #AFD787 +} + +div.highlight .-Color[class*=-C151] { + color: #AFD7AF +} + +div.highlight .-Color[class*=-BGC151] { + background-color: #AFD7AF +} + +div.highlight .-Color[class*=-C152] { + color: #AFD7D7 +} + +div.highlight .-Color[class*=-BGC152] { + background-color: #AFD7D7 +} + +div.highlight .-Color[class*=-C153] { + color: #AFD7FF +} + +div.highlight .-Color[class*=-BGC153] { + background-color: #AFD7FF +} + +div.highlight .-Color[class*=-C154] { + color: #AFFF00 +} + +div.highlight .-Color[class*=-BGC154] { + background-color: #AFFF00 +} + +div.highlight .-Color[class*=-C155] { + color: #AFFF5F +} + +div.highlight .-Color[class*=-BGC155] { + background-color: #AFFF5F +} + +div.highlight .-Color[class*=-C156] { + color: #AFFF87 +} + +div.highlight .-Color[class*=-BGC156] { + background-color: #AFFF87 +} + +div.highlight .-Color[class*=-C157] { + color: #AFFFAF +} + +div.highlight .-Color[class*=-BGC157] { + background-color: #AFFFAF +} + +div.highlight .-Color[class*=-C158] { + color: #AFFFD7 +} + +div.highlight .-Color[class*=-BGC158] { + background-color: #AFFFD7 +} + +div.highlight .-Color[class*=-C159] { + color: #AFFFFF +} + +div.highlight .-Color[class*=-BGC159] { + background-color: #AFFFFF +} + +div.highlight .-Color[class*=-C160] { + color: #D70000 +} + +div.highlight .-Color[class*=-BGC160] { + background-color: #D70000 +} + +div.highlight .-Color[class*=-C161] { + color: #D7005F +} + +div.highlight .-Color[class*=-BGC161] { + background-color: #D7005F +} + +div.highlight .-Color[class*=-C162] { + color: #D70087 +} + +div.highlight .-Color[class*=-BGC162] { + background-color: #D70087 +} + +div.highlight .-Color[class*=-C163] { + color: #D700AF +} + +div.highlight .-Color[class*=-BGC163] { + background-color: #D700AF +} + +div.highlight .-Color[class*=-C164] { + color: #D700D7 +} + +div.highlight .-Color[class*=-BGC164] { + background-color: #D700D7 +} + +div.highlight .-Color[class*=-C165] { + color: #D700FF +} + +div.highlight .-Color[class*=-BGC165] { + background-color: #D700FF +} + +div.highlight .-Color[class*=-C166] { + color: #D75F00 +} + +div.highlight .-Color[class*=-BGC166] { + background-color: #D75F00 +} + +div.highlight .-Color[class*=-C167] { + color: #D75F5F +} + +div.highlight .-Color[class*=-BGC167] { + background-color: #D75F5F +} + +div.highlight .-Color[class*=-C168] { + color: #D75F87 +} + +div.highlight .-Color[class*=-BGC168] { + background-color: #D75F87 +} + +div.highlight .-Color[class*=-C169] { + color: #D75FAF +} + +div.highlight .-Color[class*=-BGC169] { + background-color: #D75FAF +} + +div.highlight .-Color[class*=-C170] { + color: #D75FD7 +} + +div.highlight .-Color[class*=-BGC170] { + background-color: #D75FD7 +} + +div.highlight .-Color[class*=-C171] { + color: #D75FFF +} + +div.highlight .-Color[class*=-BGC171] { + background-color: #D75FFF +} + +div.highlight .-Color[class*=-C172] { + color: #D78700 +} + +div.highlight .-Color[class*=-BGC172] { + background-color: #D78700 +} + +div.highlight .-Color[class*=-C173] { + color: #D7875F +} + +div.highlight .-Color[class*=-BGC173] { + background-color: #D7875F +} + +div.highlight .-Color[class*=-C174] { + color: #D78787 +} + +div.highlight .-Color[class*=-BGC174] { + background-color: #D78787 +} + +div.highlight .-Color[class*=-C175] { + color: #D787AF +} + +div.highlight .-Color[class*=-BGC175] { + background-color: #D787AF +} + +div.highlight .-Color[class*=-C176] { + color: #D787D7 +} + +div.highlight .-Color[class*=-BGC176] { + background-color: #D787D7 +} + +div.highlight .-Color[class*=-C177] { + color: #D787FF +} + +div.highlight .-Color[class*=-BGC177] { + background-color: #D787FF +} + +div.highlight .-Color[class*=-C178] { + color: #D7AF00 +} + +div.highlight .-Color[class*=-BGC178] { + background-color: #D7AF00 +} + +div.highlight .-Color[class*=-C179] { + color: #D7AF5F +} + +div.highlight .-Color[class*=-BGC179] { + background-color: #D7AF5F +} + +div.highlight .-Color[class*=-C180] { + color: #D7AF87 +} + +div.highlight .-Color[class*=-BGC180] { + background-color: #D7AF87 +} + +div.highlight .-Color[class*=-C181] { + color: #D7AFAF +} + +div.highlight .-Color[class*=-BGC181] { + background-color: #D7AFAF +} + +div.highlight .-Color[class*=-C182] { + color: #D7AFD7 +} + +div.highlight .-Color[class*=-BGC182] { + background-color: #D7AFD7 +} + +div.highlight .-Color[class*=-C183] { + color: #D7AFFF +} + +div.highlight .-Color[class*=-BGC183] { + background-color: #D7AFFF +} + +div.highlight .-Color[class*=-C184] { + color: #D7D700 +} + +div.highlight .-Color[class*=-BGC184] { + background-color: #D7D700 +} + +div.highlight .-Color[class*=-C185] { + color: #D7D75F +} + +div.highlight .-Color[class*=-BGC185] { + background-color: #D7D75F +} + +div.highlight .-Color[class*=-C186] { + color: #D7D787 +} + +div.highlight .-Color[class*=-BGC186] { + background-color: #D7D787 +} + +div.highlight .-Color[class*=-C187] { + color: #D7D7AF +} + +div.highlight .-Color[class*=-BGC187] { + background-color: #D7D7AF +} + +div.highlight .-Color[class*=-C188] { + color: #D7D7D7 +} + +div.highlight .-Color[class*=-BGC188] { + background-color: #D7D7D7 +} + +div.highlight .-Color[class*=-C189] { + color: #D7D7FF +} + +div.highlight .-Color[class*=-BGC189] { + background-color: #D7D7FF +} + +div.highlight .-Color[class*=-C190] { + color: #D7FF00 +} + +div.highlight .-Color[class*=-BGC190] { + background-color: #D7FF00 +} + +div.highlight .-Color[class*=-C191] { + color: #D7FF5F +} + +div.highlight .-Color[class*=-BGC191] { + background-color: #D7FF5F +} + +div.highlight .-Color[class*=-C192] { + color: #D7FF87 +} + +div.highlight .-Color[class*=-BGC192] { + background-color: #D7FF87 +} + +div.highlight .-Color[class*=-C193] { + color: #D7FFAF +} + +div.highlight .-Color[class*=-BGC193] { + background-color: #D7FFAF +} + +div.highlight .-Color[class*=-C194] { + color: #D7FFD7 +} + +div.highlight .-Color[class*=-BGC194] { + background-color: #D7FFD7 +} + +div.highlight .-Color[class*=-C195] { + color: #D7FFFF +} + +div.highlight .-Color[class*=-BGC195] { + background-color: #D7FFFF +} + +div.highlight .-Color[class*=-C196] { + color: #FF0000 +} + +div.highlight .-Color[class*=-BGC196] { + background-color: #FF0000 +} + +div.highlight .-Color[class*=-C197] { + color: #FF005F +} + +div.highlight .-Color[class*=-BGC197] { + background-color: #FF005F +} + +div.highlight .-Color[class*=-C198] { + color: #FF0087 +} + +div.highlight .-Color[class*=-BGC198] { + background-color: #FF0087 +} + +div.highlight .-Color[class*=-C199] { + color: #FF00AF +} + +div.highlight .-Color[class*=-BGC199] { + background-color: #FF00AF +} + +div.highlight .-Color[class*=-C200] { + color: #FF00D7 +} + +div.highlight .-Color[class*=-BGC200] { + background-color: #FF00D7 +} + +div.highlight .-Color[class*=-C201] { + color: #FF00FF +} + +div.highlight .-Color[class*=-BGC201] { + background-color: #FF00FF +} + +div.highlight .-Color[class*=-C202] { + color: #FF5F00 +} + +div.highlight .-Color[class*=-BGC202] { + background-color: #FF5F00 +} + +div.highlight .-Color[class*=-C203] { + color: #FF5F5F +} + +div.highlight .-Color[class*=-BGC203] { + background-color: #FF5F5F +} + +div.highlight .-Color[class*=-C204] { + color: #FF5F87 +} + +div.highlight .-Color[class*=-BGC204] { + background-color: #FF5F87 +} + +div.highlight .-Color[class*=-C205] { + color: #FF5FAF +} + +div.highlight .-Color[class*=-BGC205] { + background-color: #FF5FAF +} + +div.highlight .-Color[class*=-C206] { + color: #FF5FD7 +} + +div.highlight .-Color[class*=-BGC206] { + background-color: #FF5FD7 +} + +div.highlight .-Color[class*=-C207] { + color: #FF5FFF +} + +div.highlight .-Color[class*=-BGC207] { + background-color: #FF5FFF +} + +div.highlight .-Color[class*=-C208] { + color: #FF8700 +} + +div.highlight .-Color[class*=-BGC208] { + background-color: #FF8700 +} + +div.highlight .-Color[class*=-C209] { + color: #FF875F +} + +div.highlight .-Color[class*=-BGC209] { + background-color: #FF875F +} + +div.highlight .-Color[class*=-C210] { + color: #FF8787 +} + +div.highlight .-Color[class*=-BGC210] { + background-color: #FF8787 +} + +div.highlight .-Color[class*=-C211] { + color: #FF87AF +} + +div.highlight .-Color[class*=-BGC211] { + background-color: #FF87AF +} + +div.highlight .-Color[class*=-C212] { + color: #FF87D7 +} + +div.highlight .-Color[class*=-BGC212] { + background-color: #FF87D7 +} + +div.highlight .-Color[class*=-C213] { + color: #FF87FF +} + +div.highlight .-Color[class*=-BGC213] { + background-color: #FF87FF +} + +div.highlight .-Color[class*=-C214] { + color: #FFAF00 +} + +div.highlight .-Color[class*=-BGC214] { + background-color: #FFAF00 +} + +div.highlight .-Color[class*=-C215] { + color: #FFAF5F +} + +div.highlight .-Color[class*=-BGC215] { + background-color: #FFAF5F +} + +div.highlight .-Color[class*=-C216] { + color: #FFAF87 +} + +div.highlight .-Color[class*=-BGC216] { + background-color: #FFAF87 +} + +div.highlight .-Color[class*=-C217] { + color: #FFAFAF +} + +div.highlight .-Color[class*=-BGC217] { + background-color: #FFAFAF +} + +div.highlight .-Color[class*=-C218] { + color: #FFAFD7 +} + +div.highlight .-Color[class*=-BGC218] { + background-color: #FFAFD7 +} + +div.highlight .-Color[class*=-C219] { + color: #FFAFFF +} + +div.highlight .-Color[class*=-BGC219] { + background-color: #FFAFFF +} + +div.highlight .-Color[class*=-C220] { + color: #FFD700 +} + +div.highlight .-Color[class*=-BGC220] { + background-color: #FFD700 +} + +div.highlight .-Color[class*=-C221] { + color: #FFD75F +} + +div.highlight .-Color[class*=-BGC221] { + background-color: #FFD75F +} + +div.highlight .-Color[class*=-C222] { + color: #FFD787 +} + +div.highlight .-Color[class*=-BGC222] { + background-color: #FFD787 +} + +div.highlight .-Color[class*=-C223] { + color: #FFD7AF +} + +div.highlight .-Color[class*=-BGC223] { + background-color: #FFD7AF +} + +div.highlight .-Color[class*=-C224] { + color: #FFD7D7 +} + +div.highlight .-Color[class*=-BGC224] { + background-color: #FFD7D7 +} + +div.highlight .-Color[class*=-C225] { + color: #FFD7FF +} + +div.highlight .-Color[class*=-BGC225] { + background-color: #FFD7FF +} + +div.highlight .-Color[class*=-C226] { + color: #FFFF00 +} + +div.highlight .-Color[class*=-BGC226] { + background-color: #FFFF00 +} + +div.highlight .-Color[class*=-C227] { + color: #FFFF5F +} + +div.highlight .-Color[class*=-BGC227] { + background-color: #FFFF5F +} + +div.highlight .-Color[class*=-C228] { + color: #FFFF87 +} + +div.highlight .-Color[class*=-BGC228] { + background-color: #FFFF87 +} + +div.highlight .-Color[class*=-C229] { + color: #FFFFAF +} + +div.highlight .-Color[class*=-BGC229] { + background-color: #FFFFAF +} + +div.highlight .-Color[class*=-C230] { + color: #FFFFD7 +} + +div.highlight .-Color[class*=-BGC230] { + background-color: #FFFFD7 +} + +div.highlight .-Color[class*=-C231] { + color: #FFFFFF +} + +div.highlight .-Color[class*=-BGC231] { + background-color: #FFFFFF +} + +div.highlight .-Color[class*=-C232] { + color: #080808 +} + +div.highlight .-Color[class*=-BGC232] { + background-color: #080808 +} + +div.highlight .-Color[class*=-C233] { + color: #121212 +} + +div.highlight .-Color[class*=-BGC233] { + background-color: #121212 +} + +div.highlight .-Color[class*=-C234] { + color: #1C1C1C +} + +div.highlight .-Color[class*=-BGC234] { + background-color: #1C1C1C +} + +div.highlight .-Color[class*=-C235] { + color: #262626 +} + +div.highlight .-Color[class*=-BGC235] { + background-color: #262626 +} + +div.highlight .-Color[class*=-C236] { + color: #303030 +} + +div.highlight .-Color[class*=-BGC236] { + background-color: #303030 +} + +div.highlight .-Color[class*=-C237] { + color: #3A3A3A +} + +div.highlight .-Color[class*=-BGC237] { + background-color: #3A3A3A +} + +div.highlight .-Color[class*=-C238] { + color: #444444 +} + +div.highlight .-Color[class*=-BGC238] { + background-color: #444444 +} + +div.highlight .-Color[class*=-C239] { + color: #4E4E4E +} + +div.highlight .-Color[class*=-BGC239] { + background-color: #4E4E4E +} + +div.highlight .-Color[class*=-C240] { + color: #585858 +} + +div.highlight .-Color[class*=-BGC240] { + background-color: #585858 +} + +div.highlight .-Color[class*=-C241] { + color: #626262 +} + +div.highlight .-Color[class*=-BGC241] { + background-color: #626262 +} + +div.highlight .-Color[class*=-C242] { + color: #6C6C6C +} + +div.highlight .-Color[class*=-BGC242] { + background-color: #6C6C6C +} + +div.highlight .-Color[class*=-C243] { + color: #767676 +} + +div.highlight .-Color[class*=-BGC243] { + background-color: #767676 +} + +div.highlight .-Color[class*=-C244] { + color: #808080 +} + +div.highlight .-Color[class*=-BGC244] { + background-color: #808080 +} + +div.highlight .-Color[class*=-C245] { + color: #8A8A8A +} + +div.highlight .-Color[class*=-BGC245] { + background-color: #8A8A8A +} + +div.highlight .-Color[class*=-C246] { + color: #949494 +} + +div.highlight .-Color[class*=-BGC246] { + background-color: #949494 +} + +div.highlight .-Color[class*=-C247] { + color: #9E9E9E +} + +div.highlight .-Color[class*=-BGC247] { + background-color: #9E9E9E +} + +div.highlight .-Color[class*=-C248] { + color: #A8A8A8 +} + +div.highlight .-Color[class*=-BGC248] { + background-color: #A8A8A8 +} + +div.highlight .-Color[class*=-C249] { + color: #B2B2B2 +} + +div.highlight .-Color[class*=-BGC249] { + background-color: #B2B2B2 +} + +div.highlight .-Color[class*=-C250] { + color: #BCBCBC +} + +div.highlight .-Color[class*=-BGC250] { + background-color: #BCBCBC +} + +div.highlight .-Color[class*=-C251] { + color: #C6C6C6 +} + +div.highlight .-Color[class*=-BGC251] { + background-color: #C6C6C6 +} + +div.highlight .-Color[class*=-C252] { + color: #D0D0D0 +} + +div.highlight .-Color[class*=-BGC252] { + background-color: #D0D0D0 +} + +div.highlight .-Color[class*=-C253] { + color: #DADADA +} + +div.highlight .-Color[class*=-BGC253] { + background-color: #DADADA +} + +div.highlight .-Color[class*=-C254] { + color: #E4E4E4 +} + +div.highlight .-Color[class*=-BGC254] { + background-color: #E4E4E4 +} + +div.highlight .-Color[class*=-C255] { + color: #EEEEEE +} + +div.highlight .-Color[class*=-BGC255] { + background-color: #EEEEEE +} diff --git a/_static/plus.png b/_static/plus.png new file mode 100644 index 00000000..7107cec9 Binary files /dev/null and b/_static/plus.png differ diff --git a/_static/pygments.css b/_static/pygments.css new file mode 100644 index 00000000..02b4b128 --- /dev/null +++ b/_static/pygments.css @@ -0,0 +1,258 @@ +.highlight pre { line-height: 125%; } +.highlight td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +.highlight span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +.highlight td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +.highlight span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +.highlight .hll { background-color: #ffffcc } +.highlight { background: #f8f8f8; } +.highlight .c { color: #8f5902; font-style: italic } /* Comment */ +.highlight .err { color: #a40000; border: 1px solid #ef2929 } /* Error */ +.highlight .g { color: #000000 } /* Generic */ +.highlight .k { color: #204a87; font-weight: bold } /* Keyword */ +.highlight .l { color: #000000 } /* Literal */ +.highlight .n { color: #000000 } /* Name */ +.highlight .o { color: #ce5c00; font-weight: bold } /* Operator */ +.highlight .x { color: #000000 } /* Other */ +.highlight .p { color: #000000; font-weight: bold } /* Punctuation */ +.highlight .ch { color: #8f5902; font-style: italic } /* Comment.Hashbang */ +.highlight .cm { color: #8f5902; font-style: italic } /* Comment.Multiline */ +.highlight .cp { color: #8f5902; font-style: italic } /* Comment.Preproc */ +.highlight .cpf { color: #8f5902; font-style: italic } /* Comment.PreprocFile */ +.highlight .c1 { color: #8f5902; font-style: italic } /* Comment.Single */ +.highlight .cs { color: #8f5902; font-style: italic } /* Comment.Special */ +.highlight .gd { color: #a40000 } /* Generic.Deleted */ +.highlight .ge { color: #000000; font-style: italic } /* Generic.Emph */ +.highlight .ges { color: #000000; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +.highlight .gr { color: #ef2929 } /* Generic.Error */ +.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +.highlight .gi { color: #00A000 } /* Generic.Inserted */ +.highlight .go { color: #000000; font-style: italic } /* Generic.Output */ +.highlight .gp { color: #8f5902 } /* Generic.Prompt */ +.highlight .gs { color: #000000; font-weight: bold } /* Generic.Strong */ +.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +.highlight .gt { color: #a40000; font-weight: bold } /* Generic.Traceback */ +.highlight .kc { color: #204a87; font-weight: bold } /* Keyword.Constant */ +.highlight .kd { color: #204a87; font-weight: bold } /* Keyword.Declaration */ +.highlight .kn { color: #204a87; font-weight: bold } /* Keyword.Namespace */ +.highlight .kp { color: #204a87; font-weight: bold } /* Keyword.Pseudo */ +.highlight .kr { color: #204a87; font-weight: bold } /* Keyword.Reserved */ +.highlight .kt { color: #204a87; font-weight: bold } /* Keyword.Type */ +.highlight .ld { color: #000000 } /* Literal.Date */ +.highlight .m { color: #0000cf; font-weight: bold } /* Literal.Number */ +.highlight .s { color: #4e9a06 } /* Literal.String */ +.highlight .na { color: #c4a000 } /* Name.Attribute */ +.highlight .nb { color: #204a87 } /* Name.Builtin */ +.highlight .nc { color: #000000 } /* Name.Class */ +.highlight .no { color: #000000 } /* Name.Constant */ +.highlight .nd { color: #5c35cc; font-weight: bold } /* Name.Decorator */ +.highlight .ni { color: #ce5c00 } /* Name.Entity */ +.highlight .ne { color: #cc0000; font-weight: bold } /* Name.Exception */ +.highlight .nf { color: #000000 } /* Name.Function */ +.highlight .nl { color: #f57900 } /* Name.Label */ +.highlight .nn { color: #000000 } /* Name.Namespace */ +.highlight .nx { color: #000000 } /* Name.Other */ +.highlight .py { color: #000000 } /* Name.Property */ +.highlight .nt { color: #204a87; font-weight: bold } /* Name.Tag */ +.highlight .nv { color: #000000 } /* Name.Variable */ +.highlight .ow { color: #204a87; font-weight: bold } /* Operator.Word */ +.highlight .pm { color: #000000; font-weight: bold } /* Punctuation.Marker */ +.highlight .w { color: #f8f8f8 } /* Text.Whitespace */ +.highlight .mb { color: #0000cf; font-weight: bold } /* Literal.Number.Bin */ +.highlight .mf { color: #0000cf; font-weight: bold } /* Literal.Number.Float */ +.highlight .mh { color: #0000cf; font-weight: bold } /* Literal.Number.Hex */ +.highlight .mi { color: #0000cf; font-weight: bold } /* Literal.Number.Integer */ +.highlight .mo { color: #0000cf; font-weight: bold } /* Literal.Number.Oct */ +.highlight .sa { color: #4e9a06 } /* Literal.String.Affix */ +.highlight .sb { color: #4e9a06 } /* Literal.String.Backtick */ +.highlight .sc { color: #4e9a06 } /* Literal.String.Char */ +.highlight .dl { color: #4e9a06 } /* Literal.String.Delimiter */ +.highlight .sd { color: #8f5902; font-style: italic } /* Literal.String.Doc */ +.highlight .s2 { color: #4e9a06 } /* Literal.String.Double */ +.highlight .se { color: #4e9a06 } /* Literal.String.Escape */ +.highlight .sh { color: #4e9a06 } /* Literal.String.Heredoc */ +.highlight .si { color: #4e9a06 } /* Literal.String.Interpol */ +.highlight .sx { color: #4e9a06 } /* Literal.String.Other */ +.highlight .sr { color: #4e9a06 } /* Literal.String.Regex */ +.highlight .s1 { color: #4e9a06 } /* Literal.String.Single */ +.highlight .ss { color: #4e9a06 } /* Literal.String.Symbol */ +.highlight .bp { color: #3465a4 } /* Name.Builtin.Pseudo */ +.highlight .fm { color: #000000 } /* Name.Function.Magic */ +.highlight .vc { color: #000000 } /* Name.Variable.Class */ +.highlight .vg { color: #000000 } /* Name.Variable.Global */ +.highlight .vi { color: #000000 } /* Name.Variable.Instance */ +.highlight .vm { color: #000000 } /* Name.Variable.Magic */ +.highlight .il { color: #0000cf; font-weight: bold } /* Literal.Number.Integer.Long */ +@media not print { +body[data-theme="dark"] .highlight pre { line-height: 125%; } +body[data-theme="dark"] .highlight td.linenos .normal { color: #aaaaaa; background-color: transparent; padding-left: 5px; padding-right: 5px; } +body[data-theme="dark"] .highlight span.linenos { color: #aaaaaa; background-color: transparent; padding-left: 5px; padding-right: 5px; } +body[data-theme="dark"] .highlight td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +body[data-theme="dark"] .highlight span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +body[data-theme="dark"] .highlight .hll { background-color: #404040 } +body[data-theme="dark"] .highlight { background: #202020; color: #d0d0d0 } +body[data-theme="dark"] .highlight .c { color: #ababab; font-style: italic } /* Comment */ +body[data-theme="dark"] .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ +body[data-theme="dark"] .highlight .esc { color: #d0d0d0 } /* Escape */ +body[data-theme="dark"] .highlight .g { color: #d0d0d0 } /* Generic */ +body[data-theme="dark"] .highlight .k { color: #6ebf26; font-weight: bold } /* Keyword */ +body[data-theme="dark"] .highlight .l { color: #d0d0d0 } /* Literal */ +body[data-theme="dark"] .highlight .n { color: #d0d0d0 } /* Name */ +body[data-theme="dark"] .highlight .o { color: #d0d0d0 } /* Operator */ +body[data-theme="dark"] .highlight .x { color: #d0d0d0 } /* Other */ +body[data-theme="dark"] .highlight .p { color: #d0d0d0 } /* Punctuation */ +body[data-theme="dark"] .highlight .ch { color: #ababab; font-style: italic } /* Comment.Hashbang */ +body[data-theme="dark"] .highlight .cm { color: #ababab; font-style: italic } /* Comment.Multiline */ +body[data-theme="dark"] .highlight .cp { color: #ff3a3a; font-weight: bold } /* Comment.Preproc */ +body[data-theme="dark"] .highlight .cpf { color: #ababab; font-style: italic } /* Comment.PreprocFile */ +body[data-theme="dark"] .highlight .c1 { color: #ababab; font-style: italic } /* Comment.Single */ +body[data-theme="dark"] .highlight .cs { color: #e50808; font-weight: bold; background-color: #520000 } /* Comment.Special */ +body[data-theme="dark"] .highlight .gd { color: #ff3a3a } /* Generic.Deleted */ +body[data-theme="dark"] .highlight .ge { color: #d0d0d0; font-style: italic } /* Generic.Emph */ +body[data-theme="dark"] .highlight .ges { color: #d0d0d0; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +body[data-theme="dark"] .highlight .gr { color: #ff3a3a } /* Generic.Error */ +body[data-theme="dark"] .highlight .gh { color: #ffffff; font-weight: bold } /* Generic.Heading */ +body[data-theme="dark"] .highlight .gi { color: #589819 } /* Generic.Inserted */ +body[data-theme="dark"] .highlight .go { color: #cccccc } /* Generic.Output */ +body[data-theme="dark"] .highlight .gp { color: #aaaaaa } /* Generic.Prompt */ +body[data-theme="dark"] .highlight .gs { color: #d0d0d0; font-weight: bold } /* Generic.Strong */ +body[data-theme="dark"] .highlight .gu { color: #ffffff; text-decoration: underline } /* Generic.Subheading */ +body[data-theme="dark"] .highlight .gt { color: #ff3a3a } /* Generic.Traceback */ +body[data-theme="dark"] .highlight .kc { color: #6ebf26; font-weight: bold } /* Keyword.Constant */ +body[data-theme="dark"] .highlight .kd { color: #6ebf26; font-weight: bold } /* Keyword.Declaration */ +body[data-theme="dark"] .highlight .kn { color: #6ebf26; font-weight: bold } /* Keyword.Namespace */ +body[data-theme="dark"] .highlight .kp { color: #6ebf26 } /* Keyword.Pseudo */ +body[data-theme="dark"] .highlight .kr { color: #6ebf26; font-weight: bold } /* Keyword.Reserved */ +body[data-theme="dark"] .highlight .kt { color: #6ebf26; font-weight: bold } /* Keyword.Type */ +body[data-theme="dark"] .highlight .ld { color: #d0d0d0 } /* Literal.Date */ +body[data-theme="dark"] .highlight .m { color: #51b2fd } /* Literal.Number */ +body[data-theme="dark"] .highlight .s { color: #ed9d13 } /* Literal.String */ +body[data-theme="dark"] .highlight .na { color: #bbbbbb } /* Name.Attribute */ +body[data-theme="dark"] .highlight .nb { color: #2fbccd } /* Name.Builtin */ +body[data-theme="dark"] .highlight .nc { color: #71adff; text-decoration: underline } /* Name.Class */ +body[data-theme="dark"] .highlight .no { color: #40ffff } /* Name.Constant */ +body[data-theme="dark"] .highlight .nd { color: #ffa500 } /* Name.Decorator */ +body[data-theme="dark"] .highlight .ni { color: #d0d0d0 } /* Name.Entity */ +body[data-theme="dark"] .highlight .ne { color: #bbbbbb } /* Name.Exception */ +body[data-theme="dark"] .highlight .nf { color: #71adff } /* Name.Function */ +body[data-theme="dark"] .highlight .nl { color: #d0d0d0 } /* Name.Label */ +body[data-theme="dark"] .highlight .nn { color: #71adff; text-decoration: underline } /* Name.Namespace */ +body[data-theme="dark"] .highlight .nx { color: #d0d0d0 } /* Name.Other */ +body[data-theme="dark"] .highlight .py { color: #d0d0d0 } /* Name.Property */ +body[data-theme="dark"] .highlight .nt { color: #6ebf26; font-weight: bold } /* Name.Tag */ +body[data-theme="dark"] .highlight .nv { color: #40ffff } /* Name.Variable */ +body[data-theme="dark"] .highlight .ow { color: #6ebf26; font-weight: bold } /* Operator.Word */ +body[data-theme="dark"] .highlight .pm { color: #d0d0d0 } /* Punctuation.Marker */ +body[data-theme="dark"] .highlight .w { color: #666666 } /* Text.Whitespace */ +body[data-theme="dark"] .highlight .mb { color: #51b2fd } /* Literal.Number.Bin */ +body[data-theme="dark"] .highlight .mf { color: #51b2fd } /* Literal.Number.Float */ +body[data-theme="dark"] .highlight .mh { color: #51b2fd } /* Literal.Number.Hex */ +body[data-theme="dark"] .highlight .mi { color: #51b2fd } /* Literal.Number.Integer */ +body[data-theme="dark"] .highlight .mo { color: #51b2fd } /* Literal.Number.Oct */ +body[data-theme="dark"] .highlight .sa { color: #ed9d13 } /* Literal.String.Affix */ +body[data-theme="dark"] .highlight .sb { color: #ed9d13 } /* Literal.String.Backtick */ +body[data-theme="dark"] .highlight .sc { color: #ed9d13 } /* Literal.String.Char */ +body[data-theme="dark"] .highlight .dl { color: #ed9d13 } /* Literal.String.Delimiter */ +body[data-theme="dark"] .highlight .sd { color: #ed9d13 } /* Literal.String.Doc */ +body[data-theme="dark"] .highlight .s2 { color: #ed9d13 } /* Literal.String.Double */ +body[data-theme="dark"] .highlight .se { color: #ed9d13 } /* Literal.String.Escape */ +body[data-theme="dark"] .highlight .sh { color: #ed9d13 } /* Literal.String.Heredoc */ +body[data-theme="dark"] .highlight .si { color: #ed9d13 } /* Literal.String.Interpol */ +body[data-theme="dark"] .highlight .sx { color: #ffa500 } /* Literal.String.Other */ +body[data-theme="dark"] .highlight .sr { color: #ed9d13 } /* Literal.String.Regex */ +body[data-theme="dark"] .highlight .s1 { color: #ed9d13 } /* Literal.String.Single */ +body[data-theme="dark"] .highlight .ss { color: #ed9d13 } /* Literal.String.Symbol */ +body[data-theme="dark"] .highlight .bp { color: #2fbccd } /* Name.Builtin.Pseudo */ +body[data-theme="dark"] .highlight .fm { color: #71adff } /* Name.Function.Magic */ +body[data-theme="dark"] .highlight .vc { color: #40ffff } /* Name.Variable.Class */ +body[data-theme="dark"] .highlight .vg { color: #40ffff } /* Name.Variable.Global */ +body[data-theme="dark"] .highlight .vi { color: #40ffff } /* Name.Variable.Instance */ +body[data-theme="dark"] .highlight .vm { color: #40ffff } /* Name.Variable.Magic */ +body[data-theme="dark"] .highlight .il { color: #51b2fd } /* Literal.Number.Integer.Long */ +@media (prefers-color-scheme: dark) { +body:not([data-theme="light"]) .highlight pre { line-height: 125%; } +body:not([data-theme="light"]) .highlight td.linenos .normal { color: #aaaaaa; background-color: transparent; padding-left: 5px; padding-right: 5px; } +body:not([data-theme="light"]) .highlight span.linenos { color: #aaaaaa; background-color: transparent; padding-left: 5px; padding-right: 5px; } +body:not([data-theme="light"]) .highlight td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +body:not([data-theme="light"]) .highlight span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +body:not([data-theme="light"]) .highlight .hll { background-color: #404040 } +body:not([data-theme="light"]) .highlight { background: #202020; color: #d0d0d0 } +body:not([data-theme="light"]) .highlight .c { color: #ababab; font-style: italic } /* Comment */ +body:not([data-theme="light"]) .highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ +body:not([data-theme="light"]) .highlight .esc { color: #d0d0d0 } /* Escape */ +body:not([data-theme="light"]) .highlight .g { color: #d0d0d0 } /* Generic */ +body:not([data-theme="light"]) .highlight .k { color: #6ebf26; font-weight: bold } /* Keyword */ +body:not([data-theme="light"]) .highlight .l { color: #d0d0d0 } /* Literal */ +body:not([data-theme="light"]) .highlight .n { color: #d0d0d0 } /* Name */ +body:not([data-theme="light"]) .highlight .o { color: #d0d0d0 } /* Operator */ +body:not([data-theme="light"]) .highlight .x { color: #d0d0d0 } /* Other */ +body:not([data-theme="light"]) .highlight .p { color: #d0d0d0 } /* Punctuation */ +body:not([data-theme="light"]) .highlight .ch { color: #ababab; font-style: italic } /* Comment.Hashbang */ +body:not([data-theme="light"]) .highlight .cm { color: #ababab; font-style: italic } /* Comment.Multiline */ +body:not([data-theme="light"]) .highlight .cp { color: #ff3a3a; font-weight: bold } /* Comment.Preproc */ +body:not([data-theme="light"]) .highlight .cpf { color: #ababab; font-style: italic } /* Comment.PreprocFile */ +body:not([data-theme="light"]) .highlight .c1 { color: #ababab; font-style: italic } /* Comment.Single */ +body:not([data-theme="light"]) .highlight .cs { color: #e50808; font-weight: bold; background-color: #520000 } /* Comment.Special */ +body:not([data-theme="light"]) .highlight .gd { color: #ff3a3a } /* Generic.Deleted */ +body:not([data-theme="light"]) .highlight .ge { color: #d0d0d0; font-style: italic } /* Generic.Emph */ +body:not([data-theme="light"]) .highlight .ges { color: #d0d0d0; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +body:not([data-theme="light"]) .highlight .gr { color: #ff3a3a } /* Generic.Error */ +body:not([data-theme="light"]) .highlight .gh { color: #ffffff; font-weight: bold } /* Generic.Heading */ +body:not([data-theme="light"]) .highlight .gi { color: #589819 } /* Generic.Inserted */ +body:not([data-theme="light"]) .highlight .go { color: #cccccc } /* Generic.Output */ +body:not([data-theme="light"]) .highlight .gp { color: #aaaaaa } /* Generic.Prompt */ +body:not([data-theme="light"]) .highlight .gs { color: #d0d0d0; font-weight: bold } /* Generic.Strong */ +body:not([data-theme="light"]) .highlight .gu { color: #ffffff; text-decoration: underline } /* Generic.Subheading */ +body:not([data-theme="light"]) .highlight .gt { color: #ff3a3a } /* Generic.Traceback */ +body:not([data-theme="light"]) .highlight .kc { color: #6ebf26; font-weight: bold } /* Keyword.Constant */ +body:not([data-theme="light"]) .highlight .kd { color: #6ebf26; font-weight: bold } /* Keyword.Declaration */ +body:not([data-theme="light"]) .highlight .kn { color: #6ebf26; font-weight: bold } /* Keyword.Namespace */ +body:not([data-theme="light"]) .highlight .kp { color: #6ebf26 } /* Keyword.Pseudo */ +body:not([data-theme="light"]) .highlight .kr { color: #6ebf26; font-weight: bold } /* Keyword.Reserved */ +body:not([data-theme="light"]) .highlight .kt { color: #6ebf26; font-weight: bold } /* Keyword.Type */ +body:not([data-theme="light"]) .highlight .ld { color: #d0d0d0 } /* Literal.Date */ +body:not([data-theme="light"]) .highlight .m { color: #51b2fd } /* Literal.Number */ +body:not([data-theme="light"]) .highlight .s { color: #ed9d13 } /* Literal.String */ +body:not([data-theme="light"]) .highlight .na { color: #bbbbbb } /* Name.Attribute */ +body:not([data-theme="light"]) .highlight .nb { color: #2fbccd } /* Name.Builtin */ +body:not([data-theme="light"]) .highlight .nc { color: #71adff; text-decoration: underline } /* Name.Class */ +body:not([data-theme="light"]) .highlight .no { color: #40ffff } /* Name.Constant */ +body:not([data-theme="light"]) .highlight .nd { color: #ffa500 } /* Name.Decorator */ +body:not([data-theme="light"]) .highlight .ni { color: #d0d0d0 } /* Name.Entity */ +body:not([data-theme="light"]) .highlight .ne { color: #bbbbbb } /* Name.Exception */ +body:not([data-theme="light"]) .highlight .nf { color: #71adff } /* Name.Function */ +body:not([data-theme="light"]) .highlight .nl { color: #d0d0d0 } /* Name.Label */ +body:not([data-theme="light"]) .highlight .nn { color: #71adff; text-decoration: underline } /* Name.Namespace */ +body:not([data-theme="light"]) .highlight .nx { color: #d0d0d0 } /* Name.Other */ +body:not([data-theme="light"]) .highlight .py { color: #d0d0d0 } /* Name.Property */ +body:not([data-theme="light"]) .highlight .nt { color: #6ebf26; font-weight: bold } /* Name.Tag */ +body:not([data-theme="light"]) .highlight .nv { color: #40ffff } /* Name.Variable */ +body:not([data-theme="light"]) .highlight .ow { color: #6ebf26; font-weight: bold } /* Operator.Word */ +body:not([data-theme="light"]) .highlight .pm { color: #d0d0d0 } /* Punctuation.Marker */ +body:not([data-theme="light"]) .highlight .w { color: #666666 } /* Text.Whitespace */ +body:not([data-theme="light"]) .highlight .mb { color: #51b2fd } /* Literal.Number.Bin */ +body:not([data-theme="light"]) .highlight .mf { color: #51b2fd } /* Literal.Number.Float */ +body:not([data-theme="light"]) .highlight .mh { color: #51b2fd } /* Literal.Number.Hex */ +body:not([data-theme="light"]) .highlight .mi { color: #51b2fd } /* Literal.Number.Integer */ +body:not([data-theme="light"]) .highlight .mo { color: #51b2fd } /* Literal.Number.Oct */ +body:not([data-theme="light"]) .highlight .sa { color: #ed9d13 } /* Literal.String.Affix */ +body:not([data-theme="light"]) .highlight .sb { color: #ed9d13 } /* Literal.String.Backtick */ +body:not([data-theme="light"]) .highlight .sc { color: #ed9d13 } /* Literal.String.Char */ +body:not([data-theme="light"]) .highlight .dl { color: #ed9d13 } /* Literal.String.Delimiter */ +body:not([data-theme="light"]) .highlight .sd { color: #ed9d13 } /* Literal.String.Doc */ +body:not([data-theme="light"]) .highlight .s2 { color: #ed9d13 } /* Literal.String.Double */ +body:not([data-theme="light"]) .highlight .se { color: #ed9d13 } /* Literal.String.Escape */ +body:not([data-theme="light"]) .highlight .sh { color: #ed9d13 } /* Literal.String.Heredoc */ +body:not([data-theme="light"]) .highlight .si { color: #ed9d13 } /* Literal.String.Interpol */ +body:not([data-theme="light"]) .highlight .sx { color: #ffa500 } /* Literal.String.Other */ +body:not([data-theme="light"]) .highlight .sr { color: #ed9d13 } /* Literal.String.Regex */ +body:not([data-theme="light"]) .highlight .s1 { color: #ed9d13 } /* Literal.String.Single */ +body:not([data-theme="light"]) .highlight .ss { color: #ed9d13 } /* Literal.String.Symbol */ +body:not([data-theme="light"]) .highlight .bp { color: #2fbccd } /* Name.Builtin.Pseudo */ +body:not([data-theme="light"]) .highlight .fm { color: #71adff } /* Name.Function.Magic */ +body:not([data-theme="light"]) .highlight .vc { color: #40ffff } /* Name.Variable.Class */ +body:not([data-theme="light"]) .highlight .vg { color: #40ffff } /* Name.Variable.Global */ +body:not([data-theme="light"]) .highlight .vi { color: #40ffff } /* Name.Variable.Instance */ +body:not([data-theme="light"]) .highlight .vm { color: #40ffff } /* Name.Variable.Magic */ +body:not([data-theme="light"]) .highlight .il { color: #51b2fd } /* Literal.Number.Integer.Long */ +} +} \ No newline at end of file diff --git a/_static/scripts/furo-extensions.js b/_static/scripts/furo-extensions.js new file mode 100644 index 00000000..e69de29b diff --git a/_static/scripts/furo.js b/_static/scripts/furo.js new file mode 100644 index 00000000..0abb2afa --- /dev/null +++ b/_static/scripts/furo.js @@ -0,0 +1,3 @@ +/*! For license information please see furo.js.LICENSE.txt */ +(()=>{var t={856:function(t,e,n){var o,r;r=void 0!==n.g?n.g:"undefined"!=typeof window?window:this,o=function(){return function(t){"use strict";var e={navClass:"active",contentClass:"active",nested:!1,nestedClass:"active",offset:0,reflow:!1,events:!0},n=function(t,e,n){if(n.settings.events){var o=new CustomEvent(t,{bubbles:!0,cancelable:!0,detail:n});e.dispatchEvent(o)}},o=function(t){var e=0;if(t.offsetParent)for(;t;)e+=t.offsetTop,t=t.offsetParent;return e>=0?e:0},r=function(t){t&&t.sort((function(t,e){return o(t.content)=Math.max(document.body.scrollHeight,document.documentElement.scrollHeight,document.body.offsetHeight,document.documentElement.offsetHeight,document.body.clientHeight,document.documentElement.clientHeight)},l=function(t,e){var n=t[t.length-1];if(function(t,e){return!(!s()||!c(t.content,e,!0))}(n,e))return n;for(var o=t.length-1;o>=0;o--)if(c(t[o].content,e))return t[o]},a=function(t,e){if(e.nested&&t.parentNode){var n=t.parentNode.closest("li");n&&(n.classList.remove(e.nestedClass),a(n,e))}},i=function(t,e){if(t){var o=t.nav.closest("li");o&&(o.classList.remove(e.navClass),t.content.classList.remove(e.contentClass),a(o,e),n("gumshoeDeactivate",o,{link:t.nav,content:t.content,settings:e}))}},u=function(t,e){if(e.nested){var n=t.parentNode.closest("li");n&&(n.classList.add(e.nestedClass),u(n,e))}};return function(o,c){var s,a,d,f,m,v={setup:function(){s=document.querySelectorAll(o),a=[],Array.prototype.forEach.call(s,(function(t){var e=document.getElementById(decodeURIComponent(t.hash.substr(1)));e&&a.push({nav:t,content:e})})),r(a)},detect:function(){var t=l(a,m);t?d&&t.content===d.content||(i(d,m),function(t,e){if(t){var o=t.nav.closest("li");o&&(o.classList.add(e.navClass),t.content.classList.add(e.contentClass),u(o,e),n("gumshoeActivate",o,{link:t.nav,content:t.content,settings:e}))}}(t,m),d=t):d&&(i(d,m),d=null)}},h=function(e){f&&t.cancelAnimationFrame(f),f=t.requestAnimationFrame(v.detect)},g=function(e){f&&t.cancelAnimationFrame(f),f=t.requestAnimationFrame((function(){r(a),v.detect()}))};return v.destroy=function(){d&&i(d,m),t.removeEventListener("scroll",h,!1),m.reflow&&t.removeEventListener("resize",g,!1),a=null,s=null,d=null,f=null,m=null},m=function(){var t={};return Array.prototype.forEach.call(arguments,(function(e){for(var n in e){if(!e.hasOwnProperty(n))return;t[n]=e[n]}})),t}(e,c||{}),v.setup(),v.detect(),t.addEventListener("scroll",h,!1),m.reflow&&t.addEventListener("resize",g,!1),v}}(r)}.apply(e,[]),void 0===o||(t.exports=o)}},e={};function n(o){var r=e[o];if(void 0!==r)return r.exports;var c=e[o]={exports:{}};return t[o].call(c.exports,c,c.exports,n),c.exports}n.n=t=>{var e=t&&t.__esModule?()=>t.default:()=>t;return n.d(e,{a:e}),e},n.d=(t,e)=>{for(var o in e)n.o(e,o)&&!n.o(t,o)&&Object.defineProperty(t,o,{enumerable:!0,get:e[o]})},n.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(t){if("object"==typeof window)return window}}(),n.o=(t,e)=>Object.prototype.hasOwnProperty.call(t,e),(()=>{"use strict";var t=n(856),e=n.n(t),o=null,r=null,c=document.documentElement.scrollTop;const s=64;function l(){const t=localStorage.getItem("theme")||"auto";var e;"light"!==(e=window.matchMedia("(prefers-color-scheme: dark)").matches?"auto"===t?"light":"light"==t?"dark":"auto":"auto"===t?"dark":"dark"==t?"light":"auto")&&"dark"!==e&&"auto"!==e&&(console.error(`Got invalid theme mode: ${e}. Resetting to auto.`),e="auto"),document.body.dataset.theme=e,localStorage.setItem("theme",e),console.log(`Changed to ${e} mode.`)}function a(){!function(){const t=document.getElementsByClassName("theme-toggle");Array.from(t).forEach((t=>{t.addEventListener("click",l)}))}(),function(){let t=0,e=!1;window.addEventListener("scroll",(function(n){t=window.scrollY,e||(window.requestAnimationFrame((function(){var n;(function(t){const e=Math.floor(r.getBoundingClientRect().top);console.log(`headerTop: ${e}`),0==e&&t!=e?r.classList.add("scrolled"):r.classList.remove("scrolled")})(n=t),function(t){tc&&document.documentElement.classList.remove("show-back-to-top"),c=t}(n),function(t){null!==o&&(0==t?o.scrollTo(0,0):Math.ceil(t)>=Math.floor(document.documentElement.scrollHeight-window.innerHeight)?o.scrollTo(0,o.scrollHeight):document.querySelector(".scroll-current"))}(n),e=!1})),e=!0)})),window.scroll()}(),null!==o&&new(e())(".toc-tree a",{reflow:!0,recursive:!0,navClass:"scroll-current",offset:()=>{let t=parseFloat(getComputedStyle(document.documentElement).fontSize);return r.getBoundingClientRect().height+2.5*t+1}})}document.addEventListener("DOMContentLoaded",(function(){document.body.parentNode.classList.remove("no-js"),r=document.querySelector("header"),o=document.querySelector(".toc-scroll"),a()}))})()})(); +//# sourceMappingURL=furo.js.map \ No newline at end of file diff --git a/_static/scripts/furo.js.LICENSE.txt b/_static/scripts/furo.js.LICENSE.txt new file mode 100644 index 00000000..1632189c --- /dev/null +++ b/_static/scripts/furo.js.LICENSE.txt @@ -0,0 +1,7 @@ +/*! + * gumshoejs v5.1.2 (patched by @pradyunsg) + * A simple, framework-agnostic scrollspy script. + * (c) 2019 Chris Ferdinandi + * MIT License + * http://github.com/cferdinandi/gumshoe + */ diff --git a/_static/scripts/furo.js.map b/_static/scripts/furo.js.map new file mode 100644 index 00000000..80ea12b8 --- /dev/null +++ b/_static/scripts/furo.js.map @@ -0,0 +1 @@ +{"version":3,"file":"scripts/furo.js","mappings":";iCAAA,MAQWA,SAWS,IAAX,EAAAC,EACH,EAAAA,EACkB,oBAAXC,OACLA,OACAC,KAbO,EAAF,WACP,OAaJ,SAAUD,GACR,aAMA,IAAIE,EAAW,CAEbC,SAAU,SACVC,aAAc,SAGdC,QAAQ,EACRC,YAAa,SAGbC,OAAQ,EACRC,QAAQ,EAGRC,QAAQ,GA6BNC,EAAY,SAAUC,EAAMC,EAAMC,GAEpC,GAAKA,EAAOC,SAASL,OAArB,CAGA,IAAIM,EAAQ,IAAIC,YAAYL,EAAM,CAChCM,SAAS,EACTC,YAAY,EACZL,OAAQA,IAIVD,EAAKO,cAAcJ,EAVgB,CAWrC,EAOIK,EAAe,SAAUR,GAC3B,IAAIS,EAAW,EACf,GAAIT,EAAKU,aACP,KAAOV,GACLS,GAAYT,EAAKW,UACjBX,EAAOA,EAAKU,aAGhB,OAAOD,GAAY,EAAIA,EAAW,CACpC,EAMIG,EAAe,SAAUC,GACvBA,GACFA,EAASC,MAAK,SAAUC,EAAOC,GAG7B,OAFcR,EAAaO,EAAME,SACnBT,EAAaQ,EAAMC,UACF,EACxB,CACT,GAEJ,EAwCIC,EAAW,SAAUlB,EAAME,EAAUiB,GACvC,IAAIC,EAASpB,EAAKqB,wBACd1B,EAnCU,SAAUO,GAExB,MAA+B,mBAApBA,EAASP,OACX2B,WAAWpB,EAASP,UAItB2B,WAAWpB,EAASP,OAC7B,CA2Be4B,CAAUrB,GACvB,OAAIiB,EAEAK,SAASJ,EAAOD,OAAQ,KACvB/B,EAAOqC,aAAeC,SAASC,gBAAgBC,cAG7CJ,SAASJ,EAAOS,IAAK,KAAOlC,CACrC,EAMImC,EAAa,WACf,OACEC,KAAKC,KAAK5C,EAAOqC,YAAcrC,EAAO6C,cAnCjCF,KAAKG,IACVR,SAASS,KAAKC,aACdV,SAASC,gBAAgBS,aACzBV,SAASS,KAAKE,aACdX,SAASC,gBAAgBU,aACzBX,SAASS,KAAKP,aACdF,SAASC,gBAAgBC,aAkC7B,EAmBIU,EAAY,SAAUzB,EAAUX,GAClC,IAAIqC,EAAO1B,EAASA,EAAS2B,OAAS,GACtC,GAbgB,SAAUC,EAAMvC,GAChC,SAAI4B,MAAgBZ,EAASuB,EAAKxB,QAASf,GAAU,GAEvD,CAUMwC,CAAYH,EAAMrC,GAAW,OAAOqC,EACxC,IAAK,IAAII,EAAI9B,EAAS2B,OAAS,EAAGG,GAAK,EAAGA,IACxC,GAAIzB,EAASL,EAAS8B,GAAG1B,QAASf,GAAW,OAAOW,EAAS8B,EAEjE,EAOIC,EAAmB,SAAUC,EAAK3C,GAEpC,GAAKA,EAAST,QAAWoD,EAAIC,WAA7B,CAGA,IAAIC,EAAKF,EAAIC,WAAWE,QAAQ,MAC3BD,IAGLA,EAAGE,UAAUC,OAAOhD,EAASR,aAG7BkD,EAAiBG,EAAI7C,GAV0B,CAWjD,EAOIiD,EAAa,SAAUC,EAAOlD,GAEhC,GAAKkD,EAAL,CAGA,IAAIL,EAAKK,EAAMP,IAAIG,QAAQ,MACtBD,IAGLA,EAAGE,UAAUC,OAAOhD,EAASX,UAC7B6D,EAAMnC,QAAQgC,UAAUC,OAAOhD,EAASV,cAGxCoD,EAAiBG,EAAI7C,GAGrBJ,EAAU,oBAAqBiD,EAAI,CACjCM,KAAMD,EAAMP,IACZ5B,QAASmC,EAAMnC,QACff,SAAUA,IAjBM,CAmBpB,EAOIoD,EAAiB,SAAUT,EAAK3C,GAElC,GAAKA,EAAST,OAAd,CAGA,IAAIsD,EAAKF,EAAIC,WAAWE,QAAQ,MAC3BD,IAGLA,EAAGE,UAAUM,IAAIrD,EAASR,aAG1B4D,EAAeP,EAAI7C,GAVS,CAW9B,EA6LA,OA1JkB,SAAUsD,EAAUC,GAKpC,IACIC,EAAU7C,EAAU8C,EAASC,EAAS1D,EADtC2D,EAAa,CAUjBA,MAAmB,WAEjBH,EAAWhC,SAASoC,iBAAiBN,GAGrC3C,EAAW,GAGXkD,MAAMC,UAAUC,QAAQC,KAAKR,GAAU,SAAUjB,GAE/C,IAAIxB,EAAUS,SAASyC,eACrBC,mBAAmB3B,EAAK4B,KAAKC,OAAO,KAEjCrD,GAGLJ,EAAS0D,KAAK,CACZ1B,IAAKJ,EACLxB,QAASA,GAEb,IAGAL,EAAaC,EACf,EAKAgD,OAAoB,WAElB,IAAIW,EAASlC,EAAUzB,EAAUX,GAG5BsE,EASDb,GAAWa,EAAOvD,UAAY0C,EAAQ1C,UAG1CkC,EAAWQ,EAASzD,GAzFT,SAAUkD,EAAOlD,GAE9B,GAAKkD,EAAL,CAGA,IAAIL,EAAKK,EAAMP,IAAIG,QAAQ,MACtBD,IAGLA,EAAGE,UAAUM,IAAIrD,EAASX,UAC1B6D,EAAMnC,QAAQgC,UAAUM,IAAIrD,EAASV,cAGrC8D,EAAeP,EAAI7C,GAGnBJ,EAAU,kBAAmBiD,EAAI,CAC/BM,KAAMD,EAAMP,IACZ5B,QAASmC,EAAMnC,QACff,SAAUA,IAjBM,CAmBpB,CAqEIuE,CAASD,EAAQtE,GAGjByD,EAAUa,GAfJb,IACFR,EAAWQ,EAASzD,GACpByD,EAAU,KAchB,GAMIe,EAAgB,SAAUvE,GAExByD,GACFxE,EAAOuF,qBAAqBf,GAI9BA,EAAUxE,EAAOwF,sBAAsBf,EAAWgB,OACpD,EAMIC,EAAgB,SAAU3E,GAExByD,GACFxE,EAAOuF,qBAAqBf,GAI9BA,EAAUxE,EAAOwF,uBAAsB,WACrChE,EAAaC,GACbgD,EAAWgB,QACb,GACF,EAkDA,OA7CAhB,EAAWkB,QAAU,WAEfpB,GACFR,EAAWQ,EAASzD,GAItBd,EAAO4F,oBAAoB,SAAUN,GAAe,GAChDxE,EAASN,QACXR,EAAO4F,oBAAoB,SAAUF,GAAe,GAItDjE,EAAW,KACX6C,EAAW,KACXC,EAAU,KACVC,EAAU,KACV1D,EAAW,IACb,EAOEA,EA3XS,WACX,IAAI+E,EAAS,CAAC,EAOd,OANAlB,MAAMC,UAAUC,QAAQC,KAAKgB,WAAW,SAAUC,GAChD,IAAK,IAAIC,KAAOD,EAAK,CACnB,IAAKA,EAAIE,eAAeD,GAAM,OAC9BH,EAAOG,GAAOD,EAAIC,EACpB,CACF,IACOH,CACT,CAkXeK,CAAOhG,EAAUmE,GAAW,CAAC,GAGxCI,EAAW0B,QAGX1B,EAAWgB,SAGXzF,EAAOoG,iBAAiB,SAAUd,GAAe,GAC7CxE,EAASN,QACXR,EAAOoG,iBAAiB,SAAUV,GAAe,GAS9CjB,CACT,CAOF,CArcW4B,CAAQvG,EAChB,UAFM,SAEN,uBCXDwG,EAA2B,CAAC,EAGhC,SAASC,EAAoBC,GAE5B,IAAIC,EAAeH,EAAyBE,GAC5C,QAAqBE,IAAjBD,EACH,OAAOA,EAAaE,QAGrB,IAAIC,EAASN,EAAyBE,GAAY,CAGjDG,QAAS,CAAC,GAOX,OAHAE,EAAoBL,GAAU1B,KAAK8B,EAAOD,QAASC,EAAQA,EAAOD,QAASJ,GAGpEK,EAAOD,OACf,CCrBAJ,EAAoBO,EAAKF,IACxB,IAAIG,EAASH,GAAUA,EAAOI,WAC7B,IAAOJ,EAAiB,QACxB,IAAM,EAEP,OADAL,EAAoBU,EAAEF,EAAQ,CAAEG,EAAGH,IAC5BA,CAAM,ECLdR,EAAoBU,EAAI,CAACN,EAASQ,KACjC,IAAI,IAAInB,KAAOmB,EACXZ,EAAoBa,EAAED,EAAYnB,KAASO,EAAoBa,EAAET,EAASX,IAC5EqB,OAAOC,eAAeX,EAASX,EAAK,CAAEuB,YAAY,EAAMC,IAAKL,EAAWnB,IAE1E,ECNDO,EAAoBxG,EAAI,WACvB,GAA0B,iBAAf0H,WAAyB,OAAOA,WAC3C,IACC,OAAOxH,MAAQ,IAAIyH,SAAS,cAAb,EAChB,CAAE,MAAOC,GACR,GAAsB,iBAAX3H,OAAqB,OAAOA,MACxC,CACA,CAPuB,GCAxBuG,EAAoBa,EAAI,CAACrB,EAAK6B,IAAUP,OAAOzC,UAAUqB,eAAenB,KAAKiB,EAAK6B,4CCK9EC,EAAY,KACZC,EAAS,KACTC,EAAgBzF,SAASC,gBAAgByF,UAC7C,MAAMC,EAAmB,GA8EzB,SAASC,IACP,MAAMC,EAAeC,aAAaC,QAAQ,UAAY,OAZxD,IAAkBC,EACH,WADGA,EAaItI,OAAOuI,WAAW,gCAAgCC,QAI/C,SAAjBL,EACO,QACgB,SAAhBA,EACA,OAEA,OAIU,SAAjBA,EACO,OACgB,QAAhBA,EACA,QAEA,SA9BoB,SAATG,GAA4B,SAATA,IACzCG,QAAQC,MAAM,2BAA2BJ,yBACzCA,EAAO,QAGThG,SAASS,KAAK4F,QAAQC,MAAQN,EAC9BF,aAAaS,QAAQ,QAASP,GAC9BG,QAAQK,IAAI,cAAcR,UA0B5B,CAkDA,SAASnC,KART,WAEE,MAAM4C,EAAUzG,SAAS0G,uBAAuB,gBAChDrE,MAAMsE,KAAKF,GAASlE,SAASqE,IAC3BA,EAAI9C,iBAAiB,QAAS8B,EAAe,GAEjD,CAGEiB,GA9CF,WAEE,IAAIC,EAA6B,EAC7BC,GAAU,EAEdrJ,OAAOoG,iBAAiB,UAAU,SAAUuB,GAC1CyB,EAA6BpJ,OAAOsJ,QAE/BD,IACHrJ,OAAOwF,uBAAsB,WAzDnC,IAAuB+D,GAxDvB,SAAgCA,GAC9B,MAAMC,EAAY7G,KAAK8G,MAAM3B,EAAO7F,wBAAwBQ,KAE5DgG,QAAQK,IAAI,cAAcU,KACT,GAAbA,GAAkBD,GAAaC,EACjC1B,EAAOjE,UAAUM,IAAI,YAErB2D,EAAOjE,UAAUC,OAAO,WAE5B,EAgDE4F,CADqBH,EA0DDH,GAvGtB,SAAmCG,GAC7BA,EAAYtB,EACd3F,SAASC,gBAAgBsB,UAAUC,OAAO,oBAEtCyF,EAAYxB,EACdzF,SAASC,gBAAgBsB,UAAUM,IAAI,oBAC9BoF,EAAYxB,GACrBzF,SAASC,gBAAgBsB,UAAUC,OAAO,oBAG9CiE,EAAgBwB,CAClB,CAoCEI,CAA0BJ,GAlC5B,SAA6BA,GACT,OAAd1B,IAKa,GAAb0B,EACF1B,EAAU+B,SAAS,EAAG,GAGtBjH,KAAKC,KAAK2G,IACV5G,KAAK8G,MAAMnH,SAASC,gBAAgBS,aAAehD,OAAOqC,aAE1DwF,EAAU+B,SAAS,EAAG/B,EAAU7E,cAGhBV,SAASuH,cAAc,mBAc3C,CAKEC,CAAoBP,GAwDdF,GAAU,CACZ,IAEAA,GAAU,EAEd,IACArJ,OAAO+J,QACT,CA6BEC,GA1BkB,OAAdnC,GAKJ,IAAI,IAAJ,CAAY,cAAe,CACzBrH,QAAQ,EACRyJ,WAAW,EACX9J,SAAU,iBACVI,OAAQ,KACN,IAAI2J,EAAMhI,WAAWiI,iBAAiB7H,SAASC,iBAAiB6H,UAChE,OAAOtC,EAAO7F,wBAAwBoI,OAAS,IAAMH,EAAM,CAAC,GAiBlE,CAcA5H,SAAS8D,iBAAiB,oBAT1B,WACE9D,SAASS,KAAKW,WAAWG,UAAUC,OAAO,SAE1CgE,EAASxF,SAASuH,cAAc,UAChChC,EAAYvF,SAASuH,cAAc,eAEnC1D,GACF","sources":["webpack:///./src/furo/assets/scripts/gumshoe-patched.js","webpack:///webpack/bootstrap","webpack:///webpack/runtime/compat get default export","webpack:///webpack/runtime/define property getters","webpack:///webpack/runtime/global","webpack:///webpack/runtime/hasOwnProperty shorthand","webpack:///./src/furo/assets/scripts/furo.js"],"sourcesContent":["/*!\n * gumshoejs v5.1.2 (patched by @pradyunsg)\n * A simple, framework-agnostic scrollspy script.\n * (c) 2019 Chris Ferdinandi\n * MIT License\n * http://github.com/cferdinandi/gumshoe\n */\n\n(function (root, factory) {\n if (typeof define === \"function\" && define.amd) {\n define([], function () {\n return factory(root);\n });\n } else if (typeof exports === \"object\") {\n module.exports = factory(root);\n } else {\n root.Gumshoe = factory(root);\n }\n})(\n typeof global !== \"undefined\"\n ? global\n : typeof window !== \"undefined\"\n ? window\n : this,\n function (window) {\n \"use strict\";\n\n //\n // Defaults\n //\n\n var defaults = {\n // Active classes\n navClass: \"active\",\n contentClass: \"active\",\n\n // Nested navigation\n nested: false,\n nestedClass: \"active\",\n\n // Offset & reflow\n offset: 0,\n reflow: false,\n\n // Event support\n events: true,\n };\n\n //\n // Methods\n //\n\n /**\n * Merge two or more objects together.\n * @param {Object} objects The objects to merge together\n * @returns {Object} Merged values of defaults and options\n */\n var extend = function () {\n var merged = {};\n Array.prototype.forEach.call(arguments, function (obj) {\n for (var key in obj) {\n if (!obj.hasOwnProperty(key)) return;\n merged[key] = obj[key];\n }\n });\n return merged;\n };\n\n /**\n * Emit a custom event\n * @param {String} type The event type\n * @param {Node} elem The element to attach the event to\n * @param {Object} detail Any details to pass along with the event\n */\n var emitEvent = function (type, elem, detail) {\n // Make sure events are enabled\n if (!detail.settings.events) return;\n\n // Create a new event\n var event = new CustomEvent(type, {\n bubbles: true,\n cancelable: true,\n detail: detail,\n });\n\n // Dispatch the event\n elem.dispatchEvent(event);\n };\n\n /**\n * Get an element's distance from the top of the Document.\n * @param {Node} elem The element\n * @return {Number} Distance from the top in pixels\n */\n var getOffsetTop = function (elem) {\n var location = 0;\n if (elem.offsetParent) {\n while (elem) {\n location += elem.offsetTop;\n elem = elem.offsetParent;\n }\n }\n return location >= 0 ? location : 0;\n };\n\n /**\n * Sort content from first to last in the DOM\n * @param {Array} contents The content areas\n */\n var sortContents = function (contents) {\n if (contents) {\n contents.sort(function (item1, item2) {\n var offset1 = getOffsetTop(item1.content);\n var offset2 = getOffsetTop(item2.content);\n if (offset1 < offset2) return -1;\n return 1;\n });\n }\n };\n\n /**\n * Get the offset to use for calculating position\n * @param {Object} settings The settings for this instantiation\n * @return {Float} The number of pixels to offset the calculations\n */\n var getOffset = function (settings) {\n // if the offset is a function run it\n if (typeof settings.offset === \"function\") {\n return parseFloat(settings.offset());\n }\n\n // Otherwise, return it as-is\n return parseFloat(settings.offset);\n };\n\n /**\n * Get the document element's height\n * @private\n * @returns {Number}\n */\n var getDocumentHeight = function () {\n return Math.max(\n document.body.scrollHeight,\n document.documentElement.scrollHeight,\n document.body.offsetHeight,\n document.documentElement.offsetHeight,\n document.body.clientHeight,\n document.documentElement.clientHeight,\n );\n };\n\n /**\n * Determine if an element is in view\n * @param {Node} elem The element\n * @param {Object} settings The settings for this instantiation\n * @param {Boolean} bottom If true, check if element is above bottom of viewport instead\n * @return {Boolean} Returns true if element is in the viewport\n */\n var isInView = function (elem, settings, bottom) {\n var bounds = elem.getBoundingClientRect();\n var offset = getOffset(settings);\n if (bottom) {\n return (\n parseInt(bounds.bottom, 10) <\n (window.innerHeight || document.documentElement.clientHeight)\n );\n }\n return parseInt(bounds.top, 10) <= offset;\n };\n\n /**\n * Check if at the bottom of the viewport\n * @return {Boolean} If true, page is at the bottom of the viewport\n */\n var isAtBottom = function () {\n if (\n Math.ceil(window.innerHeight + window.pageYOffset) >=\n getDocumentHeight()\n )\n return true;\n return false;\n };\n\n /**\n * Check if the last item should be used (even if not at the top of the page)\n * @param {Object} item The last item\n * @param {Object} settings The settings for this instantiation\n * @return {Boolean} If true, use the last item\n */\n var useLastItem = function (item, settings) {\n if (isAtBottom() && isInView(item.content, settings, true)) return true;\n return false;\n };\n\n /**\n * Get the active content\n * @param {Array} contents The content areas\n * @param {Object} settings The settings for this instantiation\n * @return {Object} The content area and matching navigation link\n */\n var getActive = function (contents, settings) {\n var last = contents[contents.length - 1];\n if (useLastItem(last, settings)) return last;\n for (var i = contents.length - 1; i >= 0; i--) {\n if (isInView(contents[i].content, settings)) return contents[i];\n }\n };\n\n /**\n * Deactivate parent navs in a nested navigation\n * @param {Node} nav The starting navigation element\n * @param {Object} settings The settings for this instantiation\n */\n var deactivateNested = function (nav, settings) {\n // If nesting isn't activated, bail\n if (!settings.nested || !nav.parentNode) return;\n\n // Get the parent navigation\n var li = nav.parentNode.closest(\"li\");\n if (!li) return;\n\n // Remove the active class\n li.classList.remove(settings.nestedClass);\n\n // Apply recursively to any parent navigation elements\n deactivateNested(li, settings);\n };\n\n /**\n * Deactivate a nav and content area\n * @param {Object} items The nav item and content to deactivate\n * @param {Object} settings The settings for this instantiation\n */\n var deactivate = function (items, settings) {\n // Make sure there are items to deactivate\n if (!items) return;\n\n // Get the parent list item\n var li = items.nav.closest(\"li\");\n if (!li) return;\n\n // Remove the active class from the nav and content\n li.classList.remove(settings.navClass);\n items.content.classList.remove(settings.contentClass);\n\n // Deactivate any parent navs in a nested navigation\n deactivateNested(li, settings);\n\n // Emit a custom event\n emitEvent(\"gumshoeDeactivate\", li, {\n link: items.nav,\n content: items.content,\n settings: settings,\n });\n };\n\n /**\n * Activate parent navs in a nested navigation\n * @param {Node} nav The starting navigation element\n * @param {Object} settings The settings for this instantiation\n */\n var activateNested = function (nav, settings) {\n // If nesting isn't activated, bail\n if (!settings.nested) return;\n\n // Get the parent navigation\n var li = nav.parentNode.closest(\"li\");\n if (!li) return;\n\n // Add the active class\n li.classList.add(settings.nestedClass);\n\n // Apply recursively to any parent navigation elements\n activateNested(li, settings);\n };\n\n /**\n * Activate a nav and content area\n * @param {Object} items The nav item and content to activate\n * @param {Object} settings The settings for this instantiation\n */\n var activate = function (items, settings) {\n // Make sure there are items to activate\n if (!items) return;\n\n // Get the parent list item\n var li = items.nav.closest(\"li\");\n if (!li) return;\n\n // Add the active class to the nav and content\n li.classList.add(settings.navClass);\n items.content.classList.add(settings.contentClass);\n\n // Activate any parent navs in a nested navigation\n activateNested(li, settings);\n\n // Emit a custom event\n emitEvent(\"gumshoeActivate\", li, {\n link: items.nav,\n content: items.content,\n settings: settings,\n });\n };\n\n /**\n * Create the Constructor object\n * @param {String} selector The selector to use for navigation items\n * @param {Object} options User options and settings\n */\n var Constructor = function (selector, options) {\n //\n // Variables\n //\n\n var publicAPIs = {};\n var navItems, contents, current, timeout, settings;\n\n //\n // Methods\n //\n\n /**\n * Set variables from DOM elements\n */\n publicAPIs.setup = function () {\n // Get all nav items\n navItems = document.querySelectorAll(selector);\n\n // Create contents array\n contents = [];\n\n // Loop through each item, get it's matching content, and push to the array\n Array.prototype.forEach.call(navItems, function (item) {\n // Get the content for the nav item\n var content = document.getElementById(\n decodeURIComponent(item.hash.substr(1)),\n );\n if (!content) return;\n\n // Push to the contents array\n contents.push({\n nav: item,\n content: content,\n });\n });\n\n // Sort contents by the order they appear in the DOM\n sortContents(contents);\n };\n\n /**\n * Detect which content is currently active\n */\n publicAPIs.detect = function () {\n // Get the active content\n var active = getActive(contents, settings);\n\n // if there's no active content, deactivate and bail\n if (!active) {\n if (current) {\n deactivate(current, settings);\n current = null;\n }\n return;\n }\n\n // If the active content is the one currently active, do nothing\n if (current && active.content === current.content) return;\n\n // Deactivate the current content and activate the new content\n deactivate(current, settings);\n activate(active, settings);\n\n // Update the currently active content\n current = active;\n };\n\n /**\n * Detect the active content on scroll\n * Debounced for performance\n */\n var scrollHandler = function (event) {\n // If there's a timer, cancel it\n if (timeout) {\n window.cancelAnimationFrame(timeout);\n }\n\n // Setup debounce callback\n timeout = window.requestAnimationFrame(publicAPIs.detect);\n };\n\n /**\n * Update content sorting on resize\n * Debounced for performance\n */\n var resizeHandler = function (event) {\n // If there's a timer, cancel it\n if (timeout) {\n window.cancelAnimationFrame(timeout);\n }\n\n // Setup debounce callback\n timeout = window.requestAnimationFrame(function () {\n sortContents(contents);\n publicAPIs.detect();\n });\n };\n\n /**\n * Destroy the current instantiation\n */\n publicAPIs.destroy = function () {\n // Undo DOM changes\n if (current) {\n deactivate(current, settings);\n }\n\n // Remove event listeners\n window.removeEventListener(\"scroll\", scrollHandler, false);\n if (settings.reflow) {\n window.removeEventListener(\"resize\", resizeHandler, false);\n }\n\n // Reset variables\n contents = null;\n navItems = null;\n current = null;\n timeout = null;\n settings = null;\n };\n\n /**\n * Initialize the current instantiation\n */\n var init = function () {\n // Merge user options into defaults\n settings = extend(defaults, options || {});\n\n // Setup variables based on the current DOM\n publicAPIs.setup();\n\n // Find the currently active content\n publicAPIs.detect();\n\n // Setup event listeners\n window.addEventListener(\"scroll\", scrollHandler, false);\n if (settings.reflow) {\n window.addEventListener(\"resize\", resizeHandler, false);\n }\n };\n\n //\n // Initialize and return the public APIs\n //\n\n init();\n return publicAPIs;\n };\n\n //\n // Return the Constructor\n //\n\n return Constructor;\n },\n);\n","// The module cache\nvar __webpack_module_cache__ = {};\n\n// The require function\nfunction __webpack_require__(moduleId) {\n\t// Check if module is in cache\n\tvar cachedModule = __webpack_module_cache__[moduleId];\n\tif (cachedModule !== undefined) {\n\t\treturn cachedModule.exports;\n\t}\n\t// Create a new module (and put it into the cache)\n\tvar module = __webpack_module_cache__[moduleId] = {\n\t\t// no module.id needed\n\t\t// no module.loaded needed\n\t\texports: {}\n\t};\n\n\t// Execute the module function\n\t__webpack_modules__[moduleId].call(module.exports, module, module.exports, __webpack_require__);\n\n\t// Return the exports of the module\n\treturn module.exports;\n}\n\n","// getDefaultExport function for compatibility with non-harmony modules\n__webpack_require__.n = (module) => {\n\tvar getter = module && module.__esModule ?\n\t\t() => (module['default']) :\n\t\t() => (module);\n\t__webpack_require__.d(getter, { a: getter });\n\treturn getter;\n};","// define getter functions for harmony exports\n__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n\t\tif(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n\t\t\tObject.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n\t\t}\n\t}\n};","__webpack_require__.g = (function() {\n\tif (typeof globalThis === 'object') return globalThis;\n\ttry {\n\t\treturn this || new Function('return this')();\n\t} catch (e) {\n\t\tif (typeof window === 'object') return window;\n\t}\n})();","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","import Gumshoe from \"./gumshoe-patched.js\";\n\n////////////////////////////////////////////////////////////////////////////////\n// Scroll Handling\n////////////////////////////////////////////////////////////////////////////////\nvar tocScroll = null;\nvar header = null;\nvar lastScrollTop = document.documentElement.scrollTop;\nconst GO_TO_TOP_OFFSET = 64;\n\nfunction scrollHandlerForHeader(positionY) {\n const headerTop = Math.floor(header.getBoundingClientRect().top);\n\n console.log(`headerTop: ${headerTop}`);\n if (headerTop == 0 && positionY != headerTop) {\n header.classList.add(\"scrolled\");\n } else {\n header.classList.remove(\"scrolled\");\n }\n}\n\nfunction scrollHandlerForBackToTop(positionY) {\n if (positionY < GO_TO_TOP_OFFSET) {\n document.documentElement.classList.remove(\"show-back-to-top\");\n } else {\n if (positionY < lastScrollTop) {\n document.documentElement.classList.add(\"show-back-to-top\");\n } else if (positionY > lastScrollTop) {\n document.documentElement.classList.remove(\"show-back-to-top\");\n }\n }\n lastScrollTop = positionY;\n}\n\nfunction scrollHandlerForTOC(positionY) {\n if (tocScroll === null) {\n return;\n }\n\n // top of page.\n if (positionY == 0) {\n tocScroll.scrollTo(0, 0);\n } else if (\n // bottom of page.\n Math.ceil(positionY) >=\n Math.floor(document.documentElement.scrollHeight - window.innerHeight)\n ) {\n tocScroll.scrollTo(0, tocScroll.scrollHeight);\n } else {\n // somewhere in the middle.\n const current = document.querySelector(\".scroll-current\");\n if (current == null) {\n return;\n }\n\n // https://github.com/pypa/pip/issues/9159 This breaks scroll behaviours.\n // // scroll the currently \"active\" heading in toc, into view.\n // const rect = current.getBoundingClientRect();\n // if (0 > rect.top) {\n // current.scrollIntoView(true); // the argument is \"alignTop\"\n // } else if (rect.bottom > window.innerHeight) {\n // current.scrollIntoView(false);\n // }\n }\n}\n\nfunction scrollHandler(positionY) {\n scrollHandlerForHeader(positionY);\n scrollHandlerForBackToTop(positionY);\n scrollHandlerForTOC(positionY);\n}\n\n////////////////////////////////////////////////////////////////////////////////\n// Theme Toggle\n////////////////////////////////////////////////////////////////////////////////\nfunction setTheme(mode) {\n if (mode !== \"light\" && mode !== \"dark\" && mode !== \"auto\") {\n console.error(`Got invalid theme mode: ${mode}. Resetting to auto.`);\n mode = \"auto\";\n }\n\n document.body.dataset.theme = mode;\n localStorage.setItem(\"theme\", mode);\n console.log(`Changed to ${mode} mode.`);\n}\n\nfunction cycleThemeOnce() {\n const currentTheme = localStorage.getItem(\"theme\") || \"auto\";\n const prefersDark = window.matchMedia(\"(prefers-color-scheme: dark)\").matches;\n\n if (prefersDark) {\n // Auto (dark) -> Light -> Dark\n if (currentTheme === \"auto\") {\n setTheme(\"light\");\n } else if (currentTheme == \"light\") {\n setTheme(\"dark\");\n } else {\n setTheme(\"auto\");\n }\n } else {\n // Auto (light) -> Dark -> Light\n if (currentTheme === \"auto\") {\n setTheme(\"dark\");\n } else if (currentTheme == \"dark\") {\n setTheme(\"light\");\n } else {\n setTheme(\"auto\");\n }\n }\n}\n\n////////////////////////////////////////////////////////////////////////////////\n// Setup\n////////////////////////////////////////////////////////////////////////////////\nfunction setupScrollHandler() {\n // Taken from https://developer.mozilla.org/en-US/docs/Web/API/Document/scroll_event\n let last_known_scroll_position = 0;\n let ticking = false;\n\n window.addEventListener(\"scroll\", function (e) {\n last_known_scroll_position = window.scrollY;\n\n if (!ticking) {\n window.requestAnimationFrame(function () {\n scrollHandler(last_known_scroll_position);\n ticking = false;\n });\n\n ticking = true;\n }\n });\n window.scroll();\n}\n\nfunction setupScrollSpy() {\n if (tocScroll === null) {\n return;\n }\n\n // Scrollspy -- highlight table on contents, based on scroll\n new Gumshoe(\".toc-tree a\", {\n reflow: true,\n recursive: true,\n navClass: \"scroll-current\",\n offset: () => {\n let rem = parseFloat(getComputedStyle(document.documentElement).fontSize);\n return header.getBoundingClientRect().height + 2.5 * rem + 1;\n },\n });\n}\n\nfunction setupTheme() {\n // Attach event handlers for toggling themes\n const buttons = document.getElementsByClassName(\"theme-toggle\");\n Array.from(buttons).forEach((btn) => {\n btn.addEventListener(\"click\", cycleThemeOnce);\n });\n}\n\nfunction setup() {\n setupTheme();\n setupScrollHandler();\n setupScrollSpy();\n}\n\n////////////////////////////////////////////////////////////////////////////////\n// Main entrypoint\n////////////////////////////////////////////////////////////////////////////////\nfunction main() {\n document.body.parentNode.classList.remove(\"no-js\");\n\n header = document.querySelector(\"header\");\n tocScroll = document.querySelector(\".toc-scroll\");\n\n setup();\n}\n\ndocument.addEventListener(\"DOMContentLoaded\", main);\n"],"names":["root","g","window","this","defaults","navClass","contentClass","nested","nestedClass","offset","reflow","events","emitEvent","type","elem","detail","settings","event","CustomEvent","bubbles","cancelable","dispatchEvent","getOffsetTop","location","offsetParent","offsetTop","sortContents","contents","sort","item1","item2","content","isInView","bottom","bounds","getBoundingClientRect","parseFloat","getOffset","parseInt","innerHeight","document","documentElement","clientHeight","top","isAtBottom","Math","ceil","pageYOffset","max","body","scrollHeight","offsetHeight","getActive","last","length","item","useLastItem","i","deactivateNested","nav","parentNode","li","closest","classList","remove","deactivate","items","link","activateNested","add","selector","options","navItems","current","timeout","publicAPIs","querySelectorAll","Array","prototype","forEach","call","getElementById","decodeURIComponent","hash","substr","push","active","activate","scrollHandler","cancelAnimationFrame","requestAnimationFrame","detect","resizeHandler","destroy","removeEventListener","merged","arguments","obj","key","hasOwnProperty","extend","setup","addEventListener","factory","__webpack_module_cache__","__webpack_require__","moduleId","cachedModule","undefined","exports","module","__webpack_modules__","n","getter","__esModule","d","a","definition","o","Object","defineProperty","enumerable","get","globalThis","Function","e","prop","tocScroll","header","lastScrollTop","scrollTop","GO_TO_TOP_OFFSET","cycleThemeOnce","currentTheme","localStorage","getItem","mode","matchMedia","matches","console","error","dataset","theme","setItem","log","buttons","getElementsByClassName","from","btn","setupTheme","last_known_scroll_position","ticking","scrollY","positionY","headerTop","floor","scrollHandlerForHeader","scrollHandlerForBackToTop","scrollTo","querySelector","scrollHandlerForTOC","scroll","setupScrollHandler","recursive","rem","getComputedStyle","fontSize","height"],"sourceRoot":""} \ No newline at end of file diff --git a/_static/searchtools.js b/_static/searchtools.js new file mode 100644 index 00000000..2c774d17 --- /dev/null +++ b/_static/searchtools.js @@ -0,0 +1,632 @@ +/* + * Sphinx JavaScript utilities for the full-text search. + */ +"use strict"; + +/** + * Simple result scoring code. + */ +if (typeof Scorer === "undefined") { + var Scorer = { + // Implement the following function to further tweak the score for each result + // The function takes a result array [docname, title, anchor, descr, score, filename] + // and returns the new score. + /* + score: result => { + const [docname, title, anchor, descr, score, filename, kind] = result + return score + }, + */ + + // query matches the full name of an object + objNameMatch: 11, + // or matches in the last dotted part of the object name + objPartialMatch: 6, + // Additive scores depending on the priority of the object + objPrio: { + 0: 15, // used to be importantResults + 1: 5, // used to be objectResults + 2: -5, // used to be unimportantResults + }, + // Used when the priority is not in the mapping. + objPrioDefault: 0, + + // query found in title + title: 15, + partialTitle: 7, + // query found in terms + term: 5, + partialTerm: 2, + }; +} + +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + +const _removeChildren = (element) => { + while (element && element.lastChild) element.removeChild(element.lastChild); +}; + +/** + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping + */ +const _escapeRegExp = (string) => + string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + +const _displayItem = (item, searchTerms, highlightTerms) => { + const docBuilder = DOCUMENTATION_OPTIONS.BUILDER; + const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX; + const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX; + const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; + const contentRoot = document.documentElement.dataset.content_root; + + const [docName, title, anchor, descr, score, _filename, kind] = item; + + let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); + let requestUrl; + let linkUrl; + if (docBuilder === "dirhtml") { + // dirhtml builder + let dirname = docName + "/"; + if (dirname.match(/\/index\/$/)) + dirname = dirname.substring(0, dirname.length - 6); + else if (dirname === "index/") dirname = ""; + requestUrl = contentRoot + dirname; + linkUrl = requestUrl; + } else { + // normal html builders + requestUrl = contentRoot + docName + docFileSuffix; + linkUrl = docName + docLinkSuffix; + } + let linkEl = listItem.appendChild(document.createElement("a")); + linkEl.href = linkUrl + anchor; + linkEl.dataset.score = score; + linkEl.innerHTML = title; + if (descr) { + listItem.appendChild(document.createElement("span")).innerHTML = + " (" + descr + ")"; + // highlight search terms in the description + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + } + else if (showSearchSummary) + fetch(requestUrl) + .then((responseData) => responseData.text()) + .then((data) => { + if (data) + listItem.appendChild( + Search.makeSearchSummary(data, searchTerms, anchor) + ); + // highlight search terms in the summary + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + }); + Search.output.appendChild(listItem); +}; +const _finishSearch = (resultCount) => { + Search.stopPulse(); + Search.title.innerText = _("Search Results"); + if (!resultCount) + Search.status.innerText = Documentation.gettext( + "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." + ); + else + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, + ).replace('${resultCount}', resultCount); +}; +const _displayNextItem = ( + results, + resultCount, + searchTerms, + highlightTerms, +) => { + // results left, load the summary and display it + // this is intended to be dynamic (don't sub resultsCount) + if (results.length) { + _displayItem(results.pop(), searchTerms, highlightTerms); + setTimeout( + () => _displayNextItem(results, resultCount, searchTerms, highlightTerms), + 5 + ); + } + // search finished, update title and status message + else _finishSearch(resultCount); +}; +// Helper function used by query() to order search results. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. +// Order the results by score (in opposite order of appearance, since the +// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. +const _orderResultsByScoreThenName = (a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; +}; + +/** + * Default splitQuery function. Can be overridden in ``sphinx.search`` with a + * custom function per language. + * + * The regular expression works by splitting the string on consecutive characters + * that are not Unicode letters, numbers, underscores, or emoji characters. + * This is the same as ``\W+`` in Python, preserving the surrogate pair area. + */ +if (typeof splitQuery === "undefined") { + var splitQuery = (query) => query + .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu) + .filter(term => term) // remove remaining empty strings +} + +/** + * Search Module + */ +const Search = { + _index: null, + _queued_query: null, + _pulse_status: -1, + + htmlToText: (htmlString, anchor) => { + const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); + for (const removalQuery of [".headerlink", "script", "style"]) { + htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() }); + } + if (anchor) { + const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`); + if (anchorContent) return anchorContent.textContent; + + console.warn( + `Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.` + ); + } + + // if anchor not specified or not found, fall back to main content + const docContent = htmlElement.querySelector('[role="main"]'); + if (docContent) return docContent.textContent; + + console.warn( + "Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template." + ); + return ""; + }, + + init: () => { + const query = new URLSearchParams(window.location.search).get("q"); + document + .querySelectorAll('input[name="q"]') + .forEach((el) => (el.value = query)); + if (query) Search.performSearch(query); + }, + + loadIndex: (url) => + (document.body.appendChild(document.createElement("script")).src = url), + + setIndex: (index) => { + Search._index = index; + if (Search._queued_query !== null) { + const query = Search._queued_query; + Search._queued_query = null; + Search.query(query); + } + }, + + hasIndex: () => Search._index !== null, + + deferQuery: (query) => (Search._queued_query = query), + + stopPulse: () => (Search._pulse_status = -1), + + startPulse: () => { + if (Search._pulse_status >= 0) return; + + const pulse = () => { + Search._pulse_status = (Search._pulse_status + 1) % 4; + Search.dots.innerText = ".".repeat(Search._pulse_status); + if (Search._pulse_status >= 0) window.setTimeout(pulse, 500); + }; + pulse(); + }, + + /** + * perform a search for something (or wait until index is loaded) + */ + performSearch: (query) => { + // create the required interface elements + const searchText = document.createElement("h2"); + searchText.textContent = _("Searching"); + const searchSummary = document.createElement("p"); + searchSummary.classList.add("search-summary"); + searchSummary.innerText = ""; + const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); + searchList.classList.add("search"); + + const out = document.getElementById("search-results"); + Search.title = out.appendChild(searchText); + Search.dots = Search.title.appendChild(document.createElement("span")); + Search.status = out.appendChild(searchSummary); + Search.output = out.appendChild(searchList); + + const searchProgress = document.getElementById("search-progress"); + // Some themes don't use the search progress node + if (searchProgress) { + searchProgress.innerText = _("Preparing search..."); + } + Search.startPulse(); + + // index already loaded, the browser was quick! + if (Search.hasIndex()) Search.query(query); + else Search.deferQuery(query); + }, + + _parseQuery: (query) => { + // stem the search terms and add them to the correct list + const stemmer = new Stemmer(); + const searchTerms = new Set(); + const excludedTerms = new Set(); + const highlightTerms = new Set(); + const objectTerms = new Set(splitQuery(query.toLowerCase().trim())); + splitQuery(query.trim()).forEach((queryTerm) => { + const queryTermLower = queryTerm.toLowerCase(); + + // maybe skip this "word" + // stopwords array is from language_data.js + if ( + stopwords.indexOf(queryTermLower) !== -1 || + queryTerm.match(/^\d+$/) + ) + return; + + // stem the word + let word = stemmer.stemWord(queryTermLower); + // select the correct list + if (word[0] === "-") excludedTerms.add(word.substr(1)); + else { + searchTerms.add(word); + highlightTerms.add(queryTermLower); + } + }); + + if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js + localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" ")) + } + + // console.debug("SEARCH: searching for:"); + // console.info("required: ", [...searchTerms]); + // console.info("excluded: ", [...excludedTerms]); + + return [query, searchTerms, excludedTerms, highlightTerms, objectTerms]; + }, + + /** + * execute search (requires search index to be loaded) + */ + _performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + + // Collect multiple result groups to be sorted separately and then ordered. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. + const normalResults = []; + const nonMainIndexResults = []; + + _removeChildren(document.getElementById("search-progress")); + + const queryLower = query.toLowerCase().trim(); + for (const [title, foundTitles] of Object.entries(allTitles)) { + if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) { + for (const [file, id] of foundTitles) { + const score = Math.round(Scorer.title * queryLower.length / title.length); + const boost = titles[file] === title ? 1 : 0; // add a boost for document titles + normalResults.push([ + docNames[file], + titles[file] !== title ? `${titles[file]} > ${title}` : title, + id !== null ? "#" + id : "", + null, + score + boost, + filenames[file], + SearchResultKind.title, + ]); + } + } + } + + // search for explicit entries in index directives + for (const [entry, foundEntries] of Object.entries(indexEntries)) { + if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { + for (const [file, id, isMain] of foundEntries) { + const score = Math.round(100 * queryLower.length / entry.length); + const result = [ + docNames[file], + titles[file], + id ? "#" + id : "", + null, + score, + filenames[file], + SearchResultKind.index, + ]; + if (isMain) { + normalResults.push(result); + } else { + nonMainIndexResults.push(result); + } + } + } + } + + // lookup as object + objectTerms.forEach((term) => + normalResults.push(...Search.performObjectSearch(term, objectTerms)) + ); + + // lookup as search terms in fulltext + normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + + // let the scorer override scores with a custom scoring function + if (Scorer.score) { + normalResults.forEach((item) => (item[4] = Scorer.score(item))); + nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item))); + } + + // Sort each group of results by score and then alphabetically by name. + normalResults.sort(_orderResultsByScoreThenName); + nonMainIndexResults.sort(_orderResultsByScoreThenName); + + // Combine the result groups in (reverse) order. + // Non-main index entries are typically arbitrary cross-references, + // so display them after other results. + let results = [...nonMainIndexResults, ...normalResults]; + + // remove duplicate search results + // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept + let seen = new Set(); + results = results.reverse().reduce((acc, result) => { + let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(','); + if (!seen.has(resultStr)) { + acc.push(result); + seen.add(resultStr); + } + return acc; + }, []); + + return results.reverse(); + }, + + query: (query) => { + const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query); + const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms); + + // for debugging + //Search.lastresults = results.slice(); // a copy + // console.info("search results:", Search.lastresults); + + // print the results + _displayNextItem(results, results.length, searchTerms, highlightTerms); + }, + + /** + * search for object names + */ + performObjectSearch: (object, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const objects = Search._index.objects; + const objNames = Search._index.objnames; + const titles = Search._index.titles; + + const results = []; + + const objectSearchCallback = (prefix, match) => { + const name = match[4] + const fullname = (prefix ? prefix + "." : "") + name; + const fullnameLower = fullname.toLowerCase(); + if (fullnameLower.indexOf(object) < 0) return; + + let score = 0; + const parts = fullnameLower.split("."); + + // check for different match types: exact matches of full name or + // "last name" (i.e. last dotted part) + if (fullnameLower === object || parts.slice(-1)[0] === object) + score += Scorer.objNameMatch; + else if (parts.slice(-1)[0].indexOf(object) > -1) + score += Scorer.objPartialMatch; // matches in last name + + const objName = objNames[match[1]][2]; + const title = titles[match[0]]; + + // If more than one term searched for, we require other words to be + // found in the name/title/description + const otherTerms = new Set(objectTerms); + otherTerms.delete(object); + if (otherTerms.size > 0) { + const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase(); + if ( + [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0) + ) + return; + } + + let anchor = match[3]; + if (anchor === "") anchor = fullname; + else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname; + + const descr = objName + _(", in ") + title; + + // add custom score for some objects according to scorer + if (Scorer.objPrio.hasOwnProperty(match[2])) + score += Scorer.objPrio[match[2]]; + else score += Scorer.objPrioDefault; + + results.push([ + docNames[match[0]], + fullname, + "#" + anchor, + descr, + score, + filenames[match[0]], + SearchResultKind.object, + ]); + }; + Object.keys(objects).forEach((prefix) => + objects[prefix].forEach((array) => + objectSearchCallback(prefix, array) + ) + ); + return results; + }, + + /** + * search for full-text terms in the index + */ + performTermsSearch: (searchTerms, excludedTerms) => { + // prepare search + const terms = Search._index.terms; + const titleTerms = Search._index.titleterms; + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + + const scoreMap = new Map(); + const fileMap = new Map(); + + // perform the search on the required terms + searchTerms.forEach((word) => { + const files = []; + const arr = [ + { files: terms[word], score: Scorer.term }, + { files: titleTerms[word], score: Scorer.title }, + ]; + // add support for partial matches + if (word.length > 2) { + const escapedWord = _escapeRegExp(word); + if (!terms.hasOwnProperty(word)) { + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + } + if (!titleTerms.hasOwnProperty(word)) { + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: titleTerms[term], score: Scorer.partialTitle }); + }); + } + } + + // no match but word was a required one + if (arr.every((record) => record.files === undefined)) return; + + // found search word in contents + arr.forEach((record) => { + if (record.files === undefined) return; + + let recordFiles = record.files; + if (recordFiles.length === undefined) recordFiles = [recordFiles]; + files.push(...recordFiles); + + // set score for the word in each file + recordFiles.forEach((file) => { + if (!scoreMap.has(file)) scoreMap.set(file, {}); + scoreMap.get(file)[word] = record.score; + }); + }); + + // create the mapping + files.forEach((file) => { + if (!fileMap.has(file)) fileMap.set(file, [word]); + else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word); + }); + }); + + // now check if the files don't contain excluded terms + const results = []; + for (const [file, wordList] of fileMap) { + // check if all requirements are matched + + // as search terms with length < 3 are discarded + const filteredTermCount = [...searchTerms].filter( + (term) => term.length > 2 + ).length; + if ( + wordList.length !== searchTerms.size && + wordList.length !== filteredTermCount + ) + continue; + + // ensure that none of the excluded terms is in the search result + if ( + [...excludedTerms].some( + (term) => + terms[term] === file || + titleTerms[term] === file || + (terms[term] || []).includes(file) || + (titleTerms[term] || []).includes(file) + ) + ) + break; + + // select one (max) score for the file. + const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w])); + // add result to the result list + results.push([ + docNames[file], + titles[file], + "", + null, + score, + filenames[file], + SearchResultKind.text, + ]); + } + return results; + }, + + /** + * helper function to return a node containing the + * search summary for a given text. keywords is a list + * of stemmed words. + */ + makeSearchSummary: (htmlText, keywords, anchor) => { + const text = Search.htmlToText(htmlText, anchor); + if (text === "") return null; + + const textLower = text.toLowerCase(); + const actualStartPosition = [...keywords] + .map((k) => textLower.indexOf(k.toLowerCase())) + .filter((i) => i > -1) + .slice(-1)[0]; + const startWithContext = Math.max(actualStartPosition - 120, 0); + + const top = startWithContext === 0 ? "" : "..."; + const tail = startWithContext + 240 < text.length ? "..." : ""; + + let summary = document.createElement("p"); + summary.classList.add("context"); + summary.textContent = top + text.substr(startWithContext, 240).trim() + tail; + + return summary; + }, +}; + +_ready(Search.init); diff --git a/_static/skeleton.css b/_static/skeleton.css new file mode 100644 index 00000000..467c878c --- /dev/null +++ b/_static/skeleton.css @@ -0,0 +1,296 @@ +/* Some sane resets. */ +html { + height: 100%; +} + +body { + margin: 0; + min-height: 100%; +} + +/* All the flexbox magic! */ +body, +.sb-announcement, +.sb-content, +.sb-main, +.sb-container, +.sb-container__inner, +.sb-article-container, +.sb-footer-content, +.sb-header, +.sb-header-secondary, +.sb-footer { + display: flex; +} + +/* These order things vertically */ +body, +.sb-main, +.sb-article-container { + flex-direction: column; +} + +/* Put elements in the center */ +.sb-header, +.sb-header-secondary, +.sb-container, +.sb-content, +.sb-footer, +.sb-footer-content { + justify-content: center; +} +/* Put elements at the ends */ +.sb-article-container { + justify-content: space-between; +} + +/* These elements grow. */ +.sb-main, +.sb-content, +.sb-container, +article { + flex-grow: 1; +} + +/* Because padding making this wider is not fun */ +article { + box-sizing: border-box; +} + +/* The announcements element should never be wider than the page. */ +.sb-announcement { + max-width: 100%; +} + +.sb-sidebar-primary, +.sb-sidebar-secondary { + flex-shrink: 0; + width: 17rem; +} + +.sb-announcement__inner { + justify-content: center; + + box-sizing: border-box; + height: 3rem; + + overflow-x: auto; + white-space: nowrap; +} + +/* Sidebars, with checkbox-based toggle */ +.sb-sidebar-primary, +.sb-sidebar-secondary { + position: fixed; + height: 100%; + top: 0; +} + +.sb-sidebar-primary { + left: -17rem; + transition: left 250ms ease-in-out; +} +.sb-sidebar-secondary { + right: -17rem; + transition: right 250ms ease-in-out; +} + +.sb-sidebar-toggle { + display: none; +} +.sb-sidebar-overlay { + position: fixed; + top: 0; + width: 0; + height: 0; + + transition: width 0ms ease 250ms, height 0ms ease 250ms, opacity 250ms ease; + + opacity: 0; + background-color: rgba(0, 0, 0, 0.54); +} + +#sb-sidebar-toggle--primary:checked + ~ .sb-sidebar-overlay[for="sb-sidebar-toggle--primary"], +#sb-sidebar-toggle--secondary:checked + ~ .sb-sidebar-overlay[for="sb-sidebar-toggle--secondary"] { + width: 100%; + height: 100%; + opacity: 1; + transition: width 0ms ease, height 0ms ease, opacity 250ms ease; +} + +#sb-sidebar-toggle--primary:checked ~ .sb-container .sb-sidebar-primary { + left: 0; +} +#sb-sidebar-toggle--secondary:checked ~ .sb-container .sb-sidebar-secondary { + right: 0; +} + +/* Full-width mode */ +.drop-secondary-sidebar-for-full-width-content + .hide-when-secondary-sidebar-shown { + display: none !important; +} +.drop-secondary-sidebar-for-full-width-content .sb-sidebar-secondary { + display: none !important; +} + +/* Mobile views */ +.sb-page-width { + width: 100%; +} + +.sb-article-container, +.sb-footer-content__inner, +.drop-secondary-sidebar-for-full-width-content .sb-article, +.drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 100vw; +} + +.sb-article, +.match-content-width { + padding: 0 1rem; + box-sizing: border-box; +} + +@media (min-width: 32rem) { + .sb-article, + .match-content-width { + padding: 0 2rem; + } +} + +/* Tablet views */ +@media (min-width: 42rem) { + .sb-article-container { + width: auto; + } + .sb-footer-content__inner, + .drop-secondary-sidebar-for-full-width-content .sb-article, + .drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 42rem; + } + .sb-article, + .match-content-width { + width: 42rem; + } +} +@media (min-width: 46rem) { + .sb-footer-content__inner, + .drop-secondary-sidebar-for-full-width-content .sb-article, + .drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 46rem; + } + .sb-article, + .match-content-width { + width: 46rem; + } +} +@media (min-width: 50rem) { + .sb-footer-content__inner, + .drop-secondary-sidebar-for-full-width-content .sb-article, + .drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 50rem; + } + .sb-article, + .match-content-width { + width: 50rem; + } +} + +/* Tablet views */ +@media (min-width: 59rem) { + .sb-sidebar-secondary { + position: static; + } + .hide-when-secondary-sidebar-shown { + display: none !important; + } + .sb-footer-content__inner, + .drop-secondary-sidebar-for-full-width-content .sb-article, + .drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 59rem; + } + .sb-article, + .match-content-width { + width: 42rem; + } +} +@media (min-width: 63rem) { + .sb-footer-content__inner, + .drop-secondary-sidebar-for-full-width-content .sb-article, + .drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 63rem; + } + .sb-article, + .match-content-width { + width: 46rem; + } +} +@media (min-width: 67rem) { + .sb-footer-content__inner, + .drop-secondary-sidebar-for-full-width-content .sb-article, + .drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 67rem; + } + .sb-article, + .match-content-width { + width: 50rem; + } +} + +/* Desktop views */ +@media (min-width: 76rem) { + .sb-sidebar-primary { + position: static; + } + .hide-when-primary-sidebar-shown { + display: none !important; + } + .sb-footer-content__inner, + .drop-secondary-sidebar-for-full-width-content .sb-article, + .drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 59rem; + } + .sb-article, + .match-content-width { + width: 42rem; + } +} + +/* Full desktop views */ +@media (min-width: 80rem) { + .sb-article, + .match-content-width { + width: 46rem; + } + .sb-footer-content__inner, + .drop-secondary-sidebar-for-full-width-content .sb-article, + .drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 63rem; + } +} + +@media (min-width: 84rem) { + .sb-article, + .match-content-width { + width: 50rem; + } + .sb-footer-content__inner, + .drop-secondary-sidebar-for-full-width-content .sb-article, + .drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 67rem; + } +} + +@media (min-width: 88rem) { + .sb-footer-content__inner, + .drop-secondary-sidebar-for-full-width-content .sb-article, + .drop-secondary-sidebar-for-full-width-content .match-content-width { + width: 67rem; + } + .sb-page-width { + width: 88rem; + } +} diff --git a/_static/sphinx_highlight.js b/_static/sphinx_highlight.js new file mode 100644 index 00000000..8a96c69a --- /dev/null +++ b/_static/sphinx_highlight.js @@ -0,0 +1,154 @@ +/* Highlighting utilities for Sphinx HTML documentation. */ +"use strict"; + +const SPHINX_HIGHLIGHT_ENABLED = true + +/** + * highlight a given string on a node by wrapping it in + * span elements with the given class name. + */ +const _highlight = (node, addItems, text, className) => { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + const rest = document.createTextNode(val.substr(pos + text.length)); + parent.insertBefore( + span, + parent.insertBefore( + rest, + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + /* There may be more occurrences of search term in this node. So call this + * function recursively on the remaining fragment. + */ + _highlight(rest, addItems, text, className); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const SphinxHighlight = { + + /** + * highlight the search words provided in localstorage in the text + */ + highlightSearchWords: () => { + if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight + + // get and clear terms from localstorage + const url = new URL(window.location); + const highlight = + localStorage.getItem("sphinx_highlight_terms") + || url.searchParams.get("highlight") + || ""; + localStorage.removeItem("sphinx_highlight_terms") + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + + // get individual terms from highlight string + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/_static/styles/furo-extensions.css b/_static/styles/furo-extensions.css new file mode 100644 index 00000000..82295876 --- /dev/null +++ b/_static/styles/furo-extensions.css @@ -0,0 +1,2 @@ +#furo-sidebar-ad-placement{padding:var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal)}#furo-sidebar-ad-placement .ethical-sidebar{background:var(--color-background-secondary);border:none;box-shadow:none}#furo-sidebar-ad-placement .ethical-sidebar:hover{background:var(--color-background-hover)}#furo-sidebar-ad-placement .ethical-sidebar a{color:var(--color-foreground-primary)}#furo-sidebar-ad-placement .ethical-callout a{color:var(--color-foreground-secondary)!important}#furo-readthedocs-versions{background:transparent;display:block;position:static;width:100%}#furo-readthedocs-versions .rst-versions{background:#1a1c1e}#furo-readthedocs-versions .rst-current-version{background:var(--color-sidebar-item-background);cursor:unset}#furo-readthedocs-versions .rst-current-version:hover{background:var(--color-sidebar-item-background)}#furo-readthedocs-versions .rst-current-version .fa-book{color:var(--color-foreground-primary)}#furo-readthedocs-versions>.rst-other-versions{padding:0}#furo-readthedocs-versions>.rst-other-versions small{opacity:1}#furo-readthedocs-versions .injected .rst-versions{position:unset}#furo-readthedocs-versions:focus-within,#furo-readthedocs-versions:hover{box-shadow:0 0 0 1px var(--color-sidebar-background-border)}#furo-readthedocs-versions:focus-within .rst-current-version,#furo-readthedocs-versions:hover .rst-current-version{background:#1a1c1e;font-size:inherit;height:auto;line-height:inherit;padding:12px;text-align:right}#furo-readthedocs-versions:focus-within .rst-current-version .fa-book,#furo-readthedocs-versions:hover .rst-current-version .fa-book{color:#fff;float:left}#furo-readthedocs-versions:focus-within .fa-caret-down,#furo-readthedocs-versions:hover .fa-caret-down{display:none}#furo-readthedocs-versions:focus-within .injected,#furo-readthedocs-versions:focus-within .rst-current-version,#furo-readthedocs-versions:focus-within .rst-other-versions,#furo-readthedocs-versions:hover .injected,#furo-readthedocs-versions:hover .rst-current-version,#furo-readthedocs-versions:hover .rst-other-versions{display:block}#furo-readthedocs-versions:focus-within>.rst-current-version,#furo-readthedocs-versions:hover>.rst-current-version{display:none}.highlight:hover button.copybtn{color:var(--color-code-foreground)}.highlight button.copybtn{align-items:center;background-color:var(--color-code-background);border:none;color:var(--color-background-item);cursor:pointer;height:1.25em;right:.5rem;top:.625rem;transition:color .3s,opacity .3s;width:1.25em}.highlight button.copybtn:hover{background-color:var(--color-code-background);color:var(--color-brand-content)}.highlight button.copybtn:after{background-color:transparent;color:var(--color-code-foreground);display:none}.highlight button.copybtn.success{color:#22863a;transition:color 0ms}.highlight button.copybtn.success:after{display:block}.highlight button.copybtn svg{padding:0}body{--sd-color-primary:var(--color-brand-primary);--sd-color-primary-highlight:var(--color-brand-content);--sd-color-primary-text:var(--color-background-primary);--sd-color-shadow:rgba(0,0,0,.05);--sd-color-card-border:var(--color-card-border);--sd-color-card-border-hover:var(--color-brand-content);--sd-color-card-background:var(--color-card-background);--sd-color-card-text:var(--color-foreground-primary);--sd-color-card-header:var(--color-card-marginals-background);--sd-color-card-footer:var(--color-card-marginals-background);--sd-color-tabs-label-active:var(--color-brand-content);--sd-color-tabs-label-hover:var(--color-foreground-muted);--sd-color-tabs-label-inactive:var(--color-foreground-muted);--sd-color-tabs-underline-active:var(--color-brand-content);--sd-color-tabs-underline-hover:var(--color-foreground-border);--sd-color-tabs-underline-inactive:var(--color-background-border);--sd-color-tabs-overline:var(--color-background-border);--sd-color-tabs-underline:var(--color-background-border)}.sd-tab-content{box-shadow:0 -2px var(--sd-color-tabs-overline),0 1px var(--sd-color-tabs-underline)}.sd-card{box-shadow:0 .1rem .25rem var(--sd-color-shadow),0 0 .0625rem rgba(0,0,0,.1)}.sd-shadow-sm{box-shadow:0 .1rem .25rem var(--sd-color-shadow),0 0 .0625rem rgba(0,0,0,.1)!important}.sd-shadow-md{box-shadow:0 .3rem .75rem var(--sd-color-shadow),0 0 .0625rem rgba(0,0,0,.1)!important}.sd-shadow-lg{box-shadow:0 .6rem 1.5rem var(--sd-color-shadow),0 0 .0625rem rgba(0,0,0,.1)!important}.sd-card-hover:hover{transform:none}.sd-cards-carousel{gap:.25rem;padding:.25rem}body{--tabs--label-text:var(--color-foreground-muted);--tabs--label-text--hover:var(--color-foreground-muted);--tabs--label-text--active:var(--color-brand-content);--tabs--label-text--active--hover:var(--color-brand-content);--tabs--label-background:transparent;--tabs--label-background--hover:transparent;--tabs--label-background--active:transparent;--tabs--label-background--active--hover:transparent;--tabs--padding-x:0.25em;--tabs--margin-x:1em;--tabs--border:var(--color-background-border);--tabs--label-border:transparent;--tabs--label-border--hover:var(--color-foreground-muted);--tabs--label-border--active:var(--color-brand-content);--tabs--label-border--active--hover:var(--color-brand-content)}[role=main] .container{max-width:none;padding-left:0;padding-right:0}.shadow.docutils{border:none;box-shadow:0 .2rem .5rem rgba(0,0,0,.05),0 0 .0625rem rgba(0,0,0,.1)!important}.sphinx-bs .card{background-color:var(--color-background-secondary);color:var(--color-foreground)} +/*# sourceMappingURL=furo-extensions.css.map*/ \ No newline at end of file diff --git a/_static/styles/furo-extensions.css.map b/_static/styles/furo-extensions.css.map new file mode 100644 index 00000000..c26eac7f --- /dev/null +++ b/_static/styles/furo-extensions.css.map @@ -0,0 +1 @@ +{"version":3,"file":"styles/furo-extensions.css","mappings":"AAGA,2BACE,oFACA,4CAKE,6CAHA,YACA,eAEA,CACA,kDACE,yCAEF,8CACE,sCAEJ,8CACE,kDAEJ,2BAGE,uBACA,cAHA,gBACA,UAEA,CAGA,yCACE,mBAEF,gDAEE,gDADA,YACA,CACA,sDACE,gDACF,yDACE,sCAEJ,+CACE,UACA,qDACE,UAGF,mDACE,eAEJ,yEAEE,4DAEA,mHASE,mBAPA,kBAEA,YADA,oBAGA,aADA,gBAIA,CAEA,qIAEE,WADA,UACA,CAEJ,uGACE,aAEF,iUAGE,cAEF,mHACE,aC1EJ,gCACE,mCAEF,0BAEE,mBAUA,8CACA,YAFA,mCAKA,eAZA,cAIA,YADA,YAYA,iCAdA,YAcA,CAEA,gCAEE,8CADA,gCACA,CAEF,gCAGE,6BADA,mCADA,YAEA,CAEF,kCAEE,cADA,oBACA,CACA,wCACE,cAEJ,8BACE,UCzCN,KAEE,6CAA8C,CAC9C,uDAAwD,CACxD,uDAAwD,CAGxD,iCAAsC,CAGtC,+CAAgD,CAChD,uDAAwD,CACxD,uDAAwD,CACxD,oDAAqD,CACrD,6DAA8D,CAC9D,6DAA8D,CAG9D,uDAAwD,CACxD,yDAA0D,CAC1D,4DAA6D,CAC7D,2DAA4D,CAC5D,8DAA+D,CAC/D,iEAAkE,CAClE,uDAAwD,CACxD,wDAAyD,CAG3D,gBACE,qFAGF,SACE,6EAEF,cACE,uFAEF,cACE,uFAEF,cACE,uFAGF,qBACE,eAEF,mBACE,WACA,eChDF,KACE,gDAAiD,CACjD,uDAAwD,CACxD,qDAAsD,CACtD,4DAA6D,CAC7D,oCAAqC,CACrC,2CAA4C,CAC5C,4CAA6C,CAC7C,mDAAoD,CACpD,wBAAyB,CACzB,oBAAqB,CACrB,6CAA8C,CAC9C,gCAAiC,CACjC,yDAA0D,CAC1D,uDAAwD,CACxD,8DAA+D,CCbjE,uBACE,eACA,eACA,gBAGF,iBACE,YACA,+EAGF,iBACE,mDACA","sources":["webpack:///./src/furo/assets/styles/extensions/_readthedocs.sass","webpack:///./src/furo/assets/styles/extensions/_copybutton.sass","webpack:///./src/furo/assets/styles/extensions/_sphinx-design.sass","webpack:///./src/furo/assets/styles/extensions/_sphinx-inline-tabs.sass","webpack:///./src/furo/assets/styles/extensions/_sphinx-panels.sass"],"sourcesContent":["// This file contains the styles used for tweaking how ReadTheDoc's embedded\n// contents would show up inside the theme.\n\n#furo-sidebar-ad-placement\n padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal)\n .ethical-sidebar\n // Remove the border and box-shadow.\n border: none\n box-shadow: none\n // Manage the background colors.\n background: var(--color-background-secondary)\n &:hover\n background: var(--color-background-hover)\n // Ensure the text is legible.\n a\n color: var(--color-foreground-primary)\n\n .ethical-callout a\n color: var(--color-foreground-secondary) !important\n\n#furo-readthedocs-versions\n position: static\n width: 100%\n background: transparent\n display: block\n\n // Make the background color fit with the theme's aesthetic.\n .rst-versions\n background: rgb(26, 28, 30)\n\n .rst-current-version\n cursor: unset\n background: var(--color-sidebar-item-background)\n &:hover\n background: var(--color-sidebar-item-background)\n .fa-book\n color: var(--color-foreground-primary)\n\n > .rst-other-versions\n padding: 0\n small\n opacity: 1\n\n .injected\n .rst-versions\n position: unset\n\n &:hover,\n &:focus-within\n box-shadow: 0 0 0 1px var(--color-sidebar-background-border)\n\n .rst-current-version\n // Undo the tweaks done in RTD's CSS\n font-size: inherit\n line-height: inherit\n height: auto\n text-align: right\n padding: 12px\n\n // Match the rest of the body\n background: #1a1c1e\n\n .fa-book\n float: left\n color: white\n\n .fa-caret-down\n display: none\n\n .rst-current-version,\n .rst-other-versions,\n .injected\n display: block\n\n > .rst-current-version\n display: none\n",".highlight\n &:hover button.copybtn\n color: var(--color-code-foreground)\n\n button.copybtn\n // Align things correctly\n align-items: center\n\n height: 1.25em\n width: 1.25em\n\n top: 0.625rem // $code-spacing-vertical\n right: 0.5rem\n\n // Make it look better\n color: var(--color-background-item)\n background-color: var(--color-code-background)\n border: none\n\n // Change to cursor to make it obvious that you can click on it\n cursor: pointer\n\n // Transition smoothly, for aesthetics\n transition: color 300ms, opacity 300ms\n\n &:hover\n color: var(--color-brand-content)\n background-color: var(--color-code-background)\n\n &::after\n display: none\n color: var(--color-code-foreground)\n background-color: transparent\n\n &.success\n transition: color 0ms\n color: #22863a\n &::after\n display: block\n\n svg\n padding: 0\n","body\n // Colors\n --sd-color-primary: var(--color-brand-primary)\n --sd-color-primary-highlight: var(--color-brand-content)\n --sd-color-primary-text: var(--color-background-primary)\n\n // Shadows\n --sd-color-shadow: rgba(0, 0, 0, 0.05)\n\n // Cards\n --sd-color-card-border: var(--color-card-border)\n --sd-color-card-border-hover: var(--color-brand-content)\n --sd-color-card-background: var(--color-card-background)\n --sd-color-card-text: var(--color-foreground-primary)\n --sd-color-card-header: var(--color-card-marginals-background)\n --sd-color-card-footer: var(--color-card-marginals-background)\n\n // Tabs\n --sd-color-tabs-label-active: var(--color-brand-content)\n --sd-color-tabs-label-hover: var(--color-foreground-muted)\n --sd-color-tabs-label-inactive: var(--color-foreground-muted)\n --sd-color-tabs-underline-active: var(--color-brand-content)\n --sd-color-tabs-underline-hover: var(--color-foreground-border)\n --sd-color-tabs-underline-inactive: var(--color-background-border)\n --sd-color-tabs-overline: var(--color-background-border)\n --sd-color-tabs-underline: var(--color-background-border)\n\n// Tabs\n.sd-tab-content\n box-shadow: 0 -2px var(--sd-color-tabs-overline), 0 1px var(--sd-color-tabs-underline)\n\n// Shadows\n.sd-card // Have a shadow by default\n box-shadow: 0 0.1rem 0.25rem var(--sd-color-shadow), 0 0 0.0625rem rgba(0, 0, 0, 0.1)\n\n.sd-shadow-sm\n box-shadow: 0 0.1rem 0.25rem var(--sd-color-shadow), 0 0 0.0625rem rgba(0, 0, 0, 0.1) !important\n\n.sd-shadow-md\n box-shadow: 0 0.3rem 0.75rem var(--sd-color-shadow), 0 0 0.0625rem rgba(0, 0, 0, 0.1) !important\n\n.sd-shadow-lg\n box-shadow: 0 0.6rem 1.5rem var(--sd-color-shadow), 0 0 0.0625rem rgba(0, 0, 0, 0.1) !important\n\n// Cards\n.sd-card-hover:hover // Don't change scale on hover\n transform: none\n\n.sd-cards-carousel // Have a bit of gap in the carousel by default\n gap: 0.25rem\n padding: 0.25rem\n","// This file contains styles to tweak sphinx-inline-tabs to work well with Furo.\n\nbody\n --tabs--label-text: var(--color-foreground-muted)\n --tabs--label-text--hover: var(--color-foreground-muted)\n --tabs--label-text--active: var(--color-brand-content)\n --tabs--label-text--active--hover: var(--color-brand-content)\n --tabs--label-background: transparent\n --tabs--label-background--hover: transparent\n --tabs--label-background--active: transparent\n --tabs--label-background--active--hover: transparent\n --tabs--padding-x: 0.25em\n --tabs--margin-x: 1em\n --tabs--border: var(--color-background-border)\n --tabs--label-border: transparent\n --tabs--label-border--hover: var(--color-foreground-muted)\n --tabs--label-border--active: var(--color-brand-content)\n --tabs--label-border--active--hover: var(--color-brand-content)\n","// This file contains styles to tweak sphinx-panels to work well with Furo.\n\n// sphinx-panels includes Bootstrap 4, which uses .container which can conflict\n// with docutils' `.. container::` directive.\n[role=\"main\"] .container\n max-width: initial\n padding-left: initial\n padding-right: initial\n\n// Make the panels look nicer!\n.shadow.docutils\n border: none\n box-shadow: 0 0.2rem 0.5rem rgba(0, 0, 0, 0.05), 0 0 0.0625rem rgba(0, 0, 0, 0.1) !important\n\n// Make panel colors respond to dark mode\n.sphinx-bs .card\n background-color: var(--color-background-secondary)\n color: var(--color-foreground)\n"],"names":[],"sourceRoot":""} \ No newline at end of file diff --git a/_static/styles/furo.css b/_static/styles/furo.css new file mode 100644 index 00000000..05a56b17 --- /dev/null +++ b/_static/styles/furo.css @@ -0,0 +1,2 @@ +/*! normalize.css v8.0.1 | MIT License | github.com/necolas/normalize.css */html{line-height:1.15;-webkit-text-size-adjust:100%}body{margin:0}main{display:block}h1{font-size:2em;margin:.67em 0}hr{box-sizing:content-box;height:0;overflow:visible}pre{font-family:monospace,monospace;font-size:1em}a{background-color:transparent}abbr[title]{border-bottom:none;text-decoration:underline;text-decoration:underline dotted}b,strong{font-weight:bolder}code,kbd,samp{font-family:monospace,monospace;font-size:1em}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}img{border-style:none}button,input,optgroup,select,textarea{font-family:inherit;font-size:100%;line-height:1.15;margin:0}button,input{overflow:visible}button,select{text-transform:none}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button}[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner,button::-moz-focus-inner{border-style:none;padding:0}[type=button]:-moz-focusring,[type=reset]:-moz-focusring,[type=submit]:-moz-focusring,button:-moz-focusring{outline:1px dotted ButtonText}fieldset{padding:.35em .75em .625em}legend{box-sizing:border-box;color:inherit;display:table;max-width:100%;padding:0;white-space:normal}progress{vertical-align:baseline}textarea{overflow:auto}[type=checkbox],[type=radio]{box-sizing:border-box;padding:0}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}details{display:block}summary{display:list-item}[hidden],template{display:none}@media print{.content-icon-container,.headerlink,.mobile-header,.related-pages{display:none!important}.highlight{border:.1pt solid var(--color-foreground-border)}a,blockquote,dl,ol,p,pre,table,ul{page-break-inside:avoid}caption,figure,h1,h2,h3,h4,h5,h6,img{page-break-after:avoid;page-break-inside:avoid}dl,ol,ul{page-break-before:avoid}}.visually-hidden{height:1px!important;margin:-1px!important;overflow:hidden!important;padding:0!important;position:absolute!important;width:1px!important;clip:rect(0,0,0,0)!important;background:var(--color-background-primary);border:0!important;color:var(--color-foreground-primary);white-space:nowrap!important}:-moz-focusring{outline:auto}body{--font-stack:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,Apple Color Emoji,Segoe UI Emoji;--font-stack--monospace:"SFMono-Regular",Menlo,Consolas,Monaco,Liberation Mono,Lucida Console,monospace;--font-stack--headings:var(--font-stack);--font-size--normal:100%;--font-size--small:87.5%;--font-size--small--2:81.25%;--font-size--small--3:75%;--font-size--small--4:62.5%;--sidebar-caption-font-size:var(--font-size--small--2);--sidebar-item-font-size:var(--font-size--small);--sidebar-search-input-font-size:var(--font-size--small);--toc-font-size:var(--font-size--small--3);--toc-font-size--mobile:var(--font-size--normal);--toc-title-font-size:var(--font-size--small--4);--admonition-font-size:0.8125rem;--admonition-title-font-size:0.8125rem;--code-font-size:var(--font-size--small--2);--api-font-size:var(--font-size--small);--header-height:calc(var(--sidebar-item-line-height) + var(--sidebar-item-spacing-vertical)*4);--header-padding:0.5rem;--sidebar-tree-space-above:1.5rem;--sidebar-caption-space-above:1rem;--sidebar-item-line-height:1rem;--sidebar-item-spacing-vertical:0.5rem;--sidebar-item-spacing-horizontal:1rem;--sidebar-item-height:calc(var(--sidebar-item-line-height) + var(--sidebar-item-spacing-vertical)*2);--sidebar-expander-width:var(--sidebar-item-height);--sidebar-search-space-above:0.5rem;--sidebar-search-input-spacing-vertical:0.5rem;--sidebar-search-input-spacing-horizontal:0.5rem;--sidebar-search-input-height:1rem;--sidebar-search-icon-size:var(--sidebar-search-input-height);--toc-title-padding:0.25rem 0;--toc-spacing-vertical:1.5rem;--toc-spacing-horizontal:1.5rem;--toc-item-spacing-vertical:0.4rem;--toc-item-spacing-horizontal:1rem;--icon-search:url('data:image/svg+xml;charset=utf-8,');--icon-pencil:url('data:image/svg+xml;charset=utf-8,');--icon-abstract:url('data:image/svg+xml;charset=utf-8,');--icon-info:url('data:image/svg+xml;charset=utf-8,');--icon-flame:url('data:image/svg+xml;charset=utf-8,');--icon-question:url('data:image/svg+xml;charset=utf-8,');--icon-warning:url('data:image/svg+xml;charset=utf-8,');--icon-failure:url('data:image/svg+xml;charset=utf-8,');--icon-spark:url('data:image/svg+xml;charset=utf-8,');--color-admonition-title--caution:#ff9100;--color-admonition-title-background--caution:rgba(255,145,0,.2);--color-admonition-title--warning:#ff9100;--color-admonition-title-background--warning:rgba(255,145,0,.2);--color-admonition-title--danger:#ff5252;--color-admonition-title-background--danger:rgba(255,82,82,.2);--color-admonition-title--attention:#ff5252;--color-admonition-title-background--attention:rgba(255,82,82,.2);--color-admonition-title--error:#ff5252;--color-admonition-title-background--error:rgba(255,82,82,.2);--color-admonition-title--hint:#00c852;--color-admonition-title-background--hint:rgba(0,200,82,.2);--color-admonition-title--tip:#00c852;--color-admonition-title-background--tip:rgba(0,200,82,.2);--color-admonition-title--important:#00bfa5;--color-admonition-title-background--important:rgba(0,191,165,.2);--color-admonition-title--note:#00b0ff;--color-admonition-title-background--note:rgba(0,176,255,.2);--color-admonition-title--seealso:#448aff;--color-admonition-title-background--seealso:rgba(68,138,255,.2);--color-admonition-title--admonition-todo:grey;--color-admonition-title-background--admonition-todo:hsla(0,0%,50%,.2);--color-admonition-title:#651fff;--color-admonition-title-background:rgba(101,31,255,.2);--icon-admonition-default:var(--icon-abstract);--color-topic-title:#14b8a6;--color-topic-title-background:rgba(20,184,166,.2);--icon-topic-default:var(--icon-pencil);--color-problematic:#b30000;--color-foreground-primary:#000;--color-foreground-secondary:#5a5c63;--color-foreground-muted:#6b6f76;--color-foreground-border:#878787;--color-background-primary:#fff;--color-background-secondary:#f8f9fb;--color-background-hover:#efeff4;--color-background-hover--transparent:#efeff400;--color-background-border:#eeebee;--color-background-item:#ccc;--color-announcement-background:#000000dd;--color-announcement-text:#eeebee;--color-brand-primary:#0a4bff;--color-brand-content:#2757dd;--color-brand-visited:#872ee0;--color-api-background:var(--color-background-hover--transparent);--color-api-background-hover:var(--color-background-hover);--color-api-overall:var(--color-foreground-secondary);--color-api-name:var(--color-problematic);--color-api-pre-name:var(--color-problematic);--color-api-paren:var(--color-foreground-secondary);--color-api-keyword:var(--color-foreground-primary);--color-api-added:#21632c;--color-api-added-border:#38a84d;--color-api-changed:#046172;--color-api-changed-border:#06a1bc;--color-api-deprecated:#605706;--color-api-deprecated-border:#f0d90f;--color-api-removed:#b30000;--color-api-removed-border:#ff5c5c;--color-highlight-on-target:#ffc;--color-inline-code-background:var(--color-background-secondary);--color-highlighted-background:#def;--color-highlighted-text:var(--color-foreground-primary);--color-guilabel-background:#ddeeff80;--color-guilabel-border:#bedaf580;--color-guilabel-text:var(--color-foreground-primary);--color-admonition-background:transparent;--color-table-header-background:var(--color-background-secondary);--color-table-border:var(--color-background-border);--color-card-border:var(--color-background-secondary);--color-card-background:transparent;--color-card-marginals-background:var(--color-background-secondary);--color-header-background:var(--color-background-primary);--color-header-border:var(--color-background-border);--color-header-text:var(--color-foreground-primary);--color-sidebar-background:var(--color-background-secondary);--color-sidebar-background-border:var(--color-background-border);--color-sidebar-brand-text:var(--color-foreground-primary);--color-sidebar-caption-text:var(--color-foreground-muted);--color-sidebar-link-text:var(--color-foreground-secondary);--color-sidebar-link-text--top-level:var(--color-brand-primary);--color-sidebar-item-background:var(--color-sidebar-background);--color-sidebar-item-background--current:var( --color-sidebar-item-background );--color-sidebar-item-background--hover:linear-gradient(90deg,var(--color-background-hover--transparent) 0%,var(--color-background-hover) var(--sidebar-item-spacing-horizontal),var(--color-background-hover) 100%);--color-sidebar-item-expander-background:transparent;--color-sidebar-item-expander-background--hover:var( --color-background-hover );--color-sidebar-search-text:var(--color-foreground-primary);--color-sidebar-search-background:var(--color-background-secondary);--color-sidebar-search-background--focus:var(--color-background-primary);--color-sidebar-search-border:var(--color-background-border);--color-sidebar-search-icon:var(--color-foreground-muted);--color-toc-background:var(--color-background-primary);--color-toc-title-text:var(--color-foreground-muted);--color-toc-item-text:var(--color-foreground-secondary);--color-toc-item-text--hover:var(--color-foreground-primary);--color-toc-item-text--active:var(--color-brand-primary);--color-content-foreground:var(--color-foreground-primary);--color-content-background:transparent;--color-link:var(--color-brand-content);--color-link-underline:var(--color-background-border);--color-link--hover:var(--color-brand-content);--color-link-underline--hover:var(--color-foreground-border);--color-link--visited:var(--color-brand-visited);--color-link-underline--visited:var(--color-background-border);--color-link--visited--hover:var(--color-brand-visited);--color-link-underline--visited--hover:var(--color-foreground-border)}.only-light{display:block!important}html body .only-dark{display:none!important}@media not print{body[data-theme=dark]{--color-problematic:#ee5151;--color-foreground-primary:#cfd0d0;--color-foreground-secondary:#9ca0a5;--color-foreground-muted:#81868d;--color-foreground-border:#666;--color-background-primary:#131416;--color-background-secondary:#1a1c1e;--color-background-hover:#1e2124;--color-background-hover--transparent:#1e212400;--color-background-border:#303335;--color-background-item:#444;--color-announcement-background:#000000dd;--color-announcement-text:#eeebee;--color-brand-primary:#3d94ff;--color-brand-content:#5ca5ff;--color-brand-visited:#b27aeb;--color-highlighted-background:#083563;--color-guilabel-background:#08356380;--color-guilabel-border:#13395f80;--color-api-keyword:var(--color-foreground-secondary);--color-highlight-on-target:#330;--color-api-added:#3db854;--color-api-added-border:#267334;--color-api-changed:#09b0ce;--color-api-changed-border:#056d80;--color-api-deprecated:#b1a10b;--color-api-deprecated-border:#6e6407;--color-api-removed:#ff7575;--color-api-removed-border:#b03b3b;--color-admonition-background:#18181a;--color-card-border:var(--color-background-secondary);--color-card-background:#18181a;--color-card-marginals-background:var(--color-background-hover)}html body[data-theme=dark] .only-light{display:none!important}body[data-theme=dark] .only-dark{display:block!important}@media(prefers-color-scheme:dark){body:not([data-theme=light]){--color-problematic:#ee5151;--color-foreground-primary:#cfd0d0;--color-foreground-secondary:#9ca0a5;--color-foreground-muted:#81868d;--color-foreground-border:#666;--color-background-primary:#131416;--color-background-secondary:#1a1c1e;--color-background-hover:#1e2124;--color-background-hover--transparent:#1e212400;--color-background-border:#303335;--color-background-item:#444;--color-announcement-background:#000000dd;--color-announcement-text:#eeebee;--color-brand-primary:#3d94ff;--color-brand-content:#5ca5ff;--color-brand-visited:#b27aeb;--color-highlighted-background:#083563;--color-guilabel-background:#08356380;--color-guilabel-border:#13395f80;--color-api-keyword:var(--color-foreground-secondary);--color-highlight-on-target:#330;--color-api-added:#3db854;--color-api-added-border:#267334;--color-api-changed:#09b0ce;--color-api-changed-border:#056d80;--color-api-deprecated:#b1a10b;--color-api-deprecated-border:#6e6407;--color-api-removed:#ff7575;--color-api-removed-border:#b03b3b;--color-admonition-background:#18181a;--color-card-border:var(--color-background-secondary);--color-card-background:#18181a;--color-card-marginals-background:var(--color-background-hover)}html body:not([data-theme=light]) .only-light{display:none!important}body:not([data-theme=light]) .only-dark{display:block!important}}}body[data-theme=auto] .theme-toggle svg.theme-icon-when-auto-light{display:block}@media(prefers-color-scheme:dark){body[data-theme=auto] .theme-toggle svg.theme-icon-when-auto-dark{display:block}body[data-theme=auto] .theme-toggle svg.theme-icon-when-auto-light{display:none}}body[data-theme=dark] .theme-toggle svg.theme-icon-when-dark,body[data-theme=light] .theme-toggle svg.theme-icon-when-light{display:block}body{font-family:var(--font-stack)}code,kbd,pre,samp{font-family:var(--font-stack--monospace)}body{-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}article{line-height:1.5}h1,h2,h3,h4,h5,h6{border-radius:.5rem;font-family:var(--font-stack--headings);font-weight:700;line-height:1.25;margin:.5rem -.5rem;padding-left:.5rem;padding-right:.5rem}h1+p,h2+p,h3+p,h4+p,h5+p,h6+p{margin-top:0}h1{font-size:2.5em;margin-bottom:1rem}h1,h2{margin-top:1.75rem}h2{font-size:2em}h3{font-size:1.5em}h4{font-size:1.25em}h5{font-size:1.125em}h6{font-size:1em}small{font-size:80%;opacity:75%}p{margin-bottom:.75rem;margin-top:.5rem}hr.docutils{background-color:var(--color-background-border);border:0;height:1px;margin:2rem 0;padding:0}.centered{text-align:center}a{color:var(--color-link);text-decoration:underline;text-decoration-color:var(--color-link-underline)}a:visited{color:var(--color-link--visited);text-decoration-color:var(--color-link-underline--visited)}a:visited:hover{color:var(--color-link--visited--hover);text-decoration-color:var(--color-link-underline--visited--hover)}a:hover{color:var(--color-link--hover);text-decoration-color:var(--color-link-underline--hover)}a.muted-link{color:inherit}a.muted-link:hover{color:var(--color-link--hover);text-decoration-color:var(--color-link-underline--hover)}a.muted-link:hover:visited{color:var(--color-link--visited--hover);text-decoration-color:var(--color-link-underline--visited--hover)}html{overflow-x:hidden;overflow-y:scroll;scroll-behavior:smooth}.sidebar-scroll,.toc-scroll,article[role=main] *{scrollbar-color:var(--color-foreground-border) transparent;scrollbar-width:thin}.sidebar-scroll::-webkit-scrollbar,.toc-scroll::-webkit-scrollbar,article[role=main] ::-webkit-scrollbar{height:.25rem;width:.25rem}.sidebar-scroll::-webkit-scrollbar-thumb,.toc-scroll::-webkit-scrollbar-thumb,article[role=main] ::-webkit-scrollbar-thumb{background-color:var(--color-foreground-border);border-radius:.125rem}body,html{height:100%}.skip-to-content,body,html{background:var(--color-background-primary);color:var(--color-foreground-primary)}.skip-to-content{border-radius:1rem;left:.25rem;padding:1rem;position:fixed;top:.25rem;transform:translateY(-200%);transition:transform .3s ease-in-out;z-index:40}.skip-to-content:focus-within{transform:translateY(0)}article{background:var(--color-content-background);color:var(--color-content-foreground);overflow-wrap:break-word}.page{display:flex;min-height:100%}.mobile-header{background-color:var(--color-header-background);border-bottom:1px solid var(--color-header-border);color:var(--color-header-text);display:none;height:var(--header-height);width:100%;z-index:10}.mobile-header.scrolled{border-bottom:none;box-shadow:0 0 .2rem rgba(0,0,0,.1),0 .2rem .4rem rgba(0,0,0,.2)}.mobile-header .header-center a{color:var(--color-header-text);text-decoration:none}.main{display:flex;flex:1}.sidebar-drawer{background:var(--color-sidebar-background);border-right:1px solid var(--color-sidebar-background-border);box-sizing:border-box;display:flex;justify-content:flex-end;min-width:15em;width:calc(50% - 26em)}.sidebar-container,.toc-drawer{box-sizing:border-box;width:15em}.toc-drawer{background:var(--color-toc-background);padding-right:1rem}.sidebar-sticky,.toc-sticky{display:flex;flex-direction:column;height:min(100%,100vh);height:100vh;position:sticky;top:0}.sidebar-scroll,.toc-scroll{flex-grow:1;flex-shrink:1;overflow:auto;scroll-behavior:smooth}.content{display:flex;flex-direction:column;justify-content:space-between;padding:0 3em;width:46em}.icon{display:inline-block;height:1rem;width:1rem}.icon svg{height:100%;width:100%}.announcement{align-items:center;background-color:var(--color-announcement-background);color:var(--color-announcement-text);display:flex;height:var(--header-height);overflow-x:auto}.announcement+.page{min-height:calc(100% - var(--header-height))}.announcement-content{box-sizing:border-box;min-width:100%;padding:.5rem;text-align:center;white-space:nowrap}.announcement-content a{color:var(--color-announcement-text);text-decoration-color:var(--color-announcement-text)}.announcement-content a:hover{color:var(--color-announcement-text);text-decoration-color:var(--color-link--hover)}.no-js .theme-toggle-container{display:none}.theme-toggle-container{display:flex}.theme-toggle{background:transparent;border:none;cursor:pointer;display:flex;padding:0}.theme-toggle svg{color:var(--color-foreground-primary);display:none;height:1.25rem;width:1.25rem}.theme-toggle-header{align-items:center;display:flex;justify-content:center}.nav-overlay-icon,.toc-overlay-icon{cursor:pointer;display:none}.nav-overlay-icon .icon,.toc-overlay-icon .icon{color:var(--color-foreground-secondary);height:1.5rem;width:1.5rem}.nav-overlay-icon,.toc-header-icon{align-items:center;justify-content:center}.toc-content-icon{height:1.5rem;width:1.5rem}.content-icon-container{display:flex;float:right;gap:.5rem;margin-bottom:1rem;margin-left:1rem;margin-top:1.5rem}.content-icon-container .edit-this-page svg,.content-icon-container .view-this-page svg{color:inherit;height:1.25rem;width:1.25rem}.sidebar-toggle{display:none;position:absolute}.sidebar-toggle[name=__toc]{left:20px}.sidebar-toggle:checked{left:40px}.overlay{background-color:rgba(0,0,0,.54);height:0;opacity:0;position:fixed;top:0;transition:width 0ms,height 0ms,opacity .25s ease-out;width:0}.sidebar-overlay{z-index:20}.toc-overlay{z-index:40}.sidebar-drawer{transition:left .25s ease-in-out;z-index:30}.toc-drawer{transition:right .25s ease-in-out;z-index:50}#__navigation:checked~.sidebar-overlay{height:100%;opacity:1;width:100%}#__navigation:checked~.page .sidebar-drawer{left:0;top:0}#__toc:checked~.toc-overlay{height:100%;opacity:1;width:100%}#__toc:checked~.page .toc-drawer{right:0;top:0}.back-to-top{background:var(--color-background-primary);border-radius:1rem;box-shadow:0 .2rem .5rem rgba(0,0,0,.05),0 0 1px 0 hsla(220,9%,46%,.502);display:none;font-size:.8125rem;left:0;margin-left:50%;padding:.5rem .75rem .5rem .5rem;position:fixed;text-decoration:none;top:1rem;transform:translateX(-50%);z-index:10}.back-to-top svg{height:1rem;width:1rem;fill:currentColor;display:inline-block}.back-to-top span{margin-left:.25rem}.show-back-to-top .back-to-top{align-items:center;display:flex}@media(min-width:97em){html{font-size:110%}}@media(max-width:82em){.toc-content-icon{display:flex}.toc-drawer{border-left:1px solid var(--color-background-muted);height:100vh;position:fixed;right:-15em;top:0}.toc-tree{border-left:none;font-size:var(--toc-font-size--mobile)}.sidebar-drawer{width:calc(50% - 18.5em)}}@media(max-width:67em){.content{margin-left:auto;margin-right:auto;padding:0 1em}}@media(max-width:63em){.nav-overlay-icon{display:flex}.sidebar-drawer{height:100vh;left:-15em;position:fixed;top:0;width:15em}.theme-toggle-header,.toc-header-icon{display:flex}.theme-toggle-content,.toc-content-icon{display:none}.mobile-header{align-items:center;display:flex;justify-content:space-between;position:sticky;top:0}.mobile-header .header-left,.mobile-header .header-right{display:flex;height:var(--header-height);padding:0 var(--header-padding)}.mobile-header .header-left label,.mobile-header .header-right label{height:100%;-webkit-user-select:none;-moz-user-select:none;user-select:none;width:100%}.nav-overlay-icon .icon,.theme-toggle svg{height:1.5rem;width:1.5rem}:target{scroll-margin-top:calc(var(--header-height) + 2.5rem)}.back-to-top{top:calc(var(--header-height) + .5rem)}.page{flex-direction:column;justify-content:center}}@media(max-width:48em){.content{overflow-x:auto;width:100%}}@media(max-width:46em){article[role=main] aside.sidebar{float:none;margin:1rem 0;width:100%}}.admonition,.topic{background:var(--color-admonition-background);border-radius:.2rem;box-shadow:0 .2rem .5rem rgba(0,0,0,.05),0 0 .0625rem rgba(0,0,0,.1);font-size:var(--admonition-font-size);margin:1rem auto;overflow:hidden;padding:0 .5rem .5rem;page-break-inside:avoid}.admonition>:nth-child(2),.topic>:nth-child(2){margin-top:0}.admonition>:last-child,.topic>:last-child{margin-bottom:0}.admonition p.admonition-title,p.topic-title{font-size:var(--admonition-title-font-size);font-weight:500;line-height:1.3;margin:0 -.5rem .5rem;padding:.4rem .5rem .4rem 2rem;position:relative}.admonition p.admonition-title:before,p.topic-title:before{content:"";height:1rem;left:.5rem;position:absolute;width:1rem}p.admonition-title{background-color:var(--color-admonition-title-background)}p.admonition-title:before{background-color:var(--color-admonition-title);-webkit-mask-image:var(--icon-admonition-default);mask-image:var(--icon-admonition-default);-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat}p.topic-title{background-color:var(--color-topic-title-background)}p.topic-title:before{background-color:var(--color-topic-title);-webkit-mask-image:var(--icon-topic-default);mask-image:var(--icon-topic-default);-webkit-mask-repeat:no-repeat;mask-repeat:no-repeat}.admonition{border-left:.2rem solid var(--color-admonition-title)}.admonition.caution{border-left-color:var(--color-admonition-title--caution)}.admonition.caution>.admonition-title{background-color:var(--color-admonition-title-background--caution)}.admonition.caution>.admonition-title:before{background-color:var(--color-admonition-title--caution);-webkit-mask-image:var(--icon-spark);mask-image:var(--icon-spark)}.admonition.warning{border-left-color:var(--color-admonition-title--warning)}.admonition.warning>.admonition-title{background-color:var(--color-admonition-title-background--warning)}.admonition.warning>.admonition-title:before{background-color:var(--color-admonition-title--warning);-webkit-mask-image:var(--icon-warning);mask-image:var(--icon-warning)}.admonition.danger{border-left-color:var(--color-admonition-title--danger)}.admonition.danger>.admonition-title{background-color:var(--color-admonition-title-background--danger)}.admonition.danger>.admonition-title:before{background-color:var(--color-admonition-title--danger);-webkit-mask-image:var(--icon-spark);mask-image:var(--icon-spark)}.admonition.attention{border-left-color:var(--color-admonition-title--attention)}.admonition.attention>.admonition-title{background-color:var(--color-admonition-title-background--attention)}.admonition.attention>.admonition-title:before{background-color:var(--color-admonition-title--attention);-webkit-mask-image:var(--icon-warning);mask-image:var(--icon-warning)}.admonition.error{border-left-color:var(--color-admonition-title--error)}.admonition.error>.admonition-title{background-color:var(--color-admonition-title-background--error)}.admonition.error>.admonition-title:before{background-color:var(--color-admonition-title--error);-webkit-mask-image:var(--icon-failure);mask-image:var(--icon-failure)}.admonition.hint{border-left-color:var(--color-admonition-title--hint)}.admonition.hint>.admonition-title{background-color:var(--color-admonition-title-background--hint)}.admonition.hint>.admonition-title:before{background-color:var(--color-admonition-title--hint);-webkit-mask-image:var(--icon-question);mask-image:var(--icon-question)}.admonition.tip{border-left-color:var(--color-admonition-title--tip)}.admonition.tip>.admonition-title{background-color:var(--color-admonition-title-background--tip)}.admonition.tip>.admonition-title:before{background-color:var(--color-admonition-title--tip);-webkit-mask-image:var(--icon-info);mask-image:var(--icon-info)}.admonition.important{border-left-color:var(--color-admonition-title--important)}.admonition.important>.admonition-title{background-color:var(--color-admonition-title-background--important)}.admonition.important>.admonition-title:before{background-color:var(--color-admonition-title--important);-webkit-mask-image:var(--icon-flame);mask-image:var(--icon-flame)}.admonition.note{border-left-color:var(--color-admonition-title--note)}.admonition.note>.admonition-title{background-color:var(--color-admonition-title-background--note)}.admonition.note>.admonition-title:before{background-color:var(--color-admonition-title--note);-webkit-mask-image:var(--icon-pencil);mask-image:var(--icon-pencil)}.admonition.seealso{border-left-color:var(--color-admonition-title--seealso)}.admonition.seealso>.admonition-title{background-color:var(--color-admonition-title-background--seealso)}.admonition.seealso>.admonition-title:before{background-color:var(--color-admonition-title--seealso);-webkit-mask-image:var(--icon-info);mask-image:var(--icon-info)}.admonition.admonition-todo{border-left-color:var(--color-admonition-title--admonition-todo)}.admonition.admonition-todo>.admonition-title{background-color:var(--color-admonition-title-background--admonition-todo)}.admonition.admonition-todo>.admonition-title:before{background-color:var(--color-admonition-title--admonition-todo);-webkit-mask-image:var(--icon-pencil);mask-image:var(--icon-pencil)}.admonition-todo>.admonition-title{text-transform:uppercase}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dd{margin-left:2rem}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dd>:first-child{margin-top:.125rem}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list,dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) dd>:last-child{margin-bottom:.75rem}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list>dt{font-size:var(--font-size--small);text-transform:uppercase}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list dd:empty{margin-bottom:.5rem}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list dd>ul{margin-left:-1.2rem}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list dd>ul>li>p:nth-child(2){margin-top:0}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple) .field-list dd>ul>li>p+p:last-child:empty{margin-bottom:0;margin-top:0}dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)>dt{color:var(--color-api-overall)}.sig:not(.sig-inline){background:var(--color-api-background);border-radius:.25rem;font-family:var(--font-stack--monospace);font-size:var(--api-font-size);font-weight:700;margin-left:-.25rem;margin-right:-.25rem;padding:.25rem .5rem .25rem 3em;text-indent:-2.5em;transition:background .1s ease-out}.sig:not(.sig-inline):hover{background:var(--color-api-background-hover)}.sig:not(.sig-inline) a.reference .viewcode-link{font-weight:400;width:4.25rem}em.property{font-style:normal}em.property:first-child{color:var(--color-api-keyword)}.sig-name{color:var(--color-api-name)}.sig-prename{color:var(--color-api-pre-name);font-weight:400}.sig-paren{color:var(--color-api-paren)}.sig-param{font-style:normal}div.deprecated,div.versionadded,div.versionchanged,div.versionremoved{border-left:.1875rem solid;border-radius:.125rem;padding-left:.75rem}div.deprecated p,div.versionadded p,div.versionchanged p,div.versionremoved p{margin-bottom:.125rem;margin-top:.125rem}div.versionadded{border-color:var(--color-api-added-border)}div.versionadded .versionmodified{color:var(--color-api-added)}div.versionchanged{border-color:var(--color-api-changed-border)}div.versionchanged .versionmodified{color:var(--color-api-changed)}div.deprecated{border-color:var(--color-api-deprecated-border)}div.deprecated .versionmodified{color:var(--color-api-deprecated)}div.versionremoved{border-color:var(--color-api-removed-border)}div.versionremoved .versionmodified{color:var(--color-api-removed)}.viewcode-back,.viewcode-link{float:right;text-align:right}.line-block{margin-bottom:.75rem;margin-top:.5rem}.line-block .line-block{margin-bottom:0;margin-top:0;padding-left:1rem}.code-block-caption,article p.caption,table>caption{font-size:var(--font-size--small);text-align:center}.toctree-wrapper.compound .caption,.toctree-wrapper.compound :not(.caption)>.caption-text{font-size:var(--font-size--small);margin-bottom:0;text-align:initial;text-transform:uppercase}.toctree-wrapper.compound>ul{margin-bottom:0;margin-top:0}.sig-inline,code.literal{background:var(--color-inline-code-background);border-radius:.2em;font-size:var(--font-size--small--2);padding:.1em .2em}pre.literal-block .sig-inline,pre.literal-block code.literal{font-size:inherit;padding:0}p .sig-inline,p code.literal{border:1px solid var(--color-background-border)}.sig-inline{font-family:var(--font-stack--monospace)}div[class*=" highlight-"],div[class^=highlight-]{display:flex;margin:1em 0}div[class*=" highlight-"] .table-wrapper,div[class^=highlight-] .table-wrapper,pre{margin:0;padding:0}pre{overflow:auto}article[role=main] .highlight pre{line-height:1.5}.highlight pre,pre.literal-block{font-size:var(--code-font-size);padding:.625rem .875rem}pre.literal-block{background-color:var(--color-code-background);border-radius:.2rem;color:var(--color-code-foreground);margin-bottom:1rem;margin-top:1rem}.highlight{border-radius:.2rem;width:100%}.highlight .gp,.highlight span.linenos{pointer-events:none;-webkit-user-select:none;-moz-user-select:none;user-select:none}.highlight .hll{display:block;margin-left:-.875rem;margin-right:-.875rem;padding-left:.875rem;padding-right:.875rem}.code-block-caption{background-color:var(--color-code-background);border-bottom:1px solid;border-radius:.25rem;border-bottom-left-radius:0;border-bottom-right-radius:0;border-color:var(--color-background-border);color:var(--color-code-foreground);display:flex;font-weight:300;padding:.625rem .875rem}.code-block-caption+div[class]{margin-top:0}.code-block-caption+div[class] pre{border-top-left-radius:0;border-top-right-radius:0}.highlighttable{display:block;width:100%}.highlighttable tbody{display:block}.highlighttable tr{display:flex}.highlighttable td.linenos{background-color:var(--color-code-background);border-bottom-left-radius:.2rem;border-top-left-radius:.2rem;color:var(--color-code-foreground);padding:.625rem 0 .625rem .875rem}.highlighttable .linenodiv{box-shadow:-.0625rem 0 var(--color-foreground-border) inset;font-size:var(--code-font-size);padding-right:.875rem}.highlighttable td.code{display:block;flex:1;overflow:hidden;padding:0}.highlighttable td.code .highlight{border-bottom-left-radius:0;border-top-left-radius:0}.highlight span.linenos{box-shadow:-.0625rem 0 var(--color-foreground-border) inset;display:inline-block;margin-right:.875rem;padding-left:0;padding-right:.875rem}.footnote-reference{font-size:var(--font-size--small--4);vertical-align:super}dl.footnote.brackets{color:var(--color-foreground-secondary);display:grid;font-size:var(--font-size--small);grid-template-columns:max-content auto}dl.footnote.brackets dt{margin:0}dl.footnote.brackets dt>.fn-backref{margin-left:.25rem}dl.footnote.brackets dt:after{content:":"}dl.footnote.brackets dt .brackets:before{content:"["}dl.footnote.brackets dt .brackets:after{content:"]"}dl.footnote.brackets dd{margin:0;padding:0 1rem}aside.footnote{color:var(--color-foreground-secondary);font-size:var(--font-size--small)}aside.footnote>span,div.citation>span{float:left;font-weight:500;padding-right:.25rem}aside.footnote>:not(span),div.citation>p{margin-left:2rem}img{box-sizing:border-box;height:auto;max-width:100%}article .figure,article figure{border-radius:.2rem;margin:0}article .figure :last-child,article figure :last-child{margin-bottom:0}article .align-left{clear:left;float:left;margin:0 1rem 1rem}article .align-right{clear:right;float:right;margin:0 1rem 1rem}article .align-center,article .align-default{display:block;margin-left:auto;margin-right:auto;text-align:center}article table.align-default{display:table;text-align:initial}.domainindex-jumpbox,.genindex-jumpbox{border-bottom:1px solid var(--color-background-border);border-top:1px solid var(--color-background-border);padding:.25rem}.domainindex-section h2,.genindex-section h2{margin-bottom:.5rem;margin-top:.75rem}.domainindex-section ul,.genindex-section ul{margin-bottom:0;margin-top:0}ol,ul{margin-bottom:1rem;margin-top:1rem;padding-left:1.2rem}ol li>p:first-child,ul li>p:first-child{margin-bottom:.25rem;margin-top:.25rem}ol li>p:last-child,ul li>p:last-child{margin-top:.25rem}ol li>ol,ol li>ul,ul li>ol,ul li>ul{margin-bottom:.5rem;margin-top:.5rem}ol.arabic{list-style:decimal}ol.loweralpha{list-style:lower-alpha}ol.upperalpha{list-style:upper-alpha}ol.lowerroman{list-style:lower-roman}ol.upperroman{list-style:upper-roman}.simple li>ol,.simple li>ul,.toctree-wrapper li>ol,.toctree-wrapper li>ul{margin-bottom:0;margin-top:0}.field-list dt,.option-list dt,dl.footnote dt,dl.glossary dt,dl.simple dt,dl:not([class]) dt{font-weight:500;margin-top:.25rem}.field-list dt+dt,.option-list dt+dt,dl.footnote dt+dt,dl.glossary dt+dt,dl.simple dt+dt,dl:not([class]) dt+dt{margin-top:0}.field-list dt .classifier:before,.option-list dt .classifier:before,dl.footnote dt .classifier:before,dl.glossary dt .classifier:before,dl.simple dt .classifier:before,dl:not([class]) dt .classifier:before{content:":";margin-left:.2rem;margin-right:.2rem}.field-list dd ul,.field-list dd>p:first-child,.option-list dd ul,.option-list dd>p:first-child,dl.footnote dd ul,dl.footnote dd>p:first-child,dl.glossary dd ul,dl.glossary dd>p:first-child,dl.simple dd ul,dl.simple dd>p:first-child,dl:not([class]) dd ul,dl:not([class]) dd>p:first-child{margin-top:.125rem}.field-list dd ul,.option-list dd ul,dl.footnote dd ul,dl.glossary dd ul,dl.simple dd ul,dl:not([class]) dd ul{margin-bottom:.125rem}.math-wrapper{overflow-x:auto;width:100%}div.math{position:relative;text-align:center}div.math .headerlink,div.math:focus .headerlink{display:none}div.math:hover .headerlink{display:inline-block}div.math span.eqno{position:absolute;right:.5rem;top:50%;transform:translateY(-50%);z-index:1}abbr[title]{cursor:help}.problematic{color:var(--color-problematic)}kbd:not(.compound){background-color:var(--color-background-secondary);border:1px solid var(--color-foreground-border);border-radius:.2rem;box-shadow:0 .0625rem 0 rgba(0,0,0,.2),inset 0 0 0 .125rem var(--color-background-primary);color:var(--color-foreground-primary);display:inline-block;font-size:var(--font-size--small--3);margin:0 .2rem;padding:0 .2rem;vertical-align:text-bottom}blockquote{background:var(--color-background-secondary);border-left:4px solid var(--color-background-border);margin-left:0;margin-right:0;padding:.5rem 1rem}blockquote .attribution{font-weight:600;text-align:right}blockquote.highlights,blockquote.pull-quote{font-size:1.25em}blockquote.epigraph,blockquote.pull-quote{border-left-width:0;border-radius:.5rem}blockquote.highlights{background:transparent;border-left-width:0}p .reference img{vertical-align:middle}p.rubric{font-size:1.125em;font-weight:700;line-height:1.25}dd p.rubric{font-size:var(--font-size--small);font-weight:inherit;line-height:inherit;text-transform:uppercase}article .sidebar{background-color:var(--color-background-secondary);border:1px solid var(--color-background-border);border-radius:.2rem;clear:right;float:right;margin-left:1rem;margin-right:0;width:30%}article .sidebar>*{padding-left:1rem;padding-right:1rem}article .sidebar>ol,article .sidebar>ul{padding-left:2.2rem}article .sidebar .sidebar-title{border-bottom:1px solid var(--color-background-border);font-weight:500;margin:0;padding:.5rem 1rem}[role=main] .table-wrapper.container{margin-bottom:.5rem;margin-top:1rem;overflow-x:auto;padding:.2rem .2rem .75rem;width:100%}table.docutils{border-collapse:collapse;border-radius:.2rem;border-spacing:0;box-shadow:0 .2rem .5rem rgba(0,0,0,.05),0 0 .0625rem rgba(0,0,0,.1)}table.docutils th{background:var(--color-table-header-background)}table.docutils td,table.docutils th{border-bottom:1px solid var(--color-table-border);border-left:1px solid var(--color-table-border);border-right:1px solid var(--color-table-border);padding:0 .25rem}table.docutils td p,table.docutils th p{margin:.25rem}table.docutils td:first-child,table.docutils th:first-child{border-left:none}table.docutils td:last-child,table.docutils th:last-child{border-right:none}table.docutils td.text-left,table.docutils th.text-left{text-align:left}table.docutils td.text-right,table.docutils th.text-right{text-align:right}table.docutils td.text-center,table.docutils th.text-center{text-align:center}:target{scroll-margin-top:2.5rem}@media(max-width:67em){:target{scroll-margin-top:calc(2.5rem + var(--header-height))}section>span:target{scroll-margin-top:calc(2.8rem + var(--header-height))}}.headerlink{font-weight:100;-webkit-user-select:none;-moz-user-select:none;user-select:none}.code-block-caption>.headerlink,dl dt>.headerlink,figcaption p>.headerlink,h1>.headerlink,h2>.headerlink,h3>.headerlink,h4>.headerlink,h5>.headerlink,h6>.headerlink,p.caption>.headerlink,table>caption>.headerlink{margin-left:.5rem;visibility:hidden}.code-block-caption:hover>.headerlink,dl dt:hover>.headerlink,figcaption p:hover>.headerlink,h1:hover>.headerlink,h2:hover>.headerlink,h3:hover>.headerlink,h4:hover>.headerlink,h5:hover>.headerlink,h6:hover>.headerlink,p.caption:hover>.headerlink,table>caption:hover>.headerlink{visibility:visible}.code-block-caption>.toc-backref,dl dt>.toc-backref,figcaption p>.toc-backref,h1>.toc-backref,h2>.toc-backref,h3>.toc-backref,h4>.toc-backref,h5>.toc-backref,h6>.toc-backref,p.caption>.toc-backref,table>caption>.toc-backref{color:inherit;text-decoration-line:none}figure:hover>figcaption>p>.headerlink,table:hover>caption>.headerlink{visibility:visible}:target>h1:first-of-type,:target>h2:first-of-type,:target>h3:first-of-type,:target>h4:first-of-type,:target>h5:first-of-type,:target>h6:first-of-type,span:target~h1:first-of-type,span:target~h2:first-of-type,span:target~h3:first-of-type,span:target~h4:first-of-type,span:target~h5:first-of-type,span:target~h6:first-of-type{background-color:var(--color-highlight-on-target)}:target>h1:first-of-type code.literal,:target>h2:first-of-type code.literal,:target>h3:first-of-type code.literal,:target>h4:first-of-type code.literal,:target>h5:first-of-type code.literal,:target>h6:first-of-type code.literal,span:target~h1:first-of-type code.literal,span:target~h2:first-of-type code.literal,span:target~h3:first-of-type code.literal,span:target~h4:first-of-type code.literal,span:target~h5:first-of-type code.literal,span:target~h6:first-of-type code.literal{background-color:transparent}.literal-block-wrapper:target .code-block-caption,.this-will-duplicate-information-and-it-is-still-useful-here li :target,figure:target,table:target>caption{background-color:var(--color-highlight-on-target)}dt:target{background-color:var(--color-highlight-on-target)!important}.footnote-reference:target,.footnote>dt:target+dd{background-color:var(--color-highlight-on-target)}.guilabel{background-color:var(--color-guilabel-background);border:1px solid var(--color-guilabel-border);border-radius:.5em;color:var(--color-guilabel-text);font-size:.9em;padding:0 .3em}footer{display:flex;flex-direction:column;font-size:var(--font-size--small);margin-top:2rem}.bottom-of-page{align-items:center;border-top:1px solid var(--color-background-border);color:var(--color-foreground-secondary);display:flex;justify-content:space-between;line-height:1.5;margin-top:1rem;padding-bottom:1rem;padding-top:1rem}@media(max-width:46em){.bottom-of-page{flex-direction:column-reverse;gap:.25rem;text-align:center}}.bottom-of-page .left-details{font-size:var(--font-size--small)}.bottom-of-page .right-details{display:flex;flex-direction:column;gap:.25rem;text-align:right}.bottom-of-page .icons{display:flex;font-size:1rem;gap:.25rem;justify-content:flex-end}.bottom-of-page .icons a{text-decoration:none}.bottom-of-page .icons img,.bottom-of-page .icons svg{font-size:1.125rem;height:1em;width:1em}.related-pages a{align-items:center;display:flex;text-decoration:none}.related-pages a:hover .page-info .title{color:var(--color-link);text-decoration:underline;text-decoration-color:var(--color-link-underline)}.related-pages a svg.furo-related-icon,.related-pages a svg.furo-related-icon>use{color:var(--color-foreground-border);flex-shrink:0;height:.75rem;margin:0 .5rem;width:.75rem}.related-pages a.next-page{clear:right;float:right;max-width:50%;text-align:right}.related-pages a.prev-page{clear:left;float:left;max-width:50%}.related-pages a.prev-page svg{transform:rotate(180deg)}.page-info{display:flex;flex-direction:column;overflow-wrap:anywhere}.next-page .page-info{align-items:flex-end}.page-info .context{align-items:center;color:var(--color-foreground-muted);display:flex;font-size:var(--font-size--small);padding-bottom:.1rem;text-decoration:none}ul.search{list-style:none;padding-left:0}ul.search li{border-bottom:1px solid var(--color-background-border);padding:1rem 0}[role=main] .highlighted{background-color:var(--color-highlighted-background);color:var(--color-highlighted-text)}.sidebar-brand{display:flex;flex-direction:column;flex-shrink:0;padding:var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal);text-decoration:none}.sidebar-brand-text{color:var(--color-sidebar-brand-text);font-size:1.5rem;overflow-wrap:break-word}.sidebar-brand-text,.sidebar-logo-container{margin:var(--sidebar-item-spacing-vertical) 0}.sidebar-logo{display:block;margin:0 auto;max-width:100%}.sidebar-search-container{align-items:center;background:var(--color-sidebar-search-background);display:flex;margin-top:var(--sidebar-search-space-above);position:relative}.sidebar-search-container:focus-within,.sidebar-search-container:hover{background:var(--color-sidebar-search-background--focus)}.sidebar-search-container:before{background-color:var(--color-sidebar-search-icon);content:"";height:var(--sidebar-search-icon-size);left:var(--sidebar-item-spacing-horizontal);-webkit-mask-image:var(--icon-search);mask-image:var(--icon-search);position:absolute;width:var(--sidebar-search-icon-size)}.sidebar-search{background:transparent;border:none;border-bottom:1px solid var(--color-sidebar-search-border);border-top:1px solid var(--color-sidebar-search-border);box-sizing:border-box;color:var(--color-sidebar-search-foreground);padding:var(--sidebar-search-input-spacing-vertical) var(--sidebar-search-input-spacing-horizontal) var(--sidebar-search-input-spacing-vertical) calc(var(--sidebar-item-spacing-horizontal) + var(--sidebar-search-input-spacing-horizontal) + var(--sidebar-search-icon-size));width:100%;z-index:10}.sidebar-search:focus{outline:none}.sidebar-search::-moz-placeholder{font-size:var(--sidebar-search-input-font-size)}.sidebar-search::placeholder{font-size:var(--sidebar-search-input-font-size)}#searchbox .highlight-link{margin:0;padding:var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal) 0;text-align:center}#searchbox .highlight-link a{color:var(--color-sidebar-search-icon);font-size:var(--font-size--small--2)}.sidebar-tree{font-size:var(--sidebar-item-font-size);margin-bottom:var(--sidebar-item-spacing-vertical);margin-top:var(--sidebar-tree-space-above)}.sidebar-tree ul{display:flex;flex-direction:column;list-style:none;margin-bottom:0;margin-top:0;padding:0}.sidebar-tree li{margin:0;position:relative}.sidebar-tree li>ul{margin-left:var(--sidebar-item-spacing-horizontal)}.sidebar-tree .icon,.sidebar-tree .reference{color:var(--color-sidebar-link-text)}.sidebar-tree .reference{box-sizing:border-box;display:inline-block;height:100%;line-height:var(--sidebar-item-line-height);overflow-wrap:anywhere;padding:var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal);text-decoration:none;width:100%}.sidebar-tree .reference:hover{background:var(--color-sidebar-item-background--hover);color:var(--color-sidebar-link-text)}.sidebar-tree .reference.external:after{color:var(--color-sidebar-link-text);content:url("data:image/svg+xml;charset=utf-8,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' fill='none' stroke='%23607D8B' stroke-linecap='round' stroke-linejoin='round' stroke-width='1.5' viewBox='0 0 24 24'%3E%3Cpath stroke='none' d='M0 0h24v24H0z'/%3E%3Cpath d='M11 7H6a2 2 0 0 0-2 2v9a2 2 0 0 0 2 2h9a2 2 0 0 0 2-2v-5M10 14 20 4M15 4h5v5'/%3E%3C/svg%3E");margin:0 .25rem;vertical-align:middle}.sidebar-tree .current-page>.reference{font-weight:700}.sidebar-tree label{align-items:center;cursor:pointer;display:flex;height:var(--sidebar-item-height);justify-content:center;position:absolute;right:0;top:0;-webkit-user-select:none;-moz-user-select:none;user-select:none;width:var(--sidebar-expander-width)}.sidebar-tree .caption,.sidebar-tree :not(.caption)>.caption-text{color:var(--color-sidebar-caption-text);font-size:var(--sidebar-caption-font-size);font-weight:700;margin:var(--sidebar-caption-space-above) 0 0 0;padding:var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal);text-transform:uppercase}.sidebar-tree li.has-children>.reference{padding-right:var(--sidebar-expander-width)}.sidebar-tree .toctree-l1>.reference,.sidebar-tree .toctree-l1>label .icon{color:var(--color-sidebar-link-text--top-level)}.sidebar-tree label{background:var(--color-sidebar-item-expander-background)}.sidebar-tree label:hover{background:var(--color-sidebar-item-expander-background--hover)}.sidebar-tree .current>.reference{background:var(--color-sidebar-item-background--current)}.sidebar-tree .current>.reference:hover{background:var(--color-sidebar-item-background--hover)}.toctree-checkbox{display:none;position:absolute}.toctree-checkbox~ul{display:none}.toctree-checkbox~label .icon svg{transform:rotate(90deg)}.toctree-checkbox:checked~ul{display:block}.toctree-checkbox:checked~label .icon svg{transform:rotate(-90deg)}.toc-title-container{padding:var(--toc-title-padding);padding-top:var(--toc-spacing-vertical)}.toc-title{color:var(--color-toc-title-text);font-size:var(--toc-title-font-size);padding-left:var(--toc-spacing-horizontal);text-transform:uppercase}.no-toc{display:none}.toc-tree-container{padding-bottom:var(--toc-spacing-vertical)}.toc-tree{border-left:1px solid var(--color-background-border);font-size:var(--toc-font-size);line-height:1.3;padding-left:calc(var(--toc-spacing-horizontal) - var(--toc-item-spacing-horizontal))}.toc-tree>ul>li:first-child{padding-top:0}.toc-tree>ul>li:first-child>ul{padding-left:0}.toc-tree>ul>li:first-child>a{display:none}.toc-tree ul{list-style-type:none;margin-bottom:0;margin-top:0;padding-left:var(--toc-item-spacing-horizontal)}.toc-tree li{padding-top:var(--toc-item-spacing-vertical)}.toc-tree li.scroll-current>.reference{color:var(--color-toc-item-text--active);font-weight:700}.toc-tree a.reference{color:var(--color-toc-item-text);overflow-wrap:anywhere;text-decoration:none}.toc-scroll{max-height:100vh;overflow-y:scroll}.contents:not(.this-will-duplicate-information-and-it-is-still-useful-here){background:rgba(255,0,0,.25);color:var(--color-problematic)}.contents:not(.this-will-duplicate-information-and-it-is-still-useful-here):before{content:"ERROR: Adding a table of contents in Furo-based documentation is unnecessary, and does not work well with existing styling. Add a 'this-will-duplicate-information-and-it-is-still-useful-here' class, if you want an escape hatch."}.text-align\:left>p{text-align:left}.text-align\:center>p{text-align:center}.text-align\:right>p{text-align:right} +/*# sourceMappingURL=furo.css.map*/ \ No newline at end of file diff --git a/_static/styles/furo.css.map b/_static/styles/furo.css.map new file mode 100644 index 00000000..3ecc3715 --- /dev/null +++ b/_static/styles/furo.css.map @@ -0,0 +1 @@ +{"version":3,"file":"styles/furo.css","mappings":"AAAA,2EAA2E,CAU3E,KACE,gBAAiB,CACjB,6BACF,CASA,KACE,QACF,CAMA,KACE,aACF,CAOA,GACE,aAAc,CACd,cACF,CAUA,GACE,sBAAuB,CACvB,QAAS,CACT,gBACF,CAOA,IACE,+BAAiC,CACjC,aACF,CASA,EACE,4BACF,CAOA,YACE,kBAAmB,CACnB,yBAA0B,CAC1B,gCACF,CAMA,SAEE,kBACF,CAOA,cAGE,+BAAiC,CACjC,aACF,CAeA,QAEE,aAAc,CACd,aAAc,CACd,iBAAkB,CAClB,uBACF,CAEA,IACE,aACF,CAEA,IACE,SACF,CASA,IACE,iBACF,CAUA,sCAKE,mBAAoB,CACpB,cAAe,CACf,gBAAiB,CACjB,QACF,CAOA,aAEE,gBACF,CAOA,cAEE,mBACF,CAMA,gDAIE,yBACF,CAMA,wHAIE,iBAAkB,CAClB,SACF,CAMA,4GAIE,6BACF,CAMA,SACE,0BACF,CASA,OACE,qBAAsB,CACtB,aAAc,CACd,aAAc,CACd,cAAe,CACf,SAAU,CACV,kBACF,CAMA,SACE,uBACF,CAMA,SACE,aACF,CAOA,6BAEE,qBAAsB,CACtB,SACF,CAMA,kFAEE,WACF,CAOA,cACE,4BAA6B,CAC7B,mBACF,CAMA,yCACE,uBACF,CAOA,6BACE,yBAA0B,CAC1B,YACF,CASA,QACE,aACF,CAMA,QACE,iBACF,CAiBA,kBACE,YACF,CCvVA,aAcE,kEACE,uBAOF,WACE,iDAMF,kCACE,wBAEF,qCAEE,uBADA,uBACA,CAEF,SACE,wBAtBA,CCpBJ,iBAGE,qBAEA,sBACA,0BAFA,oBAHA,4BACA,oBAKA,6BAIA,2CAFA,mBACA,sCAFA,4BAGA,CAEF,gBACE,aCTF,KCGE,mHAEA,wGAEA,wCAAyC,CAEzC,wBAAyB,CACzB,wBAAyB,CACzB,4BAA6B,CAC7B,yBAA0B,CAC1B,2BAA4B,CAG5B,sDAAuD,CACvD,gDAAiD,CACjD,wDAAyD,CAGzD,0CAA2C,CAC3C,gDAAiD,CACjD,gDAAiD,CAKjD,gCAAiC,CACjC,sCAAuC,CAGvC,2CAA4C,CAG5C,uCAAwC,CCjCxC,+FAGA,uBAAwB,CAGxB,iCAAkC,CAClC,kCAAmC,CAEnC,+BAAgC,CAChC,sCAAuC,CACvC,sCAAuC,CACvC,qGAIA,mDAAoD,CAEpD,mCAAoC,CACpC,8CAA+C,CAC/C,gDAAiD,CACjD,kCAAmC,CACnC,6DAA8D,CAG9D,6BAA8B,CAC9B,6BAA8B,CAC9B,+BAAgC,CAChC,kCAAmC,CACnC,kCAAmC,CCPjC,+jBCYA,iqCAZF,iaCVA,8KAOA,4SAWA,4SAUA,0CACA,gEAGA,0CAGA,gEAGA,yCACA,+DAIA,4CACA,kEAGA,wCAUA,8DACA,uCAGA,4DACA,sCACA,2DAGA,4CACA,kEACA,uCAGA,6DACA,2GAGA,sHAEA,yFAEA,+CACA,+EAGA,4MAOA,gCACA,sHAIA,kCACA,uEACA,gEACA,4DACA,kEAGA,2DACA,sDACA,0CACA,8CACA,wGAGA,0BACA,iCAGA,+DACA,+BACA,sCACA,+DAEA,kGACA,oCACA,yDACA,sCL7HF,kCAEA,sDAIA,0CK2HE,kEAIA,oDACA,sDAGA,oCACA,oEAEA,0DACA,qDAIA,oDACA,6DAIA,iEAIA,2DAIA,2DAGA,4DACA,gEAIA,gEAEA,gFAEA,oNASA,qDLxKE,gFAGE,4DAIF,oEKkHF,yEAEA,6DAGA,0DAEA,uDACA,qDACA,wDAIA,6DAIA,yDACA,2DAIA,uCAGA,wCACA,sDAGA,+CAGA,6DAEA,iDACA,+DAEA,wDAEA,sEAMA,0DACA,sBACA,mEL9JI,wEAEA,iCACE,+BAMN,wEAGA,iCACE,kFAEA,uEAIF,gEACE,8BAGF,qEMvDA,sCAKA,wFAKA,iCAIA,0BAWA,iCACA,4BACA,mCAGA,+BAEA,sCACA,4BAEA,mCAEA,sCAKA,sDAIA,gCAEA,gEAQF,wCAME,sBACA,kCAKA,uBAEA,gEAIA,2BAIA,mCAEA,qCACA,iCAGE,+BACA,wEAEE,iCACA,kFAGF,6BACA,0CACF,kCAEE,8BACE,8BACA,qEAEE,sCACA,wFCnFN,iCAGF,2DAEE,4BACA,oCAGA,mIAGA,4HACE,gEAMJ,+CAGE,sBACA,yCAEF,uBAEE,sEAKA,gDACA,kEAGA,iFAGE,YAGF,EACA,4HAQF,mBACE,6BACA,mBACA,wCACA,wCACA,2CAIA,eAGA,mBAKE,mBAGA,CAJA,uCACA,iBAFF,gBACE,CAKE,mBACA,mBAGJ,oBAIF,+BAGE,kDACA,OADA,kBAGA,CAFA,gBAEA,mBACA,oBAEA,sCACA,OAGF,cAHE,WAGF,GAEE,oBACA,CAHF,gBAGE,CC9Gc,YDiHd,+CAIF,SAEE,CAPF,UACE,wBAMA,4BAEA,GAGA,uBACA,CAJA,yBAGA,CACA,iDAKA,2CAGA,2DAQA,iBACA,uCAGA,kEAKE,SAKJ,8BACE,yDACA,2BAEA,oBACA,8BAEA,yDAEE,4BAEJ,uCACE,CACA,iEAGA,CAEA,wCACE,uBACA,kDAEA,0DAEE,CAJF,oBAIE,0GAWN,aACE,CAHA,YAGA,4HASA,+CAGF,sBACE,WACA,WAQA,4BAFF,0CAEE,CARA,qCAsBA,CAdA,iBAEA,kBACE,aADF,4BACE,WAMF,2BAGF,qCAEE,CAXE,UAWF,+BAGA,uBAEA,SAEA,0CAIE,CANF,qCAEA,CAIE,2DACE,gBAIN,+CAIA,CAEA,kDAKE,CAPF,8BAEA,CAOE,YACA,CAjBI,2BAGN,CAHM,WAcJ,UAGA,CAEA,2GAIF,iCAGE,8BAIA,qBACA,oBACF,uBAOI,0CAIA,CATF,6DAKE,CALF,sBASE,qCAKF,CACE,cACA,CAFF,sBAEE,CACA,+BAEA,qBAEE,WAKN,aACE,sCAGA,mBAEA,6BAMA,kCACA,CAJA,sBACA,aAEA,CAJA,eACA,MAIA,2FAEA,UAGA,YACA,sBACE,8BAEA,CALF,aACA,WAIE,OACA,oBAEF,uBACE,WAEF,YAFE,UAEF,eAgBA,kBACE,CAhBA,qDAQF,qCAGF,CAGI,YACF,CAJF,2BAGI,CAEA,eACA,qBAGA,mEAEA,qBACA,8BAIA,kBADF,kBACE,yBAEJ,oCAGI,qDAIJ,+BAGI,oCAEA,+CAQF,4CACE,yBACF,2BAOE,sBACA,CAHA,WACA,CAFF,cACE,CAJA,YAGF,CAEE,SAEA,mBAGA,kDAEE,CAJF,cAEA,cAEE,sBAEA,mBADA,YACA,uBACA,mDACE,CADF,YACE,iDAEA,uCAEN,+DAOE,mBADF,sBACE,mBAGF,aACE,sCAIA,aADF,WACE,CAKF,SACE,CAHJ,kBAEE,CAJE,gBAEJ,CAHI,iBAMA,yFAKA,aACA,eACA,cElbJ,iBAEE,aADA,iBACA,6BAEA,kCAEA,SACA,UAIA,gCACA,CALA,SAEA,SAEA,CAJA,0EAEA,CAFA,OAKA,CAGA,mDACE,iBAGF,gCACE,CADF,UACE,aAEJ,iCAEE,CAFF,UAEE,wCAEA,WACA,WADA,UACA,CACA,4CAGA,MACA,CADA,KACA,wCACA,UAGA,CAJA,UAIA,6DAUA,0CACE,CAFF,mBAEE,wEACA,CAVA,YACA,CAMF,mBAJE,OAOA,gBAJJ,gCACE,CANE,cACA,CAHA,oBACA,CAGA,QAGJ,CAII,0BACA,CADA,UACA,wCAEJ,kBACE,0DACA,gCACE,kBACA,CADA,YACA,oEACA,2CAMF,mDAII,CALN,YACE,CANE,cAKJ,CACE,iBAII,kEACA,yCACE,kDACA,yDACE,+CACA,uBANN,CAMM,+BANN,uCACE,qDACA,4BAEE,mBADA,0CACA,CADA,qBACA,0DACE,wCACA,sGALJ,oCACA,sBACE,kBAFF,UAEE,2CACA,wFACE,cACA,kEANN,uBACE,iDACA,CADA,UACA,0DACE,wDAEE,iEACA,qEANN,sCACE,CAGE,iBAHF,gBAGE,qBACE,CAJJ,uBACA,gDACE,wDACA,6DAHF,2CACA,CADA,gBACA,eACE,CAGE,sBANN,8BACE,CAII,iBAFF,4DACA,WACE,YADF,uCACE,6EACA,2BANN,8CACE,kDACA,0CACE,8BACA,yFACE,sBACA,sFALJ,mEACA,sBACE,kEACA,6EACE,uCACA,kEALJ,qGAEE,kEACA,6EACE,uCACA,kEALJ,8CACA,uDACE,sEACA,2EACE,sCACA,iEALJ,mGACA,qCACE,oDACA,0DACE,6GACA,gDAGR,yDCrEA,sEACE,CACA,6GACE,gEACF,iGAIF,wFACE,qDAGA,mGAEE,2CAEF,4FACE,gCACF,wGACE,8DAEE,6FAIA,iJAKN,6GACE,gDAKF,yDACA,qCAGA,6BACA,kBACA,qDAKA,oCAEA,+DAGA,2CAGE,oDAIA,oEAEE,qBAGJ,wDAEE,uCAEF,kEAGA,8CAEA,uDAIF,gEAIE,6BACA,gEAIA,+CACE,0EAIF,sDAEE,+DAGF,sCACA,8BACE,oCAEJ,wBACE,4FAEE,gBAEJ,yGAGI,kBAGJ,CCnHE,2MCFF,oBAGE,wGAKA,iCACE,CADF,wBACE,8GAQA,mBCjBJ,2GAIE,mBACA,6HAMA,YACE,mIAYF,eACA,CAHF,YAGE,4FAGE,8BAKF,uBAkBE,sCACA,CADA,qBAbA,wCAIA,CALF,8BACE,CADF,gBAKE,wCACA,CAOA,kDACA,CACA,kCAKF,6BAGA,4CACE,kDACA,eAGF,cACE,aACA,iBACA,yBACA,8BACA,WAGJ,2BACE,cAGA,+BACA,CAHA,eAGA,wCACA,YACA,iBACA,uEAGA,0BACA,2CAEA,8EAGI,qBACA,CAFF,kBAEE,kBAGN,0CAGE,mCAGA,4BAIA,gEACE,qCACA,8BAEA,gBACA,+CACA,iCAEF,iCAEE,gEACA,qCAGF,8BAEE,+BAIA,yCAEE,qBADA,gBACA,yBAKF,eACA,CAFF,YACE,CACA,iBACA,qDAEA,mDCvIJ,2FAOE,iCACA,CAEA,eACA,CAHA,kBAEA,CAFA,wBAGA,8BACA,eACE,CAFF,YAEE,0BACA,8CAGA,oBACE,oCAGA,kBACE,8DAEA,iBAEN,UACE,8BAIJ,+CAEE,qDAEF,kDAIE,YAEF,CAFE,YAEF,CCpCE,mFADA,kBAKE,CAJF,IAGA,aACE,mCAGA,iDACE,+BAEJ,wBAEE,mBAMA,6CAEF,CAJE,mBAEA,CAEF,kCAGE,CARF,kBACE,CAHA,eAUA,YACA,mBACA,CADA,UACA,wCC9BF,oBDkCE,wBCnCJ,uCACE,+BACA,+DACA,sBAGA,qBCDA,6CAIE,CAPF,uBAGA,CDGE,oBACF,yDAEE,CCDE,2CAGF,CAJA,kCACE,CDJJ,YACE,CAIA,eCTF,CDKE,uBCMA,gCACE,YAEF,oCAEE,wBACA,0BAIF,iBAEA,cADF,UACE,uBAEA,iCAEA,wCAEA,6CAMA,CAYF,gCATI,4BASJ,CAZE,mCAEE,iCAUJ,4BAGE,4DADA,+BACA,CAHF,qBAGE,sCACE,OAEF,iBAHA,SAGA,iHACE,2DAKF,CANA,8EAMA,uSAEE,kBAEF,+FACE,yCCjEJ,WACA,yBAGA,uBACA,gBAEA,uCAIA,CAJA,iCAIA,uCAGA,UACE,gBACA,qBAEA,0CClBJ,gBACE,KAGF,qBACE,YAGF,CAHE,cAGF,gCAEE,mBACA,iEAEA,oCACA,wCAEA,sBACA,WAEA,CAFA,YAEA,8EAEA,mCAFA,iBAEA,6BAIA,wEAKA,sDAIE,CARF,mDAIA,CAIE,cAEF,8CAIA,oBAFE,iBAEF,8CAGE,eAEF,CAFE,YAEF,OAEE,kBAGJ,CAJI,eACA,CAFF,mBAKF,yCCjDE,oBACA,CAFA,iBAEA,uCAKE,iBACA,qCAGA,mBCZJ,CDWI,gBCXJ,6BAEE,eACA,sBAGA,eAEA,sBACA,oDACA,iGAMA,gBAFE,YAEF,8FAME,iJCnBF,YACA,gNAWE,gDAEF,iSAaE,kBACE,gHAKF,oCACE,eACF,CADE,UACF,8CACE,gDACF,wCACE,oBCxCJ,oBAEF,6BACE,QACE,kDAGF,yBACE,kDAmBA,kDAEF,CAhBA,+CAaA,CAbA,oBAaA,0FACE,CADF,gGAfF,cACE,gBACA,CAaA,0BAGA,mQACE,gBAGF,oMACE,iBACA,CAFF,eACE,CADF,gBAEE,aAGJ,iCAEE,CAFF,wCAEE,wBAUE,+VAIE,uEAHA,2BAGA,wXAKJ,iDAGF,CARM,+CACE,iDAIN,CALI,gBAQN,mHACE,gBAGF,2DACE,0EAOA,0EAGF,gBAEE,6DC/EA,kDACA,gCACA,qDAGA,qBACA,qDCFA,cACA,eAEA,yBAGF,sBAEE,iBACA,sNAWA,iBACE,kBACA,wRAgBA,kBAEA,iOAgBA,uCACE,uEAEA,kBAEF,qUAuBE,iDAIJ,CACA,geCxFF,4BAEE,CAQA,6JACA,iDAIA,sEAGA,mDAOF,iDAGE,4DAIA,8CACA,qDAEE,eAFF,cAEE,oBAEF,uBAFE,kCAGA,eACA,iBACA,mBAIA,mDACA,CAHA,uCAEA,CAJA,0CACA,CAIA,gBAJA,gBACA,oBADA,gBAIA,wBAEJ,gBAGE,6BACA,YAHA,iBAGA,gCACA,iEAEA,6CACA,sDACA,0BADA,wBACA,0BACA,oIAIA,mBAFA,YAEA,qBACA,0CAIE,uBAEF,CAHA,yBACE,CAEF,iDACE,mFAKJ,oCACE,CANE,aAKJ,CACE,qEAIA,YAFA,WAEA,CAHA,aACA,CAEA,gBACE,4BACA,sBADA,aACA,gCAMF,oCACA,yDACA,2CAEA,qBAGE,kBAEA,CACA,mCAIF,CARE,YACA,CAOF,iCAEE,CAPA,oBACA,CAQA,oBACE,uDAEJ,sDAGA,CAHA,cAGA,0BACE,oDAIA,oCACA,4BACA,sBAGA,cAEA,oFAGA,sBAEA,yDACE,CAIF,iBAJE,wBAIF,6CAHE,6CAKA,eACA,aACA,CADA,cACA,yCAGJ,kBACE,CAKA,iDAEA,CARF,aACE,4CAGA,kBAIA,wEAGA,wDAGA,kCAOA,iDAGA,CAPF,WAEE,sCAEA,CAJF,2CACE,CAMA,qCACA,+BARF,kBACE,qCAOA,iBAsBA,sBACE,CAvBF,WAKA,CACE,0DAIF,CALA,uDACE,CANF,sBAqBA,4CACA,CALA,gRAIA,YAEE,6CAEN,mCAEE,+CASA,6EAIA,4BChNA,SDmNA,qFCnNA,gDACA,sCAGA,qCACA,sDACA,CAKA,kDAGA,CARA,0CAQA,kBAGA,YACA,sBACA,iBAFA,gBADF,YACE,CAHA,SAKA,kBAEA,SAFA,iBAEA,uEAGA,CAEE,6CAFF,oCAgBI,CAdF,yBACE,qBACF,CAGF,oBACE,CAIF,WACE,CALA,2CAGA,uBACF,CACE,mFAGE,CALF,qBAEA,UAGE,gCAIF,sDAEA,CALE,oCAKF,yCC7CJ,oCACE,CD+CA,yXAQE,sCCrDJ,wCAGA,oCACE","sources":["webpack:///./node_modules/normalize.css/normalize.css","webpack:///./src/furo/assets/styles/base/_print.sass","webpack:///./src/furo/assets/styles/base/_screen-readers.sass","webpack:///./src/furo/assets/styles/base/_theme.sass","webpack:///./src/furo/assets/styles/variables/_fonts.scss","webpack:///./src/furo/assets/styles/variables/_spacing.scss","webpack:///./src/furo/assets/styles/variables/_icons.scss","webpack:///./src/furo/assets/styles/variables/_admonitions.scss","webpack:///./src/furo/assets/styles/variables/_colors.scss","webpack:///./src/furo/assets/styles/base/_typography.sass","webpack:///./src/furo/assets/styles/_scaffold.sass","webpack:///./src/furo/assets/styles/variables/_layout.scss","webpack:///./src/furo/assets/styles/content/_admonitions.sass","webpack:///./src/furo/assets/styles/content/_api.sass","webpack:///./src/furo/assets/styles/content/_blocks.sass","webpack:///./src/furo/assets/styles/content/_captions.sass","webpack:///./src/furo/assets/styles/content/_code.sass","webpack:///./src/furo/assets/styles/content/_footnotes.sass","webpack:///./src/furo/assets/styles/content/_images.sass","webpack:///./src/furo/assets/styles/content/_indexes.sass","webpack:///./src/furo/assets/styles/content/_lists.sass","webpack:///./src/furo/assets/styles/content/_math.sass","webpack:///./src/furo/assets/styles/content/_misc.sass","webpack:///./src/furo/assets/styles/content/_rubrics.sass","webpack:///./src/furo/assets/styles/content/_sidebar.sass","webpack:///./src/furo/assets/styles/content/_tables.sass","webpack:///./src/furo/assets/styles/content/_target.sass","webpack:///./src/furo/assets/styles/content/_gui-labels.sass","webpack:///./src/furo/assets/styles/components/_footer.sass","webpack:///./src/furo/assets/styles/components/_sidebar.sass","webpack:///./src/furo/assets/styles/components/_table_of_contents.sass","webpack:///./src/furo/assets/styles/_shame.sass"],"sourcesContent":["/*! normalize.css v8.0.1 | MIT License | github.com/necolas/normalize.css */\n\n/* Document\n ========================================================================== */\n\n/**\n * 1. Correct the line height in all browsers.\n * 2. Prevent adjustments of font size after orientation changes in iOS.\n */\n\nhtml {\n line-height: 1.15; /* 1 */\n -webkit-text-size-adjust: 100%; /* 2 */\n}\n\n/* Sections\n ========================================================================== */\n\n/**\n * Remove the margin in all browsers.\n */\n\nbody {\n margin: 0;\n}\n\n/**\n * Render the `main` element consistently in IE.\n */\n\nmain {\n display: block;\n}\n\n/**\n * Correct the font size and margin on `h1` elements within `section` and\n * `article` contexts in Chrome, Firefox, and Safari.\n */\n\nh1 {\n font-size: 2em;\n margin: 0.67em 0;\n}\n\n/* Grouping content\n ========================================================================== */\n\n/**\n * 1. Add the correct box sizing in Firefox.\n * 2. Show the overflow in Edge and IE.\n */\n\nhr {\n box-sizing: content-box; /* 1 */\n height: 0; /* 1 */\n overflow: visible; /* 2 */\n}\n\n/**\n * 1. Correct the inheritance and scaling of font size in all browsers.\n * 2. Correct the odd `em` font sizing in all browsers.\n */\n\npre {\n font-family: monospace, monospace; /* 1 */\n font-size: 1em; /* 2 */\n}\n\n/* Text-level semantics\n ========================================================================== */\n\n/**\n * Remove the gray background on active links in IE 10.\n */\n\na {\n background-color: transparent;\n}\n\n/**\n * 1. Remove the bottom border in Chrome 57-\n * 2. Add the correct text decoration in Chrome, Edge, IE, Opera, and Safari.\n */\n\nabbr[title] {\n border-bottom: none; /* 1 */\n text-decoration: underline; /* 2 */\n text-decoration: underline dotted; /* 2 */\n}\n\n/**\n * Add the correct font weight in Chrome, Edge, and Safari.\n */\n\nb,\nstrong {\n font-weight: bolder;\n}\n\n/**\n * 1. Correct the inheritance and scaling of font size in all browsers.\n * 2. Correct the odd `em` font sizing in all browsers.\n */\n\ncode,\nkbd,\nsamp {\n font-family: monospace, monospace; /* 1 */\n font-size: 1em; /* 2 */\n}\n\n/**\n * Add the correct font size in all browsers.\n */\n\nsmall {\n font-size: 80%;\n}\n\n/**\n * Prevent `sub` and `sup` elements from affecting the line height in\n * all browsers.\n */\n\nsub,\nsup {\n font-size: 75%;\n line-height: 0;\n position: relative;\n vertical-align: baseline;\n}\n\nsub {\n bottom: -0.25em;\n}\n\nsup {\n top: -0.5em;\n}\n\n/* Embedded content\n ========================================================================== */\n\n/**\n * Remove the border on images inside links in IE 10.\n */\n\nimg {\n border-style: none;\n}\n\n/* Forms\n ========================================================================== */\n\n/**\n * 1. Change the font styles in all browsers.\n * 2. Remove the margin in Firefox and Safari.\n */\n\nbutton,\ninput,\noptgroup,\nselect,\ntextarea {\n font-family: inherit; /* 1 */\n font-size: 100%; /* 1 */\n line-height: 1.15; /* 1 */\n margin: 0; /* 2 */\n}\n\n/**\n * Show the overflow in IE.\n * 1. Show the overflow in Edge.\n */\n\nbutton,\ninput { /* 1 */\n overflow: visible;\n}\n\n/**\n * Remove the inheritance of text transform in Edge, Firefox, and IE.\n * 1. Remove the inheritance of text transform in Firefox.\n */\n\nbutton,\nselect { /* 1 */\n text-transform: none;\n}\n\n/**\n * Correct the inability to style clickable types in iOS and Safari.\n */\n\nbutton,\n[type=\"button\"],\n[type=\"reset\"],\n[type=\"submit\"] {\n -webkit-appearance: button;\n}\n\n/**\n * Remove the inner border and padding in Firefox.\n */\n\nbutton::-moz-focus-inner,\n[type=\"button\"]::-moz-focus-inner,\n[type=\"reset\"]::-moz-focus-inner,\n[type=\"submit\"]::-moz-focus-inner {\n border-style: none;\n padding: 0;\n}\n\n/**\n * Restore the focus styles unset by the previous rule.\n */\n\nbutton:-moz-focusring,\n[type=\"button\"]:-moz-focusring,\n[type=\"reset\"]:-moz-focusring,\n[type=\"submit\"]:-moz-focusring {\n outline: 1px dotted ButtonText;\n}\n\n/**\n * Correct the padding in Firefox.\n */\n\nfieldset {\n padding: 0.35em 0.75em 0.625em;\n}\n\n/**\n * 1. Correct the text wrapping in Edge and IE.\n * 2. Correct the color inheritance from `fieldset` elements in IE.\n * 3. Remove the padding so developers are not caught out when they zero out\n * `fieldset` elements in all browsers.\n */\n\nlegend {\n box-sizing: border-box; /* 1 */\n color: inherit; /* 2 */\n display: table; /* 1 */\n max-width: 100%; /* 1 */\n padding: 0; /* 3 */\n white-space: normal; /* 1 */\n}\n\n/**\n * Add the correct vertical alignment in Chrome, Firefox, and Opera.\n */\n\nprogress {\n vertical-align: baseline;\n}\n\n/**\n * Remove the default vertical scrollbar in IE 10+.\n */\n\ntextarea {\n overflow: auto;\n}\n\n/**\n * 1. Add the correct box sizing in IE 10.\n * 2. Remove the padding in IE 10.\n */\n\n[type=\"checkbox\"],\n[type=\"radio\"] {\n box-sizing: border-box; /* 1 */\n padding: 0; /* 2 */\n}\n\n/**\n * Correct the cursor style of increment and decrement buttons in Chrome.\n */\n\n[type=\"number\"]::-webkit-inner-spin-button,\n[type=\"number\"]::-webkit-outer-spin-button {\n height: auto;\n}\n\n/**\n * 1. Correct the odd appearance in Chrome and Safari.\n * 2. Correct the outline style in Safari.\n */\n\n[type=\"search\"] {\n -webkit-appearance: textfield; /* 1 */\n outline-offset: -2px; /* 2 */\n}\n\n/**\n * Remove the inner padding in Chrome and Safari on macOS.\n */\n\n[type=\"search\"]::-webkit-search-decoration {\n -webkit-appearance: none;\n}\n\n/**\n * 1. Correct the inability to style clickable types in iOS and Safari.\n * 2. Change font properties to `inherit` in Safari.\n */\n\n::-webkit-file-upload-button {\n -webkit-appearance: button; /* 1 */\n font: inherit; /* 2 */\n}\n\n/* Interactive\n ========================================================================== */\n\n/*\n * Add the correct display in Edge, IE 10+, and Firefox.\n */\n\ndetails {\n display: block;\n}\n\n/*\n * Add the correct display in all browsers.\n */\n\nsummary {\n display: list-item;\n}\n\n/* Misc\n ========================================================================== */\n\n/**\n * Add the correct display in IE 10+.\n */\n\ntemplate {\n display: none;\n}\n\n/**\n * Add the correct display in IE 10.\n */\n\n[hidden] {\n display: none;\n}\n","// This file contains styles for managing print media.\n\n////////////////////////////////////////////////////////////////////////////////\n// Hide elements not relevant to print media.\n////////////////////////////////////////////////////////////////////////////////\n@media print\n // Hide icon container.\n .content-icon-container\n display: none !important\n\n // Hide showing header links if hovering over when printing.\n .headerlink\n display: none !important\n\n // Hide mobile header.\n .mobile-header\n display: none !important\n\n // Hide navigation links.\n .related-pages\n display: none !important\n\n////////////////////////////////////////////////////////////////////////////////\n// Tweaks related to decolorization.\n////////////////////////////////////////////////////////////////////////////////\n@media print\n // Apply a border around code which no longer have a color background.\n .highlight\n border: 0.1pt solid var(--color-foreground-border)\n\n////////////////////////////////////////////////////////////////////////////////\n// Avoid page break in some relevant cases.\n////////////////////////////////////////////////////////////////////////////////\n@media print\n ul, ol, dl, a, table, pre, blockquote, p\n page-break-inside: avoid\n\n h1, h2, h3, h4, h5, h6, img, figure, caption\n page-break-inside: avoid\n page-break-after: avoid\n\n ul, ol, dl\n page-break-before: avoid\n",".visually-hidden\n position: absolute !important\n width: 1px !important\n height: 1px !important\n padding: 0 !important\n margin: -1px !important\n overflow: hidden !important\n clip: rect(0,0,0,0) !important\n white-space: nowrap !important\n border: 0 !important\n color: var(--color-foreground-primary)\n background: var(--color-background-primary)\n\n:-moz-focusring\n outline: auto\n","// This file serves as the \"skeleton\" of the theming logic.\n//\n// This contains the bulk of the logic for handling dark mode, color scheme\n// toggling and the handling of color-scheme-specific hiding of elements.\n\nbody\n @include fonts\n @include spacing\n @include icons\n @include admonitions\n @include default-admonition(#651fff, \"abstract\")\n @include default-topic(#14B8A6, \"pencil\")\n\n @include colors\n\n.only-light\n display: block !important\nhtml body .only-dark\n display: none !important\n\n// Ignore dark-mode hints if print media.\n@media not print\n // Enable dark-mode, if requested.\n body[data-theme=\"dark\"]\n @include colors-dark\n\n html & .only-light\n display: none !important\n .only-dark\n display: block !important\n\n // Enable dark mode, unless explicitly told to avoid.\n @media (prefers-color-scheme: dark)\n body:not([data-theme=\"light\"])\n @include colors-dark\n\n html & .only-light\n display: none !important\n .only-dark\n display: block !important\n\n//\n// Theme toggle presentation\n//\nbody[data-theme=\"auto\"]\n .theme-toggle svg.theme-icon-when-auto-light\n display: block\n\n @media (prefers-color-scheme: dark)\n .theme-toggle svg.theme-icon-when-auto-dark\n display: block\n .theme-toggle svg.theme-icon-when-auto-light\n display: none\n\nbody[data-theme=\"dark\"]\n .theme-toggle svg.theme-icon-when-dark\n display: block\n\nbody[data-theme=\"light\"]\n .theme-toggle svg.theme-icon-when-light\n display: block\n","// Fonts used by this theme.\n//\n// There are basically two things here -- using the system font stack and\n// defining sizes for various elements in %ages. We could have also used `em`\n// but %age is easier to reason about for me.\n\n@mixin fonts {\n // These are adapted from https://systemfontstack.com/\n --font-stack: -apple-system, BlinkMacSystemFont, Segoe UI, Helvetica, Arial,\n sans-serif, Apple Color Emoji, Segoe UI Emoji;\n --font-stack--monospace: \"SFMono-Regular\", Menlo, Consolas, Monaco,\n Liberation Mono, Lucida Console, monospace;\n --font-stack--headings: var(--font-stack);\n\n --font-size--normal: 100%;\n --font-size--small: 87.5%;\n --font-size--small--2: 81.25%;\n --font-size--small--3: 75%;\n --font-size--small--4: 62.5%;\n\n // Sidebar\n --sidebar-caption-font-size: var(--font-size--small--2);\n --sidebar-item-font-size: var(--font-size--small);\n --sidebar-search-input-font-size: var(--font-size--small);\n\n // Table of Contents\n --toc-font-size: var(--font-size--small--3);\n --toc-font-size--mobile: var(--font-size--normal);\n --toc-title-font-size: var(--font-size--small--4);\n\n // Admonitions\n //\n // These aren't defined in terms of %ages, since nesting these is permitted.\n --admonition-font-size: 0.8125rem;\n --admonition-title-font-size: 0.8125rem;\n\n // Code\n --code-font-size: var(--font-size--small--2);\n\n // API\n --api-font-size: var(--font-size--small);\n}\n","// Spacing for various elements on the page\n//\n// If the user wants to tweak things in a certain way, they are permitted to.\n// They also have to deal with the consequences though!\n\n@mixin spacing {\n // Header!\n --header-height: calc(\n var(--sidebar-item-line-height) + 4 * #{var(--sidebar-item-spacing-vertical)}\n );\n --header-padding: 0.5rem;\n\n // Sidebar\n --sidebar-tree-space-above: 1.5rem;\n --sidebar-caption-space-above: 1rem;\n\n --sidebar-item-line-height: 1rem;\n --sidebar-item-spacing-vertical: 0.5rem;\n --sidebar-item-spacing-horizontal: 1rem;\n --sidebar-item-height: calc(\n var(--sidebar-item-line-height) + 2 *#{var(--sidebar-item-spacing-vertical)}\n );\n\n --sidebar-expander-width: var(--sidebar-item-height); // be square\n\n --sidebar-search-space-above: 0.5rem;\n --sidebar-search-input-spacing-vertical: 0.5rem;\n --sidebar-search-input-spacing-horizontal: 0.5rem;\n --sidebar-search-input-height: 1rem;\n --sidebar-search-icon-size: var(--sidebar-search-input-height);\n\n // Table of Contents\n --toc-title-padding: 0.25rem 0;\n --toc-spacing-vertical: 1.5rem;\n --toc-spacing-horizontal: 1.5rem;\n --toc-item-spacing-vertical: 0.4rem;\n --toc-item-spacing-horizontal: 1rem;\n}\n","// Expose theme icons as CSS variables.\n\n$icons: (\n // Adapted from tabler-icons\n // url: https://tablericons.com/\n \"search\":\n url('data:image/svg+xml;charset=utf-8,'),\n // Factored out from mkdocs-material on 24-Aug-2020.\n // url: https://squidfunk.github.io/mkdocs-material/reference/admonitions/\n \"pencil\":\n url('data:image/svg+xml;charset=utf-8,'),\n \"abstract\":\n url('data:image/svg+xml;charset=utf-8,'),\n \"info\":\n url('data:image/svg+xml;charset=utf-8,'),\n \"flame\":\n url('data:image/svg+xml;charset=utf-8,'),\n \"question\":\n url('data:image/svg+xml;charset=utf-8,'),\n \"warning\":\n url('data:image/svg+xml;charset=utf-8,'),\n \"failure\":\n url('data:image/svg+xml;charset=utf-8,'),\n \"spark\":\n url('data:image/svg+xml;charset=utf-8,')\n);\n\n@mixin icons {\n @each $name, $glyph in $icons {\n --icon-#{$name}: #{$glyph};\n }\n}\n","// Admonitions\n\n// Structure of these is:\n// admonition-class: color \"icon-name\";\n//\n// The colors are translated into CSS variables below. The icons are\n// used directly in the main declarations to set the `mask-image` in\n// the title.\n\n// prettier-ignore\n$admonitions: (\n // Each of these has an reST directives for it.\n \"caution\": #ff9100 \"spark\",\n \"warning\": #ff9100 \"warning\",\n \"danger\": #ff5252 \"spark\",\n \"attention\": #ff5252 \"warning\",\n \"error\": #ff5252 \"failure\",\n \"hint\": #00c852 \"question\",\n \"tip\": #00c852 \"info\",\n \"important\": #00bfa5 \"flame\",\n \"note\": #00b0ff \"pencil\",\n \"seealso\": #448aff \"info\",\n \"admonition-todo\": #808080 \"pencil\"\n);\n\n@mixin default-admonition($color, $icon-name) {\n --color-admonition-title: #{$color};\n --color-admonition-title-background: #{rgba($color, 0.2)};\n\n --icon-admonition-default: var(--icon-#{$icon-name});\n}\n\n@mixin default-topic($color, $icon-name) {\n --color-topic-title: #{$color};\n --color-topic-title-background: #{rgba($color, 0.2)};\n\n --icon-topic-default: var(--icon-#{$icon-name});\n}\n\n@mixin admonitions {\n @each $name, $values in $admonitions {\n --color-admonition-title--#{$name}: #{nth($values, 1)};\n --color-admonition-title-background--#{$name}: #{rgba(\n nth($values, 1),\n 0.2\n )};\n }\n}\n","// Colors used throughout this theme.\n//\n// The aim is to give the user more control. Thus, instead of hard-coding colors\n// in various parts of the stylesheet, the approach taken is to define all\n// colors as CSS variables and reusing them in all the places.\n//\n// `colors-dark` depends on `colors` being included at a lower specificity.\n\n@mixin colors {\n --color-problematic: #b30000;\n\n // Base Colors\n --color-foreground-primary: black; // for main text and headings\n --color-foreground-secondary: #5a5c63; // for secondary text\n --color-foreground-muted: #6b6f76; // for muted text\n --color-foreground-border: #878787; // for content borders\n\n --color-background-primary: white; // for content\n --color-background-secondary: #f8f9fb; // for navigation + ToC\n --color-background-hover: #efeff4ff; // for navigation-item hover\n --color-background-hover--transparent: #efeff400;\n --color-background-border: #eeebee; // for UI borders\n --color-background-item: #ccc; // for \"background\" items (eg: copybutton)\n\n // Announcements\n --color-announcement-background: #000000dd;\n --color-announcement-text: #eeebee;\n\n // Brand colors\n --color-brand-primary: #0a4bff;\n --color-brand-content: #2757dd;\n --color-brand-visited: #872ee0;\n\n // API documentation\n --color-api-background: var(--color-background-hover--transparent);\n --color-api-background-hover: var(--color-background-hover);\n --color-api-overall: var(--color-foreground-secondary);\n --color-api-name: var(--color-problematic);\n --color-api-pre-name: var(--color-problematic);\n --color-api-paren: var(--color-foreground-secondary);\n --color-api-keyword: var(--color-foreground-primary);\n\n --color-api-added: #21632c;\n --color-api-added-border: #38a84d;\n --color-api-changed: #046172;\n --color-api-changed-border: #06a1bc;\n --color-api-deprecated: #605706;\n --color-api-deprecated-border: #f0d90f;\n --color-api-removed: #b30000;\n --color-api-removed-border: #ff5c5c;\n\n --color-highlight-on-target: #ffffcc;\n\n // Inline code background\n --color-inline-code-background: var(--color-background-secondary);\n\n // Highlighted text (search)\n --color-highlighted-background: #ddeeff;\n --color-highlighted-text: var(--color-foreground-primary);\n\n // GUI Labels\n --color-guilabel-background: #ddeeff80;\n --color-guilabel-border: #bedaf580;\n --color-guilabel-text: var(--color-foreground-primary);\n\n // Admonitions!\n --color-admonition-background: transparent;\n\n //////////////////////////////////////////////////////////////////////////////\n // Everything below this should be one of:\n // - var(...)\n // - *-gradient(...)\n // - special literal values (eg: transparent, none)\n //////////////////////////////////////////////////////////////////////////////\n\n // Tables\n --color-table-header-background: var(--color-background-secondary);\n --color-table-border: var(--color-background-border);\n\n // Cards\n --color-card-border: var(--color-background-secondary);\n --color-card-background: transparent;\n --color-card-marginals-background: var(--color-background-secondary);\n\n // Header\n --color-header-background: var(--color-background-primary);\n --color-header-border: var(--color-background-border);\n --color-header-text: var(--color-foreground-primary);\n\n // Sidebar (left)\n --color-sidebar-background: var(--color-background-secondary);\n --color-sidebar-background-border: var(--color-background-border);\n\n --color-sidebar-brand-text: var(--color-foreground-primary);\n --color-sidebar-caption-text: var(--color-foreground-muted);\n --color-sidebar-link-text: var(--color-foreground-secondary);\n --color-sidebar-link-text--top-level: var(--color-brand-primary);\n\n --color-sidebar-item-background: var(--color-sidebar-background);\n --color-sidebar-item-background--current: var(\n --color-sidebar-item-background\n );\n --color-sidebar-item-background--hover: linear-gradient(\n 90deg,\n var(--color-background-hover--transparent) 0%,\n var(--color-background-hover) var(--sidebar-item-spacing-horizontal),\n var(--color-background-hover) 100%\n );\n\n --color-sidebar-item-expander-background: transparent;\n --color-sidebar-item-expander-background--hover: var(\n --color-background-hover\n );\n\n --color-sidebar-search-text: var(--color-foreground-primary);\n --color-sidebar-search-background: var(--color-background-secondary);\n --color-sidebar-search-background--focus: var(--color-background-primary);\n --color-sidebar-search-border: var(--color-background-border);\n --color-sidebar-search-icon: var(--color-foreground-muted);\n\n // Table of Contents (right)\n --color-toc-background: var(--color-background-primary);\n --color-toc-title-text: var(--color-foreground-muted);\n --color-toc-item-text: var(--color-foreground-secondary);\n --color-toc-item-text--hover: var(--color-foreground-primary);\n --color-toc-item-text--active: var(--color-brand-primary);\n\n // Actual page contents\n --color-content-foreground: var(--color-foreground-primary);\n --color-content-background: transparent;\n\n // Links\n --color-link: var(--color-brand-content);\n --color-link-underline: var(--color-background-border);\n --color-link--hover: var(--color-brand-content);\n --color-link-underline--hover: var(--color-foreground-border);\n\n --color-link--visited: var(--color-brand-visited);\n --color-link-underline--visited: var(--color-background-border);\n --color-link--visited--hover: var(--color-brand-visited);\n --color-link-underline--visited--hover: var(--color-foreground-border);\n}\n\n@mixin colors-dark {\n --color-problematic: #ee5151;\n\n // Base Colors\n --color-foreground-primary: #cfd0d0; // for main text and headings\n --color-foreground-secondary: #9ca0a5; // for secondary text\n --color-foreground-muted: #81868d; // for muted text\n --color-foreground-border: #666666; // for content borders\n\n --color-background-primary: #131416; // for content\n --color-background-secondary: #1a1c1e; // for navigation + ToC\n --color-background-hover: #1e2124ff; // for navigation-item hover\n --color-background-hover--transparent: #1e212400;\n --color-background-border: #303335; // for UI borders\n --color-background-item: #444; // for \"background\" items (eg: copybutton)\n\n // Announcements\n --color-announcement-background: #000000dd;\n --color-announcement-text: #eeebee;\n\n // Brand colors\n --color-brand-primary: #3d94ff;\n --color-brand-content: #5ca5ff;\n --color-brand-visited: #b27aeb;\n\n // Highlighted text (search)\n --color-highlighted-background: #083563;\n\n // GUI Labels\n --color-guilabel-background: #08356380;\n --color-guilabel-border: #13395f80;\n\n // API documentation\n --color-api-keyword: var(--color-foreground-secondary);\n --color-highlight-on-target: #333300;\n\n --color-api-added: #3db854;\n --color-api-added-border: #267334;\n --color-api-changed: #09b0ce;\n --color-api-changed-border: #056d80;\n --color-api-deprecated: #b1a10b;\n --color-api-deprecated-border: #6e6407;\n --color-api-removed: #ff7575;\n --color-api-removed-border: #b03b3b;\n\n // Admonitions\n --color-admonition-background: #18181a;\n\n // Cards\n --color-card-border: var(--color-background-secondary);\n --color-card-background: #18181a;\n --color-card-marginals-background: var(--color-background-hover);\n}\n","// This file contains the styling for making the content throughout the page,\n// including fonts, paragraphs, headings and spacing among these elements.\n\nbody\n font-family: var(--font-stack)\npre,\ncode,\nkbd,\nsamp\n font-family: var(--font-stack--monospace)\n\n// Make fonts look slightly nicer.\nbody\n -webkit-font-smoothing: antialiased\n -moz-osx-font-smoothing: grayscale\n\n// Line height from Bootstrap 4.1\narticle\n line-height: 1.5\n\n//\n// Headings\n//\nh1,\nh2,\nh3,\nh4,\nh5,\nh6\n line-height: 1.25\n font-family: var(--font-stack--headings)\n font-weight: bold\n\n border-radius: 0.5rem\n margin-top: 0.5rem\n margin-bottom: 0.5rem\n margin-left: -0.5rem\n margin-right: -0.5rem\n padding-left: 0.5rem\n padding-right: 0.5rem\n\n + p\n margin-top: 0\n\nh1\n font-size: 2.5em\n margin-top: 1.75rem\n margin-bottom: 1rem\nh2\n font-size: 2em\n margin-top: 1.75rem\nh3\n font-size: 1.5em\nh4\n font-size: 1.25em\nh5\n font-size: 1.125em\nh6\n font-size: 1em\n\nsmall\n opacity: 75%\n font-size: 80%\n\n// Paragraph\np\n margin-top: 0.5rem\n margin-bottom: 0.75rem\n\n// Horizontal rules\nhr.docutils\n height: 1px\n padding: 0\n margin: 2rem 0\n background-color: var(--color-background-border)\n border: 0\n\n.centered\n text-align: center\n\n// Links\na\n text-decoration: underline\n\n color: var(--color-link)\n text-decoration-color: var(--color-link-underline)\n\n &:visited\n color: var(--color-link--visited)\n text-decoration-color: var(--color-link-underline--visited)\n &:hover\n color: var(--color-link--visited--hover)\n text-decoration-color: var(--color-link-underline--visited--hover)\n\n &:hover\n color: var(--color-link--hover)\n text-decoration-color: var(--color-link-underline--hover)\n &.muted-link\n color: inherit\n &:hover\n color: var(--color-link--hover)\n text-decoration-color: var(--color-link-underline--hover)\n &:visited\n color: var(--color-link--visited--hover)\n text-decoration-color: var(--color-link-underline--visited--hover)\n","// This file contains the styles for the overall layouting of the documentation\n// skeleton, including the responsive changes as well as sidebar toggles.\n//\n// This is implemented as a mobile-last design, which isn't ideal, but it is\n// reasonably good-enough and I got pretty tired by the time I'd finished this\n// to move the rules around to fix this. Shouldn't take more than 3-4 hours,\n// if you know what you're doing tho.\n\n// HACK: Not all browsers account for the scrollbar width in media queries.\n// This results in horizontal scrollbars in the breakpoint where we go\n// from displaying everything to hiding the ToC. We accomodate for this by\n// adding a bit of padding to the TOC drawer, disabling the horizontal\n// scrollbar and allowing the scrollbars to cover the padding.\n// https://www.456bereastreet.com/archive/201301/media_query_width_and_vertical_scrollbars/\n\n// HACK: Always having the scrollbar visible, prevents certain browsers from\n// causing the content to stutter horizontally between taller-than-viewport and\n// not-taller-than-viewport pages.\n\nhtml\n overflow-x: hidden\n overflow-y: scroll\n scroll-behavior: smooth\n\n.sidebar-scroll, .toc-scroll, article[role=main] *\n // Override Firefox scrollbar style\n scrollbar-width: thin\n scrollbar-color: var(--color-foreground-border) transparent\n\n // Override Chrome scrollbar styles\n &::-webkit-scrollbar\n width: 0.25rem\n height: 0.25rem\n &::-webkit-scrollbar-thumb\n background-color: var(--color-foreground-border)\n border-radius: 0.125rem\n\n//\n// Overalls\n//\nhtml,\nbody\n height: 100%\n color: var(--color-foreground-primary)\n background: var(--color-background-primary)\n\n.skip-to-content\n position: fixed\n padding: 1rem\n border-radius: 1rem\n left: 0.25rem\n top: 0.25rem\n z-index: 40\n background: var(--color-background-primary)\n color: var(--color-foreground-primary)\n\n transform: translateY(-200%)\n transition: transform 300ms ease-in-out\n\n &:focus-within\n transform: translateY(0%)\n\narticle\n color: var(--color-content-foreground)\n background: var(--color-content-background)\n overflow-wrap: break-word\n\n.page\n display: flex\n // fill the viewport for pages with little content.\n min-height: 100%\n\n.mobile-header\n width: 100%\n height: var(--header-height)\n background-color: var(--color-header-background)\n color: var(--color-header-text)\n border-bottom: 1px solid var(--color-header-border)\n\n // Looks like sub-script/super-script have this, and we need this to\n // be \"on top\" of those.\n z-index: 10\n\n // We don't show the header on large screens.\n display: none\n\n // Add shadow when scrolled\n &.scrolled\n border-bottom: none\n box-shadow: 0 0 0.2rem rgba(0, 0, 0, 0.1), 0 0.2rem 0.4rem rgba(0, 0, 0, 0.2)\n\n .header-center\n a\n color: var(--color-header-text)\n text-decoration: none\n\n.main\n display: flex\n flex: 1\n\n// Sidebar (left) also covers the entire left portion of screen.\n.sidebar-drawer\n box-sizing: border-box\n\n border-right: 1px solid var(--color-sidebar-background-border)\n background: var(--color-sidebar-background)\n\n display: flex\n justify-content: flex-end\n // These next two lines took me two days to figure out.\n width: calc((100% - #{$full-width}) / 2 + #{$sidebar-width})\n min-width: $sidebar-width\n\n// Scroll-along sidebars\n.sidebar-container,\n.toc-drawer\n box-sizing: border-box\n width: $sidebar-width\n\n.toc-drawer\n background: var(--color-toc-background)\n // See HACK described on top of this document\n padding-right: 1rem\n\n.sidebar-sticky,\n.toc-sticky\n position: sticky\n top: 0\n height: min(100%, 100vh)\n height: 100vh\n\n display: flex\n flex-direction: column\n\n.sidebar-scroll,\n.toc-scroll\n flex-grow: 1\n flex-shrink: 1\n\n overflow: auto\n scroll-behavior: smooth\n\n// Central items.\n.content\n padding: 0 $content-padding\n width: $content-width\n\n display: flex\n flex-direction: column\n justify-content: space-between\n\n.icon\n display: inline-block\n height: 1rem\n width: 1rem\n svg\n width: 100%\n height: 100%\n\n//\n// Accommodate announcement banner\n//\n.announcement\n background-color: var(--color-announcement-background)\n color: var(--color-announcement-text)\n\n height: var(--header-height)\n display: flex\n align-items: center\n overflow-x: auto\n & + .page\n min-height: calc(100% - var(--header-height))\n\n.announcement-content\n box-sizing: border-box\n padding: 0.5rem\n min-width: 100%\n white-space: nowrap\n text-align: center\n\n a\n color: var(--color-announcement-text)\n text-decoration-color: var(--color-announcement-text)\n\n &:hover\n color: var(--color-announcement-text)\n text-decoration-color: var(--color-link--hover)\n\n////////////////////////////////////////////////////////////////////////////////\n// Toggles for theme\n////////////////////////////////////////////////////////////////////////////////\n.no-js .theme-toggle-container // don't show theme toggle if there's no JS\n display: none\n\n.theme-toggle-container\n display: flex\n\n.theme-toggle\n display: flex\n cursor: pointer\n border: none\n padding: 0\n background: transparent\n\n.theme-toggle svg\n height: 1.25rem\n width: 1.25rem\n color: var(--color-foreground-primary)\n display: none\n\n.theme-toggle-header\n display: flex\n align-items: center\n justify-content: center\n\n////////////////////////////////////////////////////////////////////////////////\n// Toggles for elements\n////////////////////////////////////////////////////////////////////////////////\n.toc-overlay-icon, .nav-overlay-icon\n display: none\n cursor: pointer\n\n .icon\n color: var(--color-foreground-secondary)\n height: 1.5rem\n width: 1.5rem\n\n.toc-header-icon, .nav-overlay-icon\n // for when we set display: flex\n justify-content: center\n align-items: center\n\n.toc-content-icon\n height: 1.5rem\n width: 1.5rem\n\n.content-icon-container\n float: right\n display: flex\n margin-top: 1.5rem\n margin-left: 1rem\n margin-bottom: 1rem\n gap: 0.5rem\n\n .edit-this-page, .view-this-page\n svg\n color: inherit\n height: 1.25rem\n width: 1.25rem\n\n.sidebar-toggle\n position: absolute\n display: none\n// \n.sidebar-toggle[name=\"__toc\"]\n left: 20px\n.sidebar-toggle:checked\n left: 40px\n// \n\n.overlay\n position: fixed\n top: 0\n width: 0\n height: 0\n\n transition: width 0ms, height 0ms, opacity 250ms ease-out\n\n opacity: 0\n background-color: rgba(0, 0, 0, 0.54)\n.sidebar-overlay\n z-index: 20\n.toc-overlay\n z-index: 40\n\n// Keep things on top and smooth.\n.sidebar-drawer\n z-index: 30\n transition: left 250ms ease-in-out\n.toc-drawer\n z-index: 50\n transition: right 250ms ease-in-out\n\n// Show the Sidebar\n#__navigation:checked\n & ~ .sidebar-overlay\n width: 100%\n height: 100%\n opacity: 1\n & ~ .page\n .sidebar-drawer\n top: 0\n left: 0\n // Show the toc sidebar\n#__toc:checked\n & ~ .toc-overlay\n width: 100%\n height: 100%\n opacity: 1\n & ~ .page\n .toc-drawer\n top: 0\n right: 0\n\n////////////////////////////////////////////////////////////////////////////////\n// Back to top\n////////////////////////////////////////////////////////////////////////////////\n.back-to-top\n text-decoration: none\n\n display: none\n position: fixed\n left: 0\n top: 1rem\n padding: 0.5rem\n padding-right: 0.75rem\n border-radius: 1rem\n font-size: 0.8125rem\n\n background: var(--color-background-primary)\n box-shadow: 0 0.2rem 0.5rem rgba(0, 0, 0, 0.05), #6b728080 0px 0px 1px 0px\n\n z-index: 10\n\n margin-left: 50%\n transform: translateX(-50%)\n svg\n height: 1rem\n width: 1rem\n fill: currentColor\n display: inline-block\n\n span\n margin-left: 0.25rem\n\n .show-back-to-top &\n display: flex\n align-items: center\n\n////////////////////////////////////////////////////////////////////////////////\n// Responsive layouting\n////////////////////////////////////////////////////////////////////////////////\n// Make things a bit bigger on bigger screens.\n@media (min-width: $full-width + $sidebar-width)\n html\n font-size: 110%\n\n@media (max-width: $full-width)\n // Collapse \"toc\" into the icon.\n .toc-content-icon\n display: flex\n .toc-drawer\n position: fixed\n height: 100vh\n top: 0\n right: -$sidebar-width\n border-left: 1px solid var(--color-background-muted)\n .toc-tree\n border-left: none\n font-size: var(--toc-font-size--mobile)\n\n // Accomodate for a changed content width.\n .sidebar-drawer\n width: calc((100% - #{$full-width - $sidebar-width}) / 2 + #{$sidebar-width})\n\n@media (max-width: $content-padded-width + $sidebar-width)\n // Center the page\n .content\n margin-left: auto\n margin-right: auto\n padding: 0 $content-padding--small\n\n@media (max-width: $content-padded-width--small + $sidebar-width)\n // Collapse \"navigation\".\n .nav-overlay-icon\n display: flex\n .sidebar-drawer\n position: fixed\n height: 100vh\n width: $sidebar-width\n\n top: 0\n left: -$sidebar-width\n\n // Swap which icon is visible.\n .toc-header-icon, .theme-toggle-header\n display: flex\n .toc-content-icon, .theme-toggle-content\n display: none\n\n // Show the header.\n .mobile-header\n position: sticky\n top: 0\n display: flex\n justify-content: space-between\n align-items: center\n\n .header-left,\n .header-right\n display: flex\n height: var(--header-height)\n padding: 0 var(--header-padding)\n label\n height: 100%\n width: 100%\n user-select: none\n\n .nav-overlay-icon .icon,\n .theme-toggle svg\n height: 1.5rem\n width: 1.5rem\n\n // Add a scroll margin for the content\n :target\n scroll-margin-top: calc(var(--header-height) + 2.5rem)\n\n // Show back-to-top below the header\n .back-to-top\n top: calc(var(--header-height) + 0.5rem)\n\n // Accommodate for the header.\n .page\n flex-direction: column\n justify-content: center\n\n@media (max-width: $content-width + 2* $content-padding--small)\n // Content should respect window limits.\n .content\n width: 100%\n overflow-x: auto\n\n@media (max-width: $content-width)\n article[role=main] aside.sidebar\n float: none\n width: 100%\n margin: 1rem 0\n","// Overall Layout Variables\n//\n// Because CSS variables can't be used in media queries. The fact that this\n// makes the layout non-user-configurable is a good thing.\n$content-padding: 3em;\n$content-padding--small: 1em;\n$content-width: 46em;\n$sidebar-width: 15em;\n$content-padded-width: $content-width + 2 * $content-padding;\n$content-padded-width--small: $content-width + 2 * $content-padding--small;\n$full-width: $content-padded-width + 2 * $sidebar-width;\n","//\n// The design here is strongly inspired by mkdocs-material.\n.admonition, .topic\n margin: 1rem auto\n padding: 0 0.5rem 0.5rem 0.5rem\n\n background: var(--color-admonition-background)\n\n border-radius: 0.2rem\n box-shadow: 0 0.2rem 0.5rem rgba(0, 0, 0, 0.05), 0 0 0.0625rem rgba(0, 0, 0, 0.1)\n\n font-size: var(--admonition-font-size)\n\n overflow: hidden\n page-break-inside: avoid\n\n // First element should have no margin, since the title has it.\n > :nth-child(2)\n margin-top: 0\n\n // Last item should have no margin, since we'll control that w/ padding\n > :last-child\n margin-bottom: 0\n\n.admonition p.admonition-title,\np.topic-title\n position: relative\n margin: 0 -0.5rem 0.5rem\n padding-left: 2rem\n padding-right: .5rem\n padding-top: .4rem\n padding-bottom: .4rem\n\n font-weight: 500\n font-size: var(--admonition-title-font-size)\n line-height: 1.3\n\n // Our fancy icon\n &::before\n content: \"\"\n position: absolute\n left: 0.5rem\n width: 1rem\n height: 1rem\n\n// Default styles\np.admonition-title\n background-color: var(--color-admonition-title-background)\n &::before\n background-color: var(--color-admonition-title)\n mask-image: var(--icon-admonition-default)\n mask-repeat: no-repeat\n\np.topic-title\n background-color: var(--color-topic-title-background)\n &::before\n background-color: var(--color-topic-title)\n mask-image: var(--icon-topic-default)\n mask-repeat: no-repeat\n\n//\n// Variants\n//\n.admonition\n border-left: 0.2rem solid var(--color-admonition-title)\n\n @each $type, $value in $admonitions\n &.#{$type}\n border-left-color: var(--color-admonition-title--#{$type})\n > .admonition-title\n background-color: var(--color-admonition-title-background--#{$type})\n &::before\n background-color: var(--color-admonition-title--#{$type})\n mask-image: var(--icon-#{nth($value, 2)})\n\n.admonition-todo > .admonition-title\n text-transform: uppercase\n","// This file stylizes the API documentation (stuff generated by autodoc). It's\n// deeply nested due to how autodoc structures the HTML without enough classes\n// to select the relevant items.\n\n// API docs!\ndl[class]:not(.option-list):not(.field-list):not(.footnote):not(.glossary):not(.simple)\n // Tweak the spacing of all the things!\n dd\n margin-left: 2rem\n > :first-child\n margin-top: 0.125rem\n > :last-child\n margin-bottom: 0.75rem\n\n // This is used for the arguments\n .field-list\n margin-bottom: 0.75rem\n\n // \"Headings\" (like \"Parameters\" and \"Return\")\n > dt\n text-transform: uppercase\n font-size: var(--font-size--small)\n\n dd:empty\n margin-bottom: 0.5rem\n dd > ul\n margin-left: -1.2rem\n > li\n > p:nth-child(2)\n margin-top: 0\n // When the last-empty-paragraph follows a paragraph, it doesn't need\n // to augument the existing spacing.\n > p + p:last-child:empty\n margin-top: 0\n margin-bottom: 0\n\n // Colorize the elements\n > dt\n color: var(--color-api-overall)\n\n.sig:not(.sig-inline)\n font-weight: bold\n\n font-size: var(--api-font-size)\n font-family: var(--font-stack--monospace)\n\n margin-left: -0.25rem\n margin-right: -0.25rem\n padding-top: 0.25rem\n padding-bottom: 0.25rem\n padding-right: 0.5rem\n\n // These are intentionally em, to properly match the font size.\n padding-left: 3em\n text-indent: -2.5em\n\n border-radius: 0.25rem\n\n background: var(--color-api-background)\n transition: background 100ms ease-out\n\n &:hover\n background: var(--color-api-background-hover)\n\n // adjust the size of the [source] link on the right.\n a.reference\n .viewcode-link\n font-weight: normal\n width: 4.25rem\n\nem.property\n font-style: normal\n &:first-child\n color: var(--color-api-keyword)\n.sig-name\n color: var(--color-api-name)\n.sig-prename\n font-weight: normal\n color: var(--color-api-pre-name)\n.sig-paren\n color: var(--color-api-paren)\n.sig-param\n font-style: normal\n\ndiv.versionadded,\ndiv.versionchanged,\ndiv.deprecated,\ndiv.versionremoved\n border-left: 0.1875rem solid\n border-radius: 0.125rem\n\n padding-left: 0.75rem\n\n p\n margin-top: 0.125rem\n margin-bottom: 0.125rem\n\ndiv.versionadded\n border-color: var(--color-api-added-border)\n .versionmodified\n color: var(--color-api-added)\n\ndiv.versionchanged\n border-color: var(--color-api-changed-border)\n .versionmodified\n color: var(--color-api-changed)\n\ndiv.deprecated\n border-color: var(--color-api-deprecated-border)\n .versionmodified\n color: var(--color-api-deprecated)\n\ndiv.versionremoved\n border-color: var(--color-api-removed-border)\n .versionmodified\n color: var(--color-api-removed)\n\n// Align the [docs] and [source] to the right.\n.viewcode-link, .viewcode-back\n float: right\n text-align: right\n",".line-block\n margin-top: 0.5rem\n margin-bottom: 0.75rem\n .line-block\n margin-top: 0rem\n margin-bottom: 0rem\n padding-left: 1rem\n","// Captions\narticle p.caption,\ntable > caption,\n.code-block-caption\n font-size: var(--font-size--small)\n text-align: center\n\n// Caption above a TOCTree\n.toctree-wrapper.compound\n .caption, :not(.caption) > .caption-text\n font-size: var(--font-size--small)\n text-transform: uppercase\n\n text-align: initial\n margin-bottom: 0\n\n > ul\n margin-top: 0\n margin-bottom: 0\n","// Inline code\ncode.literal, .sig-inline\n background: var(--color-inline-code-background)\n border-radius: 0.2em\n // Make the font smaller, and use padding to recover.\n font-size: var(--font-size--small--2)\n padding: 0.1em 0.2em\n\n pre.literal-block &\n font-size: inherit\n padding: 0\n\n p &\n border: 1px solid var(--color-background-border)\n\n.sig-inline\n font-family: var(--font-stack--monospace)\n\n// Code and Literal Blocks\n$code-spacing-vertical: 0.625rem\n$code-spacing-horizontal: 0.875rem\n\n// Wraps every literal block + line numbers.\ndiv[class*=\" highlight-\"],\ndiv[class^=\"highlight-\"]\n margin: 1em 0\n display: flex\n\n .table-wrapper\n margin: 0\n padding: 0\n\npre\n margin: 0\n padding: 0\n overflow: auto\n\n // Needed to have more specificity than pygments' \"pre\" selector. :(\n article[role=\"main\"] .highlight &\n line-height: 1.5\n\n &.literal-block,\n .highlight &\n font-size: var(--code-font-size)\n padding: $code-spacing-vertical $code-spacing-horizontal\n\n // Make it look like all the other blocks.\n &.literal-block\n margin-top: 1rem\n margin-bottom: 1rem\n\n border-radius: 0.2rem\n background-color: var(--color-code-background)\n color: var(--color-code-foreground)\n\n// All code is always contained in this.\n.highlight\n width: 100%\n border-radius: 0.2rem\n\n // Make line numbers and prompts un-selectable.\n .gp, span.linenos\n user-select: none\n pointer-events: none\n\n // Expand the line-highlighting.\n .hll\n display: block\n margin-left: -$code-spacing-horizontal\n margin-right: -$code-spacing-horizontal\n padding-left: $code-spacing-horizontal\n padding-right: $code-spacing-horizontal\n\n/* Make code block captions be nicely integrated */\n.code-block-caption\n display: flex\n padding: $code-spacing-vertical $code-spacing-horizontal\n\n border-radius: 0.25rem\n border-bottom-left-radius: 0\n border-bottom-right-radius: 0\n font-weight: 300\n border-bottom: 1px solid\n\n background-color: var(--color-code-background)\n color: var(--color-code-foreground)\n border-color: var(--color-background-border)\n\n + div[class]\n margin-top: 0\n pre\n border-top-left-radius: 0\n border-top-right-radius: 0\n\n// When `html_codeblock_linenos_style` is table.\n.highlighttable\n width: 100%\n display: block\n tbody\n display: block\n\n tr\n display: flex\n\n // Line numbers\n td.linenos\n background-color: var(--color-code-background)\n color: var(--color-code-foreground)\n padding: $code-spacing-vertical $code-spacing-horizontal\n padding-right: 0\n border-top-left-radius: 0.2rem\n border-bottom-left-radius: 0.2rem\n\n .linenodiv\n padding-right: $code-spacing-horizontal\n font-size: var(--code-font-size)\n box-shadow: -0.0625rem 0 var(--color-foreground-border) inset\n\n // Actual code\n td.code\n padding: 0\n display: block\n flex: 1\n overflow: hidden\n\n .highlight\n border-top-left-radius: 0\n border-bottom-left-radius: 0\n\n// When `html_codeblock_linenos_style` is inline.\n.highlight\n span.linenos\n display: inline-block\n padding-left: 0\n padding-right: $code-spacing-horizontal\n margin-right: $code-spacing-horizontal\n box-shadow: -0.0625rem 0 var(--color-foreground-border) inset\n","// Inline Footnote Reference\n.footnote-reference\n font-size: var(--font-size--small--4)\n vertical-align: super\n\n// Definition list, listing the content of each note.\n// docutils <= 0.17\ndl.footnote.brackets\n font-size: var(--font-size--small)\n color: var(--color-foreground-secondary)\n\n display: grid\n grid-template-columns: max-content auto\n dt\n margin: 0\n > .fn-backref\n margin-left: 0.25rem\n\n &:after\n content: \":\"\n\n .brackets\n &:before\n content: \"[\"\n &:after\n content: \"]\"\n\n dd\n margin: 0\n padding: 0 1rem\n\n// docutils >= 0.18\naside.footnote\n font-size: var(--font-size--small)\n color: var(--color-foreground-secondary)\n\naside.footnote > span,\ndiv.citation > span\n float: left\n font-weight: 500\n padding-right: 0.25rem\n\naside.footnote > *:not(span),\ndiv.citation > p\n margin-left: 2rem\n","//\n// Figures\n//\nimg\n box-sizing: border-box\n max-width: 100%\n height: auto\n\narticle\n figure, .figure\n border-radius: 0.2rem\n\n margin: 0\n :last-child\n margin-bottom: 0\n\n .align-left\n float: left\n clear: left\n margin: 0 1rem 1rem\n\n .align-right\n float: right\n clear: right\n margin: 0 1rem 1rem\n\n .align-default,\n .align-center\n display: block\n text-align: center\n margin-left: auto\n margin-right: auto\n\n // WELL, table needs to be stylised like a table.\n table.align-default\n display: table\n text-align: initial\n",".genindex-jumpbox, .domainindex-jumpbox\n border-top: 1px solid var(--color-background-border)\n border-bottom: 1px solid var(--color-background-border)\n padding: 0.25rem\n\n.genindex-section, .domainindex-section\n h2\n margin-top: 0.75rem\n margin-bottom: 0.5rem\n ul\n margin-top: 0\n margin-bottom: 0\n","ul,\nol\n padding-left: 1.2rem\n\n // Space lists out like paragraphs\n margin-top: 1rem\n margin-bottom: 1rem\n // reduce margins within li.\n li\n > p:first-child\n margin-top: 0.25rem\n margin-bottom: 0.25rem\n\n > p:last-child\n margin-top: 0.25rem\n\n > ul,\n > ol\n margin-top: 0.5rem\n margin-bottom: 0.5rem\n\nol\n &.arabic\n list-style: decimal\n &.loweralpha\n list-style: lower-alpha\n &.upperalpha\n list-style: upper-alpha\n &.lowerroman\n list-style: lower-roman\n &.upperroman\n list-style: upper-roman\n\n// Don't space lists out when they're \"simple\" or in a `.. toctree::`\n.simple,\n.toctree-wrapper\n li\n > ul,\n > ol\n margin-top: 0\n margin-bottom: 0\n\n// Definition Lists\n.field-list,\n.option-list,\ndl:not([class]),\ndl.simple,\ndl.footnote,\ndl.glossary\n dt\n font-weight: 500\n margin-top: 0.25rem\n + dt\n margin-top: 0\n\n .classifier::before\n content: \":\"\n margin-left: 0.2rem\n margin-right: 0.2rem\n\n dd\n > p:first-child,\n ul\n margin-top: 0.125rem\n\n ul\n margin-bottom: 0.125rem\n",".math-wrapper\n width: 100%\n overflow-x: auto\n\ndiv.math\n position: relative\n text-align: center\n\n .headerlink,\n &:focus .headerlink\n display: none\n\n &:hover .headerlink\n display: inline-block\n\n span.eqno\n position: absolute\n right: 0.5rem\n top: 50%\n transform: translate(0, -50%)\n z-index: 1\n","// Abbreviations\nabbr[title]\n cursor: help\n\n// \"Problematic\" content, as identified by Sphinx\n.problematic\n color: var(--color-problematic)\n\n// Keyboard / Mouse \"instructions\"\nkbd:not(.compound)\n margin: 0 0.2rem\n padding: 0 0.2rem\n border-radius: 0.2rem\n border: 1px solid var(--color-foreground-border)\n color: var(--color-foreground-primary)\n vertical-align: text-bottom\n\n font-size: var(--font-size--small--3)\n display: inline-block\n\n box-shadow: 0 0.0625rem 0 rgba(0, 0, 0, 0.2), inset 0 0 0 0.125rem var(--color-background-primary)\n\n background-color: var(--color-background-secondary)\n\n// Blockquote\nblockquote\n border-left: 4px solid var(--color-background-border)\n background: var(--color-background-secondary)\n\n margin-left: 0\n margin-right: 0\n padding: 0.5rem 1rem\n\n .attribution\n font-weight: 600\n text-align: right\n\n &.pull-quote,\n &.highlights\n font-size: 1.25em\n\n &.epigraph,\n &.pull-quote\n border-left-width: 0\n border-radius: 0.5rem\n\n &.highlights\n border-left-width: 0\n background: transparent\n\n// Center align embedded-in-text images\np .reference img\n vertical-align: middle\n","p.rubric\n line-height: 1.25\n font-weight: bold\n font-size: 1.125em\n\n // For Numpy-style documentation that's got rubrics within it.\n // https://github.com/pradyunsg/furo/discussions/505\n dd &\n line-height: inherit\n font-weight: inherit\n\n font-size: var(--font-size--small)\n text-transform: uppercase\n","article .sidebar\n float: right\n clear: right\n width: 30%\n\n margin-left: 1rem\n margin-right: 0\n\n border-radius: 0.2rem\n background-color: var(--color-background-secondary)\n border: var(--color-background-border) 1px solid\n\n > *\n padding-left: 1rem\n padding-right: 1rem\n\n > ul, > ol // lists need additional padding, because bullets.\n padding-left: 2.2rem\n\n .sidebar-title\n margin: 0\n padding: 0.5rem 1rem\n border-bottom: var(--color-background-border) 1px solid\n\n font-weight: 500\n\n// TODO: subtitle\n// TODO: dedicated variables?\n","[role=main] .table-wrapper.container\n width: 100%\n overflow-x: auto\n margin-top: 1rem\n margin-bottom: 0.5rem\n padding: 0.2rem 0.2rem 0.75rem\n\ntable.docutils\n border-radius: 0.2rem\n border-spacing: 0\n border-collapse: collapse\n\n box-shadow: 0 0.2rem 0.5rem rgba(0, 0, 0, 0.05), 0 0 0.0625rem rgba(0, 0, 0, 0.1)\n\n th\n background: var(--color-table-header-background)\n\n td,\n th\n // Space things out properly\n padding: 0 0.25rem\n\n // Get the borders looking just-right.\n border-left: 1px solid var(--color-table-border)\n border-right: 1px solid var(--color-table-border)\n border-bottom: 1px solid var(--color-table-border)\n\n p\n margin: 0.25rem\n\n &:first-child\n border-left: none\n &:last-child\n border-right: none\n\n // MyST-parser tables set these classes for control of column alignment\n &.text-left\n text-align: left\n &.text-right\n text-align: right\n &.text-center\n text-align: center\n",":target\n scroll-margin-top: 2.5rem\n\n@media (max-width: $full-width - $sidebar-width)\n :target\n scroll-margin-top: calc(2.5rem + var(--header-height))\n\n // When a heading is selected\n section > span:target\n scroll-margin-top: calc(2.8rem + var(--header-height))\n\n// Permalinks\n.headerlink\n font-weight: 100\n user-select: none\n\nh1,\nh2,\nh3,\nh4,\nh5,\nh6,\ndl dt,\np.caption,\nfigcaption p,\ntable > caption,\n.code-block-caption\n > .headerlink\n margin-left: 0.5rem\n visibility: hidden\n &:hover > .headerlink\n visibility: visible\n\n // Don't change to link-like, if someone adds the contents directive.\n > .toc-backref\n color: inherit\n text-decoration-line: none\n\n// Figure and table captions are special.\nfigure:hover > figcaption > p > .headerlink,\ntable:hover > caption > .headerlink\n visibility: visible\n\n:target >, // Regular section[id] style anchors\nspan:target ~ // Non-regular span[id] style \"extra\" anchors\n h1,\n h2,\n h3,\n h4,\n h5,\n h6\n &:nth-of-type(1)\n background-color: var(--color-highlight-on-target)\n // .headerlink\n // visibility: visible\n code.literal\n background-color: transparent\n\ntable:target > caption,\nfigure:target\n background-color: var(--color-highlight-on-target)\n\n// Inline page contents\n.this-will-duplicate-information-and-it-is-still-useful-here li :target\n background-color: var(--color-highlight-on-target)\n\n// Code block permalinks\n.literal-block-wrapper:target .code-block-caption\n background-color: var(--color-highlight-on-target)\n\n// When a definition list item is selected\n//\n// There isn't really an alternative to !important here, due to the\n// high-specificity of API documentation's selector.\ndt:target\n background-color: var(--color-highlight-on-target) !important\n\n// When a footnote reference is selected\n.footnote > dt:target + dd,\n.footnote-reference:target\n background-color: var(--color-highlight-on-target)\n",".guilabel\n background-color: var(--color-guilabel-background)\n border: 1px solid var(--color-guilabel-border)\n color: var(--color-guilabel-text)\n\n padding: 0 0.3em\n border-radius: 0.5em\n font-size: 0.9em\n","// This file contains the styles used for stylizing the footer that's shown\n// below the content.\n\nfooter\n font-size: var(--font-size--small)\n display: flex\n flex-direction: column\n\n margin-top: 2rem\n\n// Bottom of page information\n.bottom-of-page\n display: flex\n align-items: center\n justify-content: space-between\n\n margin-top: 1rem\n padding-top: 1rem\n padding-bottom: 1rem\n\n color: var(--color-foreground-secondary)\n border-top: 1px solid var(--color-background-border)\n\n line-height: 1.5\n\n @media (max-width: $content-width)\n text-align: center\n flex-direction: column-reverse\n gap: 0.25rem\n\n .left-details\n font-size: var(--font-size--small)\n\n .right-details\n display: flex\n flex-direction: column\n gap: 0.25rem\n text-align: right\n\n .icons\n display: flex\n justify-content: flex-end\n gap: 0.25rem\n font-size: 1rem\n\n a\n text-decoration: none\n\n svg,\n img\n font-size: 1.125rem\n height: 1em\n width: 1em\n\n// Next/Prev page information\n.related-pages\n a\n display: flex\n align-items: center\n\n text-decoration: none\n &:hover .page-info .title\n text-decoration: underline\n color: var(--color-link)\n text-decoration-color: var(--color-link-underline)\n\n svg.furo-related-icon,\n svg.furo-related-icon > use\n flex-shrink: 0\n\n color: var(--color-foreground-border)\n\n width: 0.75rem\n height: 0.75rem\n margin: 0 0.5rem\n\n &.next-page\n max-width: 50%\n\n float: right\n clear: right\n text-align: right\n\n &.prev-page\n max-width: 50%\n\n float: left\n clear: left\n\n svg\n transform: rotate(180deg)\n\n.page-info\n display: flex\n flex-direction: column\n overflow-wrap: anywhere\n\n .next-page &\n align-items: flex-end\n\n .context\n display: flex\n align-items: center\n\n padding-bottom: 0.1rem\n\n color: var(--color-foreground-muted)\n font-size: var(--font-size--small)\n text-decoration: none\n","// This file contains the styles for the contents of the left sidebar, which\n// contains the navigation tree, logo, search etc.\n\n////////////////////////////////////////////////////////////////////////////////\n// Brand on top of the scrollable tree.\n////////////////////////////////////////////////////////////////////////////////\n.sidebar-brand\n display: flex\n flex-direction: column\n flex-shrink: 0\n\n padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal)\n text-decoration: none\n\n.sidebar-brand-text\n color: var(--color-sidebar-brand-text)\n overflow-wrap: break-word\n margin: var(--sidebar-item-spacing-vertical) 0\n font-size: 1.5rem\n\n.sidebar-logo-container\n margin: var(--sidebar-item-spacing-vertical) 0\n\n.sidebar-logo\n margin: 0 auto\n display: block\n max-width: 100%\n\n////////////////////////////////////////////////////////////////////////////////\n// Search\n////////////////////////////////////////////////////////////////////////////////\n.sidebar-search-container\n display: flex\n align-items: center\n margin-top: var(--sidebar-search-space-above)\n\n position: relative\n\n background: var(--color-sidebar-search-background)\n &:hover,\n &:focus-within\n background: var(--color-sidebar-search-background--focus)\n\n &::before\n content: \"\"\n position: absolute\n left: var(--sidebar-item-spacing-horizontal)\n width: var(--sidebar-search-icon-size)\n height: var(--sidebar-search-icon-size)\n\n background-color: var(--color-sidebar-search-icon)\n mask-image: var(--icon-search)\n\n.sidebar-search\n box-sizing: border-box\n\n border: none\n border-top: 1px solid var(--color-sidebar-search-border)\n border-bottom: 1px solid var(--color-sidebar-search-border)\n\n padding-top: var(--sidebar-search-input-spacing-vertical)\n padding-bottom: var(--sidebar-search-input-spacing-vertical)\n padding-right: var(--sidebar-search-input-spacing-horizontal)\n padding-left: calc(var(--sidebar-item-spacing-horizontal) + var(--sidebar-search-input-spacing-horizontal) + var(--sidebar-search-icon-size))\n\n width: 100%\n\n color: var(--color-sidebar-search-foreground)\n background: transparent\n z-index: 10\n\n &:focus\n outline: none\n\n &::placeholder\n font-size: var(--sidebar-search-input-font-size)\n\n//\n// Hide Search Matches link\n//\n#searchbox .highlight-link\n padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal) 0\n margin: 0\n text-align: center\n\n a\n color: var(--color-sidebar-search-icon)\n font-size: var(--font-size--small--2)\n\n////////////////////////////////////////////////////////////////////////////////\n// Structure/Skeleton of the navigation tree (left)\n////////////////////////////////////////////////////////////////////////////////\n.sidebar-tree\n font-size: var(--sidebar-item-font-size)\n margin-top: var(--sidebar-tree-space-above)\n margin-bottom: var(--sidebar-item-spacing-vertical)\n\n ul\n padding: 0\n margin-top: 0\n margin-bottom: 0\n\n display: flex\n flex-direction: column\n\n list-style: none\n\n li\n position: relative\n margin: 0\n\n > ul\n margin-left: var(--sidebar-item-spacing-horizontal)\n\n .icon\n color: var(--color-sidebar-link-text)\n\n .reference\n box-sizing: border-box\n color: var(--color-sidebar-link-text)\n\n // Fill the parent.\n display: inline-block\n line-height: var(--sidebar-item-line-height)\n text-decoration: none\n\n // Don't allow long words to cause wrapping.\n overflow-wrap: anywhere\n\n height: 100%\n width: 100%\n\n padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal)\n\n &:hover\n color: var(--color-sidebar-link-text)\n background: var(--color-sidebar-item-background--hover)\n\n // Add a nice little \"external-link\" arrow here.\n &.external::after\n content: url('data:image/svg+xml,')\n margin: 0 0.25rem\n vertical-align: middle\n color: var(--color-sidebar-link-text)\n\n // Make the current page reference bold.\n .current-page > .reference\n font-weight: bold\n\n label\n position: absolute\n top: 0\n right: 0\n height: var(--sidebar-item-height)\n width: var(--sidebar-expander-width)\n\n cursor: pointer\n user-select: none\n\n display: flex\n justify-content: center\n align-items: center\n\n .caption, :not(.caption) > .caption-text\n font-size: var(--sidebar-caption-font-size)\n color: var(--color-sidebar-caption-text)\n\n font-weight: bold\n text-transform: uppercase\n\n margin: var(--sidebar-caption-space-above) 0 0 0\n padding: var(--sidebar-item-spacing-vertical) var(--sidebar-item-spacing-horizontal)\n\n // If it has children, add a bit more padding to wrap the content to avoid\n // overlapping with the