Skip to content

Commit

Permalink
update tutorials
Browse files Browse the repository at this point in the history
  • Loading branch information
ZiyiXia committed Dec 3, 2024
1 parent 1374b98 commit 2bdd0f0
Show file tree
Hide file tree
Showing 5 changed files with 263 additions and 15 deletions.
4 changes: 2 additions & 2 deletions Tutorials/1_Embedding/1.2.3_BGE_v1&1.5.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Run the following cell to check the model of bge-base-en-v1.5. It has the exactly same structure of BERT-base, 12 encoder layers and hidden dimension of 768.\n",
"Run the following cell to check the model of bge-base-en-v1.5. It uses BERT-base as base model, with 12 encoder layers and hidden dimension of 768.\n",
"\n",
"Note that the corresponding models of BGE and BGE-v1.5 have same structures. For example, bge-base-en and bge-base-en-v1.5 have the same structure."
]
Expand Down Expand Up @@ -391,7 +391,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/flag_models.py#L370) for more details."
"As we expect, the two encoding functions return exactly the same results. The full implementation in FlagEmbedding handles large datasets by batching and contains GPU support and parallelization. Feel free to check the [source code](https://github.com/FlagOpen/FlagEmbedding/blob/master/FlagEmbedding/inference/embedder/encoder_only/base.py) for more details."
]
}
],
Expand Down
2 changes: 1 addition & 1 deletion Tutorials/4_Evaluation/4.5.1_MIRACL.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate using FlagEmbedding"
"## 3. Evaluate using FlagEmbedding"
]
},
{
Expand Down
15 changes: 3 additions & 12 deletions Tutorials/4_Evaluation/4.5.2_MLDR.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"metadata": {},
"outputs": [],
"source": [
"% pip install FlagEmbedding"
"% pip install FlagEmbedding pytrec_eval"
]
},
{
Expand Down Expand Up @@ -318,7 +318,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Use the Faiss index to search for each query."
"Use the Faiss index to search answers for each query."
]
},
{
Expand Down Expand Up @@ -456,7 +456,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate using FlagEmbedding"
"## 3. Evaluate using FlagEmbedding"
]
},
{
Expand Down Expand Up @@ -496,15 +496,6 @@
"sys.argv = arguments.split()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"os.environ[\"SETUPTOOLS_USE_DISTUTILS\"] = \"\""
]
},
{
"cell_type": "code",
"execution_count": 4,
Expand Down
95 changes: 95 additions & 0 deletions Tutorials/4_Evaluation/utils/compute_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""
Ref: https://github.com/facebookresearch/contriever
"""
import regex
import unicodedata
from functools import partial
from typing import List, Union


class SimpleTokenizer:
ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+'
NON_WS = r'[^\p{Z}\p{C}]'

def __init__(self):
"""
Args:
annotators: None or empty set (only tokenizes).
"""
self._regexp = regex.compile(
'(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS),
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
)

def tokenize(self, text, uncased=False):
matches = [m for m in self._regexp.finditer(text)]
if uncased:
tokens = [m.group().lower() for m in matches]
else:
tokens = [m.group() for m in matches]
return tokens


def _normalize(text):
return unicodedata.normalize('NFD', text)


def has_answer(answers, text, tokenizer) -> bool:
"""Check if a document contains an answer string."""
text = _normalize(text)
text = tokenizer.tokenize(text, uncased=True)

for answer in answers:
answer = _normalize(answer)
answer = tokenizer.tokenize(answer, uncased=True)
for i in range(0, len(text) - len(answer) + 1):
if answer == text[i: i + len(answer)]:
return True
return False


def check_answer(example, tokenizer) -> List[bool]:
"""Search through all the top docs to see if they have any of the answers."""
answers = example['answers']
ctxs = example['ctxs']

hits = []
for i, text in enumerate(ctxs):
if text is None: # cannot find the document for some reason
hits.append(False)
continue
hits.append(has_answer(answers, text, tokenizer))
return hits


def evaluate_qa_recall(ctxs, answers, k_values: Union[int, List[int]]=100):
# compute Recall@k for QA task
data = []
assert len(ctxs) == len(answers)
for i in range(len(ctxs)):
_ctxs, _answers = ctxs[i], answers[i]
data.append({
'answers': _answers,
'ctxs': _ctxs,
})
tokenizer = SimpleTokenizer()
get_score_partial = partial(check_answer, tokenizer=tokenizer)

scores = map(get_score_partial, data)

n_docs = len(data[0]['ctxs'])
top_k_hits = [0] * n_docs
for question_hits in scores:
best_hit = next((i for i, x in enumerate(question_hits) if x), None)
if best_hit is not None:
top_k_hits[best_hit:] = [v + 1 for v in top_k_hits[best_hit:]]

if isinstance(k_values, int):
k = min(k_values, len(top_k_hits))
return top_k_hits[k - 1] / len(data)
else:
scores = []
for k in k_values:
k = min(k, len(top_k_hits))
scores.append(top_k_hits[k - 1] / len(data))
return scores
162 changes: 162 additions & 0 deletions Tutorials/4_Evaluation/utils/normalize_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
"""
adapted from chemdataextractor.text.normalize
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tools for normalizing text.
https://github.com/mcs07/ChemDataExtractor
:copyright: Copyright 2016 by Matt Swain.
:license: MIT
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
'Software'), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

#: Control characters.
CONTROLS = {
'\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
'\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
}
# There are further control characters, but they are instead replaced with a space by unicode normalization
# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c', '\u001d', '\u001e', '\u001f'


#: Hyphen and dash characters.
HYPHENS = {
'-', # \u002d Hyphen-minus
'‐', # \u2010 Hyphen
'‑', # \u2011 Non-breaking hyphen
'⁃', # \u2043 Hyphen bullet
'‒', # \u2012 figure dash
'–', # \u2013 en dash
'—', # \u2014 em dash
'―', # \u2015 horizontal bar
}

#: Minus characters.
MINUSES = {
'-', # \u002d Hyphen-minus
'−', # \u2212 Minus
'-', # \uff0d Full-width Hyphen-minus
'⁻', # \u207b Superscript minus
}

#: Plus characters.
PLUSES = {
'+', # \u002b Plus
'+', # \uff0b Full-width Plus
'⁺', # \u207a Superscript plus
}

#: Slash characters.
SLASHES = {
'/', # \u002f Solidus
'⁄', # \u2044 Fraction slash
'∕', # \u2215 Division slash
}

#: Tilde characters.
TILDES = {
'~', # \u007e Tilde
'˜', # \u02dc Small tilde
'⁓', # \u2053 Swung dash
'∼', # \u223c Tilde operator #in mbert vocab
'∽', # \u223d Reversed tilde
'∿', # \u223f Sine wave
'〜', # \u301c Wave dash #in mbert vocab
'~', # \uff5e Full-width tilde #in mbert vocab
}

#: Apostrophe characters.
APOSTROPHES = {
"'", # \u0027
'’', # \u2019
'՚', # \u055a
'Ꞌ', # \ua78b
'ꞌ', # \ua78c
''', # \uff07
}

#: Single quote characters.
SINGLE_QUOTES = {
"'", # \u0027
'‘', # \u2018
'’', # \u2019
'‚', # \u201a
'‛', # \u201b

}

#: Double quote characters.
DOUBLE_QUOTES = {
'"', # \u0022
'“', # \u201c
'”', # \u201d
'„', # \u201e
'‟', # \u201f
}

#: Accent characters.
ACCENTS = {
'`', # \u0060
'´', # \u00b4
}

#: Prime characters.
PRIMES = {
'′', # \u2032
'″', # \u2033
'‴', # \u2034
'‵', # \u2035
'‶', # \u2036
'‷', # \u2037
'⁗', # \u2057
}

#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES

def normalize_text(text: str):
for control in CONTROLS:
text = text.replace(control, '')
text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')

for hyphen in HYPHENS | MINUSES:
text = text.replace(hyphen, '-')
text = text.replace('\u00ad', '')

for double_quote in DOUBLE_QUOTES:
text = text.replace(double_quote, '"') # \u0022
for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
text = text.replace(single_quote, "'") # \u0027
text = text.replace('′', "'") # \u2032 prime
text = text.replace('‵', "'") # \u2035 reversed prime
text = text.replace('″', "''") # \u2033 double prime
text = text.replace('‶', "''") # \u2036 reversed double prime
text = text.replace('‴', "'''") # \u2034 triple prime
text = text.replace('‷', "'''") # \u2037 reversed triple prime
text = text.replace('⁗', "''''") # \u2057 quadruple prime

text = text.replace('…', '...').replace(' . . . ', ' ... ') # \u2026

for slash in SLASHES:
text = text.replace(slash, '/')

#for tilde in TILDES:
# text = text.replace(tilde, '~')

return text

0 comments on commit 2bdd0f0

Please sign in to comment.