From 9e723134a9c56c3efd33182315debeb050fd4c42 Mon Sep 17 00:00:00 2001 From: hiancdtrsnm Date: Thu, 11 Mar 2021 22:30:08 -0500 Subject: [PATCH 1/2] Add experimentator class --- autobrat/annotator.py | 22 +++- autobrat/controller.py | 63 +++++++--- autobrat/experimentator.py | 97 +++++++++++++++ autobrat/utils.py | 241 ++++++++++++++++++++++++++++++++++++- 4 files changed, 405 insertions(+), 18 deletions(-) create mode 100644 autobrat/experimentator.py diff --git a/autobrat/annotator.py b/autobrat/annotator.py index 4239604..e61a354 100644 --- a/autobrat/annotator.py +++ b/autobrat/annotator.py @@ -46,7 +46,7 @@ def predict(self, texts: t.List[str]) -> List[List[str]]: def get_classifications(self, text: str): parsed_sentence = [w.text for w in self.nlp(text)] - print(parsed_sentence) + # print(parsed_sentence) ans = [] for classifier in self.models: prediction = classifier.predict([parsed_sentence]) @@ -66,6 +66,19 @@ def get_probs(self, return ans + def final_prediction(self, texts: List[str]): + predictions = self.predict(texts) + probs = [self.get_probs(p) for p in predictions] + + ans = [] + for sentence in probs: + ans.append([]) + for term in sentence: + m = max( term.items(),key=lambda x: x[1]) + ans[-1].append(m[0]) + + return ans + def get_entropy(self, probs: t.List[Dict[str, float]]): return sum(-1 * sum([word * log2(word) for word in words.values()]) for words in probs) @@ -106,5 +119,10 @@ def fit(self, data: Collection): lines, classes = load_training_entities(data) lines = [[w.text for w in l] for l in lines] + return self.fit_classes(lines, classes) + + def fit_classes(self, lines, classes): for model in self.models: - model.fit(lines, classes) + model.best_pipeline_.send('train') + model.best_pipeline_.run((lines, classes)) + model.best_pipeline_.send('eval') diff --git a/autobrat/controller.py b/autobrat/controller.py index ba14be0..9fb188a 100644 --- a/autobrat/controller.py +++ b/autobrat/controller.py @@ -1,8 +1,9 @@ +import collections from functools import reduce from typing import List, Optional from pathlib import Path from .annotator import SentencesAnnotator -from scripts.utils import Collection +from scripts.utils import Collection, Sentence from tinydb import TinyDB, Query from random import choice from string import ascii_lowercase, digits @@ -21,8 +22,8 @@ def generate_random_str(size: int = 10): class AnotatorController(): def __init__( self, - sentences_files: List[Path], - baseline_collection: Path, + sentences: List[str], + baseline_collection: Collection, generated_pack_path: Path = Path('./generated_packs'), closed_packs_path: Path = Path('./closed_packs'), db_path: Path = Path('./sentencedb.json'), @@ -39,9 +40,8 @@ def __init__( self.db = TinyDB(db_path) saved_sentences = set(s['text'] for s in self.db.all()) - self._load_sentences(sentences_files, saved_sentences) - collection = Collection() - collection.load_dir(baseline_collection) + self._load_sentences(sentences, saved_sentences) + collection = baseline_collection.clone() collection.load_dir(closed_packs_path) self.annotator = sentence_annotator @@ -50,15 +50,14 @@ def __init__( collection, self.number_of_models) def _load_sentences(self, - files: List[Path], + sentences: List[str], ignore_sentences: List[str] = []): - for file in files: - for line in file.open(): - if not line or line in ignore_sentences: - continue + for line in sentences: + if not line or line in ignore_sentences: + continue - self.db.insert({'text': line[:-1], 'in_pack': False}) + self.db.insert({'text': line, 'in_pack': False}) def update_selected(self, sentences): Senteces = Query() @@ -80,6 +79,13 @@ def generate_pack(self, dest_folder = self.generated_pack_path pack_name = generate_random_str() + selected = self.get_batch(pack_size) + + self.build_pack(dest_folder / (pack_name), pack_name, selected) + + def get_batch(self, + batch_size: int, + set_procesed: bool = True) -> List[str]: Senteces = Query() texts = [s['text'] for s in self.db.search(Senteces.in_pack == False)] @@ -90,9 +96,15 @@ def generate_pack(self, sentences.sort(key=lambda x: x[1], reverse=True) - selected = [s[0] for s in sentences[:pack_size]] - self.update_selected(selected) - self.build_pack(dest_folder / (pack_name), pack_name, selected) + selected = [s[0] for s in sentences[:batch_size]] + + if not selected: + return [] + + if set_procesed: + self.update_selected(selected) + + return [s[0] for s in sentences] def close_pack(self, path: Path): collection = Collection() @@ -104,3 +116,24 @@ def close_pack(self, path: Path): logger.info( f'Finish pack moving to closed pack folder ({path}) -> ({self.closed_packs_path})' ) + + @staticmethod + def load_from_files( + self, + sentences_files: List[Path], + baseline_collection: Path, + generated_pack_path: Path = Path('./generated_packs'), + closed_packs_path: Path = Path('./closed_packs'), + db_path: Path = Path('./sentencedb.json'), + sentence_annotator: Optional[SentencesAnnotator] = None + ) -> "AnotatorController": + + sentences = [] + + for file in sentences_files: + sentences.extend([line[:-1] for line in file.open() if line]) + collection = Collection() + collection.load_dir(baseline_collection) + return AnotatorController(sentences, collection, generated_pack_path, + closed_packs_path, db_path, + sentence_annotator) diff --git a/autobrat/experimentator.py b/autobrat/experimentator.py new file mode 100644 index 0000000..dd8d594 --- /dev/null +++ b/autobrat/experimentator.py @@ -0,0 +1,97 @@ +import imp +import logging +from pathlib import Path +from typing import List + +from spacy import load +import spacy +from scripts.utils import Collection, Sentence +from .controller import AnotatorController +from .utils import load_training_entities +from random import choices, shuffle +from functools import reduce +from dataclasses import dataclass +from spacy.tokens.doc import Doc +from .utils import make_sentence +from scripts.score import subtaskA, compute_metrics + +logger = logging.getLogger('experimentator') + +nlp = spacy.load('es') + +class Experimentator(object): + def __init__(self, corpus: Collection) -> None: + logger.info(f'Corpus total sentences: {len(corpus.sentences)}') + lines, classes = load_training_entities(corpus) + self.unique_clases = reduce(lambda x, y: x | y, + [set(c) for c in classes]) + + self.train_data = { + sentence.text: ([w.text for w in line], category) + for sentence, line, category in zip(corpus.sentences, lines, + classes) + } + self.original_corpus = corpus.clone() + self.training, self.test, self.sentences = self.select_traning_sentences( + corpus) + + self.test_spacy_doc = {s.text: nlp(s.text) for s in self.test.sentences} + + self.sentences_to_train: List[str] = [s.text for s in self.training] + + super().__init__() + + def select_traning_sentences(self, corpus: Collection): + size_training = 300 + size_test = 100 + # return Collection([s for s in choices(corpus.sentences, k=size)]) + sentences = corpus.sentences[:] + shuffle(sentences) + + return Collection([s for s in sentences[:size_training]]), Collection([ + s for s in sentences[size_training:size_training + size_test] + ]), [s.text for s in sentences[size_training + size_test:]] + + def score(self, submit: Collection): + score_data = subtaskA(self.test, submit) + metrics = compute_metrics(score_data, skipB=True, skipC=True) + logger.info(f'Score: {metrics}') + print(metrics) + print(len(submit), len(self.test)) + return metrics['f1'] + + def run_experiment(self, + batch_size: int, + db_name: str = 'experiment.json'): + controller = AnotatorController(self.sentences, + self.training, + db_path=Path(db_name)) + + scores = [] + # while sentences + + sentences = controller.get_batch(batch_size) + while sentences: + self.sentences_to_train.extend(sentences) + lines, classes = [], [] + for s in sentences: + line, cls = self.train_data[s] + lines.append(line) + classes.append(cls) + + controller.annotator.fit_classes(lines, classes) + + + sentences = [] + predictions = controller.annotator.final_prediction([s for s in self.test_spacy_doc]) + for (s, spacy_doc), prediction in zip(self.test_spacy_doc.items(), predictions): + sentence = make_sentence(spacy_doc, prediction, self.unique_clases) + sentence.fix_ids() + sentences.append(sentence) + + predicted_collection = Collection(sentences) + + scores.append(self.score(predicted_collection)) + sentences = controller.get_batch(batch_size) + + return scores \ No newline at end of file diff --git a/autobrat/utils.py b/autobrat/utils.py index b5de7f4..31ea556 100644 --- a/autobrat/utils.py +++ b/autobrat/utils.py @@ -1,5 +1,9 @@ import spacy -from scripts.utils import Collection +from scripts.utils import Collection, Sentence, Keyphrase +import itertools as itt +import logging + +logger = logging.getLogger('autobrat.utils') def load_training_entities(collection: Collection): @@ -88,3 +92,238 @@ def select_tag(matches): tags = [tag for _, tag in matches] return "U" if ("U" in tags and not "B" in tags and not "L" in tags) else "V" + + +def make_sentence(doc, bilouv, labels) -> Sentence: + sentence = Sentence(doc.text) + + logger.debug(f"[make_sentence]: doc.text={doc.text}") + logger.debug(f"[make_sentence]: bilouv={bilouv}") + logger.debug(f'[labels]:{labels}') + + labels = set(l[2:] for l in labels if l != 'O') + + for label in labels: + specific_bilouv = [] + + for tag in bilouv: + if tag.endswith(label): + tag = tag[0] + specific_bilouv.append(tag[0]) + else: + specific_bilouv.append('O') + + logger.debug( + f"[make_sentence]: label={label} specific_bilouv={specific_bilouv}" + ) + + spans = from_biluov(specific_bilouv, doc, spans=True) + sentence.keyphrases.extend( + Keyphrase(sentence, label, i, sp) for i, sp in enumerate(spans)) + + return sentence + + +def from_biluov(biluov, sentence, *, spans=False, drop_remaining=[]): + """ + >>> from_biluov(list('BBULL'), 'A B C D E'.split()) + [['C'], ['B', 'D'], ['A', 'E']] + """ + + entities = [x for x in discontinuous_match(biluov, sentence)] + + for i, (tag, word) in enumerate(zip(biluov, sentence)): + if tag == "U": + entities.append([word]) + biluov[i] = "O" + elif tag == "V": + biluov[i] = "I" + + # only BILO is left!!! + changed = True + while changed: + changed = False + one_shot = enumerate(zip(biluov, sentence)) + try: + i, (tag, word) = next(one_shot) + while True: + if tag != "B": + i, (tag, word) = next(one_shot) + continue + + on_build = [(word, i)] + + i, (tag, word) = next(one_shot) + while tag in ("O", "I"): + if tag == "I": + on_build.append((word, i)) + i, (tag, word) = next(one_shot) + + if tag == "L": + entities.append([x for x, _ in on_build] + [word]) + for _, j in on_build: + biluov[j] = "O" + biluov[i] = "O" + on_build.clear() + changed = True + except StopIteration: + pass + + for i, (tag, word) in enumerate(zip(biluov, sentence)): + if tag != "O" and tag not in drop_remaining: + entities.append([word]) + + return (entities if not spans else [[(t.idx, t.idx + len(t)) + for t in tokens] + for tokens in entities]) + + +def discontinuous_match(biluov, sentence): + """ + >>> discontinuous_match(['B','V','L'],['la', 'enfermedad', 'renal']) + [['la', 'enfermedad', 'renal'], ['enfermedad']] + >>> discontinuous_match(['O','V','I','L','O','I','L'],['el','cancer','de','pulmon','y','de','mama']) + [['cancer', 'de', 'pulmon'], ['cancer', 'de', 'mama']] + >>> discontinuous_match(['B','O','B','V'],['tejidos','y','organos','humanos']) + [['organos', 'humanos'], ['tejidos', 'humanos']] + >>> discontinuous_match(['O','V','I','L','O','I','L','O','B','O','B','V'], ['el','cancer','de','pulmon','y','de','mama','y','tejidos','y','organos','humanos']) + [['cancer', 'de', 'pulmon'], ['cancer', 'de', 'mama'], ['organos', 'humanos'], ['tejidos', 'humanos']] + >>> discontinuous_match(list('BBULL'), 'A B C D E'.split()) + [] + """ + entities = [] + for i, tag in enumerate(biluov): + if tag != "V": + continue + for entity_ids in _full_overlap(biluov, list(range(len(sentence))), i): + entity = [] + for idx in entity_ids: + entity.append(sentence[idx]) + biluov[idx] = "O" + entities.append(entity) + return entities + + +def _full_overlap(biluov, sentence, index, product=False): + """ + INDEX TAG MUST BE 'V' + + >>> _full_overlap(['B','V','L'], list(range(3)), 1) + [[0, 1, 2], [1]] + >>> _full_overlap(['B','V','V','L'], list(range(4)), 1) + [[0, 1, 2, 3], [1, 2]] + >>> _full_overlap(['B','V','V','L'], list(range(4)), 2) + [[0, 1, 2, 3], [1, 2]] + >>> _full_overlap(['B','V','V','V','L'], list(range(5)), 1) + [[0, 1, 2, 3, 4], [1, 2, 3]] + >>> _full_overlap(['B','V','V','V','L'], list(range(5)), 2) + [[0, 1, 2, 3, 4], [1, 2, 3]] + >>> _full_overlap(['B','V','V','V','L'], list(range(5)), 3) + [[0, 1, 2, 3, 4], [1, 2, 3]] + >>> _full_overlap(['B','B','V','L','L'], list(range(5)), 2) + [[1, 2, 3], [0, 2, 4]] + >>> _full_overlap(['B','I','B','O','V','I','L','O','L'], list(range(9)), 4) + [[2, 4, 5, 6], [0, 1, 4, 8]] + >>> _full_overlap(['B','I','B','O','V','I','L','O','L'], list(range(9)), 4, True) + [[2, 4, 5, 6], [2, 4, 8], [0, 1, 4, 5, 6], [0, 1, 4, 8]] + >>> _full_overlap(['0','0','V','L'], list(range(4)), 2) + [[2, 3], [2]] + >>> _full_overlap(['V','L'], list(range(2)), 0) + [[0, 1], [0]] + >>> _full_overlap(['B','V','O','O'], list(range(4)), 1) + [[0, 1], [1]] + >>> _full_overlap(['B','V'], list(range(2)), 1) + [[0, 1], [1]] + >>> _full_overlap(['0','0','V','O','O'], list(range(5)), 2) + [] + """ + + left = _right_to_left_overlap(biluov[:index + 1], sentence[:index + 1]) + right = _left_to_right_overlap(biluov[index:], sentence[index:]) + + full = [] + if product: + for l in left: + for r in right: + new = l + r[1:] if len(l) > len(r) else l[:-1] + r + full.append(new) + else: + for l, r in itt.zip_longest(left, right, fillvalue=[]): + new = l + r[1:] if len(l) > len(r) else l[:-1] + r + full.append(new) + return full + + +def _left_to_right_overlap(biluov, sentence): + """ + LEFTMOST TAG MUST BE 'V' + + >>> _left_to_right_overlap(['V', 'V', 'O', 'V', 'I', 'L', 'O', 'I', 'L'], range(9)) + [[0, 1, 3, 4, 5], [0, 1, 3, 7, 8]] + >>> _left_to_right_overlap(['V', 'O', 'V', 'O'], range(4)) + [] + >>> _left_to_right_overlap(['V', 'O', 'V', 'O', 'L'], range(5)) + [[0, 2, 4], [0, 2]] + >>> _left_to_right_overlap(['V', 'O', 'V', 'O', 'L', 'O', 'L'], range(8)) + [[0, 2, 4], [0, 2, 6]] + >>> _left_to_right_overlap(['V', 'O', 'V', 'O', 'L', 'I', 'L', 'V', 'L'], range(9)) + [[0, 2, 4], [0, 2, 5, 6]] + """ + return _build_overlap(biluov, sentence, "L") + + +def _right_to_left_overlap(biluov, sentence): + """ + RIGHTMOST TAG MUST BE 'V' + + >>> _right_to_left_overlap(['B', 'I', 'O', 'B', 'I', 'V', 'O', 'V', 'V'], range(9)) + [[3, 4, 5, 7, 8], [0, 1, 5, 7, 8]] + >>> _right_to_left_overlap(['O', 'V', 'O', 'V'], range(4)) + [] + >>> _right_to_left_overlap(['B', 'O', 'V', 'O', 'V'], range(5)) + [[0, 2, 4], [2, 4]] + >>> _right_to_left_overlap(['B', 'O', 'B', 'O', 'V', 'O', 'V'], range(7)) + [[2, 4, 6], [0, 4, 6]] + >>> _right_to_left_overlap(['B', 'V', 'B', 'I', 'B', 'O', 'V', 'O', 'V'], range(9)) + [[4, 6, 8], [2, 3, 6, 8]] + """ + inverse = _build_overlap(reversed(biluov), reversed(sentence), "B") + for x in inverse: + x.reverse() + return inverse + + +def _build_overlap(biluov, sentence, finisher): + """ + LEFTMOST TAG MUST BE 'V' + """ + + one_shot = zip(biluov, sentence) + tag, word = next(one_shot) + + prefix = [] + complete = [] + + try: + while tag in ("V", "O"): + if tag == "V": + prefix.append(word) + tag, word = next(one_shot) + + on_build = [] + while tag in ("O", "I", "U", finisher): + if tag == "I": + on_build.append(word) + elif tag == finisher: + complete.append(prefix + on_build + [word]) + on_build.clear() + elif tag == "U": + complete.append([word]) + tag, word = next(one_shot) + except StopIteration: + pass + + if len(complete) == 1: + complete.append(prefix) + + return complete From 6c047de54c493527cb89ff9bca87eed6bec85c7c Mon Sep 17 00:00:00 2001 From: hiancdtrsnm Date: Fri, 12 Mar 2021 15:15:15 -0500 Subject: [PATCH 2/2] Fix bug on sentences selection --- autobrat/controller.py | 2 +- autobrat/experimentator.py | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/autobrat/controller.py b/autobrat/controller.py index 9fb188a..84de06d 100644 --- a/autobrat/controller.py +++ b/autobrat/controller.py @@ -104,7 +104,7 @@ def get_batch(self, if set_procesed: self.update_selected(selected) - return [s[0] for s in sentences] + return [s for s in selected] def close_pack(self, path: Path): collection = Collection() diff --git a/autobrat/experimentator.py b/autobrat/experimentator.py index dd8d594..504415d 100644 --- a/autobrat/experimentator.py +++ b/autobrat/experimentator.py @@ -25,6 +25,7 @@ def __init__(self, corpus: Collection) -> None: lines, classes = load_training_entities(corpus) self.unique_clases = reduce(lambda x, y: x | y, [set(c) for c in classes]) + print(self.unique_clases) self.train_data = { sentence.text: ([w.text for w in line], category) @@ -56,8 +57,6 @@ def score(self, submit: Collection): score_data = subtaskA(self.test, submit) metrics = compute_metrics(score_data, skipB=True, skipC=True) logger.info(f'Score: {metrics}') - print(metrics) - print(len(submit), len(self.test)) return metrics['f1'] def run_experiment(self, @@ -69,12 +68,11 @@ def run_experiment(self, scores = [] # while sentences - sentences = controller.get_batch(batch_size) while sentences: self.sentences_to_train.extend(sentences) lines, classes = [], [] - for s in sentences: + for s in self.sentences_to_train: line, cls = self.train_data[s] lines.append(line) classes.append(cls) @@ -94,4 +92,18 @@ def run_experiment(self, scores.append(self.score(predicted_collection)) sentences = controller.get_batch(batch_size) - return scores \ No newline at end of file + return scores + + def train_with_all(self): + controller = AnotatorController(self.sentences, + self.original_corpus, + db_path=Path('fullcorpus.json')) + sentences = [] + predictions = controller.annotator.final_prediction([s for s in self.test_spacy_doc]) + for (s, spacy_doc), prediction in zip(self.test_spacy_doc.items(), predictions): + sentence = make_sentence(spacy_doc, prediction, self.unique_clases) + sentence.fix_ids() + sentences.append(sentence) + + predicted_collection = Collection(sentences) + return self.score(predicted_collection) \ No newline at end of file