From 9e723134a9c56c3efd33182315debeb050fd4c42 Mon Sep 17 00:00:00 2001
From: hiancdtrsnm <hiancdtrsnm@gmail.com>
Date: Thu, 11 Mar 2021 22:30:08 -0500
Subject: [PATCH 1/2] Add experimentator class

---
 autobrat/annotator.py      |  22 +++-
 autobrat/controller.py     |  63 +++++++---
 autobrat/experimentator.py |  97 +++++++++++++++
 autobrat/utils.py          | 241 ++++++++++++++++++++++++++++++++++++-
 4 files changed, 405 insertions(+), 18 deletions(-)
 create mode 100644 autobrat/experimentator.py

diff --git a/autobrat/annotator.py b/autobrat/annotator.py
index 4239604..e61a354 100644
--- a/autobrat/annotator.py
+++ b/autobrat/annotator.py
@@ -46,7 +46,7 @@ def predict(self, texts: t.List[str]) -> List[List[str]]:
 
     def get_classifications(self, text: str):
         parsed_sentence = [w.text for w in self.nlp(text)]
-        print(parsed_sentence)
+        # print(parsed_sentence)
         ans = []
         for classifier in self.models:
             prediction = classifier.predict([parsed_sentence])
@@ -66,6 +66,19 @@ def get_probs(self,
 
         return ans
 
+    def final_prediction(self, texts: List[str]):
+        predictions = self.predict(texts)
+        probs = [self.get_probs(p) for p in predictions]
+
+        ans = []
+        for sentence in probs:
+            ans.append([])
+            for term in sentence:
+                m = max( term.items(),key=lambda x: x[1])
+                ans[-1].append(m[0])
+
+        return ans
+
     def get_entropy(self, probs: t.List[Dict[str, float]]):
         return sum(-1 * sum([word * log2(word) for word in words.values()])
                    for words in probs)
@@ -106,5 +119,10 @@ def fit(self, data: Collection):
         lines, classes = load_training_entities(data)
         lines = [[w.text for w in l] for l in lines]
 
+        return self.fit_classes(lines, classes)
+
+    def fit_classes(self, lines, classes):
         for model in self.models:
-            model.fit(lines, classes)
+            model.best_pipeline_.send('train')
+            model.best_pipeline_.run((lines, classes))
+            model.best_pipeline_.send('eval')
diff --git a/autobrat/controller.py b/autobrat/controller.py
index ba14be0..9fb188a 100644
--- a/autobrat/controller.py
+++ b/autobrat/controller.py
@@ -1,8 +1,9 @@
+import collections
 from functools import reduce
 from typing import List, Optional
 from pathlib import Path
 from .annotator import SentencesAnnotator
-from scripts.utils import Collection
+from scripts.utils import Collection, Sentence
 from tinydb import TinyDB, Query
 from random import choice
 from string import ascii_lowercase, digits
@@ -21,8 +22,8 @@ def generate_random_str(size: int = 10):
 class AnotatorController():
     def __init__(
             self,
-            sentences_files: List[Path],
-            baseline_collection: Path,
+            sentences: List[str],
+            baseline_collection: Collection,
             generated_pack_path: Path = Path('./generated_packs'),
             closed_packs_path: Path = Path('./closed_packs'),
             db_path: Path = Path('./sentencedb.json'),
@@ -39,9 +40,8 @@ def __init__(
         self.db = TinyDB(db_path)
         saved_sentences = set(s['text'] for s in self.db.all())
 
-        self._load_sentences(sentences_files, saved_sentences)
-        collection = Collection()
-        collection.load_dir(baseline_collection)
+        self._load_sentences(sentences, saved_sentences)
+        collection = baseline_collection.clone()
         collection.load_dir(closed_packs_path)
 
         self.annotator = sentence_annotator
@@ -50,15 +50,14 @@ def __init__(
                 collection, self.number_of_models)
 
     def _load_sentences(self,
-                        files: List[Path],
+                        sentences: List[str],
                         ignore_sentences: List[str] = []):
 
-        for file in files:
-            for line in file.open():
-                if not line or line in ignore_sentences:
-                    continue
+        for line in sentences:
+            if not line or line in ignore_sentences:
+                continue
 
-                self.db.insert({'text': line[:-1], 'in_pack': False})
+            self.db.insert({'text': line, 'in_pack': False})
 
     def update_selected(self, sentences):
         Senteces = Query()
@@ -80,6 +79,13 @@ def generate_pack(self,
             dest_folder = self.generated_pack_path
         pack_name = generate_random_str()
 
+        selected = self.get_batch(pack_size)
+
+        self.build_pack(dest_folder / (pack_name), pack_name, selected)
+
+    def get_batch(self,
+                  batch_size: int,
+                  set_procesed: bool = True) -> List[str]:
         Senteces = Query()
 
         texts = [s['text'] for s in self.db.search(Senteces.in_pack == False)]
@@ -90,9 +96,15 @@ def generate_pack(self,
 
         sentences.sort(key=lambda x: x[1], reverse=True)
 
-        selected = [s[0] for s in sentences[:pack_size]]
-        self.update_selected(selected)
-        self.build_pack(dest_folder / (pack_name), pack_name, selected)
+        selected = [s[0] for s in sentences[:batch_size]]
+
+        if not selected:
+            return []
+
+        if set_procesed:
+            self.update_selected(selected)
+
+        return [s[0] for s in sentences]
 
     def close_pack(self, path: Path):
         collection = Collection()
@@ -104,3 +116,24 @@ def close_pack(self, path: Path):
         logger.info(
             f'Finish pack moving to closed pack folder ({path}) -> ({self.closed_packs_path})'
         )
+
+    @staticmethod
+    def load_from_files(
+        self,
+        sentences_files: List[Path],
+        baseline_collection: Path,
+        generated_pack_path: Path = Path('./generated_packs'),
+        closed_packs_path: Path = Path('./closed_packs'),
+        db_path: Path = Path('./sentencedb.json'),
+        sentence_annotator: Optional[SentencesAnnotator] = None
+    ) -> "AnotatorController":
+
+        sentences = []
+
+        for file in sentences_files:
+            sentences.extend([line[:-1] for line in file.open() if line])
+        collection = Collection()
+        collection.load_dir(baseline_collection)
+        return AnotatorController(sentences, collection, generated_pack_path,
+                                  closed_packs_path, db_path,
+                                  sentence_annotator)
diff --git a/autobrat/experimentator.py b/autobrat/experimentator.py
new file mode 100644
index 0000000..dd8d594
--- /dev/null
+++ b/autobrat/experimentator.py
@@ -0,0 +1,97 @@
+import imp
+import logging
+from pathlib import Path
+from typing import List
+
+from spacy import load
+import spacy
+from scripts.utils import Collection, Sentence
+from .controller import AnotatorController
+from .utils import load_training_entities
+from random import choices, shuffle
+from functools import reduce
+from dataclasses import dataclass
+from spacy.tokens.doc import Doc
+from .utils import make_sentence
+from scripts.score import subtaskA, compute_metrics
+
+logger = logging.getLogger('experimentator')
+
+nlp = spacy.load('es')
+
+class Experimentator(object):
+    def __init__(self, corpus: Collection) -> None:
+        logger.info(f'Corpus total sentences: {len(corpus.sentences)}')
+        lines, classes = load_training_entities(corpus)
+        self.unique_clases = reduce(lambda x, y: x | y,
+                                    [set(c) for c in classes])
+
+        self.train_data = {
+            sentence.text: ([w.text for w in line], category)
+            for sentence, line, category in zip(corpus.sentences, lines,
+                                                classes)
+        }
+        self.original_corpus = corpus.clone()
+        self.training, self.test, self.sentences = self.select_traning_sentences(
+            corpus)
+
+        self.test_spacy_doc = {s.text: nlp(s.text) for s in self.test.sentences}
+
+        self.sentences_to_train: List[str] = [s.text for s in self.training]
+
+        super().__init__()
+
+    def select_traning_sentences(self, corpus: Collection):
+        size_training = 300
+        size_test = 100
+        # return Collection([s for s in choices(corpus.sentences, k=size)])
+        sentences = corpus.sentences[:]
+        shuffle(sentences)
+
+        return Collection([s for s in sentences[:size_training]]), Collection([
+            s for s in sentences[size_training:size_training + size_test]
+        ]), [s.text for s in sentences[size_training + size_test:]]
+
+    def score(self, submit: Collection):
+        score_data = subtaskA(self.test, submit)
+        metrics = compute_metrics(score_data, skipB=True, skipC=True)
+        logger.info(f'Score: {metrics}')
+        print(metrics)
+        print(len(submit), len(self.test))
+        return metrics['f1']
+
+    def run_experiment(self,
+                       batch_size: int,
+                       db_name: str = 'experiment.json'):
+        controller = AnotatorController(self.sentences,
+                                        self.training,
+                                        db_path=Path(db_name))
+
+        scores = []
+        # while sentences
+
+        sentences = controller.get_batch(batch_size)
+        while sentences:
+            self.sentences_to_train.extend(sentences)
+            lines, classes = [], []
+            for s in sentences:
+                line, cls = self.train_data[s]
+                lines.append(line)
+                classes.append(cls)
+
+            controller.annotator.fit_classes(lines, classes)
+
+
+            sentences = []
+            predictions = controller.annotator.final_prediction([s for s in self.test_spacy_doc])
+            for (s, spacy_doc), prediction in zip(self.test_spacy_doc.items(), predictions):
+                sentence = make_sentence(spacy_doc, prediction, self.unique_clases)
+                sentence.fix_ids()
+                sentences.append(sentence)
+
+            predicted_collection = Collection(sentences)
+
+            scores.append(self.score(predicted_collection))
+            sentences = controller.get_batch(batch_size)
+
+        return scores
\ No newline at end of file
diff --git a/autobrat/utils.py b/autobrat/utils.py
index b5de7f4..31ea556 100644
--- a/autobrat/utils.py
+++ b/autobrat/utils.py
@@ -1,5 +1,9 @@
 import spacy
-from scripts.utils import Collection
+from scripts.utils import Collection, Sentence, Keyphrase
+import itertools as itt
+import logging
+
+logger = logging.getLogger('autobrat.utils')
 
 
 def load_training_entities(collection: Collection):
@@ -88,3 +92,238 @@ def select_tag(matches):
     tags = [tag for _, tag in matches]
     return "U" if ("U" in tags and not "B" in tags
                    and not "L" in tags) else "V"
+
+
+def make_sentence(doc, bilouv, labels) -> Sentence:
+    sentence = Sentence(doc.text)
+
+    logger.debug(f"[make_sentence]: doc.text={doc.text}")
+    logger.debug(f"[make_sentence]: bilouv={bilouv}")
+    logger.debug(f'[labels]:{labels}')
+
+    labels = set(l[2:] for l in labels if l != 'O')
+
+    for label in labels:
+        specific_bilouv = []
+
+        for tag in bilouv:
+            if tag.endswith(label):
+                tag = tag[0]
+                specific_bilouv.append(tag[0])
+            else:
+                specific_bilouv.append('O')
+
+        logger.debug(
+            f"[make_sentence]: label={label} specific_bilouv={specific_bilouv}"
+        )
+
+        spans = from_biluov(specific_bilouv, doc, spans=True)
+        sentence.keyphrases.extend(
+            Keyphrase(sentence, label, i, sp) for i, sp in enumerate(spans))
+
+    return sentence
+
+
+def from_biluov(biluov, sentence, *, spans=False, drop_remaining=[]):
+    """
+    >>> from_biluov(list('BBULL'), 'A B C D E'.split())
+    [['C'], ['B', 'D'], ['A', 'E']]
+    """
+
+    entities = [x for x in discontinuous_match(biluov, sentence)]
+
+    for i, (tag, word) in enumerate(zip(biluov, sentence)):
+        if tag == "U":
+            entities.append([word])
+            biluov[i] = "O"
+        elif tag == "V":
+            biluov[i] = "I"
+
+    # only BILO is left!!!
+    changed = True
+    while changed:
+        changed = False
+        one_shot = enumerate(zip(biluov, sentence))
+        try:
+            i, (tag, word) = next(one_shot)
+            while True:
+                if tag != "B":
+                    i, (tag, word) = next(one_shot)
+                    continue
+
+                on_build = [(word, i)]
+
+                i, (tag, word) = next(one_shot)
+                while tag in ("O", "I"):
+                    if tag == "I":
+                        on_build.append((word, i))
+                    i, (tag, word) = next(one_shot)
+
+                if tag == "L":
+                    entities.append([x for x, _ in on_build] + [word])
+                    for _, j in on_build:
+                        biluov[j] = "O"
+                    biluov[i] = "O"
+                    on_build.clear()
+                    changed = True
+        except StopIteration:
+            pass
+
+    for i, (tag, word) in enumerate(zip(biluov, sentence)):
+        if tag != "O" and tag not in drop_remaining:
+            entities.append([word])
+
+    return (entities if not spans else [[(t.idx, t.idx + len(t))
+                                         for t in tokens]
+                                        for tokens in entities])
+
+
+def discontinuous_match(biluov, sentence):
+    """
+    >>> discontinuous_match(['B','V','L'],['la', 'enfermedad', 'renal'])
+    [['la', 'enfermedad', 'renal'], ['enfermedad']]
+    >>> discontinuous_match(['O','V','I','L','O','I','L'],['el','cancer','de','pulmon','y','de','mama'])
+    [['cancer', 'de', 'pulmon'], ['cancer', 'de', 'mama']]
+    >>> discontinuous_match(['B','O','B','V'],['tejidos','y','organos','humanos'])
+    [['organos', 'humanos'], ['tejidos', 'humanos']]
+    >>> discontinuous_match(['O','V','I','L','O','I','L','O','B','O','B','V'], ['el','cancer','de','pulmon','y','de','mama','y','tejidos','y','organos','humanos'])
+    [['cancer', 'de', 'pulmon'], ['cancer', 'de', 'mama'], ['organos', 'humanos'], ['tejidos', 'humanos']]
+    >>> discontinuous_match(list('BBULL'), 'A B C D E'.split())
+    []
+    """
+    entities = []
+    for i, tag in enumerate(biluov):
+        if tag != "V":
+            continue
+        for entity_ids in _full_overlap(biluov, list(range(len(sentence))), i):
+            entity = []
+            for idx in entity_ids:
+                entity.append(sentence[idx])
+                biluov[idx] = "O"
+            entities.append(entity)
+    return entities
+
+
+def _full_overlap(biluov, sentence, index, product=False):
+    """
+    INDEX TAG MUST BE 'V'
+
+    >>> _full_overlap(['B','V','L'], list(range(3)), 1)
+    [[0, 1, 2], [1]]
+    >>> _full_overlap(['B','V','V','L'], list(range(4)), 1)
+    [[0, 1, 2, 3], [1, 2]]
+    >>> _full_overlap(['B','V','V','L'], list(range(4)), 2)
+    [[0, 1, 2, 3], [1, 2]]
+    >>> _full_overlap(['B','V','V','V','L'], list(range(5)), 1)
+    [[0, 1, 2, 3, 4], [1, 2, 3]]
+    >>> _full_overlap(['B','V','V','V','L'], list(range(5)), 2)
+    [[0, 1, 2, 3, 4], [1, 2, 3]]
+    >>> _full_overlap(['B','V','V','V','L'], list(range(5)), 3)
+    [[0, 1, 2, 3, 4], [1, 2, 3]]
+    >>> _full_overlap(['B','B','V','L','L'], list(range(5)), 2)
+    [[1, 2, 3], [0, 2, 4]]
+    >>> _full_overlap(['B','I','B','O','V','I','L','O','L'], list(range(9)), 4)
+    [[2, 4, 5, 6], [0, 1, 4, 8]]
+    >>> _full_overlap(['B','I','B','O','V','I','L','O','L'], list(range(9)), 4, True)
+    [[2, 4, 5, 6], [2, 4, 8], [0, 1, 4, 5, 6], [0, 1, 4, 8]]
+    >>> _full_overlap(['0','0','V','L'], list(range(4)), 2)
+    [[2, 3], [2]]
+    >>> _full_overlap(['V','L'], list(range(2)), 0)
+    [[0, 1], [0]]
+    >>> _full_overlap(['B','V','O','O'], list(range(4)), 1)
+    [[0, 1], [1]]
+    >>> _full_overlap(['B','V'], list(range(2)), 1)
+    [[0, 1], [1]]
+    >>> _full_overlap(['0','0','V','O','O'], list(range(5)), 2)
+    []
+    """
+
+    left = _right_to_left_overlap(biluov[:index + 1], sentence[:index + 1])
+    right = _left_to_right_overlap(biluov[index:], sentence[index:])
+
+    full = []
+    if product:
+        for l in left:
+            for r in right:
+                new = l + r[1:] if len(l) > len(r) else l[:-1] + r
+                full.append(new)
+    else:
+        for l, r in itt.zip_longest(left, right, fillvalue=[]):
+            new = l + r[1:] if len(l) > len(r) else l[:-1] + r
+            full.append(new)
+    return full
+
+
+def _left_to_right_overlap(biluov, sentence):
+    """
+    LEFTMOST TAG MUST BE 'V'
+
+    >>> _left_to_right_overlap(['V', 'V', 'O', 'V', 'I', 'L', 'O', 'I', 'L'], range(9))
+    [[0, 1, 3, 4, 5], [0, 1, 3, 7, 8]]
+    >>> _left_to_right_overlap(['V', 'O', 'V', 'O'], range(4))
+    []
+    >>> _left_to_right_overlap(['V', 'O', 'V', 'O', 'L'], range(5))
+    [[0, 2, 4], [0, 2]]
+    >>> _left_to_right_overlap(['V', 'O', 'V', 'O', 'L', 'O', 'L'], range(8))
+    [[0, 2, 4], [0, 2, 6]]
+    >>> _left_to_right_overlap(['V', 'O', 'V', 'O', 'L', 'I', 'L', 'V', 'L'], range(9))
+    [[0, 2, 4], [0, 2, 5, 6]]
+    """
+    return _build_overlap(biluov, sentence, "L")
+
+
+def _right_to_left_overlap(biluov, sentence):
+    """
+    RIGHTMOST TAG MUST BE 'V'
+
+    >>> _right_to_left_overlap(['B', 'I', 'O', 'B', 'I', 'V', 'O', 'V', 'V'], range(9))
+    [[3, 4, 5, 7, 8], [0, 1, 5, 7, 8]]
+    >>> _right_to_left_overlap(['O', 'V', 'O', 'V'], range(4))
+    []
+    >>> _right_to_left_overlap(['B', 'O', 'V', 'O', 'V'], range(5))
+    [[0, 2, 4], [2, 4]]
+    >>> _right_to_left_overlap(['B', 'O', 'B', 'O', 'V', 'O', 'V'], range(7))
+    [[2, 4, 6], [0, 4, 6]]
+    >>> _right_to_left_overlap(['B', 'V', 'B', 'I', 'B', 'O', 'V', 'O', 'V'], range(9))
+    [[4, 6, 8], [2, 3, 6, 8]]
+    """
+    inverse = _build_overlap(reversed(biluov), reversed(sentence), "B")
+    for x in inverse:
+        x.reverse()
+    return inverse
+
+
+def _build_overlap(biluov, sentence, finisher):
+    """
+    LEFTMOST TAG MUST BE 'V'
+    """
+
+    one_shot = zip(biluov, sentence)
+    tag, word = next(one_shot)
+
+    prefix = []
+    complete = []
+
+    try:
+        while tag in ("V", "O"):
+            if tag == "V":
+                prefix.append(word)
+            tag, word = next(one_shot)
+
+        on_build = []
+        while tag in ("O", "I", "U", finisher):
+            if tag == "I":
+                on_build.append(word)
+            elif tag == finisher:
+                complete.append(prefix + on_build + [word])
+                on_build.clear()
+            elif tag == "U":
+                complete.append([word])
+            tag, word = next(one_shot)
+    except StopIteration:
+        pass
+
+    if len(complete) == 1:
+        complete.append(prefix)
+
+    return complete

From 6c047de54c493527cb89ff9bca87eed6bec85c7c Mon Sep 17 00:00:00 2001
From: hiancdtrsnm <hiancdtrsnm@gmail.com>
Date: Fri, 12 Mar 2021 15:15:15 -0500
Subject: [PATCH 2/2] Fix bug on sentences selection

---
 autobrat/controller.py     |  2 +-
 autobrat/experimentator.py | 22 +++++++++++++++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/autobrat/controller.py b/autobrat/controller.py
index 9fb188a..84de06d 100644
--- a/autobrat/controller.py
+++ b/autobrat/controller.py
@@ -104,7 +104,7 @@ def get_batch(self,
         if set_procesed:
             self.update_selected(selected)
 
-        return [s[0] for s in sentences]
+        return [s for s in selected]
 
     def close_pack(self, path: Path):
         collection = Collection()
diff --git a/autobrat/experimentator.py b/autobrat/experimentator.py
index dd8d594..504415d 100644
--- a/autobrat/experimentator.py
+++ b/autobrat/experimentator.py
@@ -25,6 +25,7 @@ def __init__(self, corpus: Collection) -> None:
         lines, classes = load_training_entities(corpus)
         self.unique_clases = reduce(lambda x, y: x | y,
                                     [set(c) for c in classes])
+        print(self.unique_clases)
 
         self.train_data = {
             sentence.text: ([w.text for w in line], category)
@@ -56,8 +57,6 @@ def score(self, submit: Collection):
         score_data = subtaskA(self.test, submit)
         metrics = compute_metrics(score_data, skipB=True, skipC=True)
         logger.info(f'Score: {metrics}')
-        print(metrics)
-        print(len(submit), len(self.test))
         return metrics['f1']
 
     def run_experiment(self,
@@ -69,12 +68,11 @@ def run_experiment(self,
 
         scores = []
         # while sentences
-
         sentences = controller.get_batch(batch_size)
         while sentences:
             self.sentences_to_train.extend(sentences)
             lines, classes = [], []
-            for s in sentences:
+            for s in self.sentences_to_train:
                 line, cls = self.train_data[s]
                 lines.append(line)
                 classes.append(cls)
@@ -94,4 +92,18 @@ def run_experiment(self,
             scores.append(self.score(predicted_collection))
             sentences = controller.get_batch(batch_size)
 
-        return scores
\ No newline at end of file
+        return scores
+
+    def train_with_all(self):
+        controller = AnotatorController(self.sentences,
+                                self.original_corpus,
+                                db_path=Path('fullcorpus.json'))
+        sentences = []
+        predictions = controller.annotator.final_prediction([s for s in self.test_spacy_doc])
+        for (s, spacy_doc), prediction in zip(self.test_spacy_doc.items(), predictions):
+            sentence = make_sentence(spacy_doc, prediction, self.unique_clases)
+            sentence.fix_ids()
+            sentences.append(sentence)
+
+        predicted_collection = Collection(sentences)
+        return self.score(predicted_collection)
\ No newline at end of file