From 733d8062611802dba0d17db72a61e77abd38e897 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 25 Oct 2023 12:23:55 -0400 Subject: [PATCH 1/5] Move assignment of model out of post init --- src/ontogpt/cli.py | 2 ++ src/ontogpt/evaluation/ctd/eval_ctd.py | 4 ++-- src/ontogpt/evaluation/evaluation_engine.py | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py index ac0493b25..6b5d08fd4 100644 --- a/src/ontogpt/cli.py +++ b/src/ontogpt/cli.py @@ -1474,6 +1474,8 @@ def eval(evaluator, num_tests, output, chunking, model, **kwargs): if model: selectmodel = get_model_by_name(model) modelname = selectmodel["alternative_names"][0] + else: + modelname = DEFAULT_MODEL evaluator = create_evaluator(evaluator) evaluator.num_tests = num_tests diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index 03b80c00e..d55006229 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -136,8 +136,7 @@ class EvalCTD(SPIRESEvaluationEngine): object_prefix = "MESH" def __post_init__(self): - self.extractor = SPIRESEngine(template="ctd.ChemicalToDiseaseDocument", - model=self.model) + self.extractor = SPIRESEngine(template="ctd.ChemicalToDiseaseDocument") # synonyms are derived entirely from training set self.extractor.load_dictionary(DATABASE_DIR / "synonyms.yaml") @@ -197,6 +196,7 @@ def eval(self) -> EvaluationObjectSetRE: labeler = get_adapter("sqlite:obo:mesh") num_test = self.num_tests ke = self.extractor + self.extractor.model = self.model docs = list(self.load_test_cases()) shuffle(docs) eos = EvaluationObjectSetRE( diff --git a/src/ontogpt/evaluation/evaluation_engine.py b/src/ontogpt/evaluation/evaluation_engine.py index 534f7b8bf..dbf673f5e 100644 --- a/src/ontogpt/evaluation/evaluation_engine.py +++ b/src/ontogpt/evaluation/evaluation_engine.py @@ -16,7 +16,6 @@ from oaklib import BasicOntologyInterface from pydantic import BaseModel -from ontogpt import DEFAULT_MODEL from ontogpt.engines.spires_engine import SPIRESEngine @@ -101,6 +100,6 @@ class SPIRESEvaluationEngine(EvaluationEngine): """Whether to pre-process input texts by chunking. If True, each chunk gets its own prompt. Otherwise, pass the full text with each prompt.""" - model: str = DEFAULT_MODEL + model: str = None """Name of the model to use in evaluation. Defaults to the default model defined in models.yaml, generally gpt-3.5-turbo.""" From 2e3bbc126192c2215ccf5cfedf0c3e0e46239d44 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 25 Oct 2023 14:40:58 -0400 Subject: [PATCH 2/5] Refactoring to pass parameters at init of eval engine --- src/ontogpt/cli.py | 8 ++++---- src/ontogpt/engines/knowledge_engine.py | 4 ++-- src/ontogpt/engines/spires_engine.py | 4 +++- src/ontogpt/evaluation/ctd/eval_ctd.py | 3 +-- src/ontogpt/evaluation/evaluation_engine.py | 3 +-- src/ontogpt/evaluation/resolver.py | 12 +++++++++--- 6 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py index 6b5d08fd4..b9e8cfe91 100644 --- a/src/ontogpt/cli.py +++ b/src/ontogpt/cli.py @@ -1477,10 +1477,10 @@ def eval(evaluator, num_tests, output, chunking, model, **kwargs): else: modelname = DEFAULT_MODEL - evaluator = create_evaluator(evaluator) - evaluator.num_tests = num_tests - evaluator.chunking = chunking - evaluator.model = modelname + evaluator = create_evaluator(name=evaluator, + num_tests=num_tests, + chunking=chunking, + model=modelname) eos = evaluator.eval() output.write(dump_minimal_yaml(eos, minimize=False)) diff --git a/src/ontogpt/engines/knowledge_engine.py b/src/ontogpt/engines/knowledge_engine.py index 92cec8c7a..cb6da8e74 100644 --- a/src/ontogpt/engines/knowledge_engine.py +++ b/src/ontogpt/engines/knowledge_engine.py @@ -104,7 +104,7 @@ class KnowledgeEngine(ABC): api_key: str = "" """OpenAI API key.""" - model: MODEL_NAME = "" + model: str = None """Language Model. This may be overridden in subclasses.""" # annotator: TextAnnotatorInterface = None @@ -166,7 +166,7 @@ def __post_init__(self): try: self.encoding = tiktoken.encoding_for_model(self.client.model) except KeyError: - self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") + self.encoding = tiktoken.encoding_for_model(DEFAULT_MODEL) logger.error(f"Could not find encoding for model {self.client.model}") def set_api_key(self, key: str): diff --git a/src/ontogpt/engines/spires_engine.py b/src/ontogpt/engines/spires_engine.py index 9096a26a0..22d563e6f 100644 --- a/src/ontogpt/engines/spires_engine.py +++ b/src/ontogpt/engines/spires_engine.py @@ -43,7 +43,9 @@ class SPIRESEngine(KnowledgeEngine): """Knowledge extractor.""" - engine: str = "gpt-3.5-turbo-instruct" + engine: str = None + """The engine name.""" + recurse: bool = True """If true, then complex non-named entity objects are always recursively parsed. If this is false AND the complex object is a pair, then token-based splitting is diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index d55006229..ce006c687 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -136,7 +136,7 @@ class EvalCTD(SPIRESEvaluationEngine): object_prefix = "MESH" def __post_init__(self): - self.extractor = SPIRESEngine(template="ctd.ChemicalToDiseaseDocument") + self.extractor = SPIRESEngine(template="ctd.ChemicalToDiseaseDocument", model=self.model) # synonyms are derived entirely from training set self.extractor.load_dictionary(DATABASE_DIR / "synonyms.yaml") @@ -196,7 +196,6 @@ def eval(self) -> EvaluationObjectSetRE: labeler = get_adapter("sqlite:obo:mesh") num_test = self.num_tests ke = self.extractor - self.extractor.model = self.model docs = list(self.load_test_cases()) shuffle(docs) eos = EvaluationObjectSetRE( diff --git a/src/ontogpt/evaluation/evaluation_engine.py b/src/ontogpt/evaluation/evaluation_engine.py index dbf673f5e..1970cf0dc 100644 --- a/src/ontogpt/evaluation/evaluation_engine.py +++ b/src/ontogpt/evaluation/evaluation_engine.py @@ -101,5 +101,4 @@ class SPIRESEvaluationEngine(EvaluationEngine): prompt. Otherwise, pass the full text with each prompt.""" model: str = None - """Name of the model to use in evaluation. Defaults to the default model defined - in models.yaml, generally gpt-3.5-turbo.""" + """Name of the model to use in evaluation.""" diff --git a/src/ontogpt/evaluation/resolver.py b/src/ontogpt/evaluation/resolver.py index 8e1d78a97..12661d3c9 100644 --- a/src/ontogpt/evaluation/resolver.py +++ b/src/ontogpt/evaluation/resolver.py @@ -9,10 +9,16 @@ resolver = ClassResolver([EvalCTD], base=SPIRESEvaluationEngine) -def create_evaluator(name: Optional[Union[str, Type]] = None, **kwargs) -> SPIRESEvaluationEngine: +def create_evaluator( + name: Optional[Union[str, Type]] = None, + num_tests: Optional[Union[int, Type]] = None, + chunking: Optional[Union[bool, Type]] = None, + model: Optional[Union[str, Type]] = None, + **kwargs, +) -> SPIRESEvaluationEngine: """Create a knowledge engine.""" if name is None: name = EvalCTD if isinstance(name, str): - return resolver.lookup(name)(**kwargs) - return name(**kwargs) + return resolver.lookup(name)(num_tests=num_tests, chunking=chunking, model=model, **kwargs) + return name(num_tests=num_tests, chunking=chunking, model=model, **kwargs) From 3f1dee8221bb2f1a1fd34df758caeec5aa01f123 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 25 Oct 2023 15:25:26 -0400 Subject: [PATCH 3/5] Typing fixes --- src/ontogpt/evaluation/evaluation_engine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ontogpt/evaluation/evaluation_engine.py b/src/ontogpt/evaluation/evaluation_engine.py index 1970cf0dc..9170af88d 100644 --- a/src/ontogpt/evaluation/evaluation_engine.py +++ b/src/ontogpt/evaluation/evaluation_engine.py @@ -11,7 +11,7 @@ """ from dataclasses import dataclass -from typing import List, Optional, Set +from typing import List, Optional, Set, Type, Union from oaklib import BasicOntologyInterface from pydantic import BaseModel @@ -92,13 +92,13 @@ class SPIRESEvaluationEngine(EvaluationEngine): num_tests: int = 10 """Number of test cases to use for evaluation""" - num_training: int = 5 + num_training: Optional[Union[int, Type]] = 5 """Number of training/exemplar cases to use for evaluation in generalization task. Note this number will be low as we use few-shot learning.""" - chunking: bool = False + chunking: Optional[Union[bool, Type]] = False """Whether to pre-process input texts by chunking. If True, each chunk gets its own prompt. Otherwise, pass the full text with each prompt.""" - model: str = None + model: Optional[Union[str, Type]] = None """Name of the model to use in evaluation.""" From 639ea8e566aacd71a2ff98f51a203cc463f5a1d6 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 25 Oct 2023 15:31:39 -0400 Subject: [PATCH 4/5] Another type fix --- src/ontogpt/evaluation/evaluation_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ontogpt/evaluation/evaluation_engine.py b/src/ontogpt/evaluation/evaluation_engine.py index 9170af88d..993af4f92 100644 --- a/src/ontogpt/evaluation/evaluation_engine.py +++ b/src/ontogpt/evaluation/evaluation_engine.py @@ -89,7 +89,7 @@ class SPIRESEvaluationEngine(EvaluationEngine): extractor: SPIRESEngine = None """Knowledge extractor to use""" - num_tests: int = 10 + num_tests: Optional[Union[int, Type]] = 10 """Number of test cases to use for evaluation""" num_training: Optional[Union[int, Type]] = 5 From cc6e9bd222c6d6a33011c3a5c2da3c88c097269b Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 25 Oct 2023 15:40:36 -0400 Subject: [PATCH 5/5] Another new type fix --- src/ontogpt/evaluation/ctd/eval_ctd.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ontogpt/evaluation/ctd/eval_ctd.py b/src/ontogpt/evaluation/ctd/eval_ctd.py index ce006c687..e6a003cd6 100644 --- a/src/ontogpt/evaluation/ctd/eval_ctd.py +++ b/src/ontogpt/evaluation/ctd/eval_ctd.py @@ -194,7 +194,10 @@ def create_training_set(self, num=100): def eval(self) -> EvaluationObjectSetRE: """Evaluate the ability to extract relations.""" labeler = get_adapter("sqlite:obo:mesh") - num_test = self.num_tests + if self.num_tests and isinstance(self.num_tests, int): + num_test = self.num_tests + else: + num_test = 1 ke = self.extractor docs = list(self.load_test_cases()) shuffle(docs)