From ca2b4edc38d3eeda9c5e1156875b2d4938496595 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 16 Oct 2023 11:30:55 -0400 Subject: [PATCH 1/7] Add github action to codespell main on push and PRs --- .github/workflows/codespell.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 .github/workflows/codespell.yml diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 000000000..3ebbf5504 --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,22 @@ +--- +name: Codespell + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Codespell + uses: codespell-project/actions-codespell@v2 From 4886fda27508f097278adecea945a3462f801e75 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Mon, 16 Oct 2023 11:30:55 -0400 Subject: [PATCH 2/7] Add rudimentary codespell config --- pyproject.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 7f6aebc0c..028f97461 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,3 +104,9 @@ reverse_relative = true [build-system] requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"] build-backend = "poetry_dynamic_versioning.backend" + +[tool.codespell] +skip = '.git,*.pdf,*.svg,poetry.lock,output,*.tsv' +# some specific phrases, variables and mixed case (CamelCase etc) +ignore-regex = '\b(Torsades de pointes|[A-Z][a-zA-Z]*|[a-z]+[A-Z][a-zA-Z]*)\b' +ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions' From b3ae901a64f6b817d4704c43f02e751d79081e1d Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 6 Dec 2024 19:36:30 -0500 Subject: [PATCH 3/7] Do not ignore folder with already committed to git tests/input/training This complicates commit of modifications --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 0ed907210..98a45a25d 100644 --- a/.gitignore +++ b/.gitignore @@ -141,6 +141,7 @@ tests/output/eval-* tasks/ training/ +!tests/input/training/ preserved/ random LOG From 6a03cc749421d516ae6e7a6cb1b3ad99bebc3a3a Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 6 Dec 2024 19:39:37 -0500 Subject: [PATCH 4/7] More of fixups --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 028f97461..8ba6afb2f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,5 +108,5 @@ build-backend = "poetry_dynamic_versioning.backend" [tool.codespell] skip = '.git,*.pdf,*.svg,poetry.lock,output,*.tsv' # some specific phrases, variables and mixed case (CamelCase etc) -ignore-regex = '\b(Torsades de pointes|[A-Z][a-zA-Z]*|[a-z]+[A-Z][a-zA-Z]*)\b' -ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions' +ignore-regex = '\b(Torsades de pointes|[A-Z][a-zA-Z]*|[a-z]+[A-Z][a-zA-Z]*|de pointes)\b|\bcommments:' +ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions,euclidian,dependant' From 2588571e3905c490a1acd8c455e18dd1d018f07c Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 18 Dec 2024 08:53:25 -0500 Subject: [PATCH 5/7] Ignore old and go-nucleus.json --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8ba6afb2f..83d2a4303 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,7 +106,7 @@ requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning"] build-backend = "poetry_dynamic_versioning.backend" [tool.codespell] -skip = '.git,*.pdf,*.svg,poetry.lock,output,*.tsv' +skip = '.git,*.pdf,*.svg,poetry.lock,output,*.tsv,./tests/input,old' # some specific phrases, variables and mixed case (CamelCase etc) ignore-regex = '\b(Torsades de pointes|[A-Z][a-zA-Z]*|[a-z]+[A-Z][a-zA-Z]*|de pointes)\b|\bcommments:' ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions,euclidian,dependant' From 77bca8e6efad059dd654979e152eea07caf5e592 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Wed, 18 Dec 2024 08:54:11 -0500 Subject: [PATCH 6/7] [DATALAD RUNCMD] run codespell throughout fixing few left typos automagically === Do not change lines below === { "chain": [], "cmd": "codespell -w", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ --- Makefile | 2 +- docs/custom.md | 2 +- docs/functions.md | 2 +- docs/troubleshooting.md | 2 +- notebooks/BioEPIC_demo.ipynb | 4 ++-- src/ontogpt/cli.py | 4 ++-- src/ontogpt/clients/pubmed_client.py | 2 +- src/ontogpt/evaluation/drugmechdb/eval_drugmechdb.py | 2 +- src/ontogpt/templates/dietitian_notes.yaml | 2 +- src/ontogpt/templates/ecosim_methods.py | 2 +- src/ontogpt/templates/ecosim_methods.yaml | 2 +- src/ontogpt/templates/pathology.py | 2 +- src/ontogpt/templates/pathology.yaml | 2 +- src/ontogpt/templates/recipe.yaml | 2 +- src/ontogpt/utils/pymupdf_helpers.py | 2 +- 15 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 8a1a1afcb..347a6cccb 100644 --- a/Makefile +++ b/Makefile @@ -57,7 +57,7 @@ gh-deploy: all_recipes: tests/output/owl/merged/recipe-all-merged.owl # prefix with 'web' for a URL in recipe-urls.csv -# prefix wiyth 'case' for a previously downloaded recipe in cases/ directory +# prefix with 'case' for a previously downloaded recipe in cases/ directory RECIPES = case-spaghetti case-egg-noodles case-tortilla-soup \ web-spinach-and-feta-turkey-burgers \ web-shrimp-and-cheesy-grits-with-bacon \ diff --git a/docs/custom.md b/docs/custom.md index 42b2f6790..bc2ef3340 100644 --- a/docs/custom.md +++ b/docs/custom.md @@ -469,4 +469,4 @@ For example, if your schema is named `albatross.yaml`, then an extract command i ontogpt extract -t albatross.yaml -i input.txt ``` -Running this (or any other command including your custom schema) will install it for future use with OntoGPT, so in subsquent commands it can be referred to by its name (e.g., `albatross`, without the file extension or a full filepath). +Running this (or any other command including your custom schema) will install it for future use with OntoGPT, so in subsequent commands it can be referred to by its name (e.g., `albatross`, without the file extension or a full filepath). diff --git a/docs/functions.md b/docs/functions.md index cadec94e6..536d5c9a6 100644 --- a/docs/functions.md +++ b/docs/functions.md @@ -194,7 +194,7 @@ Including an instruction like the following anecdotally helps to avoid parsing f ### selectcols -Use the option `selectcols` to specify exact colums to use when parsing tabular files as input. +Use the option `selectcols` to specify exact columns to use when parsing tabular files as input. Example: diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 473a335e5..f5eb4094b 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -20,7 +20,7 @@ OntoGPT uses `oaklib` to handle the ontologies it uses as annotators, and `oakli To change the download location, set the `PYSTOW_HOME` variable in your environment to your preferred path. -For example, to save downloads to `/tmp/oaklib`, set the varible like this: +For example, to save downloads to `/tmp/oaklib`, set the variable like this: ```bash export PYSTOW_HOME='/tmp/' diff --git a/notebooks/BioEPIC_demo.ipynb b/notebooks/BioEPIC_demo.ipynb index 7780d5358..d6288f4a6 100644 --- a/notebooks/BioEPIC_demo.ipynb +++ b/notebooks/BioEPIC_demo.ipynb @@ -18,7 +18,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The following examples demonstrate basic functionality of OntoGPT and the SPIRES method for extracting and integrating data (i.e., concepts and relationships) from texts in the envrionmental and earth science domains.\n", + "The following examples demonstrate basic functionality of OntoGPT and the SPIRES method for extracting and integrating data (i.e., concepts and relationships) from texts in the environmental and earth science domains.\n", "These examples assume use of the LBNL CBORG computing resource." ] }, @@ -225,7 +225,7 @@ " A semicolon-separated list of variables measured in\n", " environmental and earth science research. Examples\n", " include: root shape, biomass, water turbidity\n", - " equipments:\n", + " equipment:\n", " range: Equipment\n", " description: >-\n", " A semicolon-separated list of equipment used in\n", diff --git a/src/ontogpt/cli.py b/src/ontogpt/cli.py index 13dc83a52..c6705656e 100644 --- a/src/ontogpt/cli.py +++ b/src/ontogpt/cli.py @@ -1208,7 +1208,7 @@ def synonyms( ontogpt synonyms -m ollama/llama3 --context "political" "abdicate" - ontogpt synonyms -m ollama/llama3 --context "biological" "dessicate" + ontogpt synonyms -m ollama/llama3 --context "biological" "desiccate" """ logging.info(f"Creating for {term}") @@ -2207,7 +2207,7 @@ def list_models(): Max Tokens: Token limit for the model. Note that models may tokenize text differently and calculate input and/or output tokens - in particular ways, so consult a model's original documentaion for + in particular ways, so consult a model's original documentation for further details. """ models = get_model_cost_map("") diff --git a/src/ontogpt/clients/pubmed_client.py b/src/ontogpt/clients/pubmed_client.py index 21e48e89f..251700726 100644 --- a/src/ontogpt/clients/pubmed_client.py +++ b/src/ontogpt/clients/pubmed_client.py @@ -183,7 +183,7 @@ def text( :param ids: List of PubMed IDs, or string with single PMID :param raw: if True, do not parse the xml, just return the raw output with tags :param autoformat: if True include title and abstract concatenated - :param pubmedcentral: if True, retreive text from PubMed Central where possible + :param pubmedcentral: if True, retrieve text from PubMed Central where possible :return: the text of a single entry, or a list of strings for text of multiple entries """ batch_size = 200 diff --git a/src/ontogpt/evaluation/drugmechdb/eval_drugmechdb.py b/src/ontogpt/evaluation/drugmechdb/eval_drugmechdb.py index 65cc23753..b5fd3a6ab 100644 --- a/src/ontogpt/evaluation/drugmechdb/eval_drugmechdb.py +++ b/src/ontogpt/evaluation/drugmechdb/eval_drugmechdb.py @@ -41,7 +41,7 @@ def _fix_source_mechanism(mechanism_dict: dict) -> dict: g["id"] = g["_id"] del g["_id"] # normalize alt_ids - bad_fields = ["all_id", "alt_name", "alt-name", "comemt", "comemnt"] + bad_fields = ["all_id", "alt_name", "alt-name", "comemt", "comment"] for n in mechanism_dict["nodes"]: if "alt_ids" in n and isinstance(n["alt_ids"], str): n["alt_ids"] = [n["alt_ids"]] diff --git a/src/ontogpt/templates/dietitian_notes.yaml b/src/ontogpt/templates/dietitian_notes.yaml index a233ffbac..22db3944d 100644 --- a/src/ontogpt/templates/dietitian_notes.yaml +++ b/src/ontogpt/templates/dietitian_notes.yaml @@ -294,7 +294,7 @@ classes: range: string # TODO: distinguish whether this is currently active therapy - # or a reccomendation for future therapy (but not yet started) + # or a recommendation for future therapy (but not yet started) TherapeuticMaterial: description: >- A specific material added to a patient's diet or diff --git a/src/ontogpt/templates/ecosim_methods.py b/src/ontogpt/templates/ecosim_methods.py index b3280f677..3ee82faab 100644 --- a/src/ontogpt/templates/ecosim_methods.py +++ b/src/ontogpt/templates/ecosim_methods.py @@ -198,7 +198,7 @@ class TermSet(NamedEntity): locations: Optional[List[str]] = Field(None, description="""A semicolon-separated list of research locations. Examples include: Vermont, New York City, Ethiopia""", json_schema_extra = { "linkml_meta": {'alias': 'locations', 'domain_of': ['TermSet']} }) methods: Optional[List[str]] = Field(None, description="""A semicolon-separated list of methods used in environmental and earth science research. Examples include: sampling, spectroscopy""", json_schema_extra = { "linkml_meta": {'alias': 'methods', 'domain_of': ['TermSet']} }) variables: Optional[str] = Field(None, description="""A semicolon-separated list of variables measured in environmental and earth science research. Examples include: root shape, biomass, water turbidity""", json_schema_extra = { "linkml_meta": {'alias': 'variables', 'domain_of': ['TermSet']} }) - equipments: Optional[str] = Field(None, description="""A semicolon-separated list of equipment used in environmental and earth science research.""", json_schema_extra = { "linkml_meta": {'alias': 'equipments', 'domain_of': ['TermSet']} }) + equipment: Optional[str] = Field(None, description="""A semicolon-separated list of equipment used in environmental and earth science research.""", json_schema_extra = { "linkml_meta": {'alias': 'equipment', 'domain_of': ['TermSet']} }) equipment_to_variable_relationships: Optional[List[EquipmentMeasuresVariable]] = Field(None, description="""A semicolon separated list of relationships between specific equipment and variables they are used to measure as described in the input. Example: NMR spectrometer was used to measure chemical content""", json_schema_extra = { "linkml_meta": {'alias': 'equipment_to_variable_relationships', 'domain_of': ['TermSet']} }) id: str = Field(..., description="""A unique identifier for the named entity""", json_schema_extra = { "linkml_meta": {'alias': 'id', 'annotations': {'prompt.skip': {'tag': 'prompt.skip', 'value': 'true'}}, diff --git a/src/ontogpt/templates/ecosim_methods.yaml b/src/ontogpt/templates/ecosim_methods.yaml index 5900af1e7..031f201ab 100644 --- a/src/ontogpt/templates/ecosim_methods.yaml +++ b/src/ontogpt/templates/ecosim_methods.yaml @@ -42,7 +42,7 @@ classes: A semicolon-separated list of variables measured in environmental and earth science research. Examples include: root shape, biomass, water turbidity - equipments: + equipment: range: Equipment description: >- A semicolon-separated list of equipment used in diff --git a/src/ontogpt/templates/pathology.py b/src/ontogpt/templates/pathology.py index 71b8a645e..cdb85036d 100644 --- a/src/ontogpt/templates/pathology.py +++ b/src/ontogpt/templates/pathology.py @@ -318,7 +318,7 @@ class PathologyReport(ConfiguredBaseModel): """ linkml_meta: ClassVar[LinkMLMeta] = LinkMLMeta({'from_schema': 'http://w3id.org/ontogpt/pathology', 'tree_root': True}) - pathology_statements: Optional[List[PathologyStatement]] = Field(None, description="""A semicolon-delimited list of pathology statements, each describing a pathology, including any diagnoses, one or more specific qualities being measured and the anatomical location or tissue the pathology is measured in. If any of the pathology statements are negative, the negation should be included in each statment, e.g., \"no granulomas or viropathic changes\" should become \"no granulomas\" and \"no viropathic changes\".""", json_schema_extra = { "linkml_meta": {'alias': 'pathology_statements', 'domain_of': ['PathologyReport']} }) + pathology_statements: Optional[List[PathologyStatement]] = Field(None, description="""A semicolon-delimited list of pathology statements, each describing a pathology, including any diagnoses, one or more specific qualities being measured and the anatomical location or tissue the pathology is measured in. If any of the pathology statements are negative, the negation should be included in each statement, e.g., \"no granulomas or viropathic changes\" should become \"no granulomas\" and \"no viropathic changes\".""", json_schema_extra = { "linkml_meta": {'alias': 'pathology_statements', 'domain_of': ['PathologyReport']} }) is_benign: Optional[str] = Field(None, description="""Whether the overall pathology appears to be benign and not malignant. Other pathologies may be present, but if tissue is described as benign and/or if a carcinoma is explicitly excluded, this value should be true. A statement of \"no significant pathologic abnormality\" or the short form \"nspa\" would also have a value of true. It it otherwise 'unclear'.""", json_schema_extra = { "linkml_meta": {'alias': 'is_benign', 'annotations': {'prompt.example': {'tag': 'prompt.example', 'value': 'true, false, unclear'}}, diff --git a/src/ontogpt/templates/pathology.yaml b/src/ontogpt/templates/pathology.yaml index 150a3cbe6..b1a343f8a 100644 --- a/src/ontogpt/templates/pathology.yaml +++ b/src/ontogpt/templates/pathology.yaml @@ -43,7 +43,7 @@ classes: pathology, including any diagnoses, one or more specific qualities being measured and the anatomical location or tissue the pathology is measured in. If any of the pathology statements are negative, the - negation should be included in each statment, e.g., "no granulomas or + negation should be included in each statement, e.g., "no granulomas or viropathic changes" should become "no granulomas" and "no viropathic changes". range: PathologyStatement diff --git a/src/ontogpt/templates/recipe.yaml b/src/ontogpt/templates/recipe.yaml index 04f2646c3..49b97c376 100644 --- a/src/ontogpt/templates/recipe.yaml +++ b/src/ontogpt/templates/recipe.yaml @@ -18,7 +18,7 @@ prefixes: qudt: http://qudt.org/schema/qudt/ dbpediaont: http://dbpedia.org/ontology/ -# This template incorportates syntax from +# This template incorporates syntax from # linkml-owl to define OWL interpretations # and enable advanced functionality. # https://linkml.io/linkml-owl/templates/ diff --git a/src/ontogpt/utils/pymupdf_helpers.py b/src/ontogpt/utils/pymupdf_helpers.py index 9b909c2c7..bec15c643 100644 --- a/src/ontogpt/utils/pymupdf_helpers.py +++ b/src/ontogpt/utils/pymupdf_helpers.py @@ -106,7 +106,7 @@ def fonts(doc, granularity=False): def font_tags(font_counts, styles): """Return dictionary with font sizes as keys and tags as value. - :param font_counts: (font_size, count) for all fonts occuring in document + :param font_counts: (font_size, count) for all fonts occurring in document :type font_counts: list :param styles: all styles found in the document :type styles: dict From 349afc03912a3d672b4dddcb2784c0c243dbaa0a Mon Sep 17 00:00:00 2001 From: Harry Caufield Date: Wed, 18 Dec 2024 13:48:35 -0500 Subject: [PATCH 7/7] Add 'vrsatile' to ignored words --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 83d2a4303..5733d35c8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,4 +109,4 @@ build-backend = "poetry_dynamic_versioning.backend" skip = '.git,*.pdf,*.svg,poetry.lock,output,*.tsv,./tests/input,old' # some specific phrases, variables and mixed case (CamelCase etc) ignore-regex = '\b(Torsades de pointes|[A-Z][a-zA-Z]*|[a-z]+[A-Z][a-zA-Z]*|de pointes)\b|\bcommments:' -ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions,euclidian,dependant' +ignore-words-list = 'langual,sting,infarction,holliday,cyclin,convertor,ser,collapsin,infarctions,euclidian,dependant,vrsatile'