Skip to content

Commit

Permalink
geospatial and image (#5)
Browse files Browse the repository at this point in the history
* include resource

* make geospatial work

* test excel with non null values, fix lists and lists of lists handling, improve dictionary handling, make all embedding vectors lists not dicts

* fix image schema json; resolve issue of optional default values

* include excel files for geospatial and image

* remove LDA and Embeddings from schemas
  • Loading branch information
gblackadder authored Oct 25, 2024
1 parent c80dfe0 commit 12cdb85
Show file tree
Hide file tree
Showing 34 changed files with 1,057 additions and 1,421 deletions.
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,11 @@ There are metadata objects for each of the following metadata types:
|------------------|-------------------------------------------------|
| document | `document_schema.ScriptSchemaDraft` |
| geospatial | `geospatial_schema.GeospatialSchema` |
| image | `image_schema.ImageDataTypeSchema` |
| indicator | `indicator_schema.TimeseriesSchema` |
| indicators_db | `indicators_db_schema.TimeseriesDatabaseSchema` |
| microdata | `microdata_schema.MicrodataSchema` |
| microdata | `microdata_schema.MicrodataSchema` |
| resource |`resource_schema.Model` |
| script | `script_schema.ResearchProjectSchemaDraft` |
| table | `table_schema.Model` |
| video | `video_schema.Model` |
Expand Down Expand Up @@ -65,9 +67,6 @@ filename = mm.save_metadata_to_excel('indicator', object=indicator_metadata)

updated_indicator_metadata = mm.read_metadata_from_excel(filename)
```

Note that the Excel write and save functions do not currently support Geospatial metadata.

The manager also offers a convenient way to get started creating metadata in pydantic by creating an empty pydantic object for a given metadata type which can then be updated as needed.

```python
Expand Down Expand Up @@ -98,7 +97,7 @@ Next update the pydantic schemas so that they match the latest json schemas by r

Finally update the Excel sheets by running

`python pydantic_schemas/generators/generate_excel_files.py`
`python -m pydantic_schemas.generators.generate_excel_files`

## Versioning conventions for schemas

Expand Down
Binary file modified excel_sheets/Document_metadata.xlsx
Binary file not shown.
Binary file added excel_sheets/Geospatial_metadata.xlsx
Binary file not shown.
Binary file added excel_sheets/Image_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Indicators_db_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Microdata_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Script_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Table_metadata.xlsx
Binary file not shown.
Binary file modified excel_sheets/Video_metadata.xlsx
Binary file not shown.
42 changes: 1 addition & 41 deletions pydantic_schemas/document_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from __future__ import annotations

from enum import Enum
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional

from pydantic import Extra, Field

Expand Down Expand Up @@ -261,44 +261,6 @@ class Tag(SchemaBaseModel):
tag_group: Optional[str] = Field(None, title="Tag group")


class ModelInfoItem(SchemaBaseModel):
source: Optional[str] = Field(None, title="Source")
author: Optional[str] = Field(None, title="Author")
version: Optional[str] = Field(None, title="Version")
model_id: Optional[str] = Field(None, title="Model Identifier")
nb_topics: Optional[float] = Field(None, title="Number of topics")
description: Optional[str] = Field(None, title="Description")
corpus: Optional[str] = Field(None, title="Corpus name")
uri: Optional[str] = Field(None, title="URI")


class TopicWord(SchemaBaseModel):
word: Optional[str] = Field(None, title="Word")
word_weight: Optional[float] = Field(None, title="Word weight")


class TopicDescriptionItem(SchemaBaseModel):
topic_id: Optional[Union[int, str]] = Field(None, title="Topic identifier")
topic_score: Optional[Union[float, str]] = Field(None, title="Topic score")
topic_label: Optional[str] = Field(None, title="Topic label")
topic_words: Optional[List[TopicWord]] = Field(None, description="Words", title="Topic words")


class LdaTopic(SchemaBaseModel):
class Config:
extra = Extra.forbid

model_info: Optional[List[ModelInfoItem]] = Field(None, title="Model information")
topic_description: Optional[List[TopicDescriptionItem]] = Field(None, title="Topic information")


class Embedding(SchemaBaseModel):
id: str = Field(..., title="Vector Model ID")
description: Optional[str] = Field(None, title="Vector Model Description")
date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)")
vector: Union[Dict[str, Any], List[Any]] = Field(..., title="Vector")


class OriginDescription(SchemaBaseModel):
harvest_date: Optional[str] = Field(None, description="Harvest date using UTC date format")
altered: Optional[bool] = Field(
Expand Down Expand Up @@ -587,6 +549,4 @@ class ScriptSchemaDraft(SchemaBaseModel):
)
provenance: Optional[List[ProvenanceSchema]] = Field(None, description="Provenance")
tags: Optional[List[Tag]] = Field(None, description="Tags", title="Tags")
lda_topics: Optional[List[LdaTopic]] = Field(None, description="LDA topics", title="LDA topics")
embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings")
additional: Optional[Dict[str, Any]] = Field(None, description="Additional metadata")
14 changes: 6 additions & 8 deletions pydantic_schemas/generators/generate_excel_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ def compare_excel_files(file1, file2):

# Check if both workbooks have the same sheets
if sheets1 != sheets2:
print("Sheet names do not match")
print(f"File1 sheets: {sheets1}")
print(f"File2 sheets: {sheets2}")
# print("Sheet names do not match")
# print(f"File1 sheets: {sheets1}")
# print(f"File2 sheets: {sheets2}")
return False

# Iterate through each sheet
Expand Down Expand Up @@ -62,9 +62,9 @@ def compare_excel_files(file1, file2):
differences.append(f"Alignment: {ws1[cell_address].alignment} != {ws2[cell_address].alignment}")

if differences:
print(f"Differences found at {sheet_name} {cell_address}:")
for difference in differences:
print(f" - {difference}")
# print(f"Differences found at {sheet_name} {cell_address}:")
# for difference in differences:
# print(f" - {difference}")
return False

return True
Expand All @@ -73,8 +73,6 @@ def compare_excel_files(file1, file2):
metadata_manager = MetadataManager()

for metadata_name in metadata_manager.metadata_type_names:
if metadata_name in ["image", "geospatial"]:
continue
filename = f"excel_sheets/{metadata_name.capitalize()}_metadata.xlsx"
print(f"Writing {metadata_name} outline to {filename}")
if os.path.exists(filename):
Expand Down
42 changes: 1 addition & 41 deletions pydantic_schemas/geospatial_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from __future__ import annotations

from enum import Enum
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional

from pydantic import Extra, Field, confloat

Expand Down Expand Up @@ -493,44 +493,6 @@ class Tag(SchemaBaseModel):
tag_group: Optional[str] = Field(None, title="Tag group")


class ModelInfoItem(SchemaBaseModel):
source: Optional[str] = Field(None, title="Source")
author: Optional[str] = Field(None, title="Author")
version: Optional[str] = Field(None, title="Version")
model_id: Optional[str] = Field(None, title="Model Identifier")
nb_topics: Optional[float] = Field(None, title="Number of topics")
description: Optional[str] = Field(None, title="Description")
corpus: Optional[str] = Field(None, title="Corpus name")
uri: Optional[str] = Field(None, title="URI")


class TopicWord(SchemaBaseModel):
word: Optional[str] = Field(None, title="Word")
word_weight: Optional[float] = Field(None, title="Word weight")


class TopicDescriptionItem(SchemaBaseModel):
topic_id: Optional[Union[int, str]] = Field(None, title="Topic identifier")
topic_score: Optional[Union[float, str]] = Field(None, title="Topic score")
topic_label: Optional[str] = Field(None, title="Topic label")
topic_words: Optional[List[TopicWord]] = Field(None, description="Words", title="Topic words")


class LdaTopic(SchemaBaseModel):
class Config:
extra = Extra.forbid

model_info: Optional[List[ModelInfoItem]] = Field(None, title="Model information")
topic_description: Optional[List[TopicDescriptionItem]] = Field(None, title="Topic information")


class Embedding(SchemaBaseModel):
id: str = Field(..., title="Vector Model ID")
description: Optional[str] = Field(None, title="Vector Model Description")
date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)")
vector: Dict[str, Any] = Field(..., title="Vector")


class ResourceSchema(SchemaBaseModel):
"""
External resource schema
Expand Down Expand Up @@ -1523,8 +1485,6 @@ class GeospatialSchema(SchemaBaseModel):
description: Description = Field(..., title="Geospatial schema")
provenance: Optional[List[ProvenanceSchema]] = Field(None, description="Provenance")
tags: Optional[List[Tag]] = Field(None, description="Tags", title="Tags")
lda_topics: Optional[List[LdaTopic]] = Field(None, description="LDA topics", title="LDA topics")
embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings")
additional: Optional[Dict[str, Any]] = Field(
None, description="Any additional metadata", title="Additional metadata"
)
Expand Down
56 changes: 2 additions & 54 deletions pydantic_schemas/image_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@

from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional

from pydantic import AnyUrl, Extra, Field, confloat, constr
from pydantic import AnyUrl, Extra, Field, confloat

from .utils.schema_base_model import SchemaBaseModel

Expand Down Expand Up @@ -71,44 +71,6 @@ class Tag(SchemaBaseModel):
tag_group: Optional[str] = Field(None, title="Tag group")


class ModelInfoItem(SchemaBaseModel):
source: Optional[str] = Field(None, title="Source")
author: Optional[str] = Field(None, title="Author")
version: Optional[str] = Field(None, title="Version")
model_id: Optional[str] = Field(None, title="Model Identifier")
nb_topics: Optional[float] = Field(None, title="Number of topics")
description: Optional[str] = Field(None, title="Description")
corpus: Optional[str] = Field(None, title="Corpus name")
uri: Optional[str] = Field(None, title="URI")


class TopicWord(SchemaBaseModel):
word: Optional[str] = Field(None, title="Word")
word_weight: Optional[float] = Field(None, title="Word weight")


class TopicDescriptionItem(SchemaBaseModel):
topic_id: Optional[Union[int, str]] = Field(None, title="Topic identifier")
topic_score: Optional[Union[float, str]] = Field(None, title="Topic score")
topic_label: Optional[str] = Field(None, title="Topic label")
topic_words: Optional[List[TopicWord]] = Field(None, description="Words", title="Topic words")


class LdaTopic(SchemaBaseModel):
class Config:
extra = Extra.forbid

model_info: Optional[List[ModelInfoItem]] = Field(None, title="Model information")
topic_description: Optional[List[TopicDescriptionItem]] = Field(None, title="Topic information")


class Embedding(SchemaBaseModel):
id: str = Field(..., title="Vector Model ID")
description: Optional[str] = Field(None, title="Vector Model Description")
date: Optional[str] = Field(None, title="Date (YYYY-MM-DD)")
vector: Dict[str, Any] = Field(..., title="Vector")


class SceneCodesLabelledItem(SchemaBaseModel):
code: Optional[str] = Field(None, description="Scene code as a string of 6 digits", title="Scene Code")
label: Optional[str] = Field(None, description="Label", title="Scene Label")
Expand Down Expand Up @@ -139,18 +101,6 @@ class Config:
description: Optional[str] = None


class AltLangObject(SchemaBaseModel):
class Config:
extra = Extra.forbid

__root__: Dict[
constr(
regex=r"^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+))$"
),
str,
] = Field(..., description="Text in alternative languages")


class ArtworkOrObject(SchemaBaseModel):
class Config:
extra = Extra.forbid
Expand Down Expand Up @@ -1176,6 +1126,4 @@ class ImageDataTypeSchema(SchemaBaseModel):
image_description: Optional[ImageDescription] = None
provenance: Optional[List[ProvenanceSchema]] = Field(None, description="Provenance")
tags: Optional[List[Tag]] = Field(None, description="Tags", title="Tags")
lda_topics: Optional[List[LdaTopic]] = Field(None, description="LDA topics", title="LDA topics")
embeddings: Optional[List[Embedding]] = Field(None, description="Word embeddings", title="Word embeddings")
additional: Optional[Dict[str, Any]] = Field(None, description="Additional metadata")
Loading

0 comments on commit 12cdb85

Please sign in to comment.