Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract datamodel to json and excel #18

Merged
merged 20 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,4 +171,7 @@ cython_debug/

# pytest coverage
pytest.xml
pytest-coverage.txt
pytest-coverage.txt

# artifacts
artifacts
5 changes: 4 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/.venv/bin/bam_masterdata",
"justMyCode": false,
"args": ["fill_masterdata"]
"args": [
"fill_masterdata",
// "--url=https://devel.datastore.bam.de/"
]
},
{
"name": "BM export-to-json",
Expand Down
109 changes: 99 additions & 10 deletions bam_masterdata/cli/cli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,23 @@
import os
import subprocess
import time
from pathlib import Path

import click
from decouple import config as environ
from openpyxl import Workbook

from bam_masterdata.cli.entities_to_excel import entities_to_excel
from bam_masterdata.cli.entities_to_json import entities_to_json
from bam_masterdata.cli.fill_masterdata import MasterdataCodeGenerator
from bam_masterdata.logger import logger
from bam_masterdata.utils import (
delete_and_create_dir,
import_module,
listdir_py_modules,
)

DATAMODEL_DIR = os.path.join(".", "bam_masterdata", "datamodel")


@click.group(help="Entry point to run `bam_masterdata` CLI commands.")
Expand All @@ -14,19 +27,31 @@ def cli():

@cli.command(
name="fill_masterdata",
help="Fill the masterdata from the openBIS instance specified in the `.env` in the bam_masterdata/datamodel/ subfolder.",
help="Fill the masterdata from the openBIS instance and stores it in the bam_masterdata/datamodel/ modules.",
)
@click.option(
"--url",
type=str,
required=False,
help="""
(Optional) The URL of the openBIS instance from which to extract the data model. If not defined,
it is using the value of the `OPENBIS_URL` environment variable.
""",
)
def fill_masterdata():
def fill_masterdata(url):
start_time = time.time()

# ! this takes a lot of time loading all the entities in Openbis
generator = MasterdataCodeGenerator()
# Use the URL if provided, otherwise fall back to defaults
if not url:
url = environ("OPENBIS_URL")
click.echo(f"Using the openBIS instance: {url}\n")
generator = MasterdataCodeGenerator(url=url)

# Add each module to the `bam_masterdata/datamodel` directory
output_dir = os.path.join(".", "bam_masterdata", "datamodel")
for module_name in ["property", "collection", "dataset", "object", "vocabulary"]:
module_start_time = time.perf_counter() # more precise time measurement
output_file = Path(os.path.join(output_dir, f"{module_name}_types.py"))
output_file = Path(os.path.join(DATAMODEL_DIR, f"{module_name}_types.py"))

# Get the method from `MasterdataCodeGenerator`
code = getattr(generator, f"generate_{module_name}_types")()
Expand All @@ -40,12 +65,76 @@ def fill_masterdata():
elapsed_time = time.time() - start_time
click.echo(f"Generated all types in {elapsed_time:.2f} seconds\n\n")

# ! this could be automated in the CLI
click.echo(
"Don't forget to apply ruff at the end after generating the files by doing:\n"
try:
# Run ruff check
click.echo("Running `ruff check .`...")
subprocess.run(["ruff", "check", "."], check=True)

# Run ruff format
click.echo("Running `ruff format .`...")
subprocess.run(["ruff", "format", "."], check=True)
except subprocess.CalledProcessError as e:
click.echo(f"Error during ruff execution: {e}", err=True)
else:
click.echo("Ruff checks and formatting completed successfully!")


@cli.command(
name="export_to_json",
help="Export entities to JSON files to the `./artifacts/` folder.",
)
def export_to_json():
# Get the directories from the Python modules and the export directory for the static artifacts
export_dir = os.path.join(".", "artifacts")

# Delete and create the export directory
delete_and_create_dir(directory_path=export_dir, logger=logger)

# Get the Python modules to process the datamodel
py_modules = listdir_py_modules(directory_path=DATAMODEL_DIR, logger=logger)

# Process each module using the `to_json` method of each entity
for module_path in py_modules:
entities_to_json(module_path=module_path, export_dir=export_dir, logger=logger)

click.echo(f"All entity artifacts have been generated and saved to {export_dir}")


@cli.command(
name="export_to_excel",
help="Export entities to an Excel file in the path `./artifacts/masterdata.xlsx`.",
)
def export_to_excel():
# Get the Python modules to process the datamodel
py_modules = listdir_py_modules(directory_path=DATAMODEL_DIR, logger=logger)

# Load the definitions module classes
definitions_module = import_module(
module_path="./bam_masterdata/metadata/definitions.py"
)
click.echo(" ruff check .\n")
click.echo(" ruff format .\n")

# Process the modules and save the entities to the openBIS masterdata Excel file
masterdata_file = os.path.join(".", "artifacts", "masterdata.xlsx")
wb = Workbook()
for i, module_path in enumerate(py_modules):
if i == 0:
ws = wb.active
else:
ws = wb.create_sheet()
ws.title = (
os.path.basename(module_path)
.capitalize()
.replace(".py", "")
.replace("_", " ")
)
entities_to_excel(
worksheet=ws,
module_path=module_path,
definitions_module=definitions_module,
)
wb.save(masterdata_file)

click.echo(f"All masterdata have been generated and saved to {masterdata_file}")


if __name__ == "__main__":
Expand Down
99 changes: 99 additions & 0 deletions bam_masterdata/cli/entities_to_excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import inspect
from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
from openpyxl.worksheet.worksheet import Worksheet

from bam_masterdata.utils import import_module


def entities_to_excel(
worksheet: "Worksheet",
module_path: str,
definitions_module: Any,
) -> None:
"""
Export entities to the Excel file. The Python modules are imported using the function `import_module`,
and their contents are inspected (using `inspect`) to find the classes in the datamodel containing
`defs` and with a `to_json` method defined. Each row is then appended to the `worksheet`.

Args:
worksheet (Worksheet): The worksheet to append the entities.
module_path (str): Path to the Python module file.
definitions_module (Any): The module containing the definitions of the entities. This is used
to match the header definitions of the entities.
"""
def_members = inspect.getmembers(definitions_module, inspect.isclass)
module = import_module(module_path=module_path)

# Special case of `PropertyTypeDef` in `property_types.py`
if "property_types.py" in module_path:
for name, obj in inspect.getmembers(module):
if name.startswith("_") or name == "PropertyTypeDef":
continue

# Entity title
worksheet.append([obj.excel_name])

# Entity header definitions and values
worksheet.append(obj.excel_headers)
row = []
for f_set in obj.model_fields.keys():
if f_set == "data_type":
val = obj.data_type.value
else:
val = getattr(obj, f_set)
row.append(val)
worksheet.append(row)
worksheet.append([""]) # empty row after entity definitions
return None

# All other datamodel modules
for _, obj in inspect.getmembers(module, inspect.isclass):
# Ensure the class has the `to_json` method
if not hasattr(obj, "defs") or not callable(getattr(obj, "to_json")):
continue

obj_instance = obj()

# Entity title
obj_definitions = obj_instance.defs
worksheet.append([obj_definitions.excel_name])

# Entity header definitions and values
for def_name, def_cls in def_members:
if def_name == obj_definitions.name:
break
worksheet.append(obj_definitions.excel_headers)
header_values = [
getattr(obj_definitions, f_set) for f_set in def_cls.model_fields.keys()
]
worksheet.append(header_values)

# Properties assignment for ObjectType, DatasetType, and CollectionType
if obj_instance.cls_name in ["ObjectType", "DatasetType", "CollectionType"]:
if not obj_instance.properties:
continue
worksheet.append(obj_instance.properties[0].excel_headers)
for prop in obj_instance.properties:
row = []
for f_set in prop.model_fields.keys():
if f_set == "data_type":
val = prop.data_type.value
else:
val = getattr(prop, f_set)
row.append(val)
worksheet.append(row)
# Terms assignment for VocabularyType
elif obj_instance.cls_name == "VocabularyType":
if not obj_instance.terms:
continue
worksheet.append(obj_instance.terms[0].excel_headers)
for term in obj_instance.terms:
worksheet.append(
getattr(term, f_set) for f_set in term.model_fields.keys()
)

# ? do the PropertyTypeDef need to be exported to Excel?

worksheet.append([""]) # empty row after entity definitions
67 changes: 67 additions & 0 deletions bam_masterdata/cli/entities_to_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import inspect
import json
import os
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from structlog._config import BoundLoggerLazyProxy

import click

from bam_masterdata.utils import delete_and_create_dir, import_module


def entities_to_json(
module_path: str, export_dir: str, logger: "BoundLoggerLazyProxy"
) -> None:
"""
Export entities to JSON files. The Python modules are imported using the function `import_module`,
and their contents are inspected (using `inspect`) to find the classes in the datamodel containing
`defs` and with a `to_json` method defined.

Args:
module_path (str): Path to the Python module file.
export_dir (str): Path to the directory where the JSON files will be saved.
logger (BoundLoggerLazyProxy): The logger to log messages.
"""
module = import_module(module_path=module_path)
# export to specific subfolders for each type of entity (each module)
module_export_dir = os.path.join(
export_dir, os.path.basename(module_path).replace(".py", "")
)
delete_and_create_dir(directory_path=module_export_dir, logger=logger)

# Special case of `PropertyTypeDef` in `property_types.py`
if "property_types.py" in module_path:
for name, obj in inspect.getmembers(module):
if name.startswith("_") or name == "PropertyTypeDef":
continue
try:
json_data = json.dumps(obj.model_dump(), indent=2)
output_file = os.path.join(module_export_dir, f"{obj.code}.json")
with open(output_file, "w", encoding="utf-8") as f:
f.write(json_data)

click.echo(f"Saved JSON for class {name} to {output_file}")
except Exception as err:
click.echo(f"Failed to process class {name} in {module_path}: {err}")
return None

# All other datamodel modules
for name, obj in inspect.getmembers(module, inspect.isclass):
# Ensure the class has the `to_json` method
if not hasattr(obj, "defs") or not callable(getattr(obj, "to_json")):
continue

try:
# Instantiate the class and call the method
json_data = obj().to_json(indent=2)

# Write JSON data to file
output_file = os.path.join(module_export_dir, f"{obj.defs.code}.json")
with open(output_file, "w", encoding="utf-8") as f:
f.write(json_data)

click.echo(f"Saved JSON for class {name} to {output_file}")
except Exception as err:
click.echo(f"Failed to process class {name} in {module_path}: {err}")
21 changes: 11 additions & 10 deletions bam_masterdata/cli/fill_masterdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import click

from bam_masterdata.openbis import OpenbisEntities
from bam_masterdata.openbis.login import environ


class MasterdataCodeGenerator:
Expand All @@ -11,14 +12,14 @@ class MasterdataCodeGenerator:
openBIS instance.
"""

def __init__(self):
def __init__(self, url: str = ""):
start_time = time.time()
# * This part takes some time due to the loading of all entities from Openbis
self.properties = OpenbisEntities().get_property_dict()
self.collections = OpenbisEntities().get_collection_dict()
self.datasets = OpenbisEntities().get_dataset_dict()
self.objects = OpenbisEntities().get_object_dict()
self.vocabularies = OpenbisEntities().get_vocabulary_dict()
self.properties = OpenbisEntities(url=url).get_property_dict()
self.collections = OpenbisEntities(url=url).get_collection_dict()
self.datasets = OpenbisEntities(url=url).get_dataset_dict()
self.objects = OpenbisEntities(url=url).get_object_dict()
self.vocabularies = OpenbisEntities(url=url).get_vocabulary_dict()
elapsed_time = time.time() - start_time
click.echo(
f"Loaded OpenBIS entities in `MasterdataCodeGenerator` initialization {elapsed_time:.2f} seconds\n"
Expand Down Expand Up @@ -103,7 +104,7 @@ def add_properties(
# ! patching dataType=SAMPLE instead of OBJECT
if prop_data.get("dataType", "") == "SAMPLE":
prop_data["dataType"] = "OBJECT"
lines.append(f" data_type=\"{prop_data.get('dataType', '')}\",")
lines.append(f' data_type="{prop_data.get("dataType", "")}",')
property_label = (prop_data.get("label") or "").replace("\n", "\\n")
lines.append(f' property_label="{property_label}",')
description = (
Expand Down Expand Up @@ -163,7 +164,7 @@ def generate_property_types(self) -> str:
# ! patching dataType=SAMPLE instead of OBJECT
if data.get("dataType", "") == "SAMPLE":
data["dataType"] = "OBJECT"
lines.append(f" data_type=\"{data.get('dataType', '')}\",")
lines.append(f' data_type="{data.get("dataType", "")}",')
property_label = (
(data.get("label") or "").replace('"', '\\"').replace("\n", "\\n")
)
Expand Down Expand Up @@ -222,7 +223,7 @@ def generate_collection_types(self) -> str:
lines.append(f' description="""{description}""",')
if data.get("validationPlugin") != "":
lines.append(
f" validation_script=\"{data.get('validationPlugin')}\","
f' validation_script="{data.get("validationPlugin")}",'
)
lines.append(" )")
lines.append("")
Expand Down Expand Up @@ -327,7 +328,7 @@ def generate_object_types(self) -> str:
)
lines.append(f' description="""{description}""",')
lines.append(
f" generated_code_prefix=\"{data.get('generatedCodePrefix', '')}\","
f' generated_code_prefix="{data.get("generatedCodePrefix", "")}",'
)
lines.append(" )")
lines.append("")
Expand Down
Loading
Loading