Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data Submission Reports Feature #28

Merged
merged 11 commits into from
Feb 9, 2024
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[flake8]
max-line-length = 88
exclude = venv,.git,__pycache__,docs/source/conf.py,old,build,dist,node_modules
ignore = F403, F401
ignore = F403, F401, W503
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,5 +171,8 @@ control/
# Zipped artifacts
*.zip

# frontend dependencies
node_modules/
# Frontend dependencies
node_modules/

# Local notes
.notes
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ ENV PATH="${PATH}:/opt/poetry/bin"
# Install dependencies and start app
WORKDIR /app
COPY pyproject.toml poetry.lock ./
RUN poetry install --only main
RUN poetry install --without dev
COPY . .
CMD ["/bin/sh", "start_local.sh"]
30 changes: 30 additions & 0 deletions alembic/versions/851709d3a162_add_report_column.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Add report column

Revision ID: 851709d3a162
Revises: c5596492c87b
Create Date: 2024-02-06 13:22:25.266733

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa
from sqlalchemy.types import JSON


# revision identifiers, used by Alembic.
revision: str = "851709d3a162"
down_revision: Union[str, None] = "c5596492c87b"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade():
op.add_column(
"data_submissions",
sa.Column("report", JSON, nullable=True),
)


def add_column():
op.drop_column("data_submissions", "report")
68 changes: 67 additions & 1 deletion nad_ch/application/dtos.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,73 @@
from dataclasses import dataclass
from dataclasses import dataclass, asdict, field, is_dataclass
from typing import List
import numpy as np


@dataclass
class DownloadResult:
temp_dir: str
extracted_dir: str


@dataclass
class DataSubmissionReportOverview:
feature_count: int = 0
features_flagged: int = 0
etl_update_required: bool = False
data_update_required: bool = False


@dataclass
class DataSubmissionReportFeature:
provided_feature_name: str
nad_feature_name: str
populated_count: int
null_count: int


@dataclass
class DataSubmissionReport:
overview: DataSubmissionReportOverview
features: List[DataSubmissionReportFeature] = field(default_factory=list)


def report_to_dict(data_submission_report: DataSubmissionReport) -> dict:
"""
Converts a DataSubmissionReport instance into a dictionary because all data types
within the dictionary must be JSON-serializable.
"""
return convert(asdict(data_submission_report))


def report_from_dict(data: dict) -> DataSubmissionReport:
"""
Creates a DataSubmissionReport instance from a dictionary, reconstructing the
overview and features properties from their respective dictionary representations.
"""

overview_data = data.get("overview", {})
features_data = data.get("features", [])

overview = DataSubmissionReportOverview(**overview_data)
features = [
DataSubmissionReportFeature(**feature_data) for feature_data in features_data
]

return DataSubmissionReport(overview=overview, features=features)


def convert(item):
"""
Recursively converts items within a data structure (including dictionaries, lists,
and dataclass instances) such that all numeric types are JSON-serializable.
"""
if isinstance(item, dict):
return {k: convert(v) for k, v in item.items()}
elif isinstance(item, list):
return [convert(i) for i in item]
elif isinstance(item, (np.int64, np.int32, np.float64, np.float32)):
return int(item) if isinstance(item, (np.int64, np.int32)) else float(item)
elif is_dataclass(item):
return convert(asdict(item))
else:
return item
4 changes: 3 additions & 1 deletion nad_ch/application/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def cleanup_temp_dir(self, temp_dir: str) -> bool:


class TaskQueue(Protocol):
def run_load_and_validate(self, path: str):
def run_load_and_validate(
self, submissions: DataSubmissionRepository, submission_id: int, path: str
):
...


Expand Down
6 changes: 4 additions & 2 deletions nad_ch/application/use_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,10 @@ def validate_data_submission(ctx: ApplicationContext, filename: str):
ctx.logger.error("Data extration error")
return

result = ctx.task_queue.run_load_and_validate(download_result.extracted_dir)
report = ctx.task_queue.run_load_and_validate(
ctx.submissions, submission.id, download_result.extracted_dir
)

ctx.logger.info(f"Total number of features: {result.get()}")
ctx.logger.info(f"Total number of features: {report.overview.feature_count}")

ctx.storage.cleanup_temp_dir(download_result.temp_dir)
21 changes: 21 additions & 0 deletions nad_ch/application/validation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,26 @@
from typing import List
from geopandas import GeoDataFrame
from nad_ch.application.dtos import DataSubmissionReportFeature


def get_feature_count(gdf: GeoDataFrame) -> int:
return len(gdf)


def get_feature_details(gdf: GeoDataFrame) -> List[DataSubmissionReportFeature]:
report_features = []

for column in gdf.columns:
populated_count = gdf[column].notna().sum()
null_count = gdf[column].isna().sum()

report_feature = DataSubmissionReportFeature(
provided_feature_name=column,
nad_feature_name=column,
populated_count=populated_count,
null_count=null_count,
)

report_features.append(report_feature)

return report_features
6 changes: 3 additions & 3 deletions nad_ch/config/development_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,18 @@

class DevLocalApplicationContext(ApplicationContext):
def __init__(self):
self._session = create_session_factory(DATABASE_URL)
self._session_factory = create_session_factory(DATABASE_URL)
self._providers = self.create_provider_repository()
self._submissions = self.create_submission_repository()
self._logger = self.create_logger()
self._storage = self.create_storage()
self._task_queue = self.create_task_queue()

def create_provider_repository(self):
return SqlAlchemyDataProviderRepository(self._session)
return SqlAlchemyDataProviderRepository(self._session_factory)

def create_submission_repository(self):
return SqlAlchemyDataSubmissionRepository(self._session)
return SqlAlchemyDataSubmissionRepository(self._session_factory)

def create_logger(self):
return BasicLogger(__name__, logging.DEBUG)
Expand Down
6 changes: 3 additions & 3 deletions nad_ch/config/development_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,18 @@ def get_credentials(service_name, default={}):

class DevRemoteApplicationContext(ApplicationContext):
def __init__(self):
self._session = create_session_factory(DATABASE_URL)
self._session_factory = create_session_factory(DATABASE_URL)
self._providers = self.create_provider_repository()
self._submissions = self.create_submission_repository()
self._logger = self.create_logger()
self._storage = self.create_storage()
self._task_queue = self.create_task_queue()

def create_provider_repository(self):
return SqlAlchemyDataProviderRepository(self._session)
return SqlAlchemyDataProviderRepository(self._session_factory)

def create_submission_repository(self):
return SqlAlchemyDataSubmissionRepository(self._session)
return SqlAlchemyDataSubmissionRepository(self._session_factory)

def create_logger(self):
return BasicLogger(__name__)
Expand Down
5 changes: 5 additions & 0 deletions nad_ch/domain/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,13 @@ def __init__(
self,
filename: str,
provider: DataProvider,
report=None,
id: int = None,
):
super().__init__(id)
self.filename = filename
self.provider = provider
self.report = report

def __repr__(self):
return f"DataSubmission \
Expand All @@ -55,3 +57,6 @@ def generate_filename(file_path: str, provider: DataProvider) -> str:
_, file_extension = os.path.splitext(file_path)
filename = f"{formatted_provider_name}_{datetime_str}{file_extension}"
return filename

def has_report(self) -> bool:
return self.report is not None
3 changes: 3 additions & 0 deletions nad_ch/domain/repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ def get_by_provider(self, provider: DataProvider) -> Iterable[DataSubmission]:

def get_by_filename() -> Optional[DataSubmission]:
...

def update_report(self, submission_id: int, report) -> None:
...
42 changes: 29 additions & 13 deletions nad_ch/infrastructure/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from sqlalchemy import Column, Integer, String, create_engine, ForeignKey, DateTime
from sqlalchemy.orm import sessionmaker, declarative_base, relationship, Session
from sqlalchemy.sql import func
from sqlalchemy.types import JSON
import contextlib
from nad_ch.domain.entities import DataProvider, DataSubmission
from nad_ch.domain.repositories import DataProviderRepository, DataSubmissionRepository
Expand All @@ -16,7 +17,7 @@ def create_session_factory(connection_string: str):

@contextlib.contextmanager
def session_scope(session_factory):
session = session_factory
session = session_factory()
try:
yield session
session.commit()
Expand Down Expand Up @@ -76,6 +77,7 @@ class DataSubmissionModel(CommonBase):

filename = Column(String)
data_provider_id = Column(Integer, ForeignKey("data_providers.id"))
report = Column(JSON)

data_provider = relationship("DataProviderModel", back_populates="data_submissions")

Expand All @@ -84,12 +86,15 @@ def from_entity(submission):
model = DataSubmissionModel(
id=submission.id,
filename=submission.filename,
report=submission.report,
data_provider_id=submission.provider.id,
)
return model

def to_entity(self, provider: DataProvider):
entity = DataSubmission(id=self.id, filename=self.filename, provider=provider)
entity = DataSubmission(
id=self.id, filename=self.filename, report=self.report, provider=provider
)

if self.created_at is not None:
entity.set_created_at(self.created_at)
Expand All @@ -101,19 +106,19 @@ def to_entity(self, provider: DataProvider):


class SqlAlchemyDataProviderRepository(DataProviderRepository):
def __init__(self, session: Session):
self.session_factory = session
def __init__(self, session_factory):
self.session_factory = session_factory

def add(self, provider: DataProvider) -> DataProvider:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
provider_model = DataProviderModel.from_entity(provider)
session.add(provider_model)
session.commit()
session.refresh(provider_model)
return provider_model.to_entity()

def get_by_name(self, name: str) -> Optional[DataProvider]:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
provider_model = (
session.query(DataProviderModel)
.filter(DataProviderModel.name == name)
Expand All @@ -122,18 +127,18 @@ def get_by_name(self, name: str) -> Optional[DataProvider]:
return provider_model.to_entity() if provider_model else None

def get_all(self) -> List[DataProvider]:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
provider_models = session.query(DataProviderModel).all()
providers_entities = [provider.to_entity() for provider in provider_models]
return providers_entities


class SqlAlchemyDataSubmissionRepository(DataSubmissionRepository):
def __init__(self, session: Session):
self.session_factory = session
def __init__(self, session_factory):
self.session_factory = session_factory

def add(self, submission: DataSubmission) -> DataSubmission:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
submission_model = DataSubmissionModel.from_entity(submission)
session.add(submission_model)
session.commit()
Expand All @@ -146,7 +151,7 @@ def add(self, submission: DataSubmission) -> DataSubmission:
return submission_model.to_entity(provider_model.to_entity())

def get_by_id(self, id: int) -> Optional[DataSubmission]:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
result = (
session.query(DataSubmissionModel, DataProviderModel)
.join(
Expand All @@ -164,7 +169,7 @@ def get_by_id(self, id: int) -> Optional[DataSubmission]:
return None

def get_by_provider(self, provider: DataProvider) -> List[DataSubmission]:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
submission_models = (
session.query(DataSubmissionModel)
.filter(DataSubmissionModel.data_provider_id == provider.id)
Expand All @@ -176,7 +181,7 @@ def get_by_provider(self, provider: DataProvider) -> List[DataSubmission]:
return submission_entities

def get_by_filename(self, filename: str) -> Optional[DataSubmission]:
with self.session_factory() as session:
with session_scope(self.session_factory) as session:
result = (
session.query(DataSubmissionModel, DataProviderModel)
.join(
Expand All @@ -192,3 +197,14 @@ def get_by_filename(self, filename: str) -> Optional[DataSubmission]:
return submission_model.to_entity(provider_model.to_entity())
else:
return None

def update_report(self, id: int, report) -> None:
with session_scope(self.session_factory) as session:
model_instance = (
session.query(DataSubmissionModel)
.filter(DataSubmissionModel.id == id)
.first()
)

if model_instance:
model_instance.report = report
Loading
Loading