Skip to content

Commit

Permalink
Merge pull request #8 from krflorian/rules-url
Browse files Browse the repository at this point in the history
- updated data ingestion, transformation & loading.
- added rule-links to https://yawgatog.com
  • Loading branch information
Maximilian-Pichler authored Mar 3, 2024
2 parents dc281ad + 66ef114 commit 460e8a6
Show file tree
Hide file tree
Showing 10 changed files with 329 additions and 1,661 deletions.
25 changes: 14 additions & 11 deletions get_data.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
#%%
from src.etl import RulesGuru
from src.objects import Document
# %%
from src.etl import RulesGuru, Rules, RulesDB
from pathlib import Path

#%%
# Extract RulesGuru Data
rg = RulesGuru()
# %%
rg.from_file()
# %%
rg.data
rg.get_data_raw()
#rg.data_raw = rg._from_file(Path("../data/etl/raw/documents/rulesguru.json.bak"))
rg.get_data_processed()

#%%
rg.process_data()
# Extract Rules Data
r = Rules()
r.get_data_raw()
r.get_data_processed()

#%%
from
# Load RulesDB
rdb = RulesDB()
rdb.load_data()
4 changes: 2 additions & 2 deletions src/etl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .extract_rulesguru import RulesGuru


from .extract_rules import Rules
from .load import RulesDB
39 changes: 0 additions & 39 deletions src/etl/create_rules_db.py

This file was deleted.

Empty file removed src/etl/create_rulesguru_db.py
Empty file.
102 changes: 102 additions & 0 deletions src/etl/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from pydantic import BaseModel, Field
from typing import Tuple, Any
from pathlib import Path
import json


class DataExtractor(BaseModel):
api_url: str
path_data_raw: Path
path_data_processed: Path
data_raw: Any = ""
data_processed: list = Field(default_factory=list)
data_processed_json: list = Field(default_factory=list)

#def model_post_init(self) -> None:
# self.get_data()

def extract_data(self) -> None:
"""
Extract data from the data source, and save it as .txt or .json to the directory specified in self.path_data_raw
"""
pass

def transform_data(self) -> None:
"""
Load data from the directory specified in self.path_data_raw, transform the data, so that it is a list of json objects, and save them to the directory specified in self.path_data_processed.
"""
pass

def get_data(self) -> Tuple[str | list, list]:
"""
Get the raw and processed data by calling the get_data_raw and get_data_processed methods.
"""
if self.data_raw:
pass
else: self.get_data_raw()

if self.data_processed:
pass
else:
self.get_data_processed()

return self.data_raw, self.data_processed


def get_data_raw(self) -> str | list:
"""
Load the raw data into class variable "data_raw".
"""

if self.path_data_raw.is_file():
self.data_raw = self._from_file(self.path_data_raw)
else:
self.extract_data()
self.data_raw = self._from_file(self.path_data_raw)
return self.data_raw

def get_data_processed(self) -> list:
"""
Load the raw data into class variable "data_processed".
"""
if self.path_data_processed.is_file():
self.data_processed = self._from_file(self.path_data_processed)
elif self.path_data_raw.is_file():
self.transform_data()
self.data_processed = self._from_file(self.path_data_processed)
return self.data_processed


def _from_file(self, path:Path) -> str | list:
"""
Load data from a file with the given path. Supports .txt and .json file types.
"""
if path.suffix == '.txt':
with open(path, "r", encoding="utf-8") as file:
data = file.read()
return data
elif path.suffix == '.json':
with open(path, "r", encoding="utf-8") as file:
data = json.load(file)
return data
else: print(f"opening a file with filetype {path.suffix} is not supported")


def _to_file(self, path:Path, data: str | list) -> None:
"""
Save data to a file with the given path. Supports .txt and .json file types.
"""
if path.suffix == '.txt':
with open(path, "w") as file:
file.write(data)
elif path.suffix == '.json':
list = []
for doc in data:
list.append(doc.model_dump())

with open(path, "w", encoding="utf-8") as file:
json.dump(list, file)

else: print(f"opening a file with filetype {path.suffix} is not supported")


Loading

0 comments on commit 460e8a6

Please sign in to comment.