-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from krflorian/rules-url
- updated data ingestion, transformation & loading. - added rule-links to https://yawgatog.com
- Loading branch information
Showing
10 changed files
with
329 additions
and
1,661 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,19 @@ | ||
#%% | ||
from src.etl import RulesGuru | ||
from src.objects import Document | ||
# %% | ||
from src.etl import RulesGuru, Rules, RulesDB | ||
from pathlib import Path | ||
|
||
#%% | ||
# Extract RulesGuru Data | ||
rg = RulesGuru() | ||
# %% | ||
rg.from_file() | ||
# %% | ||
rg.data | ||
rg.get_data_raw() | ||
#rg.data_raw = rg._from_file(Path("../data/etl/raw/documents/rulesguru.json.bak")) | ||
rg.get_data_processed() | ||
|
||
#%% | ||
rg.process_data() | ||
# Extract Rules Data | ||
r = Rules() | ||
r.get_data_raw() | ||
r.get_data_processed() | ||
|
||
#%% | ||
from | ||
# Load RulesDB | ||
rdb = RulesDB() | ||
rdb.load_data() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
from .extract_rulesguru import RulesGuru | ||
|
||
|
||
from .extract_rules import Rules | ||
from .load import RulesDB |
This file was deleted.
Oops, something went wrong.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
from pydantic import BaseModel, Field | ||
from typing import Tuple, Any | ||
from pathlib import Path | ||
import json | ||
|
||
|
||
class DataExtractor(BaseModel): | ||
api_url: str | ||
path_data_raw: Path | ||
path_data_processed: Path | ||
data_raw: Any = "" | ||
data_processed: list = Field(default_factory=list) | ||
data_processed_json: list = Field(default_factory=list) | ||
|
||
#def model_post_init(self) -> None: | ||
# self.get_data() | ||
|
||
def extract_data(self) -> None: | ||
""" | ||
Extract data from the data source, and save it as .txt or .json to the directory specified in self.path_data_raw | ||
""" | ||
pass | ||
|
||
def transform_data(self) -> None: | ||
""" | ||
Load data from the directory specified in self.path_data_raw, transform the data, so that it is a list of json objects, and save them to the directory specified in self.path_data_processed. | ||
""" | ||
pass | ||
|
||
def get_data(self) -> Tuple[str | list, list]: | ||
""" | ||
Get the raw and processed data by calling the get_data_raw and get_data_processed methods. | ||
""" | ||
if self.data_raw: | ||
pass | ||
else: self.get_data_raw() | ||
|
||
if self.data_processed: | ||
pass | ||
else: | ||
self.get_data_processed() | ||
|
||
return self.data_raw, self.data_processed | ||
|
||
|
||
def get_data_raw(self) -> str | list: | ||
""" | ||
Load the raw data into class variable "data_raw". | ||
""" | ||
|
||
if self.path_data_raw.is_file(): | ||
self.data_raw = self._from_file(self.path_data_raw) | ||
else: | ||
self.extract_data() | ||
self.data_raw = self._from_file(self.path_data_raw) | ||
return self.data_raw | ||
|
||
def get_data_processed(self) -> list: | ||
""" | ||
Load the raw data into class variable "data_processed". | ||
""" | ||
if self.path_data_processed.is_file(): | ||
self.data_processed = self._from_file(self.path_data_processed) | ||
elif self.path_data_raw.is_file(): | ||
self.transform_data() | ||
self.data_processed = self._from_file(self.path_data_processed) | ||
return self.data_processed | ||
|
||
|
||
def _from_file(self, path:Path) -> str | list: | ||
""" | ||
Load data from a file with the given path. Supports .txt and .json file types. | ||
""" | ||
if path.suffix == '.txt': | ||
with open(path, "r", encoding="utf-8") as file: | ||
data = file.read() | ||
return data | ||
elif path.suffix == '.json': | ||
with open(path, "r", encoding="utf-8") as file: | ||
data = json.load(file) | ||
return data | ||
else: print(f"opening a file with filetype {path.suffix} is not supported") | ||
|
||
|
||
def _to_file(self, path:Path, data: str | list) -> None: | ||
""" | ||
Save data to a file with the given path. Supports .txt and .json file types. | ||
""" | ||
if path.suffix == '.txt': | ||
with open(path, "w") as file: | ||
file.write(data) | ||
elif path.suffix == '.json': | ||
list = [] | ||
for doc in data: | ||
list.append(doc.model_dump()) | ||
|
||
with open(path, "w", encoding="utf-8") as file: | ||
json.dump(list, file) | ||
|
||
else: print(f"opening a file with filetype {path.suffix} is not supported") | ||
|
||
|
Oops, something went wrong.