Skip to content

Commit

Permalink
Merge pull request #6 from Maximilian-Pichler/rulesguru
Browse files Browse the repository at this point in the history
added rulesguru OOP
  • Loading branch information
Maximilian-Pichler authored Feb 11, 2024
2 parents debe0b0 + 80f8836 commit d830040
Show file tree
Hide file tree
Showing 9 changed files with 1,543 additions and 809 deletions.
16 changes: 16 additions & 0 deletions get_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#%%
from src.etl import RulesGuru
from src.objects import Document

#%%
rg = RulesGuru()
# %%
rg.from_file()
# %%
rg.data

#%%
rg.process_data()

#%%
from
821 changes: 16 additions & 805 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ sentence-transformers = "^2.2.2"
fastapi = "^0.105.0"
matplotlib = "^3.8.2"
wikipedia = "^1.4.0"
ipykernel = "^6.29.2"


[tool.poetry.group.dev.dependencies]
Expand Down
3 changes: 3 additions & 0 deletions src/etl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .extract_rulesguru import RulesGuru


4 changes: 2 additions & 2 deletions src/etl/create_rules_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
if __name__ == "__main__":
# create variables
db_name = "rules_db_gte"
DATA_PATH = Path("../data")
ETL_PATH = DATA_PATH / "etl"
DATA_PATH = Path("./data")
ETL_PATH = DATA_PATH / "processed/documents"
ARTIFACT_PATH = DATA_PATH / "artifacts"

# load documents
Expand Down
Empty file added src/etl/create_rulesguru_db.py
Empty file.
123 changes: 123 additions & 0 deletions src/etl/extract_rulesguru.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Import requests library
import requests
import urllib
import json
import time
from tqdm import tqdm
from pydantic import BaseModel, Field
from pathlib import Path
from src.objects import Document

class RulesGuru(BaseModel):
# Define the API endpoint
path_raw_data: Path = Path('./data/raw/documents/rulesguru.json')
path_processed_data: Path = Path('./data/processed/documents/rulesguru.json')
data: list[dict] = Field(default_factory=list)
documents: list[Document] = Field(default_factory=list)
api_url: str = "https://rulesguru.net/api/questions/"
ids: list[str] = Field(default_factory=list)
path_ids: Path = Path("ids.txt")

def post_model_load(self):
self.get_ids()


def load(self):
if Path(self.path_raw_data).is_file():
self.from_file()
# TODO delta load
#self.data.append(self.delta_load())
else:
self.full_load()
self.to_file()
pass

def get_max_id_raw(self):
max_id_raw = max(self.data, key=lambda x:x['id'])
return max_id_raw

def get_max_id_source(self):
pass

def full_load(self):
print('TODO')
pass

def delta_load(self):
#max_id_source =
if get_max_id_raw() < get_max_id_source():
self.delta_load()


def get_ids(self):
with open(self.path_ids) as f:
ids = f.read().splitlines()
self.ids = list(map(int, ids))


def _get_query_params(self, id):
query_params = {
"id": id
}

query_params = json.dumps(query_params)
query_params = urllib.parse.quote(query_params, safe="")

return query_params


def full_load(self):
for id in tqdm(self.ids):
try:
time.sleep(2)
response = requests.get(api_url + "?json=" + self._get_query_params(id), timeout=3600)

if response.status_code == 200:
data = response.json()
question_answer = {"question": data["questions"][0]}
self.questions.append(question_answer)

else:
# Print the status code and the reason
print(response.status_code, response.reason)

except:
pass

return

def process_data(self):
for entry in self.data:
dict = {
"name": f"RulesGuru.net Question-ID: {entry['question']['id']}" ,
"text": entry["question"]['questionSimple'] ,
"url": entry["question"]['url'] ,
"metadata": {
"level": entry["question"]['level'] ,
"complexity": entry["question"]['complexity'],
"includedCards": [card['name'] for card in entry["question"]['includedCards']] ,
},
"keywords": entry["question"]['tags'] ,
}
self.documents.append(dict)
self.to_file(self.documents, self.path_processed_data)





def from_file(self):
with open(self.path_raw_data, "r", encoding="utf-8") as file:
self.data = json.load(file)



def to_file(self, data, path):
if not data:
data = self.data
if not path:
path = self.path_raw_data
# Open the file in write mode
with open(path, "w", encoding="utf-8") as file:
# Dump the dictionary as JSON to the file
json.dump(data, file)
Loading

0 comments on commit d830040

Please sign in to comment.