Skip to content

Commit

Permalink
Merge pull request #9 from krflorian/feature/stackoverflow
Browse files Browse the repository at this point in the history
Feature/stackoverflow
  • Loading branch information
krflorian authored Mar 25, 2024
2 parents 460e8a6 + 2a873f2 commit 75fbcd4
Show file tree
Hide file tree
Showing 22 changed files with 228,906 additions and 206 deletions.
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,27 @@ At the moment there are two Vector Databases that have to be filled before the d
- includes data relevant for understanding the game:
- [Comprehensive Rulebook](https://magic.wizards.com/en/rules)
- [Data about Keywords](https://en.wikipedia.org/wiki/List_of_Magic:_The_Gathering_keywords)
- Question Answer pairs from [RulesGuru](https://rulesguru.net/)
- Question Answer pairs from [RulesGuru](https://rulesguru.net/)
- Question Answer pairs from [Stackoverflow](https://boardgames.stackexchange.com/questions/tagged/magic-the-gathering)

2. Cards DB
- includes all mtg card data from [scryfall](https://scryfall.com/docs/api/bulk-data)

To fill the database there are scripts in the folder `src/etl` every script beginning with `extract_` will create data as json files that can later be vectorized and inserted in the corresponding database. The scripts beginning with `create_` will do exactly that. For vectorizing the data at the moment we are using the opensouce model [gte-large](https://huggingface.co/thenlper/gte-large) from huggingface.
To fill the database there are scripts in the folder `src/etl` every script beginning with `create_` will create data as json files that can then be vectorized and inserted in the corresponding database. For vectorizing the data at the moment we are using the opensouce model [gte-large](https://huggingface.co/thenlper/gte-large) from huggingface.

To speed up vectorization the model can be placed on gpu.

https://docs.rapids.ai/install
```shell
conda create --solver=libmamba -n rapids-24.04 -c rapidsai-nightly -c conda-forge -c nvidia \
python=3.11 cuda-version=12.0 \
pytorch
conda init
conda activate rapids-24.04

poetry shell
python src/etl/create_card_db.py
```

## Development

Expand Down
19 changes: 0 additions & 19 deletions get_data.py

This file was deleted.

28 changes: 16 additions & 12 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ description = "rest service for mtg data"
authors = ["dataflo <[email protected]>"]
license = "mit"
readme = "README.md"
packages = [{ include = "src" }]
packages = [{ from = "src", include = "*" }]

[tool.poetry.dependencies]
python = "^3.11"
Expand All @@ -16,6 +16,7 @@ fastapi = "^0.105.0"
matplotlib = "^3.8.2"
wikipedia = "^1.4.0"
ipykernel = "^6.29.2"
stackapi = "^0.3.0"


[tool.poetry.group.dev.dependencies]
Expand Down
3 changes: 0 additions & 3 deletions src/etl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from .extract_rulesguru import RulesGuru
from .extract_rules import Rules
from .load import RulesDB
146 changes: 124 additions & 22 deletions src/etl/create_card_db.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,78 @@
# %%
import requests
import json
from tqdm import tqdm
from pathlib import Path
from sentence_transformers import SentenceTransformer

from src.vector_db import VectorDB
from src.objects import Card, Document
from logging_utils import get_logger
from vector_db import VectorDB
from objects import Card, Document

BLOCKED_CARD_TYPES = ["Card", "Stickers", "Hero"]
NORMAL_CARD_TYPES = [
"saga",
"case",
"adventure",
"prototype",
"augment",
"mutate",
"leveler",
"class",
"host",
"normal",
]
DOUBLE_FACED = ["transform", "card_faces", "flip", "split"]

logger = get_logger()


def download_card_data(
lookup_url: str = "https://api.scryfall.com/bulk-data",
) -> list[dict]:

# download info
bulk_requests_info = requests.get(lookup_url)
bulk_requests_info = bulk_requests_info.json()

# download cards data
oracl_card_info = [
info for info in bulk_requests_info["data"] if info["type"] == "oracle_cards"
][0]
oracle_cards_url = oracl_card_info["download_uri"]
oracle_card_data = requests.get(oracle_cards_url)
oracle_card_data = oracle_card_data.json()

# download rulings
rulings_info = [
info for info in bulk_requests_info["data"] if info["type"] == "rulings"
][0]
rulings_info_url = rulings_info["download_uri"]
rulings_data = requests.get(rulings_info_url)
rulings_data = rulings_data.json()

# combine
idx_2_card_data = {
card_data["oracle_id"]: card_data for card_data in oracle_card_data
}

for ruling in tqdm(rulings_data):
oracle_id = ruling["oracle_id"]
if "rulings" not in idx_2_card_data[oracle_id]:
idx_2_card_data[oracle_id]["rulings"] = []
idx_2_card_data[oracle_id]["rulings"].append(ruling["comment"])

# clean card data

data = [
card
for card in idx_2_card_data.values()
if (card["type_line"] not in BLOCKED_CARD_TYPES)
and ((card["layout"] in NORMAL_CARD_TYPES) or (card["layout"] in DOUBLE_FACED))
]

logger.info(f"saving {len(data)} raw card data")
return data


def parse_card_data(data: list[dict], keywords: list[str]) -> list[Card]:
Expand All @@ -25,27 +93,52 @@ def parse_card_data(data: list[dict], keywords: list[str]) -> list[Card]:
)
for idx, text in enumerate(rules)
]
cards.append(
Card(
_id=card_data.get("id"),
name=card_data.get("name"),
mana_cost=card_data.get("mana_cost"),
type=card_data.get("type_line"),
power=card_data.get("power", "0"),
toughness=card_data.get("toughness", "0"),
oracle=card_data.get("oracle_text", ""),
price=card_data.get("prices", {}).get("eur", 0.0) or 0.0,
color_identity=card_data.get("color_identity", []),
keywords=card_data.get("keywords", []),
legalities=card_data.get("legalities", {}),
url=card_data.get("related_uris", {}).get(
"gatherer", card_data.get("image_uris", {}).get("large")
),
rulings=rules,
)

url = card_data.get("related_uris", {}).get(
"gatherer", card_data.get("image_uris", {}).get("large")
)
if url is None:
url = card_data["related_uris"].get("edhrec")

if card_data.get("layout") in NORMAL_CARD_TYPES:
cards.append(
Card(
_id=card_data.get("id"),
name=card_data.get("name"),
mana_cost=card_data.get("mana_cost"),
type=card_data.get("type_line"),
power=card_data.get("power", "0"),
toughness=card_data.get("toughness", "0"),
oracle=card_data.get("oracle_text", ""),
price=card_data.get("prices", {}).get("eur", 0.0) or 0.0,
color_identity=card_data.get("color_identity", []),
keywords=card_data.get("keywords", []),
legalities=card_data.get("legalities", {}),
url=url,
rulings=rules,
)
)
elif card_data.get("layout") in DOUBLE_FACED:
for card_face in card_data.get("card_faces"):
cards.append(
Card(
_id=card_data.get("id"),
name=card_face.get("name"),
mana_cost=card_face.get("mana_cost"),
type=card_face.get("type_line"),
power=card_data.get("power", "0"),
toughness=card_data.get("toughness", "0"),
oracle=card_face.get("oracle_text", ""),
price=card_data.get("prices", {}).get("eur", 0.0) or 0.0,
color_identity=card_data.get("color_identity", []),
keywords=card_data.get("keywords", []),
legalities=card_data.get("legalities", {}),
url=url,
rulings=rules,
)
)

print(f"parsed {len(cards)} cards")
logger.info(f"parsed {len(cards)} cards")
return cards


Expand All @@ -58,6 +151,7 @@ def create_card_db(cards: list[Card], model: SentenceTransformer) -> VectorDB:
texts.append(card.name)
cards_in_db.append(card)

logger.info(f"creating vector db with {len(cards)} cards")
card_db = VectorDB(
texts=texts,
data=cards_in_db,
Expand All @@ -73,10 +167,17 @@ def create_card_db(cards: list[Card], model: SentenceTransformer) -> VectorDB:
DATA_PATH = Path("../data")
ARTIFACT_PATH = DATA_PATH / "artifacts"
ALL_CARDS_FILE = DATA_PATH / "etl/raw/cards/scryfall_all_cards_with_rulings.json"
KEYWORD_FILE = DATA_PATH / "etl/raw/keyword_list.json"
KEYWORD_FILE = DATA_PATH / "etl/raw/documents/keyword_list.json"

# load card data
data = download_card_data()
# save data
with ALL_CARDS_FILE.open("w", encoding="utf-8") as outfile:
json.dump(data, outfile, ensure_ascii=False)

# load model
model = SentenceTransformer("../data/models/gte-large")
logger.info(f"loaded sentence transformer on device: {model.device}")

# load data
with ALL_CARDS_FILE.open("r", encoding="utf-8") as infile:
Expand All @@ -100,3 +201,4 @@ def create_card_db(cards: list[Card], model: SentenceTransformer) -> VectorDB:

# save
card_db.dump(ARTIFACT_PATH / f"{db_name}.p")
logger.info(f"created card db with {len(cards)} cards")
64 changes: 64 additions & 0 deletions src/etl/create_rules_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# %%
from src.etl.extractors import (
RulesGuruExtractor,
ComprehensiveRulesExtractor,
StackExchangeExtractor,
WikipediaExtractor,
)
from src.etl.loaders import DocumentLoader
from pathlib import Path
import logging


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)

# %%
# setup extractors
DATA_RAW = Path("../data/etl/raw/documents")
DATA_PROCESSED = Path("../data/etl/processed/documents")

rules_guru = RulesGuruExtractor(
path_data_raw=DATA_RAW / "rulesguru.json",
path_data_processed=DATA_PROCESSED / "rulesguru.json",
)
comprehensive_rules = ComprehensiveRulesExtractor(
path_data_raw=DATA_RAW / "comprehensive_rules.txt",
path_data_processed=DATA_PROCESSED / "comprehensive_rules.json",
)

stack_exchange = StackExchangeExtractor(
path_data_raw=DATA_RAW / "stackexchange.json",
path_data_processed=DATA_PROCESSED / "stackexchange.json",
)

wikipedia = WikipediaExtractor(
path_data_raw=DATA_RAW / "wikipedia.txt",
path_data_processed=DATA_PROCESSED / "wikipedia.json",
)


extractors = [rules_guru, comprehensive_rules, stack_exchange, wikipedia]

# extractors = [rules_guru]

# %%
# fire extractors

for extractor in extractors:
extractor.get_data_raw()
extractor.get_data_processed()


# %%
# setup rules db

rules_db = DocumentLoader(
path_data_processed=DATA_PROCESSED,
path_database=Path("../data/artifacts/rules_db_gte.p"),
)
rules_db.load_data()

# %%
Loading

0 comments on commit 75fbcd4

Please sign in to comment.