Skip to content

Commit

Permalink
Gidim/ load dataset from file (#289)
Browse files Browse the repository at this point in the history
* updated logo to opik

* added method to load dataset from JSONL

* Revert "updated logo to opik"

This reverts commit 6b5212c.

* fixed path

* fixed json reading

* added to docs

* fixed formatting

* docs fix

* moved some logic to converters and switched to unit test

* fixed docs

* linting

* Remove mutable default values from converters.from_jsonl_file function

* Fix lint errors

---------

Co-authored-by: Gideon Mendels <[email protected]>
Co-authored-by: Aliaksandr Kuzmik <[email protected]>
  • Loading branch information
3 people authored Sep 21, 2024
1 parent 539b11d commit 9cffebb
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,27 @@ dataset.insert([
```
:::


You can also insert items from a JSONL file:

```python
dataset.read_jsonl_from_file("path/to/file.jsonl")
```
The format of the JSONL file should be a JSON object per line. For example:

```
{"input": {"user_question": "Hello, world!"}}
{"input": {"user_question": "What is the capital of France?"}, "expected_output": {"assistant_answer": "Paris"}}
```


Once the items have been inserted, you can view them them in the Opik UI:

![Opik Dataset](/img/evaluation/dataset_items_page.png)




### Deleting items

You can delete items in a dataset by using the `delete` method:
Expand Down
14 changes: 14 additions & 0 deletions sdks/python/src/opik/api_objects/dataset/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,20 @@ def to_pandas(
return pd.DataFrame(new_item_dicts)


def from_jsonl_file(
file_path: str, keys_mapping: Dict[str, str], ignore_keys: List[str]
) -> List[dataset_item.DatasetItem]:
items = []
with open(file_path, "r", encoding="utf-8") as file:
for line in file:
json_object = line.strip()
if json_object: # Skip empty lines
items.append(json.loads(json_object))

json_str = json.dumps(items)
return from_json(json_str, keys_mapping, ignore_keys)


def from_pandas(
dataframe: pd.DataFrame,
keys_mapping: Dict[str, str],
Expand Down
21 changes: 21 additions & 0 deletions sdks/python/src/opik/api_objects/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,27 @@ def insert_from_json(

self.insert(new_items)

def read_jsonl_from_file(
self,
file_path: str,
keys_mapping: Optional[Dict[str, str]] = None,
ignore_keys: Optional[List[str]] = None,
) -> None:
"""
Read JSONL from a file and insert it into the dataset.
Args:
file_path: Path to the JSONL file
keys_mapping: dictionary that maps json keys to item fields names
Example: {'Expected output': 'expected_output'}
ignore_keys: if your json dicts contain keys that are not needed for DatasetItem
construction - pass them as ignore_keys argument
"""
keys_mapping = {} if keys_mapping is None else keys_mapping
ignore_keys = [] if ignore_keys is None else ignore_keys
new_items = converters.from_jsonl_file(file_path, keys_mapping, ignore_keys)
self.insert(new_items)

def insert_from_pandas(
self,
dataframe: pandas.DataFrame,
Expand Down
78 changes: 78 additions & 0 deletions sdks/python/tests/unit/api_objects/dataset/test_converters.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pandas as pd
import pandas.testing
import json
import tempfile
import os

from opik.api_objects.dataset import converters
from opik import DatasetItem
Expand Down Expand Up @@ -360,3 +362,79 @@ def test_to_json__with_keys_mapping__happyflow():
)

assert json.loads(actual_json) == json.loads(EXPECTED_JSON)


def test_from_jsonl_file__happyflow():
jsonl_content = """
{"input": {"user_question": "What is the capital of France?"}, "expected_output": {"assistant_answer": "The capital of France is Paris."}}
{"input": {"user_question": "How many planets are in our solar system?"}, "expected_output": {"assistant_answer": "There are 8 planets in our solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune."}}
"""
with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
temp_file.write(jsonl_content)
temp_file_path = temp_file.name

try:
result = converters.from_jsonl_file(
temp_file_path, keys_mapping={}, ignore_keys=[]
)

assert result[0].input == {"user_question": "What is the capital of France?"}
assert result[0].expected_output == {
"assistant_answer": "The capital of France is Paris."
}

assert result[1].input == {
"user_question": "How many planets are in our solar system?"
}
assert result[1].expected_output == {
"assistant_answer": "There are 8 planets in our solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune."
}
finally:
os.unlink(temp_file_path)


def test_from_jsonl_file__empty_file():
with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
temp_file_path = temp_file.name

try:
result = converters.from_jsonl_file(
temp_file_path, keys_mapping={}, ignore_keys=[]
)
assert isinstance(result, list)
assert len(result) == 0
finally:
os.unlink(temp_file_path)


def test_from_jsonl_file__file_with_empty_lines():
jsonl_content = """
{"input": {"user_question": "What is the capital of France?"}, "expected_output": {"assistant_answer": "The capital of France is Paris."}}
{"input": {"user_question": "How many planets are in our solar system?"}, "expected_output": {"assistant_answer": "There are 8 planets in our solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune."}}
"""
with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file:
temp_file.write(jsonl_content)
temp_file_path = temp_file.name

try:
result = converters.from_jsonl_file(
temp_file_path, keys_mapping={}, ignore_keys=[]
)

assert len(result) == 2

assert result[0].input == {"user_question": "What is the capital of France?"}
assert result[0].expected_output == {
"assistant_answer": "The capital of France is Paris."
}

assert result[1].input == {
"user_question": "How many planets are in our solar system?"
}
assert result[1].expected_output == {
"assistant_answer": "There are 8 planets in our solar system: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune."
}
finally:
os.unlink(temp_file_path)

0 comments on commit 9cffebb

Please sign in to comment.