diff --git a/.github/workflows/documentation_codeblock_tests.yml b/.github/workflows/documentation_codeblock_tests.yml index 679f240291..acb4fa23ea 100644 --- a/.github/workflows/documentation_codeblock_tests.yml +++ b/.github/workflows/documentation_codeblock_tests.yml @@ -24,20 +24,20 @@ jobs: # Get list of changed files in docs directory if [[ "${{ github.event_name }}" == "pull_request" ]]; then # For pull requests, compare with base branch - echo "paths=$( - git diff --name-only origin/${{ github.base_ref }} | - grep -E '^apps/opik-documentation/documentation/docs/.*\.(md|mdx)$' | - sed 's|apps/opik-documentation/documentation/||' | - jq -R -s -c 'split("\n")[:-1]' - )" >> $GITHUB_OUTPUT + CHANGED_FILES=$(git diff --name-only origin/${{ github.base_ref }} | grep -E '^apps/opik-documentation/documentation/fern/docs/.*\.(md|mdx)$' || true) + if [ -n "$CHANGED_FILES" ]; then + echo "paths=$(echo "$CHANGED_FILES" | sed 's|apps/opik-documentation/documentation/||' | jq -R -s 'split("\n")[:-1]' -c)" >> $GITHUB_OUTPUT + else + echo "paths=[]" >> $GITHUB_OUTPUT + fi else - # For manual runs and scheduled runs, check all files - echo "paths=$( - ( - ls -d docs/*/ 2>/dev/null; - find docs -maxdepth 1 -type f -name "*.md" -o -name "*.mdx" - ) | jq -R -s -c 'split("\n")[:-1]' - )" >> $GITHUB_OUTPUT + # For manual runs, get all md/mdx files + FILES=$(find fern/docs -type f \( -name "*.md" -o -name "*.mdx" \)) + if [ -n "$FILES" ]; then + echo "paths=$(echo "$FILES" | jq -R -s 'split("\n")[:-1]' -c)" >> $GITHUB_OUTPUT + else + echo "paths=[]" >> $GITHUB_OUTPUT + fi fi test: diff --git a/apps/opik-documentation/documentation/docs/cookbook/instructor.ipynb b/apps/opik-documentation/documentation/docs/cookbook/instructor.ipynb index 7512964544..fc50a66c27 100644 --- a/apps/opik-documentation/documentation/docs/cookbook/instructor.ipynb +++ b/apps/opik-documentation/documentation/docs/cookbook/instructor.ipynb @@ -26,7 +26,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade --quiet opik instructor" + "%pip install --upgrade --quiet opik instructor anthropic google-generativeai google-genai" ] }, { diff --git a/apps/opik-documentation/documentation/fern/docs.yml b/apps/opik-documentation/documentation/fern/docs.yml index 12dc4694c9..aad4200362 100644 --- a/apps/opik-documentation/documentation/fern/docs.yml +++ b/apps/opik-documentation/documentation/fern/docs.yml @@ -78,6 +78,9 @@ navigation: - page: Log traces path: docs/tracing/log_traces.mdx slug: log_traces + - page: Log conversations + path: docs/tracing/log_chat_conversations.mdx + slug: log_chat_conversations - page: Log agents path: docs/tracing/log_agents.mdx slug: log_agents diff --git a/apps/opik-documentation/documentation/fern/docs/changelog/2025-03-03.mdx b/apps/opik-documentation/documentation/fern/docs/changelog/2025-03-03.mdx new file mode 100644 index 0000000000..6a988bc99e --- /dev/null +++ b/apps/opik-documentation/documentation/fern/docs/changelog/2025-03-03.mdx @@ -0,0 +1,18 @@ +**Opik Dashboard**: + +- Chat conversations can now be reviewed in the platform + + + + + +- Added the ability to leave comments on experiments +- You can now leave reasons on feedback scores, see [Annotating Traces](/tracing/annotate_traces) +- Added support for Gemini in the playground +- A thumbs up / down feedback score definition is now added to all projects by default to make it easier + to annotate traces. + +**JS / TS SDK**: + +- The AnswerRelevanceMetric can now be run without providing a context field +- Made some updates to how metrics are uploaded to optimize data ingestion diff --git a/apps/opik-documentation/documentation/fern/docs/cookbook/anthropic.mdx b/apps/opik-documentation/documentation/fern/docs/cookbook/anthropic.mdx index d9bdfd2446..a58446090a 100644 --- a/apps/opik-documentation/documentation/fern/docs/cookbook/anthropic.mdx +++ b/apps/opik-documentation/documentation/fern/docs/cookbook/anthropic.mdx @@ -46,7 +46,9 @@ import os from opik.integrations.anthropic import track_anthropic anthropic_client = anthropic.Anthropic() -anthropic_client = track_anthropic(anthropic_client, project_name="anthropic-integration-demo") +anthropic_client = track_anthropic( + anthropic_client, project_name="anthropic-integration-demo" +) ``` diff --git a/apps/opik-documentation/documentation/fern/docs/cookbook/gemini.mdx b/apps/opik-documentation/documentation/fern/docs/cookbook/gemini.mdx index 6761faf80e..9391522dcd 100644 --- a/apps/opik-documentation/documentation/fern/docs/cookbook/gemini.mdx +++ b/apps/opik-documentation/documentation/fern/docs/cookbook/gemini.mdx @@ -10,7 +10,7 @@ Opik integrates with Gemini to provide a simple way to log traces for all Gemini ```python -%pip install --upgrade opik google-generativeai litellm +%pip install --upgrade opik google-genai litellm ``` @@ -28,45 +28,34 @@ First, we will set up our OpenAI API keys. ```python import os import getpass -import google.generativeai as genai -if "GEMINI_API_KEY" not in os.environ: - genai.configure(api_key=getpass.getpass("Enter your Gemini API key: ")) +if "GOOGLE_API_KEY" not in os.environ: + os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Gemini API key: ") ``` -## Configure LiteLLM +## Logging traces -Add the LiteLLM OpikTracker to log traces and steps to Opik: +Now each completion will logs a separate trace to LiteLLM: ```python -import litellm -import os -from litellm.integrations.opik.opik import OpikLogger +from google import genai from opik import track -from opik.opik_context import get_current_span_data +from opik.integrations.genai import track_genai os.environ["OPIK_PROJECT_NAME"] = "gemini-integration-demo" -opik_logger = OpikLogger() -litellm.callbacks = [opik_logger] -``` - -## Logging traces - -Now each completion will logs a separate trace to LiteLLM: +client = genai.Client() +gemini_client = track_genai(client) -```python prompt = """ Write a short two sentence story about Opik. """ -response = litellm.completion( - model="gemini/gemini-pro", - messages=[{"role": "user", "content": prompt}], +response = gemini_client.models.generate_content( + model="gemini-2.0-flash-001", contents=prompt ) - -print(response.choices[0].message.content) +print(response.text) ``` The prompt and response messages are automatically logged to Opik and can be viewed in the UI. @@ -81,31 +70,19 @@ If you have multiple steps in your LLM pipeline, you can use the `track` decorat ```python @track def generate_story(prompt): - response = litellm.completion( - model="gemini/gemini-pro", - messages=[{"role": "user", "content": prompt}], - metadata={ - "opik": { - "current_span_data": get_current_span_data(), - }, - }, + response = gemini_client.models.generate_content( + model="gemini-2.0-flash-001", contents=prompt ) - return response.choices[0].message.content + return response.text @track def generate_topic(): prompt = "Generate a topic for a story about Opik." - response = litellm.completion( - model="gemini/gemini-pro", - messages=[{"role": "user", "content": prompt}], - metadata={ - "opik": { - "current_span_data": get_current_span_data(), - }, - }, + response = gemini_client.models.generate_content( + model="gemini-2.0-flash-001", contents=prompt ) - return response.choices[0].message.content + return response.text @track @@ -121,5 +98,3 @@ generate_opik_story() The trace can now be viewed in the UI: ![Gemini Cookbook](https://raw.githubusercontent.com/comet-ml/opik/main/apps/opik-documentation/documentation/fern/img/cookbook/gemini_trace_decorator_cookbook.png) - - diff --git a/apps/opik-documentation/documentation/fern/docs/cookbook/instructor.mdx b/apps/opik-documentation/documentation/fern/docs/cookbook/instructor.mdx index 372468ac99..75f3d3fb87 100644 --- a/apps/opik-documentation/documentation/fern/docs/cookbook/instructor.mdx +++ b/apps/opik-documentation/documentation/fern/docs/cookbook/instructor.mdx @@ -10,7 +10,7 @@ ```python -%pip install --upgrade --quiet opik instructor +%pip install --upgrade --quiet opik instructor anthropic google-generativeai google-genai ``` diff --git a/apps/opik-documentation/documentation/fern/docs/cookbook/quickstart_notebook.mdx b/apps/opik-documentation/documentation/fern/docs/cookbook/quickstart_notebook.mdx index 3725159027..b164510cc8 100644 --- a/apps/opik-documentation/documentation/fern/docs/cookbook/quickstart_notebook.mdx +++ b/apps/opik-documentation/documentation/fern/docs/cookbook/quickstart_notebook.mdx @@ -433,8 +433,6 @@ We can now use the `evaluate` method to evaluate the summaries in our dataset: ```python from opik.evaluation import evaluate -os.environ["OPIK_PROJECT_NAME"] = "summary-evaluation-prompts" - MODEL = "gpt-4o-mini" DENSITY_ITERATIONS = 2 @@ -490,8 +488,6 @@ Guidelines: ```python from opik.evaluation import evaluate -os.environ["OPIK_PROJECT_NAME"] = "summary-evaluation-prompts" - MODEL = "gpt-4o-mini" DENSITY_ITERATIONS = 2 diff --git a/apps/opik-documentation/documentation/fern/docs/tracing/annotate_traces.mdx b/apps/opik-documentation/documentation/fern/docs/tracing/annotate_traces.mdx index 6c966b191b..085d02fcb0 100644 --- a/apps/opik-documentation/documentation/fern/docs/tracing/annotate_traces.mdx +++ b/apps/opik-documentation/documentation/fern/docs/tracing/annotate_traces.mdx @@ -1,4 +1,5 @@ -Annotating traces is a crucial aspect of evaluating and improving your LLM-based applications. By systematically recording qualitative or quantitative feedback on specific interactions or entire conversation flows, you can: +Annotating traces is a crucial aspect of evaluating and improving your LLM-based applications. By systematically recording qualitative or quantitative +feedback on specific interactions or entire conversation flows, you can: 1. Track performance over time 2. Identify areas for improvement @@ -10,7 +11,8 @@ Opik allows you to annotate traces through the SDK or the UI. ## Annotating Traces through the UI -To annotate traces through the UI, you can navigate to the trace you want to annotate in the traces page and click on the `Annotate` button. This will open a sidebar where you can add annotations to the trace. +To annotate traces through the UI, you can navigate to the trace you want to annotate in the traces page and click on the `Annotate` button. +This will open a sidebar where you can add annotations to the trace. You can annotate both traces and spans through the UI, make sure you have selected the correct span in the sidebar. @@ -19,10 +21,12 @@ You can annotate both traces and spans through the UI, make sure you have select - In order to ensure a consistent set of feedback, you will need to define feedback definitions in the `Feedback - Definitions` page which supports both numerical and categorical annotations. + Once a feedback scores has been provided, you can also add a reason to explain why this particular score was provided. + This is useful to add additional context to the score. +You can also add comments to traces and experiments to share insights with other team members. + ## Online evaluation You don't need to manually annotate each trace to measure the performance of your LLM applications! By using Opik's [online evaluation feature](/production/rules), you can define LLM as a Judge metrics that will automatically score all, or a subset, of your production traces. @@ -77,9 +81,10 @@ client.log_spans_feedback_scores( ) ``` -:::note -The `FeedbackScoreDict` class supports an optional `reason` field that can be used to provide a human-readable explanation for the feedback score. -::: + + The `FeedbackScoreDict` class supports an optional `reason` field that can be used to provide a human-readable + explanation for the feedback score. + ### Using Opik's built-in evaluation metrics @@ -90,7 +95,7 @@ Opik's built-in evaluation metrics are broken down into two main categories: 1. Heuristic metrics 2. LLM as a judge metrics -### Heuristic Metrics +#### Heuristic Metrics Heuristic metrics are use rule-based or statistical methods that can be used to evaluate the output of LLM models. @@ -118,7 +123,7 @@ score = metric.score( ) ``` -### LLM as a Judge Metrics +#### LLM as a Judge Metrics For LLM outputs that cannot be evaluated using heuristic metrics, you can use LLM as a judge metrics. These metrics are based on the idea of using an LLM to evaluate the output of another LLM. diff --git a/apps/opik-documentation/documentation/fern/docs/tracing/log_chat_conversations.mdx b/apps/opik-documentation/documentation/fern/docs/tracing/log_chat_conversations.mdx new file mode 100644 index 0000000000..e683fc5417 --- /dev/null +++ b/apps/opik-documentation/documentation/fern/docs/tracing/log_chat_conversations.mdx @@ -0,0 +1,78 @@ +You can log chat conversations to the Opik platform and track the full conversations +your users are having with your chatbot. + + + + + +## Logging conversations + +You can log chat conversations by specifying the `thread_id` parameter when using either the low level SDK or +Python decorators: + + + + ```python + import opik + from opik import opik_context + + @opik.track + def chat_message(input, thread_id): + opik_context.update_current_trace( + thread_id=thread_id + ) + return "Opik is an Open Source GenAI platform" + + thread_id = "f174a" + chat_message("What is Opik ?", thread_id) + chat_message("Repeat the previous message", thread_id) + ``` + + + ```python + import opik + + opik_client = opik.Opik() + + thread_id = "55d84" + + # Log a first message + trace = opik_client.trace( + name="chat_conversation", + input="What is Opik?", + output="Opik is an Open Source GenAI platform", + thread_id=thread_id + ) + + # Log a second message + trace = opik_client.trace( + name="chat_conversation", + input="Can you track chat conversations in Opik", + output="Yes, of course !", + thread_id=thread_id + ) + ``` + + + + + + The input to each trace will be displayed as the user message while the output will be displayed as the AI assistant + response. + + +## Reviewing conversations + +Conversations can be viewed at a project level in the `threads` tab. All conversations are tracked and by clicking on the thread ID you will be able to +view the full conversation. + +The thread view supports markdown making it easier for you to review the content that was returned to the user. If you would like to dig in deeper, you +can click on the `View trace` button to deepdive into how the AI assistant response was generated. + +By clicking on the thumbs up or thumbs down icons, you can quickly rate the AI assistant response. This feedback score will be logged and associated to +the relevant trace. By switching to the trace view, you can review the full trace as well as add additional feedback scores through the annotation +functionality. + + + + diff --git a/apps/opik-documentation/documentation/fern/img/changelog/2025-03-03/chat_conversations.png b/apps/opik-documentation/documentation/fern/img/changelog/2025-03-03/chat_conversations.png new file mode 100644 index 0000000000..cfa2057123 Binary files /dev/null and b/apps/opik-documentation/documentation/fern/img/changelog/2025-03-03/chat_conversations.png differ diff --git a/apps/opik-documentation/documentation/fern/img/tracing/chat_conversations.png b/apps/opik-documentation/documentation/fern/img/tracing/chat_conversations.png new file mode 100644 index 0000000000..cfa2057123 Binary files /dev/null and b/apps/opik-documentation/documentation/fern/img/tracing/chat_conversations.png differ diff --git a/apps/opik-documentation/documentation/fern/img/tracing/chat_conversations_actions.png b/apps/opik-documentation/documentation/fern/img/tracing/chat_conversations_actions.png new file mode 100644 index 0000000000..3f5c053d91 Binary files /dev/null and b/apps/opik-documentation/documentation/fern/img/tracing/chat_conversations_actions.png differ diff --git a/apps/opik-documentation/documentation/pytest_codeblocks/evaluators/python_evaluator.py b/apps/opik-documentation/documentation/pytest_codeblocks/evaluators/python_evaluator.py index fcdab829e4..0088b02636 100644 --- a/apps/opik-documentation/documentation/pytest_codeblocks/evaluators/python_evaluator.py +++ b/apps/opik-documentation/documentation/pytest_codeblocks/evaluators/python_evaluator.py @@ -1,6 +1,9 @@ import os import subprocess import tempfile +import logging + +LOGGER = logging.getLogger(__name__) class PythonEvaluator: @@ -19,8 +22,16 @@ def evaluate(self): # Run the code in a subprocess with tempfile.TemporaryDirectory() as temp_dir: script_path = os.path.join(temp_dir, "script.py") + + python_history = [ + x["content"] for x in self.history if x["language"] == "python" + ] + bash_history = [ + x["content"] for x in self.history if x["language"] == "bash" + ] + with open(script_path, "w") as f: - f.write("\n".join([*self.history, self.code])) + f.write("\n".join([*python_history, self.code])) env = os.environ.copy() env.update( @@ -32,6 +43,9 @@ def evaluate(self): ) try: + for bash_command in bash_history: + subprocess.run(bash_command, shell=True, env=env) + subprocess.run( [self.python_path, script_path], capture_output=True, diff --git a/apps/opik-documentation/documentation/pytest_codeblocks/parsing_utils.py b/apps/opik-documentation/documentation/pytest_codeblocks/parsing_utils.py index 68c278b0d3..89245a14c7 100644 --- a/apps/opik-documentation/documentation/pytest_codeblocks/parsing_utils.py +++ b/apps/opik-documentation/documentation/pytest_codeblocks/parsing_utils.py @@ -66,6 +66,14 @@ def check_skip_frontmatter(path): return frontmatter.get("pytest_codeblocks_skip", False) +def convert_jupyter_pip_install_to_bash(code_block): + if "%pip install" in code_block["content"]: + code_block["language"] = "bash" + code_block["content"] = code_block["content"].replace("%pip", "pip") + + return code_block + + def get_code_blocs( path: str, ) -> List[Union[evaluators.PythonEvaluator, evaluators.BashEvaluator]]: @@ -76,9 +84,16 @@ def get_code_blocs( return [] page_frontmatter = get_page_frontmatter(path) + is_cookbook = "/cookbook/" in str(path) + code_blocks = [] markdown = MarkdownAnalyzer(path) mrkdwn_analysis_code_blocks = markdown.identify_code_blocks().get("Code block", []) + mrkdwn_analysis_code_blocks = [ + convert_jupyter_pip_install_to_bash(code_block) + for code_block in mrkdwn_analysis_code_blocks + ] + for i, mk_code_block in enumerate(mrkdwn_analysis_code_blocks): language = _get_code_block_language(mk_code_block["language"]) start_line = mk_code_block["start_line"] @@ -97,8 +112,11 @@ def get_code_blocs( code_str = _reindent_code_block(mk_code_block["content"]) if language == "python": - if page_frontmatter.get("pytest_codeblocks_execute_previous", False): - history = [x["content"] for x in mrkdwn_analysis_code_blocks[:i]] + if ( + page_frontmatter.get("pytest_codeblocks_execute_previous", False) + or is_cookbook + ): + history = [x for x in mrkdwn_analysis_code_blocks[:i]] else: history = []