Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Yival to run Experiment #25

Merged
merged 43 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
e61f841
add yival to dependency
semio Oct 11, 2023
2307241
migrate to pydantic v2
semio Oct 11, 2023
4f8b1cc
langchain still use pydantic v1
semio Oct 11, 2023
69c20d1
update pandera version and make the code work
semio Oct 12, 2023
3f650a2
experiment files so far
semio Oct 14, 2023
3339a12
add readme
semio Oct 14, 2023
0dac683
some more files
semio Oct 14, 2023
d8bb1c0
update readme
semio Oct 16, 2023
b51d444
update readme
semio Oct 16, 2023
4a59740
add metrics data sheet
semio Oct 30, 2023
9c109df
update experiment sturcture
semio Oct 30, 2023
0276aba
minor changes
semio Oct 30, 2023
eb503d9
have to set a callback to make Palm work
semio Oct 30, 2023
a757aaf
add replicate key
semio Nov 1, 2023
0f1cff8
remove the hacking
semio Nov 2, 2023
06216ff
add rounds in model variation
semio Nov 2, 2023
5353d1c
update dependencies
semio Nov 2, 2023
8757ee1
Don't evaluate formulas
semio Nov 11, 2023
e2acc4c
Update github workflow
semio Nov 11, 2023
5fc96c4
update model compare function
semio Nov 11, 2023
c999fb5
strip question text
semio Nov 11, 2023
8c98ad5
update generate result script
semio Nov 11, 2023
9b7b8e9
Latest results
semio Nov 11, 2023
12d935f
update example
semio Nov 11, 2023
95e6447
update dependencies
semio Nov 11, 2023
d9c18a5
add todo
semio Nov 11, 2023
365fee6
set model name for evaluator
semio Nov 11, 2023
9d2aaf6
latest experiment yaml
semio Nov 11, 2023
99fda7f
questions
semio Nov 11, 2023
4c77f24
update README
semio Nov 11, 2023
ff65d7e
Add readme about Redis cache
semio Nov 11, 2023
bfe0d89
latest experiment results
semio Nov 24, 2023
b40b4fc
scripts and notebooks
semio Nov 24, 2023
8c80bed
add options
semio Nov 24, 2023
7066c00
add langdetect as dependency
semio Nov 24, 2023
1838dfe
add gitignore
semio Nov 24, 2023
42f3dbb
add experiment archive
semio Nov 25, 2023
9a2789c
questions
semio Nov 25, 2023
fbc7f18
create a custom evaluator for gpt4 based evaluation
semio Nov 25, 2023
3607026
take care some possible errors when loading data
semio Nov 25, 2023
f50b5c1
update Readme
semio Nov 25, 2023
9992caf
remove some unneeded files
semio Nov 25, 2023
1d1c3ae
rename script
semio Nov 27, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/automation-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python: [ 3.9 ]
python: [ 3.11 ]

steps:
- name: Checkout code
Expand Down Expand Up @@ -40,7 +40,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python: [ 3.9 ]
python: [ 3.11 ]

steps:
- name: Checkout code
Expand All @@ -60,7 +60,7 @@ jobs:
- name: Install Poetry
uses: snok/install-poetry@v1
with:
version: 1.4.2
version: 1.6.1
virtualenvs-create: true
virtualenvs-in-project: true

Expand Down
4 changes: 3 additions & 1 deletion automation-api/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@ OPENAI_ORG_ID=""
## for Huggingface Hub
HUGGINGFACEHUB_API_TOKEN=""
## for PALM
GOOGLE_API_KEY=""
PALM_API_KEY=""
## for iFlytek
IFLYTEK_API_KEY=""
IFLYTEK_API_SECRET=""
IFLYTEK_APPID=""
## for Alibaba
DASHSCOPE_API_KEY=""
# for Replicate
REPLICATE_API_KEY=""

# For local development
SERVICE_ACCOUNT_CREDENTIALS=""
Expand Down
38 changes: 33 additions & 5 deletions automation-api/lib/ai_eval_spreadsheet/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,28 @@
# for more info
# Note that most types are str since spreadsheet columns can be formulas

from datetime import datetime
from typing import Optional

import pandas as pd
import pandera as pa
from pandera.engines.pandas_engine import PydanticModel
from pydantic import BaseModel, Field, validator
from pydantic import BaseModel, ConfigDict, Field, field_validator


class Question(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True)

include_in_next_evaluation: Optional[bool] = Field(
None, title="Include in next evaluation"
None, title="Include in next evaluation", validate_default=True
)
question_id: Optional[str] = Field(None, title="Question ID")
language: Optional[str] = Field(None, title="Language")
published_version_of_question: Optional[str] = Field(
None, title="Published version of question"
)

@validator("include_in_next_evaluation", pre=True, always=True)
@field_validator("include_in_next_evaluation", mode="before")
@classmethod
def default_if_nan(cls, v): # noqa: N805
return False if pd.isna(v) else v

Expand All @@ -36,6 +38,8 @@ class Config:


class QuestionOption(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True)

question_option_id: Optional[str] = Field(None, title="Question Option ID")
question_id: Optional[str] = Field(None, title="Question ID")
language: Optional[str] = Field(None, title="Language")
Expand All @@ -53,6 +57,8 @@ class Config:


class PromptVariation(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True)

include_in_next_evaluation: Optional[bool] = Field(
None, title="Include in next evaluation"
)
Expand All @@ -78,6 +84,8 @@ class Config:


class GenAiModel(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())

model_id: Optional[str] = Field(None, title="Model ID")
vendor: Optional[str] = Field(None, title="Vendor")
model_name: Optional[str] = Field(None, title="Model name")
Expand All @@ -90,6 +98,8 @@ class Config:


class GenAiModelConfig(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())

include_in_next_evaluation: Optional[bool] = Field(
None, title="Include in next evaluation"
)
Expand All @@ -107,12 +117,28 @@ class Config:
coerce = True


class Metric(BaseModel):
name: Optional[str] = Field(None, title="Name")
description: Optional[str] = Field(None, title="Description")
prompt: Optional[str] = Field(None, title="Prompt")
choices: Optional[str] = Field(None, title="Choices")
choice_scores: Optional[str] = Field(None, title="Choice Scores")


class MetricsDf(pa.DataFrameModel):
class Config:
dtype = PydanticModel(Metric)
coerce = True


class EvalResult(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())

question_id: Optional[str] = Field(None, title="Question ID")
language: Optional[str] = Field(None, title="Language")
prompt_variation_id: Optional[str] = Field(None, title="Prompt variation ID")
model_configuration_id: Optional[str] = Field(None, title="Model Configuration ID")
last_evaluation_datetime: Optional[datetime] = Field(None, title="Last Evaluation")
last_evaluation_datetime: Optional[str] = Field(None, title="Last Evaluation")
percent_correct: Optional[float] = Field(None, title="Percent Correct")
percent_wrong: Optional[float] = Field(None, title="Percent Wrong")
percent_very_wrong: Optional[float] = Field(None, title="Percent Very Wrong")
Expand All @@ -128,6 +154,8 @@ class Config:


class SessionResult(BaseModel):
model_config = ConfigDict(coerce_numbers_to_str=True, protected_namespaces=())

session_id: Optional[str] = Field(None, title="Session ID")
session_time: Optional[str] = Field(None, title="Session Time")
prompt_variation_id: Optional[str] = Field(None, title="Prompt Variation ID")
Expand Down
24 changes: 19 additions & 5 deletions automation-api/lib/ai_eval_spreadsheet/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
GenAiModelConfig,
GenAiModelConfigsDf,
GenAiModelsDf,
Metric,
MetricsDf,
PromptVariation,
PromptVariationsDf,
Question,
Expand All @@ -36,6 +38,7 @@ class AiEvalData:
gen_ai_model_configs: Optional[
GsheetsWorksheetEditor[GenAiModelConfigsDf, GenAiModelConfig]
] = None
metrics: Optional[GsheetsWorksheetEditor[MetricsDf, Metric]] = None
evaluation_results: Optional[
GsheetsWorksheetEditor[EvalResult, EvalResultsDf]
] = None
Expand All @@ -50,6 +53,7 @@ class AiEvalData:
"prompt_variations": "Prompt variations",
"gen_ai_models": "Models",
"gen_ai_model_configs": "Model configurations",
"metrics": "Metrics",
"evaluation_results": "Latest Results",
"session_results": "Sessions",
}
Expand All @@ -71,7 +75,7 @@ def read_ai_eval_data(
row_schema=Question,
worksheet_name=sheet_names["questions"],
header_row_number=0,
evaluate_formulas=True,
evaluate_formulas=False,
)

question_options = GsheetsWorksheetEditor(
Expand All @@ -80,7 +84,7 @@ def read_ai_eval_data(
row_schema=QuestionOption,
worksheet_name=sheet_names["question_options"],
header_row_number=0,
evaluate_formulas=True,
evaluate_formulas=False,
)

prompt_variations = GsheetsWorksheetEditor(
Expand All @@ -89,7 +93,7 @@ def read_ai_eval_data(
row_schema=PromptVariation,
worksheet_name=sheet_names["prompt_variations"],
header_row_number=0,
evaluate_formulas=True,
evaluate_formulas=False,
)

gen_ai_models = GsheetsWorksheetEditor(
Expand All @@ -98,7 +102,7 @@ def read_ai_eval_data(
row_schema=GenAiModel,
worksheet_name=sheet_names["gen_ai_models"],
header_row_number=0,
evaluate_formulas=True,
evaluate_formulas=False,
)

gen_ai_model_configs = GsheetsWorksheetEditor(
Expand All @@ -107,7 +111,16 @@ def read_ai_eval_data(
row_schema=GenAiModelConfig,
worksheet_name=sheet_names["gen_ai_model_configs"],
header_row_number=0,
evaluate_formulas=True,
evaluate_formulas=False,
)

metrics = GsheetsWorksheetEditor(
sh=ai_eval_spreadsheet,
df_schema=MetricsDf,
row_schema=Metric,
worksheet_name=sheet_names["metrics"],
header_row_number=0,
evaluate_formulas=False,
)

evaluation_results = GsheetsWorksheetEditor(
Expand All @@ -134,6 +147,7 @@ def read_ai_eval_data(
prompt_variations=prompt_variations,
gen_ai_models=gen_ai_models,
gen_ai_model_configs=gen_ai_model_configs,
metrics=metrics,
evaluation_results=evaluation_results,
session_results=session_results,
)
3 changes: 2 additions & 1 deletion automation-api/lib/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ def read_config() -> dict[str, str]:
"AI_EVAL_SPREADSHEET_ID",
"AI_EVAL_DEV_SPREADSHEET_ID",
"HUGGINGFACEHUB_API_TOKEN",
"GOOGLE_API_KEY",
"PALM_API_KEY",
"IFLYTEK_APPID",
"IFLYTEK_API_KEY",
"IFLYTEK_API_SECRET",
"DASHSCOPE_API_KEY",
"REPLICATE_API_KEY",
]:
config[key] = os.getenv(key=key, default="")
return config
2 changes: 1 addition & 1 deletion automation-api/lib/gsheets/gsheets_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pandas as pd
from gspread import Spreadsheet, Worksheet, WorksheetNotFound
from gspread_dataframe import set_with_dataframe
from pydantic.main import BaseModel
from pydantic import BaseModel

from lib.app_singleton import app_logger

Expand Down
4 changes: 3 additions & 1 deletion automation-api/lib/gsheets/gsheets_worksheet_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandera as pa
from pandera import DataFrameModel
from pydantic.main import BaseModel
from pydantic import BaseModel

from lib.gsheets.gsheets_utils import get_pydantic_model_field_titles

Expand Down Expand Up @@ -36,11 +36,13 @@ def __init__(
self.df_schema = df_schema
self.row_schema = row_schema
self.header_row_number = header_row_number

self.attributes_to_columns_map = get_pydantic_model_field_titles(
self.row_schema
)
df = df.rename(columns=inv_dict(self.attributes_to_columns_map))
df = self.replace_current_row_numbers_in_formulas(df)
# import ipdb; ipdb.set_trace()
self.df = df_schema(df)

def replace_current_row_numbers_in_formulas(
Expand Down
2 changes: 1 addition & 1 deletion automation-api/lib/gsheets/gsheets_worksheet_editor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from gspread import Spreadsheet, Worksheet
from gspread.utils import rowcol_to_a1
from pandera import DataFrameModel
from pydantic.main import BaseModel
from pydantic import BaseModel

from lib.gsheets.gsheets_utils import get_worksheet
from lib.gsheets.gsheets_worksheet_data import GsheetsWorksheetData
Expand Down
4 changes: 2 additions & 2 deletions automation-api/lib/llms/alibaba.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from dashscope import Generation
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from pydantic import root_validator
from langchain.pydantic_v1 import root_validator
from tenacity import (
retry,
retry_if_exception_type,
Expand Down Expand Up @@ -71,7 +71,7 @@ def validate_environment(cls, values: Dict) -> Dict: # noqa: N805
dashscope.api_key = dashscope_api_key

if values["top_p"] is not None and not 0.0 <= values["top_p"] <= 1.0:
raise ValueError("max_output_tokens must be between 0 and 1")
raise ValueError("top_p must be between 0 and 1")

if values["top_k"] is not None and not 1 <= values["top_k"] <= 100:
raise ValueError("top_k must be between 1 and 100")
Expand Down
2 changes: 1 addition & 1 deletion automation-api/lib/llms/spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from pydantic import root_validator
from langchain.pydantic_v1 import root_validator
from tenacity import (
retry,
retry_if_exception_type,
Expand Down
15 changes: 13 additions & 2 deletions automation-api/lib/pilot/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from lib.ai_eval_spreadsheet.schemas import (
GenAiModel,
GenAiModelConfig,
Metric,
PromptVariation,
Question,
QuestionOption,
Expand Down Expand Up @@ -219,9 +220,14 @@ def get_prompt_variants(
return res


def get_model_configs(sheet: AiEvalData) -> List[ModelAndConfig]:
def get_model_configs(
sheet: AiEvalData, include_all: bool = False
) -> List[ModelAndConfig]:
models_df = sheet.gen_ai_models.data.df
model_configs_df = filter_included_rows(sheet.gen_ai_model_configs.data.df)
if include_all:
model_configs_df = sheet.gen_ai_model_configs.data.df
else:
model_configs_df = filter_included_rows(sheet.gen_ai_model_configs.data.df)

model_configs = class_objects_from_df(model_configs_df, GenAiModelConfig)
result = []
Expand All @@ -232,6 +238,11 @@ def get_model_configs(sheet: AiEvalData) -> List[ModelAndConfig]:
return result


def get_metrics(sheet: AiEvalData) -> List[Metric]:
res = class_objects_from_df(sheet.metrics.data.df, Metric)
return res


def get_survey_hash(questions: List[QuestionAndOptions]) -> str:
joined = ",".join([q[0].question_id for q in questions])
return hash_dn(joined, "")
Expand Down
Loading
Loading