From 9d2aaf6308f45d7fc7fceaf8bf766d9772a6936d Mon Sep 17 00:00:00 2001
From: semio <semio@posteo.net>
Date: Sat, 11 Nov 2023 22:55:29 +0800
Subject: [PATCH] latest experiment yaml

---
 yival_experiments/experiment_latest.yaml | 185 ++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 5 deletions(-)

diff --git a/yival_experiments/experiment_latest.yaml b/yival_experiments/experiment_latest.yaml
index eba9cbc..8746d6e 100644
--- a/yival_experiments/experiment_latest.yaml
+++ b/yival_experiments/experiment_latest.yaml
@@ -23,7 +23,7 @@ evaluators:
   prompt: |-
     You are assessing a submitted answer on a given task based on a criterion. Here is the data:
     - Task: Answer a question.
-    - Does the answer correct?
+    - Is the answer correct?
     [Input]: {question_text}
     Possible Options and Correctness of Options:
     A. {option_a} ({option_a_correctness})
@@ -31,10 +31,10 @@ evaluators:
     C. {option_c} ({option_c_correctness})
     [Result]: {raw_output}
     Answer the question by selecting one of the following options:
-    A The answer is not relevant to the question at all, or contains conflicting information.
-    B The answer is far away from the correct answer. (Very Wrong)
-    C The answer is not correct, though close to the correct one. (Wrong)
-    D The answer is correct. (Correct)
+    A The answer is not relevant to the question at all or indecisive about what the correct answer is.
+    B The answer is not correct, and matches/is close to the Very Wrong answer.
+    C The answer is not correct, and matches/is close to the Wrong answer.
+    D The answer is correct, i.e. matches the Correct answer.
   choices:
   - A
   - B
@@ -58,6 +58,181 @@ variations:
       params:
         temperature: 0.01
       round: 1
+    - vendor: Google
+      model_id: palm/text-bison
+      params:
+        temperature: 0.01
+      round: 2
+    - vendor: Google
+      model_id: palm/text-bison
+      params:
+        temperature: 0.01
+      round: 3
+    - vendor: Google
+      model_id: palm/text-bison
+      params:
+        temperature: 0.01
+      round: 4
+    - vendor: Google
+      model_id: palm/text-bison
+      params:
+        temperature: 0.01
+      round: 5
+    - vendor: Alibaba
+      model_id: qwen-plus
+      params:
+        top_p: 0.1
+        top_k: 100
+      round: 1
+    - vendor: Alibaba
+      model_id: qwen-plus
+      params:
+        top_p: 0.1
+        top_k: 100
+      round: 2
+    - vendor: Alibaba
+      model_id: qwen-plus
+      params:
+        top_p: 0.1
+        top_k: 100
+      round: 3
+    - vendor: Alibaba
+      model_id: qwen-plus
+      params:
+        top_p: 0.1
+        top_k: 100
+      round: 4
+    - vendor: Alibaba
+      model_id: qwen-plus
+      params:
+        top_p: 0.1
+        top_k: 100
+      round: 5
+    - vendor: Meta
+      model_id: replicate/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+      params:
+        temperature: 0.01
+      round: 1
+    - vendor: Meta
+      model_id: replicate/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+      params:
+        temperature: 0.01
+      round: 2
+    - vendor: Meta
+      model_id: replicate/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+      params:
+        temperature: 0.01
+      round: 3
+    - vendor: Meta
+      model_id: replicate/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+      params:
+        temperature: 0.01
+      round: 4
+    - vendor: Meta
+      model_id: replicate/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3
+      params:
+        temperature: 0.01
+      round: 5
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 0.01
+      round: 1
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 0.01
+      round: 2
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 0.01
+      round: 3
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 0.01
+      round: 4
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 0.01
+      round: 5
+    - vendor: OpenAI
+      model_id: gpt-4
+      params:
+        temperature: 0.01
+      round: 1
+    - vendor: OpenAI
+      model_id: gpt-4
+      params:
+        temperature: 0.01
+      round: 2
+    - vendor: OpenAI
+      model_id: gpt-4
+      params:
+        temperature: 0.01
+      round: 3
+    - vendor: OpenAI
+      model_id: gpt-4
+      params:
+        temperature: 0.01
+      round: 4
+    - vendor: OpenAI
+      model_id: gpt-4
+      params:
+        temperature: 0.01
+      round: 5
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 1
+      round: 1
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 1
+      round: 2
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 1
+      round: 3
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 1
+      round: 4
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 1
+      round: 5
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 0.01
+      round: 1
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 0.01
+      round: 2
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 0.01
+      round: 3
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 0.01
+      round: 4
+    - vendor: OpenAI
+      model_id: gpt-3.5-turbo-0613
+      params:
+        temperature: 0.01
+      round: 5
 - name: prompt_template
   variations:
   - variation_id: instruct_question_options_1