diff --git a/README.md b/README.md
index 0e36575..d1d736b 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ source venv/bin/activate
 pip install -r requirements.txt
 pip install -r requirements-dev.txt
 pip install -e .
-pip install vllm==0.3.3
+pip install vllm
 python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 1
 ```
 
@@ -65,7 +65,7 @@ eval_output/
 ```
 
 ```shell
-export INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS=10 # Optional if you want to shorten run times
+export INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS=40 # Optional if you want to shorten run times
 python3 tests/test_judge_answers.py
 python3 tests/test_branch_judge_answers.py
 ```
diff --git a/requirements.txt b/requirements.txt
index bc02276..836409f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 FastChat
 GitPython>=3.1.42,<4.0.0
 shortuuid
-openai<1.0.0
+openai>=1.13.3,<2.0.0
 psutil
 torch
 transformers
diff --git a/src/instructlab/eval/mt_bench_answers.py b/src/instructlab/eval/mt_bench_answers.py
index 3c223a3..0b2bb4b 100644
--- a/src/instructlab/eval/mt_bench_answers.py
+++ b/src/instructlab/eval/mt_bench_answers.py
@@ -41,6 +41,7 @@ def get_answer(
     max_tokens: int,
     answer_file: str,
     force_temperature: float,
+    openai_client,
 ):
     """Answer a question with the model"""
     assert force_temperature is None or question.get("required_temperature") is None
@@ -62,7 +63,9 @@ def get_answer(
             conv.append_message(conv.roles[0], question["turns"][j])
             conv.append_message(conv.roles[1], None)
 
-            output = chat_completion_openai(model, conv, temperature, max_tokens)
+            output = chat_completion_openai(
+                openai_client, model, conv, temperature, max_tokens
+            )
 
             conv.update_last_message(output)
             turns.append(output)
@@ -99,7 +102,9 @@ def generate_answers(
 ):
     """Generate model answers to be judged"""
     if model_api_base is not None:
-        openai.api_base = model_api_base
+        openai_client = openai.OpenAI(base_url=model_api_base)
+    else:
+        openai_client = openai.OpenAI()
 
     if data_dir is None:
         data_dir = os.path.join(os.path.dirname(__file__), "data")
@@ -125,6 +130,7 @@ def generate_answers(
                 max_tokens,
                 answer_file,
                 force_temperature,
+                openai_client,
             )
             futures.append(future)
 
diff --git a/src/instructlab/eval/mt_bench_common.py b/src/instructlab/eval/mt_bench_common.py
index 1cf680f..d8fa602 100644
--- a/src/instructlab/eval/mt_bench_common.py
+++ b/src/instructlab/eval/mt_bench_common.py
@@ -117,7 +117,7 @@ def load_judge_prompts(prompt_file: str) -> dict:
 
 
 def run_judge_single(
-    question, answer, judge, ref_answer, multi_turn=False, judgment=None
+    question, answer, judge, ref_answer, openai_client, multi_turn=False, judgment=None
 ):
     kwargs = {}
     model = judge.model_name
@@ -150,7 +150,9 @@ def run_judge_single(
     conv.append_message(conv.roles[1], None)
 
     if judgment is None:
-        judgment = chat_completion_openai(model, conv, temperature=0, max_tokens=2048)
+        judgment = chat_completion_openai(
+            openai_client, model, conv, temperature=0, max_tokens=2048
+        )
 
     if judge.prompt_template["output_format"] == "[[rating]]":
         match = re.search(one_score_pattern, judgment)
@@ -169,7 +171,7 @@ def run_judge_single(
     return rating, user_prompt, judgment
 
 
-def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
+def play_a_match_single(openai_client, match: MatchSingle, output_file: str) -> dict:
     question, model, answer, judge, ref_answer, multi_turn = (
         match.question,
         match.model,
@@ -186,6 +188,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
             answer,
             judge,
             ref_answer,
+            openai_client,
             multi_turn=multi_turn,
             judgment=judgment,
         )
@@ -215,10 +218,7 @@ def play_a_match_single(match: MatchSingle, output_file: str) -> dict:
     return result
 
 
-def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None) -> str:
-    if api_dict is not None:
-        openai.api_base = api_dict["api_base"]
-        openai.api_key = api_dict["api_key"]
+def chat_completion_openai(openai_client, model, conv, temperature, max_tokens) -> str:
     output = API_ERROR_OUTPUT
     for _ in range(API_MAX_RETRY):
         try:
@@ -232,16 +232,16 @@ def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None)
                     messages[0]["content"] + "\n" + messages[1]["content"]
                 )
                 messages = messages[1:]
-            response = openai.ChatCompletion.create(
+            response = openai_client.chat.completions.create(
                 model=model,
                 messages=messages,
                 n=1,
                 temperature=temperature,
                 max_tokens=max_tokens,
             )
-            output = response["choices"][0]["message"]["content"]
+            output = response.choices[0].message.content
             break
-        except openai.error.OpenAIError as e:
+        except openai.OpenAIError as e:
             print(type(e), e)
             time.sleep(API_RETRY_SLEEP)
 
diff --git a/src/instructlab/eval/mt_bench_judgment.py b/src/instructlab/eval/mt_bench_judgment.py
index 576d17f..24a6a5b 100644
--- a/src/instructlab/eval/mt_bench_judgment.py
+++ b/src/instructlab/eval/mt_bench_judgment.py
@@ -128,6 +128,7 @@ def make_judgment(
 def judge_model(
     model_name,
     judge_model_name,
+    openai_client,
     branch=None,
     bench_name="mt_bench",
     output_dir="eval_output",
@@ -218,11 +219,11 @@ def judge_model(
     # Play matches
     if max_workers == 1:
         for match in tqdm(matches):
-            play_a_match_single(match, output_file=output_file)
+            play_a_match_single(openai_client, match, output_file=output_file)
     else:
 
         def play_a_match_wrapper(match):
-            play_a_match_single(match, output_file=output_file)
+            play_a_match_single(openai_client, match, output_file=output_file)
 
         np.random.seed(0)
         np.random.shuffle(matches)
@@ -250,7 +251,9 @@ def generate_judgment(
 ):
     """Generate judgment with scores and qa_pairs for a model"""
     if model_api_base is not None:
-        openai.api_base = model_api_base
+        openai_client = openai.OpenAI(base_url=model_api_base)
+    else:
+        openai_client = openai.OpenAI()
 
     first_n_env = os.environ.get("INSTRUCT_LAB_EVAL_FIRST_N_QUESTIONS")
     if first_n_env is not None and first_n is None:
@@ -259,6 +262,7 @@ def generate_judgment(
     question_file, judgment_file, answer_file = judge_model(
         model_name,
         judge_model_name,
+        openai_client,
         bench_name=bench_name,
         output_dir=output_dir,
         data_dir=data_dir,