diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index f807a8cc8..57c51c9df 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -9,7 +9,7 @@ on:
 concurrency:
     group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
     cancel-in-progress: true
-    
+
 jobs:
     docs:
 
@@ -23,10 +23,10 @@ jobs:
 
         - uses: actions/setup-python@v5
           with:
-            python-version: '3.9'
+            python-version: '3.8'
 
         - run: curl -LsSf https://astral.sh/uv/install.sh | sh
-        - run: uv pip install --system ".[tests,docs]"
+        - run: uv pip install --system ".[docs]"
 
         - name: Compile Docs
           run: make docs
diff --git a/pyproject.toml b/pyproject.toml
index 5de1b82ca..ac123fce7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -165,6 +165,9 @@ line-length = 88
 indent-width = 4
 target-version = "py38"
 
+[tool.ruff.lint.pyupgrade]
+keep-runtime-typing = true
+
 [tool.ruff.lint.per-file-ignores]
 "src/*" = ["TID252"]
 ".github/*" = ["TID251"]
diff --git a/src/unitxt/api.py b/src/unitxt/api.py
index 313622986..c899019c1 100644
--- a/src/unitxt/api.py
+++ b/src/unitxt/api.py
@@ -230,7 +230,7 @@ def infer(
     return_data: bool = False,
     return_log_probs: bool = False,
     return_meta_data: bool = False,
-    previous_messages: Optional[list[dict[str, str]]] = None,
+    previous_messages: Optional[List[Dict[str, str]]] = None,
     **kwargs,
 ):
     dataset = produce(instance_or_instances, dataset_query, **kwargs)
@@ -283,7 +283,7 @@ def select(
     engine: OptionSelectingByLogProbsInferenceEngine,
     dataset_query: Optional[str] = None,
     return_data: bool = False,
-    previous_messages: Optional[list[dict[str, str]]] = None,
+    previous_messages: Optional[List[Dict[str, str]]] = None,
     **kwargs,
 ):
     dataset = produce(instance_or_instances, dataset_query, **kwargs)
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 77ded03c1..a123e502e 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -63,8 +63,8 @@ class StandardAPIParamsMixin(Artifact):
     n: Optional[int] = None
     parallel_tool_calls: Optional[bool] = None
     service_tier: Optional[Literal["auto", "default"]] = None
-    credentials: Optional[dict[str, str]] = {}
-    extra_headers: Optional[dict[str, str]] = None
+    credentials: Optional[Dict[str, str]] = {}
+    extra_headers: Optional[Dict[str, str]] = None
 
 
 def get_model_and_label_id(model_name, label):
@@ -1171,8 +1171,8 @@ def select(self, dataset: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
             for option in instance["task_data"]["options"]
         ]
 
-        dataset_with_options_logprobs: list[
-            list[dict[str, float | str]]
+        dataset_with_options_logprobs: List[
+            List[Dict[str, Union[float, str]]]
         ] = self.get_options_log_probs(dataset_with_options)
 
         dataset_iterator = iter(dataset_with_options_logprobs)
diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py
index 9e7bf2509..fc8fff6d1 100644
--- a/src/unitxt/llm_as_judge.py
+++ b/src/unitxt/llm_as_judge.py
@@ -1,6 +1,6 @@
 import itertools
 from difflib import get_close_matches
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 from .api import infer
 from .artifact import fetch_artifact
@@ -145,7 +145,7 @@ def before_process_multi_stream(self):
             )
         return
 
-    def get_contexts(self, task_data: list[dict[str, any]]) -> list[dict[str, str]]:
+    def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]:
         return [
             get_parsed_context(
                 {
@@ -161,7 +161,7 @@ def perform_evaluation_step(
         instances: list,
         task: Task,
         template: Template,
-        previous_messages: Optional[list[dict[str, str]]] = None,
+        previous_messages: Optional[List[Dict[str, str]]] = None,
     ):
         outputs_dataset = infer(
             instances,
@@ -172,11 +172,11 @@ def perform_evaluation_step(
             return_data=True,
             previous_messages=previous_messages,
         )
-        prompts: list[str] = [instance["source"] for instance in outputs_dataset]
-        raw_predictions: list[str] = [
+        prompts: List[str] = [instance["source"] for instance in outputs_dataset]
+        raw_predictions: List[str] = [
             instance["raw_prediction"] for instance in outputs_dataset
         ]
-        predictions: list[str] = [
+        predictions: List[str] = [
             instance["prediction"] for instance in outputs_dataset
         ]
         return (prompts, raw_predictions, predictions)
@@ -274,7 +274,7 @@ def get_criterias(self, task_data, eval_count):
                 raise Exception(
                     f"The type of the criteria must be 'CriteriaWithOptions', instead it is of type '{type(self.criteria)}'"
                 )
-            criterias: list[CriteriaWithOptions] = [self.criteria] * eval_count
+            criterias: List[CriteriaWithOptions] = [self.criteria] * eval_count
         unique_criterias = list({criteria.name for criteria in criterias})
         self.logger.info(f"Criteria names are '{', '.join(unique_criterias)}'")
         return criterias
@@ -289,8 +289,8 @@ def get_results(
         option_selection_outputs,
         selections,
         evaluations_count,
-        criterias: list[CriteriaWithOptions],
-    ) -> list[dict[str, any]]:
+        criterias: List[CriteriaWithOptions],
+    ) -> List[Dict[str, Any]]:
         positional_bias = None
         if self.check_positional_bias:
             positional_bias = [
@@ -353,9 +353,9 @@ def get_results(
 
     def compute(
         self,
-        references: list[list[str]],
-        predictions: list[str],
-        task_data: list[dict[str, any]],
+        references: List[List[str]],
+        predictions: List[str],
+        task_data: List[Dict[str, Any]],
     ) -> dict:
         self.logger.info(
             f'Starting evaluation with evaluator "{self.evaluator_name}" and provider "{self.inference_engine.get_pretty_print_name()}'
@@ -545,7 +545,7 @@ def get_criterias(self, task_data, eval_count):
                     f"The type of the criteria must be 'Criteria', instead it is of type '{type(self.criteria)}'"
                 )
 
-            criterias: list[Criteria] = [self.criteria] * eval_count
+            criterias: List[Criteria] = [self.criteria] * eval_count
 
         unique_criterias = list({criteria.name for criteria in criterias})
         self.logger.info(f"Criteria names are '{', '.join(unique_criterias)}'")
@@ -553,7 +553,7 @@ def get_criterias(self, task_data, eval_count):
 
     def get_instance_results(
         self,
-        instance_predictions: dict[str, str],
+        instance_predictions: Dict[str, str],
         assessment_prompts,
         assessment_outputs,
         summarization_prompts,
@@ -728,7 +728,7 @@ def get_instance_results(
         all_results["criteria"] = criteria.to_json()
         return self.clean_results(all_results)
 
-    def parse_prediction_to_dict(self, prediction: Union[dict[str, str], list[str]]):
+    def parse_prediction_to_dict(self, prediction: Union[Dict[str, str], List[str]]):
         if isinstance(prediction, list):
             return {f"{key + 1}": value for key, value in enumerate(prediction)}
 
@@ -740,15 +740,15 @@ def parse_prediction_to_dict(self, prediction: Union[dict[str, str], list[str]])
         )
 
     def convert_predictions_to_dicts(
-        self, predictions: Union[list[dict[str, str], list[str]]]
+        self, predictions: Union[List[Dict[str, str]], List[str]]
     ):
         return [self.parse_prediction_to_dict(prediction) for prediction in predictions]
 
     def compute(
         self,
-        references: list[list[str]],
-        predictions: Union[list[dict[str, str], list[str]]],
-        task_data: list[dict[str, str]],
+        references: List[List[str]],
+        predictions: Union[List[Dict[str, str]], List[str]],
+        task_data: List[Dict[str, str]],
     ) -> dict:
         self.logger.info(
             f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}'
@@ -775,8 +775,8 @@ def compute(
             f"The evaluation will perform {sum(contests_count_list) * [1,2][self.check_positional_bias]} ({' + '.join([f'{c * [1,2][self.check_positional_bias]}' for c in contests_count_list])}) pairwise comparisons"
         )
 
-        response_pairs_list: list[list[list[str]]] = []
-        option_pairs_list: list[list[list[str]]] = []
+        response_pairs_list: List[List[List[str]]] = []
+        option_pairs_list: List[List[List[str]]] = []
         predictions_names = set(predictions[0].keys())
         for i, combination_indexes in enumerate(combination_indexes_list):
             instance_predictions = predictions[i]
@@ -786,8 +786,8 @@ def compute(
                     f"The set of prediction names is different between instance 0 and instance {i}. In prediction 0, it is {sorted(predictions_names)}. In prediction {i}, it is {sorted(instance_predictions_names)}. Make sure the same number of predictions is passed for all instances."
                 )
 
-            response_pairs: list[list[str]] = []
-            option_pairs: list[list[str]] = []
+            response_pairs: List[List[str]] = []
+            option_pairs: List[List[str]] = []
             for combination in combination_indexes:
                 (idx_1, idx_2) = combination
                 response_name_1 = instance_predictions_names[idx_1]
diff --git a/src/unitxt/llm_as_judge_constants.py b/src/unitxt/llm_as_judge_constants.py
index fbdae94bf..d81abecaa 100644
--- a/src/unitxt/llm_as_judge_constants.py
+++ b/src/unitxt/llm_as_judge_constants.py
@@ -1,6 +1,6 @@
 import json
 from enum import Enum
-from typing import Optional
+from typing import Dict, List, Optional
 
 from .artifact import Artifact
 from .inference import (
@@ -36,15 +36,15 @@ def from_obj(criteria_dict: dict):
 
 
 class CriteriaWithOptions(Criteria):
-    options: list[CriteriaOption]
-    option_map: Optional[dict[str, float]] = None
+    options: List[CriteriaOption]
+    option_map: Optional[Dict[str, float]] = None
 
     @staticmethod
     def from_jsons(s: str):
         return CriteriaWithOptions.from_obj(json.loads(s))
 
     @staticmethod
-    def from_obj(criteria_dict: dict):
+    def from_obj(criteria_dict: Dict):
         return CriteriaWithOptions(
             name=criteria_dict["name"],
             description=criteria_dict["description"],
@@ -132,7 +132,7 @@ class ModelProviderEnum(str, Enum):
 
 class EvaluatorMetadata:
     name: EvaluatorNameEnum
-    providers: list[ModelProviderEnum]
+    providers: List[ModelProviderEnum]
 
     def __init__(self, name, providers):
         self.name = name
diff --git a/src/unitxt/llm_as_judge_utils.py b/src/unitxt/llm_as_judge_utils.py
index f06b5edd9..5b64b7e51 100644
--- a/src/unitxt/llm_as_judge_utils.py
+++ b/src/unitxt/llm_as_judge_utils.py
@@ -1,3 +1,5 @@
+from typing import Dict
+
 from .llm_as_judge_constants import (
     EVALUATORS_METADATA,
     MODEL_RENAMINGS,
@@ -7,7 +9,7 @@
 )
 
 
-def get_parsed_context(context: dict[str, str]):
+def get_parsed_context(context: Dict[str, str]):
     return (
         "\n".join([f"{key}: {value}" for key, value in context.items()])
         if len(context) > 1