diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index f807a8cc8..57c51c9df 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -9,7 +9,7 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }} cancel-in-progress: true - + jobs: docs: @@ -23,10 +23,10 @@ jobs: - uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.8' - run: curl -LsSf https://astral.sh/uv/install.sh | sh - - run: uv pip install --system ".[tests,docs]" + - run: uv pip install --system ".[docs]" - name: Compile Docs run: make docs diff --git a/pyproject.toml b/pyproject.toml index 5de1b82ca..ac123fce7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -165,6 +165,9 @@ line-length = 88 indent-width = 4 target-version = "py38" +[tool.ruff.lint.pyupgrade] +keep-runtime-typing = true + [tool.ruff.lint.per-file-ignores] "src/*" = ["TID252"] ".github/*" = ["TID251"] diff --git a/src/unitxt/api.py b/src/unitxt/api.py index 313622986..c899019c1 100644 --- a/src/unitxt/api.py +++ b/src/unitxt/api.py @@ -230,7 +230,7 @@ def infer( return_data: bool = False, return_log_probs: bool = False, return_meta_data: bool = False, - previous_messages: Optional[list[dict[str, str]]] = None, + previous_messages: Optional[List[Dict[str, str]]] = None, **kwargs, ): dataset = produce(instance_or_instances, dataset_query, **kwargs) @@ -283,7 +283,7 @@ def select( engine: OptionSelectingByLogProbsInferenceEngine, dataset_query: Optional[str] = None, return_data: bool = False, - previous_messages: Optional[list[dict[str, str]]] = None, + previous_messages: Optional[List[Dict[str, str]]] = None, **kwargs, ): dataset = produce(instance_or_instances, dataset_query, **kwargs) diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 77ded03c1..a123e502e 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -63,8 +63,8 @@ class StandardAPIParamsMixin(Artifact): n: Optional[int] = None parallel_tool_calls: Optional[bool] = None service_tier: Optional[Literal["auto", "default"]] = None - credentials: Optional[dict[str, str]] = {} - extra_headers: Optional[dict[str, str]] = None + credentials: Optional[Dict[str, str]] = {} + extra_headers: Optional[Dict[str, str]] = None def get_model_and_label_id(model_name, label): @@ -1171,8 +1171,8 @@ def select(self, dataset: List[Dict[str, Any]]) -> List[Dict[str, Any]]: for option in instance["task_data"]["options"] ] - dataset_with_options_logprobs: list[ - list[dict[str, float | str]] + dataset_with_options_logprobs: List[ + List[Dict[str, Union[float, str]]] ] = self.get_options_log_probs(dataset_with_options) dataset_iterator = iter(dataset_with_options_logprobs) diff --git a/src/unitxt/llm_as_judge.py b/src/unitxt/llm_as_judge.py index 9e7bf2509..fc8fff6d1 100644 --- a/src/unitxt/llm_as_judge.py +++ b/src/unitxt/llm_as_judge.py @@ -1,6 +1,6 @@ import itertools from difflib import get_close_matches -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union from .api import infer from .artifact import fetch_artifact @@ -145,7 +145,7 @@ def before_process_multi_stream(self): ) return - def get_contexts(self, task_data: list[dict[str, any]]) -> list[dict[str, str]]: + def get_contexts(self, task_data: List[Dict[str, Any]]) -> List[Dict[str, str]]: return [ get_parsed_context( { @@ -161,7 +161,7 @@ def perform_evaluation_step( instances: list, task: Task, template: Template, - previous_messages: Optional[list[dict[str, str]]] = None, + previous_messages: Optional[List[Dict[str, str]]] = None, ): outputs_dataset = infer( instances, @@ -172,11 +172,11 @@ def perform_evaluation_step( return_data=True, previous_messages=previous_messages, ) - prompts: list[str] = [instance["source"] for instance in outputs_dataset] - raw_predictions: list[str] = [ + prompts: List[str] = [instance["source"] for instance in outputs_dataset] + raw_predictions: List[str] = [ instance["raw_prediction"] for instance in outputs_dataset ] - predictions: list[str] = [ + predictions: List[str] = [ instance["prediction"] for instance in outputs_dataset ] return (prompts, raw_predictions, predictions) @@ -274,7 +274,7 @@ def get_criterias(self, task_data, eval_count): raise Exception( f"The type of the criteria must be 'CriteriaWithOptions', instead it is of type '{type(self.criteria)}'" ) - criterias: list[CriteriaWithOptions] = [self.criteria] * eval_count + criterias: List[CriteriaWithOptions] = [self.criteria] * eval_count unique_criterias = list({criteria.name for criteria in criterias}) self.logger.info(f"Criteria names are '{', '.join(unique_criterias)}'") return criterias @@ -289,8 +289,8 @@ def get_results( option_selection_outputs, selections, evaluations_count, - criterias: list[CriteriaWithOptions], - ) -> list[dict[str, any]]: + criterias: List[CriteriaWithOptions], + ) -> List[Dict[str, Any]]: positional_bias = None if self.check_positional_bias: positional_bias = [ @@ -353,9 +353,9 @@ def get_results( def compute( self, - references: list[list[str]], - predictions: list[str], - task_data: list[dict[str, any]], + references: List[List[str]], + predictions: List[str], + task_data: List[Dict[str, Any]], ) -> dict: self.logger.info( f'Starting evaluation with evaluator "{self.evaluator_name}" and provider "{self.inference_engine.get_pretty_print_name()}' @@ -545,7 +545,7 @@ def get_criterias(self, task_data, eval_count): f"The type of the criteria must be 'Criteria', instead it is of type '{type(self.criteria)}'" ) - criterias: list[Criteria] = [self.criteria] * eval_count + criterias: List[Criteria] = [self.criteria] * eval_count unique_criterias = list({criteria.name for criteria in criterias}) self.logger.info(f"Criteria names are '{', '.join(unique_criterias)}'") @@ -553,7 +553,7 @@ def get_criterias(self, task_data, eval_count): def get_instance_results( self, - instance_predictions: dict[str, str], + instance_predictions: Dict[str, str], assessment_prompts, assessment_outputs, summarization_prompts, @@ -728,7 +728,7 @@ def get_instance_results( all_results["criteria"] = criteria.to_json() return self.clean_results(all_results) - def parse_prediction_to_dict(self, prediction: Union[dict[str, str], list[str]]): + def parse_prediction_to_dict(self, prediction: Union[Dict[str, str], List[str]]): if isinstance(prediction, list): return {f"{key + 1}": value for key, value in enumerate(prediction)} @@ -740,15 +740,15 @@ def parse_prediction_to_dict(self, prediction: Union[dict[str, str], list[str]]) ) def convert_predictions_to_dicts( - self, predictions: Union[list[dict[str, str], list[str]]] + self, predictions: Union[List[Dict[str, str]], List[str]] ): return [self.parse_prediction_to_dict(prediction) for prediction in predictions] def compute( self, - references: list[list[str]], - predictions: Union[list[dict[str, str], list[str]]], - task_data: list[dict[str, str]], + references: List[List[str]], + predictions: Union[List[Dict[str, str]], List[str]], + task_data: List[Dict[str, str]], ) -> dict: self.logger.info( f'Starting evaluation with evaluator "{self.evaluator_name}" and provider {self.inference_engine.get_pretty_print_name()}' @@ -775,8 +775,8 @@ def compute( f"The evaluation will perform {sum(contests_count_list) * [1,2][self.check_positional_bias]} ({' + '.join([f'{c * [1,2][self.check_positional_bias]}' for c in contests_count_list])}) pairwise comparisons" ) - response_pairs_list: list[list[list[str]]] = [] - option_pairs_list: list[list[list[str]]] = [] + response_pairs_list: List[List[List[str]]] = [] + option_pairs_list: List[List[List[str]]] = [] predictions_names = set(predictions[0].keys()) for i, combination_indexes in enumerate(combination_indexes_list): instance_predictions = predictions[i] @@ -786,8 +786,8 @@ def compute( f"The set of prediction names is different between instance 0 and instance {i}. In prediction 0, it is {sorted(predictions_names)}. In prediction {i}, it is {sorted(instance_predictions_names)}. Make sure the same number of predictions is passed for all instances." ) - response_pairs: list[list[str]] = [] - option_pairs: list[list[str]] = [] + response_pairs: List[List[str]] = [] + option_pairs: List[List[str]] = [] for combination in combination_indexes: (idx_1, idx_2) = combination response_name_1 = instance_predictions_names[idx_1] diff --git a/src/unitxt/llm_as_judge_constants.py b/src/unitxt/llm_as_judge_constants.py index fbdae94bf..d81abecaa 100644 --- a/src/unitxt/llm_as_judge_constants.py +++ b/src/unitxt/llm_as_judge_constants.py @@ -1,6 +1,6 @@ import json from enum import Enum -from typing import Optional +from typing import Dict, List, Optional from .artifact import Artifact from .inference import ( @@ -36,15 +36,15 @@ def from_obj(criteria_dict: dict): class CriteriaWithOptions(Criteria): - options: list[CriteriaOption] - option_map: Optional[dict[str, float]] = None + options: List[CriteriaOption] + option_map: Optional[Dict[str, float]] = None @staticmethod def from_jsons(s: str): return CriteriaWithOptions.from_obj(json.loads(s)) @staticmethod - def from_obj(criteria_dict: dict): + def from_obj(criteria_dict: Dict): return CriteriaWithOptions( name=criteria_dict["name"], description=criteria_dict["description"], @@ -132,7 +132,7 @@ class ModelProviderEnum(str, Enum): class EvaluatorMetadata: name: EvaluatorNameEnum - providers: list[ModelProviderEnum] + providers: List[ModelProviderEnum] def __init__(self, name, providers): self.name = name diff --git a/src/unitxt/llm_as_judge_utils.py b/src/unitxt/llm_as_judge_utils.py index f06b5edd9..5b64b7e51 100644 --- a/src/unitxt/llm_as_judge_utils.py +++ b/src/unitxt/llm_as_judge_utils.py @@ -1,3 +1,5 @@ +from typing import Dict + from .llm_as_judge_constants import ( EVALUATORS_METADATA, MODEL_RENAMINGS, @@ -7,7 +9,7 @@ ) -def get_parsed_context(context: dict[str, str]): +def get_parsed_context(context: Dict[str, str]): return ( "\n".join([f"{key}: {value}" for key, value in context.items()]) if len(context) > 1