Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update ruff config to enable more checks and run auto-fix 2/2 #19

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@

# Refer from https://github.com/intel/intel-extension-for-transformers/tree/main/intel_extension_for_transformers/transformers/llm/evaluation/lm_eval

from evaluation.accuracy import cli_evaluate as evaluate
from evaluation.utils import LMEvalParser
from evaluation.accuracy import cli_evaluate as evaluate # noqa: F401
from evaluation.utils import LMEvalParser # noqa: F401
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@


def _handle_non_serializable(o):
if isinstance(o, np.int64) or isinstance(o, np.int32):
if isinstance(o, (np.int32, np.int64)):
return int(o)
elif isinstance(o, set):
return list(o)
Expand All @@ -45,7 +45,7 @@ def cli_evaluate(args) -> None:

eval_logger = lm_eval.utils.eval_logger
eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
eval_logger.info(f"Verbosity set to {args.verbosity}")
eval_logger.info("Verbosity set to %s", args.verbosity)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

if args.predict_only:
Expand All @@ -54,48 +54,45 @@ def cli_evaluate(args) -> None:
raise ValueError("Specify --output_path if providing --log_samples or --predict_only")

if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
eval_logger.info(f"Including path: {args.include_path}") # noqa: G004
task_manager = lm_eval.tasks.TaskManager(args.verbosity, include_path=args.include_path)

if args.limit:
eval_logger.warning(
" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")

if args.tasks is None:
eval_logger.error("Need to specify task to evaluate.")
sys.exit()
elif args.tasks == "list":
eval_logger.info("Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks)))
eval_logger.info("Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks))) # noqa: G001
sys.exit()
elif os.path.isdir(args.tasks):
task_names = []
yaml_path = os.path.join(args.tasks, "*.yaml")
for yaml_file in glob.glob(yaml_path):
config = lm_eval.utils.load_yaml_config(yaml_file)
task_names.append(config)
else:
if os.path.isdir(args.tasks):
task_names = []
yaml_path = os.path.join(args.tasks, "*.yaml")
for yaml_file in glob.glob(yaml_path):
config = lm_eval.utils.load_yaml_config(yaml_file)
task_list = args.tasks.split(",")
task_names = task_manager.match_tasks(task_list)
for task in [task for task in task_list if task not in task_names]:
if os.path.isfile(task):
config = lm_eval.utils.load_yaml_config(task)
task_names.append(config)
else:
task_list = args.tasks.split(",")
task_names = task_manager.match_tasks(task_list)
for task in [task for task in task_list if task not in task_names]:
if os.path.isfile(task):
config = lm_eval.utils.load_yaml_config(task)
task_names.append(config)
task_missing = [
task for task in task_list if task not in task_names and "*" not in task
] # we don't want errors if a wildcard ("*") task name was used

if task_missing:
missing = ", ".join(task_missing)
eval_logger.error(
f"Tasks were not found: {missing}\n"
f"{lm_eval.utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
)
raise ValueError(
f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks,"
+ " or '--verbosity DEBUG' to troubleshoot task registration issues."
)
task_missing = [
task for task in task_list if task not in task_names and "*" not in task
] # we don't want errors if a wildcard ("*") task name was used

if task_missing:
missing = ", ".join(task_missing)
eval_logger.error(
f"Tasks were not found: {missing}\n" # noqa: G004
f"{lm_eval.utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
)
raise ValueError(
f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks," # noqa: ISC003
+ " or '--verbosity DEBUG' to troubleshoot task registration issues."
)

if args.output_path:
path = Path(args.output_path)
Expand All @@ -104,7 +101,7 @@ def cli_evaluate(args) -> None:
raise FileExistsError(f"File already exists at {path}")
output_path_file = path.joinpath(DEFAULT_RESULTS_FILE)
if output_path_file.is_file():
eval_logger.warning(f"File {output_path_file} already exists. Results will be overwritten.")
eval_logger.warning(f"File {output_path_file} already exists. Results will be overwritten.") # noqa: G004
# if path json then get parent dir
elif path.suffix in (".json", ".jsonl"):
output_path_file = path
Expand All @@ -118,7 +115,7 @@ def cli_evaluate(args) -> None:
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
args.model_args = args.model_args + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"

eval_logger.info(f"Selected Tasks: {task_names}")
eval_logger.info(f"Selected Tasks: {task_names}") # noqa: G004
eval_logger.info("Loading selected tasks...")

request_caching_args = evaluator.request_caching_arg_to_dict(cache_requests=args.cache_requests)
Expand Down Expand Up @@ -164,14 +161,14 @@ def cli_evaluate(args) -> None:
wandb_logger.log_eval_result()
if args.log_samples:
wandb_logger.log_eval_samples(samples)
except Exception as e:
eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
except Exception as e: # noqa: BLE001
eval_logger.info(f"Logging to Weights and Biases failed due to {e}") # noqa: G004

if args.output_path:
output_path_file.open("w", encoding="utf-8").write(dumped)

if args.log_samples:
for task_name, config in results["configs"].items():
for task_name, config in results["configs"].items(): # noqa: B007
output_name = "{}_{}".format(re.sub("/|=", "__", args.model_args), task_name)
filename = path.joinpath(f"{output_name}.jsonl")
samples_dumped = json.dumps(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import collections
import itertools
import logging
import random
import time
from typing import TYPE_CHECKING, List, Optional, Union
from typing import TYPE_CHECKING

import lm_eval.api.metrics
import lm_eval.api.registry
Expand All @@ -40,30 +40,30 @@
@lm_eval.utils.positional_deprecated
def simple_evaluate(
model,
model_args: Optional[Union[str, dict, object]] = None,
tasks: Optional[List[Union[str, dict, object]]] = None,
num_fewshot: Optional[int] = None,
batch_size: Optional[int] = None,
max_batch_size: Optional[int] = None,
provider: Optional[str] = None,
use_cache: Optional[str] = None,
model_args: str | dict | object | None = None,
tasks: list[str | dict | object] | None = None,
num_fewshot: int | None = None,
batch_size: int | None = None,
max_batch_size: int | None = None,
provider: str | None = None,
use_cache: str | None = None,
cache_requests: bool = False,
rewrite_requests_cache: bool = False,
delete_requests_cache: bool = False,
limit: Optional[Union[int, float]] = None,
limit: int | float | None = None,
bootstrap_iters: int = 100000,
check_integrity: bool = False,
write_out: bool = False,
log_samples: bool = True,
gen_kwargs: Optional[str] = None,
task_manager: Optional[lm_eval.tasks.TaskManager] = None,
gen_kwargs: str | None = None,
task_manager: lm_eval.tasks.TaskManager | None = None,
verbosity: str = "INFO",
predict_only: bool = False,
random_seed: int = 0,
numpy_random_seed: int = 1234,
torch_random_seed: int = 1234,
user_model: Optional[object] = None,
tokenizer: Optional[object] = None,
user_model: object | None = None,
tokenizer: object | None = None,
):
"""Instantiate and evaluate a model on a list of tasks.

Expand Down Expand Up @@ -178,7 +178,7 @@ def simple_evaluate(
elif isinstance(user_model, optimum.onnxruntime.ORTModelForSeq2SeqLM):
model_id = "optimum/t5-small"
lm_eval.utils.eval_logger.info(
"We use '{}' to build `LM` instance, the actually run model is user_model you passed.".format(model_id)
f"We use '{model_id}' to build `LM` instance, the actually run model is user_model you passed."
)
lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
"pretrained=" + model_id,
Expand All @@ -193,7 +193,7 @@ def simple_evaluate(
if tokenizer is not None:
lm.tokenizer = tokenizer
else:
assert False, "Please provide tokenizer in evaluation function"
raise AssertionError("Please provide tokenizer in evaluation function")
elif isinstance(model_args, dict):
lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
model_args,
Expand Down Expand Up @@ -231,7 +231,7 @@ def simple_evaluate(
task_manager = lm_eval.tasks.TaskManager(verbosity)

task_dict = lm_eval.tasks.get_task_dict(tasks, task_manager)
for task_name in task_dict.keys():
for task_name in task_dict:
task_obj = task_dict[task_name]
if isinstance(task_obj, tuple):
_, task_obj = task_obj
Expand All @@ -255,18 +255,16 @@ def simple_evaluate(
if num_fewshot is not None:
if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
lm_eval.utils.eval_logger.info(
f"num_fewshot has been set to 0 for {task_name} in its config."
f"num_fewshot has been set to 0 for {task_name} in its config." # noqa: ISC003
+ "Manual configuration will be ignored."
)
else:
lm_eval.utils.eval_logger.warning(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
task_obj.set_config(key="num_fewshot", value=0)
elif (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
task_obj.set_config(key="num_fewshot", value=0)

if check_integrity:
lm_eval.evaluator_utils.run_task_tests(task_list=tasks)
Expand Down Expand Up @@ -307,7 +305,7 @@ def simple_evaluate(
results["date"] = start_date
try:
lm_eval.logging_utils.add_env_info(results) # additional environment info to results
except:
except: # noqa: E722
lm_eval.utils.eval_logger.info("get env info failed.")
return results
else:
Expand All @@ -316,12 +314,12 @@ def simple_evaluate(

@lm_eval.utils.positional_deprecated
def evaluate(
lm: "lm_eval.api.model.LM",
lm: lm_eval.api.model.LM,
task_dict,
limit: Optional[int] = None,
limit: int | None = None,
cache_requests: bool = False,
rewrite_requests_cache: bool = False,
bootstrap_iters: Optional[int] = 100000,
bootstrap_iters: int | None = 100000,
write_out: bool = False,
log_samples: bool = True,
verbosity: str = "INFO",
Expand Down Expand Up @@ -362,9 +360,7 @@ def evaluate(
# get lists of group hierarchy and each type of request
task_hierarchy, eval_tasks = lm_eval.evaluator_utils.get_task_list(task_dict)
if not log_samples:
if not all(
"bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() for task_output in eval_tasks
):
if not all("bypass" not in getattr(task_output.task, "_metric_fn_list", {}) for task_output in eval_tasks):
raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
for task_output in eval_tasks:
task: lm_eval.tasks.Task = task_output.task
Expand Down Expand Up @@ -420,8 +416,8 @@ def evaluate(
if lm.world_size > 1:
lm.accelerator.wait_for_everyone()

RANK = lm.rank
WORLD_SIZE = lm.world_size
RANK = lm.rank # noqa: N806
WORLD_SIZE = lm.world_size # noqa: N806
### Postprocess outputs ###
# TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
for task_output in eval_tasks:
Expand All @@ -439,7 +435,7 @@ def evaluate(
for instances in instances_by_doc_id.values():
instances.sort(key=lambda x: x.idx)
# iterate over different filters used
for filter_key in task.instances[0].filtered_resps.keys():
for filter_key in task.instances[0].filtered_resps:
doc_iterator = task.doc_iterator(rank=RANK, limit=limit, world_size=WORLD_SIZE)
for doc_id, doc in doc_iterator:
requests = instances_by_doc_id[doc_id]
Expand Down Expand Up @@ -506,7 +502,7 @@ def evaluate(
{
key
for task in task_list
for key in results[task].keys()
for key in results[task]
if "_stderr" not in key and key not in ["alias", "samples"]
}
)
Expand Down Expand Up @@ -537,8 +533,8 @@ def evaluate(
groups_agg = collections.defaultdict(dict)
all_tasks_list = list(task_hierarchy.keys())
while True:
add_tasks_list = list(k for k in results_agg.keys())
left_tasks_list = sorted(list(set(all_tasks_list) - set(add_tasks_list)))
add_tasks_list = list(results_agg.keys())
left_tasks_list = sorted(set(all_tasks_list) - set(add_tasks_list))
if len(left_tasks_list) == 0:
break

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__all__ = ["huggingface"]

from evaluation.models import huggingface

# TODO: implement __all__


try:
# enable hf hub transfer if available
import hf_transfer # type: ignore # noqa
Expand Down
Loading
Loading