11# Standard
2- from enum import StrEnum
2+ from copy import deepcopy
33from pathlib import Path
44import gc
55import json
66import os
77import typing as t
8- from copy import deepcopy
98
109# Third Party
1110from accelerate import Accelerator
1817# Local
1918from .evaluator import Evaluator
2019
20+ # Since StrEnum wasn't part of the STL until Python3.11, we must do this
21+ try :
22+ # Standard
23+ from enum import StrEnum
24+ except ImportError as ie :
25+ # Third Party
26+ from strenum import StrEnum # type: ignore[no-redef]
27+
2128
2229class ParsedScores (t .TypedDict ):
2330 """
@@ -94,7 +101,7 @@ class TaskGrouping(t.TypedDict):
94101}
95102
96103# 1. Add OpenAI configuration defaults
97- DEFAULT_OPENAI_CONFIG = {
104+ DEFAULT_OPENAI_CONFIG : t . Dict [ str , t . Any ] = {
98105 "max_tokens" : 768 ,
99106 "temperature" : 0.0 ,
100107 "seed" : 1337 ,
@@ -194,9 +201,6 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
194201def evaluate_with_hf (args : LeaderboardArgs ) -> t .Dict [str , t .Any ]:
195202 # we need to use torch.multiprocessing to run each task in a separate process,
196203 # and then combine the results
197- # Third Party
198- import torch .multiprocessing as mp
199-
200204 num_processes = args ["num_gpus" ]
201205
202206 # Create the context and queue within the same context
@@ -222,9 +226,9 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
222226 p .join ()
223227
224228 # extract the result which is not None
225- assert len ([ res for res in results . values () if res is not None ]) == 1 , (
226- "we expect exactly 1 process to return a results dict properly"
227- )
229+ assert (
230+ len ([ res for res in results . values () if res is not None ]) == 1
231+ ), "we expect exactly 1 process to return a results dict properly"
228232 results_dict = [res for res in results .values () if res is not None ][0 ]
229233 return results_dict
230234
@@ -290,9 +294,9 @@ def parse_bbh(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
290294 parsed_scores = parse_multitask_results (
291295 result_dict , LeaderboardV2Tasks .BBH .value , "acc_norm"
292296 )
293- assert len ( parsed_scores [ "subtasks" ]) == 24 , (
294- "there should be 24 subtasks of bbh run"
295- )
297+ assert (
298+ len ( parsed_scores [ " subtasks" ]) == 24
299+ ), "there should be 24 subtasks of bbh run"
296300 return parsed_scores
297301
298302
@@ -343,9 +347,9 @@ def parse_ifeval(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
343347 scores .append (value )
344348 target_metrics .remove (metric )
345349
346- assert len ( scores ) == 2 , (
347- f"there should only be 2 values extracted in ifeval, got: { len (scores )} "
348- )
350+ assert (
351+ len (scores ) == 2
352+ ), f"there should only be 2 values extracted in ifeval, got: { len ( scores ) } "
349353 return {
350354 "score" : sum (scores ) / 2 ,
351355 }
@@ -369,9 +373,9 @@ def parse_gpqa(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
369373 parsed_scores = parse_multitask_results (
370374 result_dict , LeaderboardV2Tasks .GPQA .value , "acc_norm"
371375 )
372- assert len ( parsed_scores [ "subtasks" ]) == 3 , (
373- f"Expected 3 gpqa scores, got { len (parsed_scores [' subtasks' ]) } "
374- )
376+ assert (
377+ len (parsed_scores [" subtasks" ]) == 3
378+ ), f"Expected 3 gpqa scores, got { len ( parsed_scores [ 'subtasks' ]) } "
375379 return parsed_scores
376380
377381
@@ -382,9 +386,9 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
382386 parsed_scores = parse_multitask_results (
383387 result_dict , LeaderboardV2Tasks .MATH_HARD .value , "exact_match"
384388 )
385- assert len ( parsed_scores [ "subtasks" ]) == 7 , (
386- f"leaderboard_math_hard should have 7 subtasks, found: { len (parsed_scores [' subtasks' ]) } "
387- )
389+ assert (
390+ len (parsed_scores [" subtasks" ]) == 7
391+ ), f"leaderboard_math_hard should have 7 subtasks, found: { len ( parsed_scores [ 'subtasks' ]) } "
388392 return parsed_scores
389393
390394
@@ -451,9 +455,9 @@ def get_scores_from_result_dicts(
451455 # this is just a sanity check step
452456 benchmarks_already_covered = set (parsed_scores .keys ())
453457 overlapping_benchmarks = benchmarks_already_covered & benchmarks_to_parse
454- assert len ( benchmarks_already_covered & benchmarks_to_parse ) == 0 , (
455- f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
456- )
458+ assert (
459+ len ( benchmarks_already_covered & benchmarks_to_parse ) == 0
460+ ), f"expected no overlapping benchmarks but found the following to overlap: { list ( overlapping_benchmarks ) } "
457461
458462 # now actually add them
459463 for benchmark in benchmarks_to_parse :
@@ -486,12 +490,15 @@ def validate_output_path(output_file: str) -> None:
486490
487491 # Test if we can write to the file by opening it in append mode
488492 # We don't actually write anything
489- output_path .open ("a" ).close ()
493+ with output_path .open ("a" , encoding = "utf-8" ) as _ :
494+ pass
490495
491- except PermissionError :
492- raise ValueError (f"Permission denied: Cannot write to { output_file } " )
493- except OSError as e :
494- raise ValueError (f"Invalid output path: { output_file } . Error: { str (e )} " )
496+ except PermissionError as pe :
497+ raise ValueError (f"Permission denied: Cannot write to { output_file } " ) from pe
498+ except OSError as ose :
499+ raise ValueError (
500+ f"Invalid output path: { output_file } . Error: { str (ose )} "
501+ ) from ose
495502
496503
497504def validate_leaderboard_v2_tasks (tasks : t .List [str ]):
@@ -658,7 +665,7 @@ def save_to_file(self, output_file: t.Optional[str] = None) -> None:
658665 output_dir = os .path .dirname (output_file )
659666 if output_dir :
660667 os .makedirs (output_dir , exist_ok = True )
661- with open (output_file , "w" ) as f :
668+ with open (output_file , "w" , encoding = "utf-8" ) as f :
662669 json .dump (self ._results , f , indent = 2 )
663670
664671 def run (
@@ -739,15 +746,6 @@ def run(
739746 # validation logic
740747 validate_leaderboard_v2_tasks (tasks )
741748
742- # Only validate GPU requirements when not using an API endpoint
743- if not api_endpoint :
744- if not num_gpus :
745- num_gpus = cuda .device_count ()
746- if num_gpus <= 0 or num_gpus > cuda .device_count ():
747- raise ValueError (
748- f"invalid value for num_gpus, must be between 1 and { cuda .device_count ()} ; got: { num_gpus } "
749- )
750-
751749 if output_file :
752750 validate_output_path (output_file )
753751
@@ -767,6 +765,14 @@ def run(
767765 openai_results = evaluate_with_openai (args_openai )
768766 self ._lm_eval_results .append (openai_results )
769767 else :
768+ # Only validate GPU requirements when not using an API endpoint
769+ if not num_gpus :
770+ num_gpus = cuda .device_count ()
771+ if num_gpus <= 0 or num_gpus > cuda .device_count ():
772+ raise ValueError (
773+ f"invalid value for num_gpus, must be between 1 and { cuda .device_count ()} ; got: { num_gpus } "
774+ )
775+
770776 # Only run local evaluation if not using OpenAI API
771777 if vllm_tasks := grouped_tasks ["vllm" ]:
772778 args_vllm : LeaderboardArgs = {
@@ -823,11 +829,11 @@ def evaluate_with_openai(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
823829
824830 # Add base_url if provided
825831 if base_url :
826- model_args [ "base_url" ] = base_url
832+ model_args . update ({ "base_url" : base_url })
827833
828834 # Add API key if provided
829835 if api_key :
830- model_args [ "api_key" ] = api_key
836+ model_args . update ({ "api_key" : api_key })
831837
832838 # Add any remaining backend config options
833839 model_args .update (backend_config )
0 commit comments