From 32a59c13dc6e58d40ad4b9770add101bc242ad0c Mon Sep 17 00:00:00 2001 From: Matistjati Date: Sat, 6 Dec 2025 20:03:23 +0100 Subject: [PATCH 1/6] Better grader error display --- problemtools/verifyproblem.py | 77 ++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 20 deletions(-) diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py index 2538f0c2..9039cfd0 100644 --- a/problemtools/verifyproblem.py +++ b/problemtools/verifyproblem.py @@ -39,7 +39,7 @@ from .version import add_version_arg from abc import ABC -from typing import Any, Callable, ClassVar, Literal, Pattern, Match, ParamSpec, TypeVar +from typing import Any, Callable, ClassVar, Literal, Pattern, Match, ParamSpec, TypeVar, cast from pydantic import ValidationError random.seed(42) @@ -1268,28 +1268,46 @@ def grade( self.debug(f'Grading {len(sub_results)} results:\n{grader_input}') self.debug(f'Grader flags: {grader_flags}') + grader_results = [] + grader_names = [] for grader in graders: - if grader is not None and grader.compile()[0]: - fd, infile = tempfile.mkstemp() - os.close(fd) - fd, outfile = tempfile.mkstemp() - os.close(fd) + if not grader or not grader.compile()[0]: + continue + + infile_path = outfile_path = errfile_path = None + try: + # Create input and output files for grader + # We do it in this awkward way because the files need to be closed before reading/writing + with tempfile.NamedTemporaryFile(mode='w', delete=False) as infile: + infile.write(grader_input) + infile_path = infile.name + + with tempfile.NamedTemporaryFile(delete=False) as outfile: + outfile_path = outfile.name - open(infile, 'w').write(grader_input) + with tempfile.NamedTemporaryFile(delete=False) as errfile: + errfile_path = errfile.name - status, runtime = grader.run(infile, outfile, args=grader_flags) + status, runtime = grader.run(infile_path, outfile_path, errfile=errfile_path, args=grader_flags) - grader_output = open(outfile, 'r').read() - os.remove(infile) - os.remove(outfile) + with open(outfile_path, 'r') as fh: + grader_output = fh.read() + + with open(errfile_path, 'r') as errfile: + stderr_content = errfile.read() + + clean_exit = True + ret = os.WEXITSTATUS(status) if not os.WIFEXITED(status): + clean_exit = False self.error(f'Judge error: {grader} crashed') - self.debug(f'Grader input:\n{grader_input}') - return ('JE', None) - ret = os.WEXITSTATUS(status) - if ret != 0: + elif ret != 0: + clean_exit = False self.error(f'Judge error: exit code {ret} for grader {grader}, expected 0') - self.debug(f'Grader input: {grader_input}\n') + + if not clean_exit: + self.error(f'Grader stderr:\n{stderr_content}\n') + self.debug(f'Grader input:\n{grader_input}') return ('JE', None) if not re.match(grader_output_re, grader_output): @@ -1299,10 +1317,29 @@ def grade( return ('JE', None) verdict_str, score_str = grader_output.split() - verdict = verdict_str # type: ignore - score = float(score_str) - # TODO: check that all graders give same result - + # Make mypy happy with cast + grader_results.append((cast(Verdict, verdict_str), float(score_str))) + grader_names.append(Path(grader.path).name) + finally: + for path in [infile_path, outfile_path, errfile_path]: + if path: + try: + os.remove(path) + except OSError: + pass + + if not grader_results: + self.error('Graders failed') + return ('JE', None) + + if not all(result == grader_results[0] for result in grader_results): + self.error('Different graders gave different results:') + for i, (verdict, score) in enumerate(grader_results): + self.error(f'{grader_names[i]}: {verdict} {score}') + return ('JE', None) + + verdict = grader_results[0][0] + score = grader_results[0][1] if not shadow_result: self.debug(f'Grade on {testcasegroup} is {verdict} ({score})') From 93b18d73daab91e9a4c55a61e49022027ecff93f Mon Sep 17 00:00:00 2001 From: Matistjati Date: Sat, 6 Dec 2025 20:10:49 +0100 Subject: [PATCH 2/6] Minor fixes --- problemtools/verifyproblem.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py index 9039cfd0..29accab6 100644 --- a/problemtools/verifyproblem.py +++ b/problemtools/verifyproblem.py @@ -1288,7 +1288,7 @@ def grade( with tempfile.NamedTemporaryFile(delete=False) as errfile: errfile_path = errfile.name - status, runtime = grader.run(infile_path, outfile_path, errfile=errfile_path, args=grader_flags) + status, runtime = grader.run(infile_path, outfile_path, errfile_path, args=grader_flags) with open(outfile_path, 'r') as fh: grader_output = fh.read() @@ -1317,7 +1317,7 @@ def grade( return ('JE', None) verdict_str, score_str = grader_output.split() - # Make mypy happy with cast + # Make mypy happy by explicitly using cast grader_results.append((cast(Verdict, verdict_str), float(score_str))) grader_names.append(Path(grader.path).name) finally: @@ -1329,7 +1329,7 @@ def grade( pass if not grader_results: - self.error('Graders failed') + self.error('Grader failed') return ('JE', None) if not all(result == grader_results[0] for result in grader_results): From 1a0bddf3ca404c9375a85aafb4d81d1045dc60cf Mon Sep 17 00:00:00 2001 From: Matistjati Date: Sun, 7 Dec 2025 16:25:13 +0100 Subject: [PATCH 3/6] Remove support for multiple graders --- problemtools/verifyproblem.py | 148 +++++++++++++++++----------------- 1 file changed, 72 insertions(+), 76 deletions(-) diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py index 29accab6..50ca98c6 100644 --- a/problemtools/verifyproblem.py +++ b/problemtools/verifyproblem.py @@ -590,7 +590,7 @@ def check(self, context: Context) -> bool: if self.config['grading'] not in ['default', 'custom']: self.error('Invalid grading policy in testdata.yaml') - if self.config['grading'] == 'custom' and len(self._problem.graders._graders) == 0: + if self.config['grading'] == 'custom' and self._problem.graders._grader is None: self._problem.graders.fatal(f'{self} has custom grading but no custom graders provided') if self.config['grading'] == 'default' and Graders._default_grader is None: self._problem.graders.fatal(f'{self} has default grading but I could not find default grader') @@ -1223,11 +1223,14 @@ class Graders(ProblemPart): PART_NAME = 'grader' def setup(self): - self._graders: list = run.find_programs( + graders: list = run.find_programs( os.path.join(self.problem.probdir, 'graders'), language_config=self.problem.language_config, work_dir=self.problem.tmpdir, ) + if len(graders) > 1: + self.fatal('There is more than one custom grader') + self._grader = graders[0] if graders else None return {} def __str__(self) -> str: @@ -1238,22 +1241,32 @@ def check(self, context: Context) -> bool: return self._check_res self._check_res = True - if self.problem.is_pass_fail() and len(self._graders) > 0: - self.error('There are grader programs but the problem is pass-fail') + if self._grader: + if self.problem.is_pass_fail() and self._grader: + self.fatal('There is a grader but the problem is pass-fail') - for grader in self._graders: - success, msg = grader.compile() + success, msg = self._grader.compile() if not success: - self.fatal(f'Compile error for {grader}', msg) + self.fatal(f'Compile error for {self._grader}', msg) return self._check_res def grade( self, sub_results: list[SubmissionResult], testcasegroup: TestCaseGroup, shadow_result: bool = False ) -> tuple[Verdict, float | None]: if testcasegroup.config['grading'] == 'default': - graders = [self._default_grader] + if not self._default_grader: + self.fatal('Failed to locate default grader') + return ('JE', None) + grader = self._default_grader else: - graders = self._graders + if not self._grader: + self.fatal('Problem has grading: custom without any custom grader') + return ('JE', None) + grader = self._grader + + if not grader.compile()[0]: + self.fatal(f'Failed to compile grader {grader}') + return ('JE', None) grader_input = ''.join([f'{r.verdict} {0 if r.score is None else r.score}\n' for r in sub_results]) grader_output_re = r'^((AC)|(WA)|(TLE)|(RTE)|(JE))\s+-?[0-9.]+\s*$' @@ -1261,89 +1274,72 @@ def grade( score: float = 0 if not sub_results: - self.info('No results on %s, so no graders ran' % (testcasegroup,)) + self.info('No results on %s, so no grader ran' % (testcasegroup,)) return (verdict, score) grader_flags = testcasegroup.config['grader_flags'].split() self.debug(f'Grading {len(sub_results)} results:\n{grader_input}') self.debug(f'Grader flags: {grader_flags}') - grader_results = [] - grader_names = [] - for grader in graders: - if not grader or not grader.compile()[0]: - continue + infile_path = outfile_path = errfile_path = None + try: + # Create input and output files for grader + # We do it in this awkward way because the files need to be closed before reading/writing + with tempfile.NamedTemporaryFile(mode='w', delete=False) as infile: + infile.write(grader_input) + infile_path = infile.name - infile_path = outfile_path = errfile_path = None - try: - # Create input and output files for grader - # We do it in this awkward way because the files need to be closed before reading/writing - with tempfile.NamedTemporaryFile(mode='w', delete=False) as infile: - infile.write(grader_input) - infile_path = infile.name + with tempfile.NamedTemporaryFile(delete=False) as outfile: + outfile_path = outfile.name - with tempfile.NamedTemporaryFile(delete=False) as outfile: - outfile_path = outfile.name + with tempfile.NamedTemporaryFile(delete=False) as errfile: + errfile_path = errfile.name - with tempfile.NamedTemporaryFile(delete=False) as errfile: - errfile_path = errfile.name + status, runtime = grader.run(infile_path, outfile_path, errfile_path, args=grader_flags) - status, runtime = grader.run(infile_path, outfile_path, errfile_path, args=grader_flags) + with open(outfile_path, 'r') as fh: + grader_output = fh.read() - with open(outfile_path, 'r') as fh: - grader_output = fh.read() + with open(errfile_path, 'r') as errfile: + stderr_content = errfile.read() - with open(errfile_path, 'r') as errfile: - stderr_content = errfile.read() + if not os.WIFEXITED(status): + self.error(f'Judge error: {grader} crashed') + self.error(f'Grader stderr:\n{stderr_content}\n') + self.debug(f'Grader input:\n{grader_input}') + return ('JE', None) - clean_exit = True - ret = os.WEXITSTATUS(status) - if not os.WIFEXITED(status): - clean_exit = False - self.error(f'Judge error: {grader} crashed') - elif ret != 0: - clean_exit = False - self.error(f'Judge error: exit code {ret} for grader {grader}, expected 0') - - if not clean_exit: - self.error(f'Grader stderr:\n{stderr_content}\n') - self.debug(f'Grader input:\n{grader_input}') - return ('JE', None) - - if not re.match(grader_output_re, grader_output): - self.error('Judge error: invalid format of grader output') - self.debug(f'Output must match: "{grader_output_re}"') - self.debug(f'Output was: "{grader_output}"') - return ('JE', None) - - verdict_str, score_str = grader_output.split() - # Make mypy happy by explicitly using cast - grader_results.append((cast(Verdict, verdict_str), float(score_str))) - grader_names.append(Path(grader.path).name) - finally: - for path in [infile_path, outfile_path, errfile_path]: - if path: - try: - os.remove(path) - except OSError: - pass - - if not grader_results: - self.error('Grader failed') - return ('JE', None) + if os.WEXITSTATUS(status) != 0: + self.error(f'Judge error: exit code {os.WEXITSTATUS(status)} for grader {grader}, expected 0') + self.error(f'Grader stderr:\n{stderr_content}\n') + self.debug(f'Grader input:\n{grader_input}') + return ('JE', None) - if not all(result == grader_results[0] for result in grader_results): - self.error('Different graders gave different results:') - for i, (verdict, score) in enumerate(grader_results): - self.error(f'{grader_names[i]}: {verdict} {score}') - return ('JE', None) + if not re.match(grader_output_re, grader_output): + self.error('Judge error: invalid format of grader output') + self.debug(f'Output must match: "{grader_output_re}"') + self.debug(f'Output was: "{grader_output}"') + return ('JE', None) + + verdict_str, score_str = grader_output.split() + # Make mypy happy by explicitly using cast + verdict = cast(Verdict, verdict_str) + score = float(score_str) - verdict = grader_results[0][0] - score = grader_results[0][1] - if not shadow_result: - self.debug(f'Grade on {testcasegroup} is {verdict} ({score})') + if not shadow_result: + self.debug(f'Grade on {testcasegroup} is {verdict} ({score})') - return (verdict, score) + return (verdict, score) + except Exception as e: + self.error(f'Grader failed with exception {e}') + return ('JE', None) + finally: + for path in [infile_path, outfile_path, errfile_path]: + if path: + try: + os.remove(path) + except OSError: + pass class OutputValidators(ProblemPart): From 004416040a3ebc1fe8d84995f57e6ec3b2239a2a Mon Sep 17 00:00:00 2001 From: Matistjati Date: Sun, 7 Dec 2025 16:44:03 +0100 Subject: [PATCH 4/6] Use f-string --- problemtools/verifyproblem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py index 50ca98c6..7af9408a 100644 --- a/problemtools/verifyproblem.py +++ b/problemtools/verifyproblem.py @@ -1274,7 +1274,7 @@ def grade( score: float = 0 if not sub_results: - self.info('No results on %s, so no grader ran' % (testcasegroup,)) + self.info(f'No results on {testcasegroup}, so no grader ran') return (verdict, score) grader_flags = testcasegroup.config['grader_flags'].split() From f3dd2426f864f4b2dbf6e7020a7820e59e720d31 Mon Sep 17 00:00:00 2001 From: Matistjati Date: Sun, 7 Dec 2025 19:39:41 +0100 Subject: [PATCH 5/6] Rewrite some if:s to be more DRY --- problemtools/verifyproblem.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py index 7af9408a..54e5cdd4 100644 --- a/problemtools/verifyproblem.py +++ b/problemtools/verifyproblem.py @@ -1303,14 +1303,11 @@ def grade( with open(errfile_path, 'r') as errfile: stderr_content = errfile.read() - if not os.WIFEXITED(status): - self.error(f'Judge error: {grader} crashed') - self.error(f'Grader stderr:\n{stderr_content}\n') - self.debug(f'Grader input:\n{grader_input}') - return ('JE', None) - - if os.WEXITSTATUS(status) != 0: - self.error(f'Judge error: exit code {os.WEXITSTATUS(status)} for grader {grader}, expected 0') + if not os.WIFEXITED(status) or os.WEXITSTATUS(status) != 0: + if not os.WIFEXITED(status): + self.error(f'Judge error: {grader} crashed') + else: + self.error(f'Judge error: exit code {os.WEXITSTATUS(status)} for grader {grader}, expected 0') self.error(f'Grader stderr:\n{stderr_content}\n') self.debug(f'Grader input:\n{grader_input}') return ('JE', None) From 01a3865ef5347ef77b564b99bc10b8cedba514ab Mon Sep 17 00:00:00 2001 From: Matistjati Date: Mon, 8 Dec 2025 18:37:53 +0100 Subject: [PATCH 6/6] Better error message for compile error --- problemtools/verifyproblem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/problemtools/verifyproblem.py b/problemtools/verifyproblem.py index 54e5cdd4..ce59954f 100644 --- a/problemtools/verifyproblem.py +++ b/problemtools/verifyproblem.py @@ -1265,7 +1265,7 @@ def grade( grader = self._grader if not grader.compile()[0]: - self.fatal(f'Failed to compile grader {grader}') + self.fatal(f'Failed to compile grader {grader}', grader.compile()[1]) return ('JE', None) grader_input = ''.join([f'{r.verdict} {0 if r.score is None else r.score}\n' for r in sub_results])