towards unified report code

ngc92 · ngc92 · commit d0e97b4fd7c4 · 2025-01-13T15:00:21.000+02:00
diff --git a/src/discord-cluster-manager/cogs/github_cog.py b/src/discord-cluster-manager/cogs/github_cog.py
@@ -14,7 +14,9 @@
 from env import GITHUB_REPO, GITHUB_TOKEN
 from github import Github
 from leaderboard_eval import cu_eval, py_eval
+from run_eval import RunResult, CompileResult, FullResult
 from utils import get_github_branch_name, send_discord_message, setup_logging
+from report import generate_report
 
 logger = setup_logging()
 
@@ -92,10 +94,18 @@ async def run_github(
 
                 await thread.send(f"Training completed with status: {status}")
 
-                if len(logs) > 1900:
-                    await self.bot.send_chunked_message(thread, logs, code_block=True)
+                if expect_result:
+                    # {"success": True, **json.loads(logs)}
+                    if logs['success']:
+                        await generate_report(thread, logs)
+                    else:
+                        await thread.send(logs['error'])
+
                 else:
-                    await thread.send(f"```\n!!Logs!!:\n{logs}\n```")
+                    if len(logs) > 1900:
+                        await self.bot.send_chunked_message(thread, logs, code_block=True)
+                    else:
+                        await thread.send(f"```\nLogs:\n{logs}\n```")
 
                 if url:
                     await thread.send(f"View the full run at: <{url}>")
@@ -233,8 +243,7 @@ async def check_workflow_status(self, run_id, thread, expect_result: bool=False)
 
                 if run.status == "completed":
                     if expect_result:
-                        result = await self.download_results(run_id)
-                        logs = self.make_logs(result)
+                        logs = await self.download_results(run_id)
                     else:
                         logs = await self.handle_training_log(run_id)
                     return run.conclusion, logs, run.html_url
@@ -248,16 +257,16 @@ async def check_workflow_status(self, run_id, thread, expect_result: bool=False)
             except Exception as e:
                 return "error", str(e), None
 
-    def make_logs(self, result: dict):
-        return pprint.pformat(result)
-
-    async def download_results(self, run_id):
+    async def download_results(self, run_id) -> FullResult:
         try:
             data = await self.download_artifact(run_id, name="run-result")
             logs = data['result.json'].decode("utf-8")
-            return {"success": True, **json.loads(logs)}
+            data = json.loads(logs)
+            comp = CompileResult(**data['compile'])
+            run = RunResult(**data['run'])
+            return FullResult(success=True, error="", compile=comp, run=run)
         except Exception as e:
-            return {"success": False, "error": f"Error downloading artifacts: {str(e)}"}
+            return FullResult(success=True, error=f"Error downloading artifacts: {str(e)}", compile=None, run=None)
 
     async def handle_training_log(self, run_id):
         try:
diff --git a/src/discord-cluster-manager/cogs/modal_cog.py b/src/discord-cluster-manager/cogs/modal_cog.py
@@ -8,6 +8,7 @@
 from discord.ext import commands
 from leaderboard_eval import cu_eval, py_eval
 from utils import send_discord_message, send_logs, setup_logging
+from report import generate_report
 
 logger = setup_logging()
 
@@ -66,7 +67,7 @@ async def run_modal(
                     else (await reference_script.read()).decode("utf-8")
                 )
 
-            result, score = await self.handle_modal_execution(
+            await self.handle_modal_execution(
                 interaction,
                 thread,
                 script_content,
@@ -75,10 +76,6 @@ async def run_modal(
                 reference_content,
                 status_msg,
             )
-
-            if result is not None and score > 0:
-                await thread.send(f"**score:{score:.9f}**")
-
             return thread
 
         except Exception as e:
@@ -97,21 +94,26 @@ async def handle_modal_execution(
         gpu_type: str,
         reference_content: Optional[str],
         status_msg: discord.Message,
-    ) -> tuple[str, float]:
+    ):
         try:
             loop = asyncio.get_event_loop()
             func_type = "pytorch" if filename.endswith(".py") else "cuda"
             func_name = f"run_{func_type}_script_{gpu_type.lower()}"
 
             if reference_content is not None:
-                result, score = await loop.run_in_executor(
+                result = await loop.run_in_executor(
                     None,
                     lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(
                         py_eval if filename.endswith(".py") else cu_eval,
                         reference_content=reference_content,
                         submission_content=script_content,
                     ),
                 )
+
+                # Send results
+                await thread.send(f"\n**Script size:** {len(script_content)} bytes")
+                await generate_report(thread, result)
+
             else:
                 result, score = await loop.run_in_executor(
                     None,
@@ -123,21 +125,20 @@ async def handle_modal_execution(
                     interaction, f"Modal job completed in thread {thread.jump_url}", ephemeral=True
                 )
 
-            # Send results
-            await thread.send(f"\n**Script size:** {len(script_content)} bytes")
-            await thread.send(f"**Execution time:** {score:.3f} s\n")
+                # Send results
+                await thread.send(f"\n**Script size:** {len(script_content)} bytes")
+                await thread.send(f"**Execution time:** {score:.3f} s\n")
 
-            if "check_implementation failed" in result or "Error" in result:
-                await thread.send("Modal run failed.\n")
-                await send_logs(thread, result)
-                await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
-                return result, 0
+                if "check_implementation failed" in result or "Error" in result:
+                    await thread.send("Modal run failed.\n")
+                    await send_logs(thread, result)
+                    await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
+                    return result, 0
 
-            if result is not None:
-                await thread.send(f"**score:{score:.9f}**\n```")
+                if result is not None:
+                    await thread.send(f"**score:{score:.9f}**\n```")
 
-            await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")
-            return result, score
+                await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")
 
         except Exception as e:
             logger.error(f"Error in handle_modal_execution: {str(e)}", exc_info=True)
diff --git a/src/discord-cluster-manager/cogs/verify_run_cog.py b/src/discord-cluster-manager/cogs/verify_run_cog.py
@@ -120,6 +120,43 @@ async def verify_modal_run(self, modal_cog: ModalCog, interaction: discord.Inter
             )
             return False
 
+    async def verify_modal_cuda(self, modal_cog: ModalCog, interaction: discord.Interaction) -> bool:
+        cuda_file = create_mock_attachment("test.cu", Path("examples/identity_cuda/submission.cuh").read_text())
+        reference_code = Path("examples/identity_cuda/reference.cuh").read_text()
+
+        t4 = app_commands.Choice(name="T4", value="t4")
+        modal_command = modal_cog.run_modal
+
+        modal_thread = await modal_command.callback(modal_cog, interaction, cuda_file, t4, reference_code=reference_code)
+
+        message_contents = [msg.content async for msg in modal_thread.history(limit=None)]
+
+        required_patterns = ["Running on Modal...", "Job completed!"]
+
+        all_patterns_found = all(
+            any(re.search(pattern, content, re.DOTALL) is not None for content in message_contents)
+            for pattern in required_patterns
+        )
+
+        if all_patterns_found:
+            await send_discord_message(
+                interaction,
+                "✅ Modal run completed successfully - all expected messages found!",
+            )
+            return True
+        else:
+            missing_patterns = [
+                pattern
+                for pattern in required_patterns
+                if not any(re.search(pattern, content, re.DOTALL) for content in message_contents)
+            ]
+            await send_discord_message(
+                interaction,
+                "❌ Modal run verification failed. Missing expected messages:\n"
+                + "\n".join(f"- {pattern}" for pattern in missing_patterns),
+            )
+            return False
+
     @app_commands.command(name="verifyruns")
     async def verify_runs(self, interaction: discord.Interaction):
         """Verify runs on on Modal, GitHub Nvidia, and GitHub AMD."""
@@ -139,7 +176,8 @@ async def verify_runs(self, interaction: discord.Interaction):
             amd = app_commands.Choice(name="AMD", value="amd")
 
             results = await asyncio.gather(
-                self.verify_github_run(github_cog, nvidia, interaction),
+                #self.verify_github_run(github_cog, nvidia, interaction),
+                self.verify_modal_cuda(modal_cog, interaction),
                 #self.verify_github_run(github_cog, amd, interaction),
                 #self.verify_modal_run(modal_cog, interaction),
             )
diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py
@@ -4,7 +4,7 @@
 
 from consts import MODAL_CUDA_INCLUDE_DIRS, MODAL_PATH
 from modal import App, Image, Mount
-from run_eval import run_cuda_script, run_pytorch_script
+from run_eval import run_cuda_script, run_pytorch_script, CompileResult, RunResult, FullResult
 
 # Create a stub for the Modal app
 # IMPORTANT: This has to stay in separate file or modal breaks
@@ -96,57 +96,20 @@ def modal_run_cuda_script(  # # noqa: C901
     submission_content: str = None,
     timeout_seconds: int = 600,
     arch: int = None,
-) -> tuple[str, float]:
+) -> FullResult:
     """Modal version of run_cuda_script, handling timeouts"""
     try:
         with timeout(timeout_seconds):
-            compile_result, run_result = run_cuda_script(
+            comp, run = run_cuda_script(
                 script_content,
                 reference_content=reference_content,
                 submission_content=submission_content,
                 arch=arch,
                 include_dirs=MODAL_CUDA_INCLUDE_DIRS,
             )
-
-            if not compile_result.success:
-                if not compile_result.nvcc_found:
-                    return (
-                        "Error executing script: NVCC not found:\n"
-                        + f"command `{compile_result.command}` failed with exit code {compile_result.exit_code}:\n"
-                        + compile_result.stderr,
-                        0.0,
-                    )
-                return (
-                    "Error executing script: CUDA compilation failed with return code "
-                    + f"{compile_result.exit_code}:\n{compile_result.stderr}\n"
-                    + f"compile command: `{compile_result.command}`",
-                    0.0,
-                )
-
-            if not run_result.success:
-                # exit code 1 encodes failed tests
-                if run_result.exit_code == 1:
-                    return f"check_implementation failed:\n{run_result.stderr}", 0.0
-                else:
-                    return (
-                        f"Script failed with exit code ({run_result.exit_code}):\n{run_result.stderr}",
-                        0.0,
-                    )
-
-            print("run process stdout:", run_result.stdout)
-            print("run process stderr:", run_result.stderr)
-
-            score = float(run_result.result.get("duration.mean", "0.0")) / 1e9
-            passed = run_result.result.get("check", "") == "pass"
-            if not passed:
-                return "check_implementation failed", 0.0
-
-            if score is None:
-                return run_result.stdout, run_result.duration
-
-            return run_result.stdout, score
-
+        return FullResult(success=True, error="", compile=comp, run=run)
+    # TODO fixup error handling!
     except TimeoutException as e:
-        return f"Timeout Error: {str(e)}", 0.0
+        return FullResult(success=False, error=f"Timeout Error: {str(e)}", compile=None, run=None)
     except Exception as e:
-        return f"Error executing script: {str(e)}", 0.0
+        return FullResult(success=False, error=f"Error executing script: {str(e)}", compile=None, run=None)
diff --git a/src/discord-cluster-manager/modal_runner_archs.py b/src/discord-cluster-manager/modal_runner_archs.py
@@ -4,6 +4,7 @@
 
 from consts import GPU_TO_SM
 from modal_runner import app, cuda_image, modal_run_cuda_script, modal_run_pytorch_script
+from run_eval import CompileResult, RunResult, FullResult
 
 
 # T4: sm_70 (CUDA 7.x, Maxwell Architecture)
@@ -16,7 +17,7 @@ def run_cuda_script_t4(
     reference_content: str = None,
     submission_content: str = None,
     timeout_seconds: int = 600,
-) -> tuple[str, float]:
+) -> FullResult:
     return modal_run_cuda_script(
         script_content,
         reference_content,
@@ -55,7 +56,7 @@ def run_cuda_script_l4(
     reference_content: str = None,
     submission_content: str = None,
     timeout_seconds: int = 600,
-) -> tuple[str, float]:
+) -> FullResult:
     return modal_run_cuda_script(
         script_content,
         reference_content,
@@ -74,7 +75,7 @@ def run_pytorch_script_l4(
     reference_content: str = None,
     submission_content: str = None,
     timeout_seconds: int = 600,
-) -> tuple[str, float]:
+) -> FullResult:
     return modal_run_cuda_script(
         script_content,
         reference_content,
@@ -94,7 +95,7 @@ def run_cuda_script_a100(
     reference_content: str = None,
     submission_content: str = None,
     timeout_seconds: int = 600,
-) -> tuple[str, float]:
+) -> FullResult:
     return modal_run_cuda_script(
         script_content,
         reference_content,
@@ -113,7 +114,7 @@ def run_pytorch_script_a100(
     reference_content: str = None,
     submission_content: str = None,
     timeout_seconds: int = 600,
-) -> tuple[str, float]:
+) -> FullResult:
     return modal_run_cuda_script(
         script_content,
         reference_content,
@@ -133,7 +134,7 @@ def run_cuda_script_h100(
     reference_content: str = None,
     submission_content: str = None,
     timeout_seconds: int = 600,
-) -> tuple[str, float]:
+) -> FullResult:
     return modal_run_cuda_script(
         script_content,
         reference_content,
@@ -153,7 +154,7 @@ def run_pytorch_script_h100(
     submission_content: str = None,
     timeout_seconds: int = 600,
 ) -> tuple[str, float]:
-    return modal_run_cuda_script(
+    return modal_run_pytorch_script(
         script_content,
         reference_content,
         submission_content,
diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py