Skip to content

Commit d0e97b4

Browse files
committed
towards unified report code
1 parent a37c356 commit d0e97b4

File tree

7 files changed

+234
-83
lines changed

7 files changed

+234
-83
lines changed

src/discord-cluster-manager/cogs/github_cog.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
from env import GITHUB_REPO, GITHUB_TOKEN
1515
from github import Github
1616
from leaderboard_eval import cu_eval, py_eval
17+
from run_eval import RunResult, CompileResult, FullResult
1718
from utils import get_github_branch_name, send_discord_message, setup_logging
19+
from report import generate_report
1820

1921
logger = setup_logging()
2022

@@ -92,10 +94,18 @@ async def run_github(
9294

9395
await thread.send(f"Training completed with status: {status}")
9496

95-
if len(logs) > 1900:
96-
await self.bot.send_chunked_message(thread, logs, code_block=True)
97+
if expect_result:
98+
# {"success": True, **json.loads(logs)}
99+
if logs['success']:
100+
await generate_report(thread, logs)
101+
else:
102+
await thread.send(logs['error'])
103+
97104
else:
98-
await thread.send(f"```\n!!Logs!!:\n{logs}\n```")
105+
if len(logs) > 1900:
106+
await self.bot.send_chunked_message(thread, logs, code_block=True)
107+
else:
108+
await thread.send(f"```\nLogs:\n{logs}\n```")
99109

100110
if url:
101111
await thread.send(f"View the full run at: <{url}>")
@@ -233,8 +243,7 @@ async def check_workflow_status(self, run_id, thread, expect_result: bool=False)
233243

234244
if run.status == "completed":
235245
if expect_result:
236-
result = await self.download_results(run_id)
237-
logs = self.make_logs(result)
246+
logs = await self.download_results(run_id)
238247
else:
239248
logs = await self.handle_training_log(run_id)
240249
return run.conclusion, logs, run.html_url
@@ -248,16 +257,16 @@ async def check_workflow_status(self, run_id, thread, expect_result: bool=False)
248257
except Exception as e:
249258
return "error", str(e), None
250259

251-
def make_logs(self, result: dict):
252-
return pprint.pformat(result)
253-
254-
async def download_results(self, run_id):
260+
async def download_results(self, run_id) -> FullResult:
255261
try:
256262
data = await self.download_artifact(run_id, name="run-result")
257263
logs = data['result.json'].decode("utf-8")
258-
return {"success": True, **json.loads(logs)}
264+
data = json.loads(logs)
265+
comp = CompileResult(**data['compile'])
266+
run = RunResult(**data['run'])
267+
return FullResult(success=True, error="", compile=comp, run=run)
259268
except Exception as e:
260-
return {"success": False, "error": f"Error downloading artifacts: {str(e)}"}
269+
return FullResult(success=True, error=f"Error downloading artifacts: {str(e)}", compile=None, run=None)
261270

262271
async def handle_training_log(self, run_id):
263272
try:

src/discord-cluster-manager/cogs/modal_cog.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from discord.ext import commands
99
from leaderboard_eval import cu_eval, py_eval
1010
from utils import send_discord_message, send_logs, setup_logging
11+
from report import generate_report
1112

1213
logger = setup_logging()
1314

@@ -66,7 +67,7 @@ async def run_modal(
6667
else (await reference_script.read()).decode("utf-8")
6768
)
6869

69-
result, score = await self.handle_modal_execution(
70+
await self.handle_modal_execution(
7071
interaction,
7172
thread,
7273
script_content,
@@ -75,10 +76,6 @@ async def run_modal(
7576
reference_content,
7677
status_msg,
7778
)
78-
79-
if result is not None and score > 0:
80-
await thread.send(f"**score:{score:.9f}**")
81-
8279
return thread
8380

8481
except Exception as e:
@@ -97,21 +94,26 @@ async def handle_modal_execution(
9794
gpu_type: str,
9895
reference_content: Optional[str],
9996
status_msg: discord.Message,
100-
) -> tuple[str, float]:
97+
):
10198
try:
10299
loop = asyncio.get_event_loop()
103100
func_type = "pytorch" if filename.endswith(".py") else "cuda"
104101
func_name = f"run_{func_type}_script_{gpu_type.lower()}"
105102

106103
if reference_content is not None:
107-
result, score = await loop.run_in_executor(
104+
result = await loop.run_in_executor(
108105
None,
109106
lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(
110107
py_eval if filename.endswith(".py") else cu_eval,
111108
reference_content=reference_content,
112109
submission_content=script_content,
113110
),
114111
)
112+
113+
# Send results
114+
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
115+
await generate_report(thread, result)
116+
115117
else:
116118
result, score = await loop.run_in_executor(
117119
None,
@@ -123,21 +125,20 @@ async def handle_modal_execution(
123125
interaction, f"Modal job completed in thread {thread.jump_url}", ephemeral=True
124126
)
125127

126-
# Send results
127-
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
128-
await thread.send(f"**Execution time:** {score:.3f} s\n")
128+
# Send results
129+
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
130+
await thread.send(f"**Execution time:** {score:.3f} s\n")
129131

130-
if "check_implementation failed" in result or "Error" in result:
131-
await thread.send("Modal run failed.\n")
132-
await send_logs(thread, result)
133-
await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
134-
return result, 0
132+
if "check_implementation failed" in result or "Error" in result:
133+
await thread.send("Modal run failed.\n")
134+
await send_logs(thread, result)
135+
await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
136+
return result, 0
135137

136-
if result is not None:
137-
await thread.send(f"**score:{score:.9f}**\n```")
138+
if result is not None:
139+
await thread.send(f"**score:{score:.9f}**\n```")
138140

139-
await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")
140-
return result, score
141+
await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")
141142

142143
except Exception as e:
143144
logger.error(f"Error in handle_modal_execution: {str(e)}", exc_info=True)

src/discord-cluster-manager/cogs/verify_run_cog.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,43 @@ async def verify_modal_run(self, modal_cog: ModalCog, interaction: discord.Inter
120120
)
121121
return False
122122

123+
async def verify_modal_cuda(self, modal_cog: ModalCog, interaction: discord.Interaction) -> bool:
124+
cuda_file = create_mock_attachment("test.cu", Path("examples/identity_cuda/submission.cuh").read_text())
125+
reference_code = Path("examples/identity_cuda/reference.cuh").read_text()
126+
127+
t4 = app_commands.Choice(name="T4", value="t4")
128+
modal_command = modal_cog.run_modal
129+
130+
modal_thread = await modal_command.callback(modal_cog, interaction, cuda_file, t4, reference_code=reference_code)
131+
132+
message_contents = [msg.content async for msg in modal_thread.history(limit=None)]
133+
134+
required_patterns = ["Running on Modal...", "Job completed!"]
135+
136+
all_patterns_found = all(
137+
any(re.search(pattern, content, re.DOTALL) is not None for content in message_contents)
138+
for pattern in required_patterns
139+
)
140+
141+
if all_patterns_found:
142+
await send_discord_message(
143+
interaction,
144+
"✅ Modal run completed successfully - all expected messages found!",
145+
)
146+
return True
147+
else:
148+
missing_patterns = [
149+
pattern
150+
for pattern in required_patterns
151+
if not any(re.search(pattern, content, re.DOTALL) for content in message_contents)
152+
]
153+
await send_discord_message(
154+
interaction,
155+
"❌ Modal run verification failed. Missing expected messages:\n"
156+
+ "\n".join(f"- {pattern}" for pattern in missing_patterns),
157+
)
158+
return False
159+
123160
@app_commands.command(name="verifyruns")
124161
async def verify_runs(self, interaction: discord.Interaction):
125162
"""Verify runs on on Modal, GitHub Nvidia, and GitHub AMD."""
@@ -139,7 +176,8 @@ async def verify_runs(self, interaction: discord.Interaction):
139176
amd = app_commands.Choice(name="AMD", value="amd")
140177

141178
results = await asyncio.gather(
142-
self.verify_github_run(github_cog, nvidia, interaction),
179+
#self.verify_github_run(github_cog, nvidia, interaction),
180+
self.verify_modal_cuda(modal_cog, interaction),
143181
#self.verify_github_run(github_cog, amd, interaction),
144182
#self.verify_modal_run(modal_cog, interaction),
145183
)

src/discord-cluster-manager/modal_runner.py

Lines changed: 7 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from consts import MODAL_CUDA_INCLUDE_DIRS, MODAL_PATH
66
from modal import App, Image, Mount
7-
from run_eval import run_cuda_script, run_pytorch_script
7+
from run_eval import run_cuda_script, run_pytorch_script, CompileResult, RunResult, FullResult
88

99
# Create a stub for the Modal app
1010
# IMPORTANT: This has to stay in separate file or modal breaks
@@ -96,57 +96,20 @@ def modal_run_cuda_script( # # noqa: C901
9696
submission_content: str = None,
9797
timeout_seconds: int = 600,
9898
arch: int = None,
99-
) -> tuple[str, float]:
99+
) -> FullResult:
100100
"""Modal version of run_cuda_script, handling timeouts"""
101101
try:
102102
with timeout(timeout_seconds):
103-
compile_result, run_result = run_cuda_script(
103+
comp, run = run_cuda_script(
104104
script_content,
105105
reference_content=reference_content,
106106
submission_content=submission_content,
107107
arch=arch,
108108
include_dirs=MODAL_CUDA_INCLUDE_DIRS,
109109
)
110-
111-
if not compile_result.success:
112-
if not compile_result.nvcc_found:
113-
return (
114-
"Error executing script: NVCC not found:\n"
115-
+ f"command `{compile_result.command}` failed with exit code {compile_result.exit_code}:\n"
116-
+ compile_result.stderr,
117-
0.0,
118-
)
119-
return (
120-
"Error executing script: CUDA compilation failed with return code "
121-
+ f"{compile_result.exit_code}:\n{compile_result.stderr}\n"
122-
+ f"compile command: `{compile_result.command}`",
123-
0.0,
124-
)
125-
126-
if not run_result.success:
127-
# exit code 1 encodes failed tests
128-
if run_result.exit_code == 1:
129-
return f"check_implementation failed:\n{run_result.stderr}", 0.0
130-
else:
131-
return (
132-
f"Script failed with exit code ({run_result.exit_code}):\n{run_result.stderr}",
133-
0.0,
134-
)
135-
136-
print("run process stdout:", run_result.stdout)
137-
print("run process stderr:", run_result.stderr)
138-
139-
score = float(run_result.result.get("duration.mean", "0.0")) / 1e9
140-
passed = run_result.result.get("check", "") == "pass"
141-
if not passed:
142-
return "check_implementation failed", 0.0
143-
144-
if score is None:
145-
return run_result.stdout, run_result.duration
146-
147-
return run_result.stdout, score
148-
110+
return FullResult(success=True, error="", compile=comp, run=run)
111+
# TODO fixup error handling!
149112
except TimeoutException as e:
150-
return f"Timeout Error: {str(e)}", 0.0
113+
return FullResult(success=False, error=f"Timeout Error: {str(e)}", compile=None, run=None)
151114
except Exception as e:
152-
return f"Error executing script: {str(e)}", 0.0
115+
return FullResult(success=False, error=f"Error executing script: {str(e)}", compile=None, run=None)

src/discord-cluster-manager/modal_runner_archs.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from consts import GPU_TO_SM
66
from modal_runner import app, cuda_image, modal_run_cuda_script, modal_run_pytorch_script
7+
from run_eval import CompileResult, RunResult, FullResult
78

89

910
# T4: sm_70 (CUDA 7.x, Maxwell Architecture)
@@ -16,7 +17,7 @@ def run_cuda_script_t4(
1617
reference_content: str = None,
1718
submission_content: str = None,
1819
timeout_seconds: int = 600,
19-
) -> tuple[str, float]:
20+
) -> FullResult:
2021
return modal_run_cuda_script(
2122
script_content,
2223
reference_content,
@@ -55,7 +56,7 @@ def run_cuda_script_l4(
5556
reference_content: str = None,
5657
submission_content: str = None,
5758
timeout_seconds: int = 600,
58-
) -> tuple[str, float]:
59+
) -> FullResult:
5960
return modal_run_cuda_script(
6061
script_content,
6162
reference_content,
@@ -74,7 +75,7 @@ def run_pytorch_script_l4(
7475
reference_content: str = None,
7576
submission_content: str = None,
7677
timeout_seconds: int = 600,
77-
) -> tuple[str, float]:
78+
) -> FullResult:
7879
return modal_run_cuda_script(
7980
script_content,
8081
reference_content,
@@ -94,7 +95,7 @@ def run_cuda_script_a100(
9495
reference_content: str = None,
9596
submission_content: str = None,
9697
timeout_seconds: int = 600,
97-
) -> tuple[str, float]:
98+
) -> FullResult:
9899
return modal_run_cuda_script(
99100
script_content,
100101
reference_content,
@@ -113,7 +114,7 @@ def run_pytorch_script_a100(
113114
reference_content: str = None,
114115
submission_content: str = None,
115116
timeout_seconds: int = 600,
116-
) -> tuple[str, float]:
117+
) -> FullResult:
117118
return modal_run_cuda_script(
118119
script_content,
119120
reference_content,
@@ -133,7 +134,7 @@ def run_cuda_script_h100(
133134
reference_content: str = None,
134135
submission_content: str = None,
135136
timeout_seconds: int = 600,
136-
) -> tuple[str, float]:
137+
) -> FullResult:
137138
return modal_run_cuda_script(
138139
script_content,
139140
reference_content,
@@ -153,7 +154,7 @@ def run_pytorch_script_h100(
153154
submission_content: str = None,
154155
timeout_seconds: int = 600,
155156
) -> tuple[str, float]:
156-
return modal_run_cuda_script(
157+
return modal_run_pytorch_script(
157158
script_content,
158159
reference_content,
159160
submission_content,

0 commit comments

Comments
 (0)