diff --git a/.gitignore b/.gitignore index 556b613..300478e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ submission.* target/ scratch.md +*claude +*.zip diff --git a/Cargo.lock b/Cargo.lock index 29e4754..4dcb102 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -23,6 +23,15 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" version = "0.6.18" @@ -187,6 +196,19 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chrono" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-link", +] + [[package]] name = "clap" version = "4.5.36" @@ -609,6 +631,30 @@ dependencies = [ "tokio-native-tls", ] +[[package]] +name = "iana-time-zone" +version = "0.1.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "icu_collections" version = "1.5.0" @@ -977,6 +1023,15 @@ dependencies = [ "libc", ] +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "objc" version = "0.2.7" @@ -1109,8 +1164,10 @@ name = "popcorn-cli" version = "0.1.0" dependencies = [ "anyhow", + "base64 0.22.1", "base64-url", "bytes", + "chrono", "clap", "crossterm", "ctrlc", @@ -1912,6 +1969,65 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-sys" version = "0.45.0" diff --git a/Cargo.toml b/Cargo.toml index c3303d0..d212bd7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,8 @@ dirs = "5.0" serde_yaml = "0.9" webbrowser = "0.8" base64-url = "3.0.0" +base64 = "0.22" +chrono = "0.4" urlencoding = "2.1.3" bytes = "1.10.1" futures-util = "0.3.31" diff --git a/README.md b/README.md index f3ba659..287a8c9 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ A command-line interface tool for submitting solutions to the [Popcorn Discord B Tested on linux and mac but should just work on Windows as well. +## New: Nsight Compute Profiling + +Profile your kernels with `--mode profile` and get detailed metrics. Currently only available for the NVFP4 Blackwell competition (Modal, which we use for other competitions, does not support NCU). See [docs/profiling.md](docs/profiling.md) for details. + ## Installation ### Option 1: Using pre-built binaries (Recommended) diff --git a/docs/profiling.md b/docs/profiling.md new file mode 100644 index 0000000..5aba6d6 --- /dev/null +++ b/docs/profiling.md @@ -0,0 +1,65 @@ +# Nsight Compute Profiling + +Profile your kernels directly from the CLI and get detailed Nsight Compute metrics. This is particularly useful for the NVIDIA NVFP4 Blackwell competition where you need to optimize tensor core utilization. + +**Note:** Profiling is currently only available for the NVFP4 Blackwell competition. Modal, which we use for other competitions, does not support NCU. + +## Quick Start + +```bash +popcorn-cli submit submission.py --leaderboard nvfp4_dual_gemm --gpu NVIDIA --mode profile --no-tui +``` + +## Expected Output + +The profiler returns three key metric tables for each benchmark: + +**GPU Throughput** - Overall utilization: +``` +Metric Name Metric Unit Metric Value +---------------- ----------- ------------ +Memory [%] % 32.48 +Compute (SM) [%] % 13.23 +``` + +**Pipe Utilization** - Which pipelines are active: +``` +Metric Name Metric Unit Metric Value +-------------------- ----------- ------------ +TC % 16.67 +TMEM (Tensor Memory) % 15.27 +Tensor (FP) % 12.58 +ALU % 2.38 +TMA % 0.29 +``` + +**Warp State** - Where your warps are stalling: +``` +Metric Name Metric Unit Metric Value +------------------------ ----------- ------------ +Stall Long Scoreboard inst 18.31 +Stall Wait inst 1.88 +Stall Short Scoreboard inst 1.23 +Selected inst 1.00 +Stall Barrier inst 0.75 +``` + +## Trace Files + +After profiling, a zip file is saved to your current directory: +``` +profile_20260113_031052_run0.zip +``` + +This contains a `.ncu-rep` file (the full Nsight Compute report): +``` +$ unzip -l profile_20260113_031052_run0.zip + Length Date Time Name +--------- ---------- ----- ---- + 2178383 01-13-2026 03:10 profile.ncu-rep +``` + +You can open this file in the Nsight Compute GUI for detailed analysis: +```bash +ncu-ui profile.ncu-rep +``` diff --git a/src/cmd/submit.rs b/src/cmd/submit.rs index 87face5..325b5f1 100644 --- a/src/cmd/submit.rs +++ b/src/cmd/submit.rs @@ -67,7 +67,7 @@ impl App { ), SubmissionModeItem::new( "Profile".to_string(), - "Profile is currently supported only via Discord. We'll add this feature to the CLI soon.".to_string(), + "Profile the solution using Nsight Compute (NVIDIA) or rocPROF (AMD). Downloads profiling data to current directory.".to_string(), "profile".to_string(), ), ]; diff --git a/src/service/mod.rs b/src/service/mod.rs index 03cc006..264b428 100644 --- a/src/service/mod.rs +++ b/src/service/mod.rs @@ -1,4 +1,6 @@ use anyhow::{anyhow, Result}; +use base64::Engine; +use chrono::Utc; use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::multipart::{Form, Part}; use reqwest::Client; @@ -189,26 +191,40 @@ pub async fn submit_solution>( } "result" => { let result_val: Value = serde_json::from_str(data)?; - + if let Some(ref cb) = on_log { // Handle "results" array if let Some(results_array) = result_val.get("results").and_then(|v| v.as_array()) { - for (i, result_item) in results_array.iter().enumerate() { - let mode_key = submission_mode.to_lowercase(); - - if let Some(run_obj) = result_item.get("runs") - .and_then(|r| r.get(&mode_key)) - .and_then(|t| t.get("run")) - { - if let Some(stdout) = run_obj.get("stdout").and_then(|s| s.as_str()) { - if !stdout.is_empty() { - cb(format!("STDOUT (Run {}):\n{}", i + 1, stdout)); + let mode_key = submission_mode.to_lowercase(); + + // Special handling for profile mode + if mode_key == "profile" { + for (i, result_item) in results_array.iter().enumerate() { + if let Some(runs) = result_item.get("runs").and_then(|r| r.as_object()) { + for (key, run_data) in runs.iter() { + if key.starts_with("profile") { + handle_profile_result(cb, run_data, i); + } } } - // Also check stderr - if let Some(stderr) = run_obj.get("stderr").and_then(|s| s.as_str()) { - if !stderr.is_empty() { - cb(format!("STDERR (Run {}):\n{}", i + 1, stderr)); + } + } else { + // Existing handling for non-profile modes + for (i, result_item) in results_array.iter().enumerate() { + if let Some(run_obj) = result_item.get("runs") + .and_then(|r| r.get(&mode_key)) + .and_then(|t| t.get("run")) + { + if let Some(stdout) = run_obj.get("stdout").and_then(|s| s.as_str()) { + if !stdout.is_empty() { + cb(format!("STDOUT (Run {}):\n{}", i + 1, stdout)); + } + } + // Also check stderr + if let Some(stderr) = run_obj.get("stderr").and_then(|s| s.as_str()) { + if !stderr.is_empty() { + cb(format!("STDERR (Run {}):\n{}", i + 1, stderr)); + } } } } @@ -273,3 +289,91 @@ pub async fn submit_solution>( Ok(pretty_result) } } + +/// Handle profile mode results by decoding and displaying profile data, +/// and saving trace files to the current directory. +fn handle_profile_result( + cb: &Box, + run_data: &Value, + run_idx: usize, +) { + // 1. Get profiler type and display it + if let Some(profile) = run_data.get("profile") { + let profiler = profile + .get("profiler") + .and_then(|p| p.as_str()) + .unwrap_or("Unknown"); + cb(format!("\n=== Profiler: {} ===", profiler)); + + // 2. Decode and display profile report from run.result + if let Some(run) = run_data.get("run") { + // Display stdout/stderr if present + if let Some(stdout) = run.get("stdout").and_then(|s| s.as_str()) { + if !stdout.is_empty() { + cb(format!("STDOUT:\n{}", stdout)); + } + } + if let Some(stderr) = run.get("stderr").and_then(|s| s.as_str()) { + if !stderr.is_empty() { + cb(format!("STDERR:\n{}", stderr)); + } + } + + // Extract and decode profile report from result + if let Some(result) = run.get("result").and_then(|r| r.as_object()) { + let bench_count = result + .get("benchmark-count") + .and_then(|c| c.as_i64()) + .unwrap_or(0); + + for i in 0..bench_count { + // Get benchmark spec + let spec_key = format!("benchmark.{}.spec", i); + let spec = result + .get(&spec_key) + .and_then(|s| s.as_str()) + .unwrap_or("unknown"); + cb(format!("\nBenchmark: {}", spec)); + + // Decode and display the profile report + let report_key = format!("benchmark.{}.report", i); + if let Some(encoded_report) = result.get(&report_key).and_then(|r| r.as_str()) { + match base64::engine::general_purpose::STANDARD.decode(encoded_report) { + Ok(decoded) => { + if let Ok(report_text) = String::from_utf8(decoded) { + cb(format!("\n{}", report_text)); + } + } + Err(e) => cb(format!("Failed to decode profile report: {}", e)), + } + } + } + } + } + + // 3. Save trace file with unique timestamp + if let Some(trace_b64) = profile.get("trace").and_then(|t| t.as_str()) { + if !trace_b64.is_empty() { + match base64::engine::general_purpose::STANDARD.decode(trace_b64) { + Ok(trace_data) => { + // Generate unique filename with timestamp and run index + let timestamp = Utc::now().format("%Y%m%d_%H%M%S"); + let filename = format!("profile_{}_run{}.zip", timestamp, run_idx); + match std::fs::write(&filename, &trace_data) { + Ok(_) => cb(format!("\nSaved profile trace to: {}", filename)), + Err(e) => cb(format!("Failed to save trace file: {}", e)), + } + } + Err(e) => cb(format!("Failed to decode trace data: {}", e)), + } + } + } + + // 4. Show download URL if available + if let Some(url) = profile.get("download_url").and_then(|u| u.as_str()) { + if !url.is_empty() { + cb(format!("Download full profile: {}", url)); + } + } + } +}