From 45ada2f1353b02006529e4ad8eb36f2596e35476 Mon Sep 17 00:00:00 2001 From: Orion Papadakis Date: Tue, 5 May 2026 13:40:01 +0300 Subject: [PATCH 1/5] Introduce `RunMetrics` singleton to track and report fine-grained performance metrics across inference stages --- llama-tornado | 2 - .../java/org/beehive/gpullama3/LlamaApp.java | 6 +- .../gpullama3/auxiliary/RunMetrics.java | 118 ++++++++++++++++++ .../gpullama3/inference/InferenceEngine.java | 63 +++++----- ...InferenceEngineWithBatchPrefillDecode.java | 14 ++- .../InferenceEngineWithPrefillDecode.java | 14 ++- .../org/beehive/gpullama3/model/Model.java | 4 +- .../TornadoVMMasterPlanStandard.java | 25 ++-- ...adoVMMasterPlanWithBatchPrefillDecode.java | 25 ++-- .../TornadoVMMasterPlanWithPrefillDecode.java | 29 ++--- 10 files changed, 203 insertions(+), 97 deletions(-) create mode 100644 src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java diff --git a/llama-tornado b/llama-tornado index 11ada591..1d6c3d23 100755 --- a/llama-tornado +++ b/llama-tornado @@ -96,8 +96,6 @@ class LlamaRunner: if args.cuda_graphs: cmd.append("-Dllama.cudaGraphs=true") - elif args.no_cuda_graphs: - cmd.append("-Dllama.cudaGraphs=false") # Debug options debug_config = [] diff --git a/src/main/java/org/beehive/gpullama3/LlamaApp.java b/src/main/java/org/beehive/gpullama3/LlamaApp.java index 822a082c..463bd141 100644 --- a/src/main/java/org/beehive/gpullama3/LlamaApp.java +++ b/src/main/java/org/beehive/gpullama3/LlamaApp.java @@ -1,6 +1,6 @@ package org.beehive.gpullama3; -import org.beehive.gpullama3.auxiliary.LastRunMetrics; +import org.beehive.gpullama3.auxiliary.RunMetrics; import org.beehive.gpullama3.inference.sampler.Sampler; import org.beehive.gpullama3.model.Model; @@ -18,7 +18,7 @@ private static void runSingleInstruction(Model model, Sampler sampler, Options o String response = model.runInstructOnce(sampler, options); System.out.println(response); if (SHOW_PERF_INTERACTIVE) { - LastRunMetrics.printMetrics(); + RunMetrics.printMetrics(); } } @@ -35,7 +35,9 @@ private static void runSingleInstruction(Model model, Sampler sampler, Options o */ static void main(String[] args) throws IOException { Options options = Options.parseOptions(args); + long loadStart = System.nanoTime(); Model model = loadModel(options); + RunMetrics.setLoadDuration(System.nanoTime() - loadStart); Sampler sampler = createSampler(model, options); if (options.interactive()) { diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java new file mode 100644 index 00000000..d807e703 --- /dev/null +++ b/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java @@ -0,0 +1,118 @@ +package org.beehive.gpullama3.auxiliary; + +/** + * Singleton that accumulates fine-grained performance metrics across one inference run. + * + *

Metrics are set incrementally by different layers of the stack:

+ * + * + *

All durations are stored in nanoseconds. {@link #printMetrics()} prints throughput only:

+ * + */ +public final class RunMetrics { + + // ── Core metrics (nanoseconds) ──────────────────────────────────────────── + private long totalDurationNs; + private long loadDurationNs; + private int promptEvalCount; + private long promptEvalDurationNs; + private int evalCount; + private long evalDurationNs; + private boolean hasPrefillPhase; + + // ── TornadoVM-specific metrics (nanoseconds) ────────────────────────────── + private long tornadoCompileDurationNs; + private long tornadoWarmupDurationNs; + + // ── Singleton ───────────────────────────────────────────────────────────── + private static final RunMetrics INSTANCE = new RunMetrics(); + + private RunMetrics() {} + + // ── Setters ─────────────────────────────────────────────────────────────── + + /** Records the time spent loading the model file (not including TornadoVM initialisation). */ + public static void setLoadDuration(long ns) { + INSTANCE.loadDurationNs = ns; + } + + /** + * Records TornadoVM-specific initialisation durations. + * + * @param compileNs plan-graph construction + JIT compilation ({@code withPreCompilation()}) + * @param warmupNs first-execution weight upload ({@code forceCopyInReadOnlyData()}) + */ + public static void setTornadoMetrics(long compileNs, long warmupNs) { + INSTANCE.tornadoCompileDurationNs = compileNs; + INSTANCE.tornadoWarmupDurationNs = warmupNs; + } + + /** + * Records inference-phase durations at the end of a generation run. + * + * @param promptCount number of prompt tokens processed (prefill) + * @param prefillNs wall-clock time spent in the prefill phase + * @param generatedCount number of tokens generated (decode) + * @param decodeNs wall-clock time spent in the decode phase + * @param totalNs total wall-clock time for the full inference call + */ + public static void setInferenceMetrics(int promptCount, long prefillNs, + int generatedCount, long decodeNs, + long totalNs) { + INSTANCE.promptEvalCount = promptCount; + INSTANCE.promptEvalDurationNs = prefillNs; + INSTANCE.evalCount = generatedCount; + INSTANCE.evalDurationNs = decodeNs; + INSTANCE.totalDurationNs = totalNs; + } + + /** + * Signals that prefill and decode are distinct timed phases. + * Called by {@code InferenceEngineWithPrefillDecode} and + * {@code InferenceEngineWithBatchPrefillDecode} before returning. + */ + public static void setHasPrefillPhase(boolean value) { + INSTANCE.hasPrefillPhase = value; + } + + // ── Output ──────────────────────────────────────────────────────────────── + + /** Prints throughput metrics to {@code stderr}. */ + public static void printMetrics() { + RunMetrics m = INSTANCE; + + int totalTokens = m.promptEvalCount + m.evalCount; + double totalSecs = m.totalDurationNs / 1e9; + double totalRate = totalSecs > 0 ? totalTokens / totalSecs : 0; + + System.err.println(); + System.err.println("==== Performance Metrics ===="); + if (m.hasPrefillPhase) { + double prefillSecs = m.promptEvalDurationNs / 1e9; + double decodeSecs = m.evalDurationNs / 1e9; + double prefillRate = (prefillSecs > 0 && m.promptEvalCount > 0) + ? m.promptEvalCount / prefillSecs : 0; + double decodeRate = (decodeSecs > 0 && m.evalCount > 0) + ? m.evalCount / decodeSecs : 0; + System.err.printf( + "Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" + + "¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" + + "¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n", + totalRate, totalTokens, totalSecs, + prefillRate, m.promptEvalCount, prefillSecs, + decodeRate, m.evalCount, decodeSecs); + } else { + System.err.printf("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n", + totalRate, totalTokens, totalSecs); + } + System.err.println(); + } +} diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java index a9c65223..d653ba58 100644 --- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java +++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java @@ -1,6 +1,6 @@ package org.beehive.gpullama3.inference; -import org.beehive.gpullama3.auxiliary.LastRunMetrics; +import org.beehive.gpullama3.auxiliary.RunMetrics; import org.beehive.gpullama3.inference.sampler.Sampler; import org.beehive.gpullama3.inference.state.State; import org.beehive.gpullama3.model.Configuration; @@ -132,10 +132,9 @@ public static List generateTokensLlama(Model model, State state, int st // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; - int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos; + RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos, + generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos); return generatedTokens; } @@ -213,10 +212,9 @@ public static List generateTokensQwen3(Model model, State state, int st // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; - int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos; + RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos, + generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos); return generatedTokens; } @@ -225,6 +223,7 @@ public static List generateTokensPhi3(Model model, State state, int sta IntConsumer onTokenGenerated) { long startNanos = System.nanoTime(); + long inferenceStartNanos = 0; if (maxTokens < 0 || model.configuration().contextLength() < maxTokens) { maxTokens = model.configuration().contextLength(); } @@ -245,6 +244,9 @@ public static List generateTokensPhi3(Model model, State state, int sta System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken)))); } } else { + if (inferenceStartNanos == 0) { + inferenceStartNanos = System.nanoTime(); + } nextToken = sampler.sampleToken(state.logits); if (echo) { // log inferred token @@ -266,10 +268,9 @@ public static List generateTokensPhi3(Model model, State state, int sta // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; - int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos; + RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos, + generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos); return generatedTokens; @@ -356,11 +357,9 @@ public static List generateTokensGPULlama(Model model, State state, int // === Performance Metrics === long endNanos = System.nanoTime(); - double totalSeconds = (endNanos - startNanos) / 1_000_000_000.0; - int totalTokens = promptIndex + generatedTokens.size(); - - // Set metrics for tokens achieved - LastRunMetrics.setMetrics(totalTokens, totalSeconds); + long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos; + RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos, + generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos); return generatedTokens; } @@ -449,10 +448,9 @@ public static List generateTokensGPUQwen3(Model model, State state, int // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; - int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos; + RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos, + generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos); return generatedTokens; } @@ -524,10 +522,9 @@ public static List generateTokensGPUPhi3(Model model, State state, int // Calculate and print performance metrics long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; - int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos; + RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos, + generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos); return generatedTokens; } @@ -591,10 +588,9 @@ public static List generateTokensGranite(Model model, State state, int } long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; - int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos; + RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos, + generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos); return generatedTokens; } @@ -658,10 +654,9 @@ public static List generateTokensGPUGranite(Model model, State state, i } long endNanos = System.nanoTime(); - double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0; - int totalTokens = promptIndex + generatedTokens.size(); - - LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds); + long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos; + RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos, + generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos); return generatedTokens; } diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java index 1440a984..06dbe256 100644 --- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java +++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java @@ -1,6 +1,6 @@ package org.beehive.gpullama3.inference; -import org.beehive.gpullama3.auxiliary.LastRunMetrics; +import org.beehive.gpullama3.auxiliary.RunMetrics; import org.beehive.gpullama3.inference.sampler.Sampler; import org.beehive.gpullama3.inference.state.State; import org.beehive.gpullama3.model.Configuration; @@ -102,6 +102,7 @@ public static List generateTokensLlama( } state.latestToken = currentToken; + long decodeStartNanos = System.nanoTime(); // ── Decode ──────────────────────────────────────────────────────────── while (pos < actualMaxTokens) { @@ -129,8 +130,9 @@ public static List generateTokensLlama( } long endNanos = System.nanoTime(); - int totalTokens = promptTokens.size() + generatedTokens.size(); - LastRunMetrics.setMetrics(totalTokens, (endNanos - startNanos) / 1_000_000_000.0); + RunMetrics.setInferenceMetrics(promptTokens.size(), decodeStartNanos - startNanos, + generatedTokens.size(), endNanos - decodeStartNanos, endNanos - startNanos); + RunMetrics.setHasPrefillPhase(true); return generatedTokens; } @@ -197,6 +199,7 @@ public static List generateTokensGPULlama( currentToken = promptTokens.get(N - 1); pos = startPosition + N; state.latestToken = currentToken; + long decodeStartNanos = System.nanoTime(); // ── Decode ──────────────────────────────────────────────────────────── while (pos < actualMaxTokens) { @@ -224,8 +227,9 @@ public static List generateTokensGPULlama( } long endNanos = System.nanoTime(); - int totalTokens = promptTokens.size() + generatedTokens.size(); - LastRunMetrics.setMetrics(totalTokens, (endNanos - startNanos) / 1_000_000_000.0); + RunMetrics.setInferenceMetrics(promptTokens.size(), decodeStartNanos - startNanos, + generatedTokens.size(), endNanos - decodeStartNanos, endNanos - startNanos); + RunMetrics.setHasPrefillPhase(true); return generatedTokens; } diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java index 64815d52..ae9f6e9c 100644 --- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java +++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java @@ -1,6 +1,6 @@ package org.beehive.gpullama3.inference; -import org.beehive.gpullama3.auxiliary.LastRunMetrics; +import org.beehive.gpullama3.auxiliary.RunMetrics; import org.beehive.gpullama3.inference.sampler.Sampler; import org.beehive.gpullama3.inference.state.State; import org.beehive.gpullama3.model.Configuration; @@ -75,6 +75,7 @@ public static List generateTokensLlama( } state.latestToken = currentToken; + long decodeStartNanos = System.nanoTime(); // ── Decode ──────────────────────────────────────────────────────────── while (pos < actualMaxTokens) { @@ -102,8 +103,9 @@ public static List generateTokensLlama( } long endNanos = System.nanoTime(); - int totalTokens = promptTokens.size() + generatedTokens.size(); - LastRunMetrics.setMetrics(totalTokens, (endNanos - startNanos) / 1_000_000_000.0); + RunMetrics.setInferenceMetrics(promptTokens.size(), decodeStartNanos - startNanos, + generatedTokens.size(), endNanos - decodeStartNanos, endNanos - startNanos); + RunMetrics.setHasPrefillPhase(true); return generatedTokens; } @@ -153,6 +155,7 @@ public static List generateTokensGPULlama( } state.latestToken = currentToken; + long decodeStartNanos = System.nanoTime(); // ── Decode (GPU, with logits) ───────────────────────────────────────── while (pos < actualMaxTokens) { @@ -180,8 +183,9 @@ public static List generateTokensGPULlama( } long endNanos = System.nanoTime(); - int totalTokens = promptTokens.size() + generatedTokens.size(); - LastRunMetrics.setMetrics(totalTokens, (endNanos - startNanos) / 1_000_000_000.0); + RunMetrics.setInferenceMetrics(promptTokens.size(), decodeStartNanos - startNanos, + generatedTokens.size(), endNanos - decodeStartNanos, endNanos - startNanos); + RunMetrics.setHasPrefillPhase(true); return generatedTokens; } diff --git a/src/main/java/org/beehive/gpullama3/model/Model.java b/src/main/java/org/beehive/gpullama3/model/Model.java index 6defefd0..60e00cf0 100644 --- a/src/main/java/org/beehive/gpullama3/model/Model.java +++ b/src/main/java/org/beehive/gpullama3/model/Model.java @@ -1,7 +1,7 @@ package org.beehive.gpullama3.model; import org.beehive.gpullama3.Options; -import org.beehive.gpullama3.auxiliary.LastRunMetrics; +import org.beehive.gpullama3.auxiliary.RunMetrics; import org.beehive.gpullama3.inference.sampler.Sampler; import org.beehive.gpullama3.inference.state.State; import org.beehive.gpullama3.inference.weights.Weights; @@ -165,7 +165,7 @@ default void runInteractive(Sampler sampler, Options options) { // Optionally print performance metrics after each response if (SHOW_PERF_INTERACTIVE) { - LastRunMetrics.printMetrics(); + RunMetrics.printMetrics(); } } } finally { diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java index da9c34c6..f50b2146 100644 --- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java +++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java @@ -1,5 +1,6 @@ package org.beehive.gpullama3.tornadovm; +import org.beehive.gpullama3.auxiliary.RunMetrics; import org.beehive.gpullama3.inference.state.State; import org.beehive.gpullama3.model.Configuration; import org.beehive.gpullama3.model.Model; @@ -27,10 +28,6 @@ public class TornadoVMMasterPlanStandard implements TornadoVMMasterPlan { public TornadoExecutionPlan executionPlan; public TornadoVMMasterPlanStandard(State state, Model model) { - long startTime = System.nanoTime(); - long planCreationTime = 0; - long warmupTime = 0; - if (ENABLE_TORNADOVM_INIT_TIME) { System.err.println("\nStarting TornadoVM initialization..."); } @@ -38,25 +35,23 @@ public TornadoVMMasterPlanStandard(State state, Model model) { this.state = state; this.model = model; this.config = model.configuration(); - this.executionPlan = createExecutionPlan(); - if (ENABLE_TORNADOVM_INIT_TIME) { - planCreationTime = System.nanoTime(); - System.err.printf("TornadoVM GPU standard execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0); - } + long startTime = System.nanoTime(); + this.executionPlan = createExecutionPlan(); + long planCreationTime = System.nanoTime(); if (CUDA_GRAPHS) executionPlan.withAllGraphs().withCUDAGraph(); executionPlan.withPreCompilation(); - - if (ENABLE_TORNADOVM_INIT_TIME) { - warmupTime = System.nanoTime(); - System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0); - } + long warmupTime = System.nanoTime(); forceCopyInReadOnlyData(); + long copyTime = System.nanoTime(); + + RunMetrics.setTornadoMetrics(warmupTime - startTime, copyTime - warmupTime); if (ENABLE_TORNADOVM_INIT_TIME) { - long copyTime = System.nanoTime(); + System.err.printf("TornadoVM GPU standard execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0); + System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0); System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0); System.err.printf("Finished TornadoVM initialization...\n \n"); } diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java index aa69afad..46835797 100644 --- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java +++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java @@ -1,5 +1,6 @@ package org.beehive.gpullama3.tornadovm; +import org.beehive.gpullama3.auxiliary.RunMetrics; import org.beehive.gpullama3.inference.state.LlamaState; import org.beehive.gpullama3.inference.state.State; import org.beehive.gpullama3.tensor.GGMLType; @@ -74,10 +75,6 @@ public class TornadoVMMasterPlanWithBatchPrefillDecode implements TornadoVMMaste // ── Construction ───────────────────────────────────────────────────────── TornadoVMMasterPlanWithBatchPrefillDecode(State initialState, Model model) { - long startTime = System.nanoTime(); - long planCreationTime = 0; - long warmupTime = 0; - if (ENABLE_TORNADOVM_INIT_TIME) { System.err.println("\nStarting TornadoVM initialization..."); } @@ -88,25 +85,23 @@ public class TornadoVMMasterPlanWithBatchPrefillDecode implements TornadoVMMaste this.batchSize = PREFILL_BATCH_SIZE; this.N = config.numberOfLayers(); this.gridScheduler = new GridScheduler(); - this.executionPlan = createExecutionPlan(); - if (ENABLE_TORNADOVM_INIT_TIME) { - planCreationTime = System.nanoTime(); - System.err.printf("TornadoVM GPU batched prefill/decode execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0); - } + long startTime = System.nanoTime(); + this.executionPlan = createExecutionPlan(); + long planCreationTime = System.nanoTime(); if (CUDA_GRAPHS) executionPlan.withAllGraphs().withCUDAGraph(); executionPlan.withPreCompilation(); - - if (ENABLE_TORNADOVM_INIT_TIME) { - warmupTime = System.nanoTime(); - System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0); - } + long warmupTime = System.nanoTime(); forceCopyInReadOnlyData(); + long copyTime = System.nanoTime(); + + RunMetrics.setTornadoMetrics(warmupTime - startTime, copyTime - warmupTime); if (ENABLE_TORNADOVM_INIT_TIME) { - long copyTime = System.nanoTime(); + System.err.printf("TornadoVM GPU batched prefill/decode execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0); + System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0); System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0); System.err.printf("Finished TornadoVM initialization...\n \n"); } diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java index 7a668546..97e51f23 100644 --- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java +++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java @@ -1,5 +1,6 @@ package org.beehive.gpullama3.tornadovm; +import org.beehive.gpullama3.auxiliary.RunMetrics; import org.beehive.gpullama3.inference.state.LlamaState; import org.beehive.gpullama3.inference.state.State; import org.beehive.gpullama3.tensor.GGMLType; @@ -63,10 +64,6 @@ public class TornadoVMMasterPlanWithPrefillDecode implements TornadoVMMasterPlan // ── Construction ───────────────────────────────────────────────────────── TornadoVMMasterPlanWithPrefillDecode(State initialState, Model model) { - long startTime = System.nanoTime(); - long planCreationTime = 0; - long warmupTime = 0; - if (ENABLE_TORNADOVM_INIT_TIME) { System.err.println("\nStarting TornadoVM initialization..."); } @@ -76,27 +73,25 @@ public class TornadoVMMasterPlanWithPrefillDecode implements TornadoVMMasterPlan this.config = (LlamaConfiguration) model.configuration(); this.N = config.numberOfLayers(); this.gridScheduler = new GridScheduler(); - this.executionPlan = createExecutionPlan(); - if (ENABLE_TORNADOVM_INIT_TIME) { - planCreationTime = System.nanoTime(); - System.err.printf("TornadoVM GPU single-token prefill/decode execution plan creation: %.2f ms\n", - (planCreationTime - startTime) / 1_000_000.0); - } + long startTime = System.nanoTime(); + this.executionPlan = createExecutionPlan(); + long planCreationTime = System.nanoTime(); if (CUDA_GRAPHS) executionPlan.withAllGraphs().withCUDAGraph(); executionPlan.withPreCompilation(); - - if (ENABLE_TORNADOVM_INIT_TIME) { - warmupTime = System.nanoTime(); - System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", - (warmupTime - planCreationTime) / 1_000_000.0); - } + long warmupTime = System.nanoTime(); forceCopyInReadOnlyData(); + long copyTime = System.nanoTime(); + + RunMetrics.setTornadoMetrics(warmupTime - startTime, copyTime - warmupTime); if (ENABLE_TORNADOVM_INIT_TIME) { - long copyTime = System.nanoTime(); + System.err.printf("TornadoVM GPU single-token prefill/decode execution plan creation: %.2f ms\n", + (planCreationTime - startTime) / 1_000_000.0); + System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", + (warmupTime - planCreationTime) / 1_000_000.0); System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0); System.err.printf("Finished TornadoVM initialization...\n \n"); From 83998cc6d60fa6dec6e79903bb60656feaefcbf7 Mon Sep 17 00:00:00 2001 From: Orion Papadakis Date: Tue, 5 May 2026 15:14:18 +0300 Subject: [PATCH 2/5] Refactor TornadoVM initialization metrics tracking to include finer granularity across plan creation, JIT, and weight transfer stages --- .../gpullama3/auxiliary/RunMetrics.java | 36 +++++++++++++------ .../TornadoVMMasterPlanStandard.java | 9 +---- ...adoVMMasterPlanWithBatchPrefillDecode.java | 9 +---- .../TornadoVMMasterPlanWithPrefillDecode.java | 12 +------ 4 files changed, 28 insertions(+), 38 deletions(-) diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java index d807e703..9211e0f1 100644 --- a/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java +++ b/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java @@ -29,8 +29,9 @@ public final class RunMetrics { private boolean hasPrefillPhase; // ── TornadoVM-specific metrics (nanoseconds) ────────────────────────────── - private long tornadoCompileDurationNs; - private long tornadoWarmupDurationNs; + private long tornadoPlanCreationNs; + private long tornadoJitNs; + private long readOnlyWeightsCopyInNs; // ── Singleton ───────────────────────────────────────────────────────────── private static final RunMetrics INSTANCE = new RunMetrics(); @@ -47,12 +48,14 @@ public static void setLoadDuration(long ns) { /** * Records TornadoVM-specific initialisation durations. * - * @param compileNs plan-graph construction + JIT compilation ({@code withPreCompilation()}) - * @param warmupNs first-execution weight upload ({@code forceCopyInReadOnlyData()}) + * @param planCreationNs task-graph construction ({@code createExecutionPlan()}) + * @param jitNs JIT compilation ({@code withPreCompilation()}) + * @param weightCopyNs first-execution weight upload ({@code forceCopyInReadOnlyData()}) */ - public static void setTornadoMetrics(long compileNs, long warmupNs) { - INSTANCE.tornadoCompileDurationNs = compileNs; - INSTANCE.tornadoWarmupDurationNs = warmupNs; + public static void setTornadoMetrics(long planCreationNs, long jitNs, long weightCopyNs) { + INSTANCE.tornadoPlanCreationNs = planCreationNs; + INSTANCE.tornadoJitNs = jitNs; + INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs; } /** @@ -85,13 +88,13 @@ public static void setHasPrefillPhase(boolean value) { // ── Output ──────────────────────────────────────────────────────────────── - /** Prints throughput metrics to {@code stderr}. */ + /** Prints throughput metrics to {@code stderr}, and TornadoVM init metrics when enabled. */ public static void printMetrics() { RunMetrics m = INSTANCE; - int totalTokens = m.promptEvalCount + m.evalCount; - double totalSecs = m.totalDurationNs / 1e9; - double totalRate = totalSecs > 0 ? totalTokens / totalSecs : 0; + int totalTokens = m.promptEvalCount + m.evalCount; + double totalSecs = m.totalDurationNs / 1e9; + double totalRate = totalSecs > 0 ? totalTokens / totalSecs : 0; System.err.println(); System.err.println("==== Performance Metrics ===="); @@ -113,6 +116,17 @@ public static void printMetrics() { System.err.printf("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n", totalRate, totalTokens, totalSecs); } + + if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false")) + && m.tornadoPlanCreationNs > 0) { + System.err.printf( + "Compilation & CodeGen: %.2f ms%n" + + "Warmup: %.2f ms%n" + + "Read-only weights Copy-in: %.2f ms%n", + m.tornadoPlanCreationNs / 1_000_000.0, + m.tornadoJitNs / 1_000_000.0, + m.readOnlyWeightsCopyInNs / 1_000_000.0); + } System.err.println(); } } diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java index f50b2146..14aaed10 100644 --- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java +++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java @@ -47,14 +47,7 @@ public TornadoVMMasterPlanStandard(State state, Model model) { forceCopyInReadOnlyData(); long copyTime = System.nanoTime(); - RunMetrics.setTornadoMetrics(warmupTime - startTime, copyTime - warmupTime); - - if (ENABLE_TORNADOVM_INIT_TIME) { - System.err.printf("TornadoVM GPU standard execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0); - System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0); - System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0); - System.err.printf("Finished TornadoVM initialization...\n \n"); - } + RunMetrics.setTornadoMetrics(planCreationTime - startTime, warmupTime - planCreationTime, copyTime - warmupTime); } /** diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java index 46835797..2984508e 100644 --- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java +++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java @@ -97,14 +97,7 @@ public class TornadoVMMasterPlanWithBatchPrefillDecode implements TornadoVMMaste forceCopyInReadOnlyData(); long copyTime = System.nanoTime(); - RunMetrics.setTornadoMetrics(warmupTime - startTime, copyTime - warmupTime); - - if (ENABLE_TORNADOVM_INIT_TIME) { - System.err.printf("TornadoVM GPU batched prefill/decode execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0); - System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0); - System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0); - System.err.printf("Finished TornadoVM initialization...\n \n"); - } + RunMetrics.setTornadoMetrics(planCreationTime - startTime, warmupTime - planCreationTime, copyTime - warmupTime); } // ── Batch Prefill Activation graphs ───────────────────────────────────────────────────── diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java index 97e51f23..517024f9 100644 --- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java +++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java @@ -85,17 +85,7 @@ public class TornadoVMMasterPlanWithPrefillDecode implements TornadoVMMasterPlan forceCopyInReadOnlyData(); long copyTime = System.nanoTime(); - RunMetrics.setTornadoMetrics(warmupTime - startTime, copyTime - warmupTime); - - if (ENABLE_TORNADOVM_INIT_TIME) { - System.err.printf("TornadoVM GPU single-token prefill/decode execution plan creation: %.2f ms\n", - (planCreationTime - startTime) / 1_000_000.0); - System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", - (warmupTime - planCreationTime) / 1_000_000.0); - System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", - (copyTime - warmupTime) / 1_000_000.0); - System.err.printf("Finished TornadoVM initialization...\n \n"); - } + RunMetrics.setTornadoMetrics(planCreationTime - startTime, warmupTime - planCreationTime, copyTime - warmupTime); } // ── Activation graph ───────────────────────────────────────────────────── From 623f613fc0a6b175989e3657f895eefebb01ec1c Mon Sep 17 00:00:00 2001 From: Orion Papadakis Date: Tue, 5 May 2026 15:22:43 +0300 Subject: [PATCH 3/5] Track model loading duration in `RunMetrics` and include it in timing reports --- .../java/org/beehive/gpullama3/LlamaApp.java | 2 -- .../beehive/gpullama3/auxiliary/RunMetrics.java | 2 ++ .../gpullama3/model/loader/ModelLoader.java | 17 +++++++++-------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/beehive/gpullama3/LlamaApp.java b/src/main/java/org/beehive/gpullama3/LlamaApp.java index 463bd141..ef64ca23 100644 --- a/src/main/java/org/beehive/gpullama3/LlamaApp.java +++ b/src/main/java/org/beehive/gpullama3/LlamaApp.java @@ -35,9 +35,7 @@ private static void runSingleInstruction(Model model, Sampler sampler, Options o */ static void main(String[] args) throws IOException { Options options = Options.parseOptions(args); - long loadStart = System.nanoTime(); Model model = loadModel(options); - RunMetrics.setLoadDuration(System.nanoTime() - loadStart); Sampler sampler = createSampler(model, options); if (options.interactive()) { diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java index 9211e0f1..cfd499f4 100644 --- a/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java +++ b/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java @@ -120,9 +120,11 @@ public static void printMetrics() { if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false")) && m.tornadoPlanCreationNs > 0) { System.err.printf( + "GGUF Model Load: %.2f ms%n" + "Compilation & CodeGen: %.2f ms%n" + "Warmup: %.2f ms%n" + "Read-only weights Copy-in: %.2f ms%n", + m.loadDurationNs / 1_000_000.0, m.tornadoPlanCreationNs / 1_000_000.0, m.tornadoJitNs / 1_000_000.0, m.readOnlyWeightsCopyInNs / 1_000_000.0); diff --git a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java index 83b25987..bac9aa38 100644 --- a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java +++ b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java @@ -1,6 +1,7 @@ package org.beehive.gpullama3.model.loader; import org.beehive.gpullama3.Options; +import org.beehive.gpullama3.auxiliary.RunMetrics; import org.beehive.gpullama3.tensor.GGMLType; import org.beehive.gpullama3.tensor.GGUF; import org.beehive.gpullama3.tensor.*; @@ -91,24 +92,24 @@ public static Model loadModel(Options options) throws IOException { int contextLength = options.maxTokens(); boolean useTornadovm = options.useTornadovm(); - // initial load of metadata from gguf file + long start = System.nanoTime(); GGUF gguf = GGUF.loadGGUFMetadata(ggufPath); - // detect model type ModelType modelType = detectModelType(gguf.getMetadata()); - // model type-specific load - return modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + Model model = modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + RunMetrics.setLoadDuration(System.nanoTime() - start); + return model; } /** * For compatibility with langchain4j and quarkus. */ public static Model loadModel(Path ggufPath, int contextLength, boolean loadWeights, boolean useTornadovm) throws IOException { - // initial load of metadata from gguf file + long start = System.nanoTime(); GGUF gguf = GGUF.loadGGUFMetadata(ggufPath); - // detect model type ModelType modelType = detectModelType(gguf.getMetadata()); - // model type-specific load - return modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + Model model = modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm); + RunMetrics.setLoadDuration(System.nanoTime() - start); + return model; } /** From 964927b65dcddc8259a01dae39ffb9ac04f13caa Mon Sep 17 00:00:00 2001 From: Orion Papadakis Date: Tue, 5 May 2026 15:37:01 +0300 Subject: [PATCH 4/5] Remove `LastRunMetrics` as it is replaced by `RunMetrics` --- LlamaTornadoCli.java | 1 - .../gpullama3/auxiliary/LastRunMetrics.java | 33 ------------------- 2 files changed, 34 deletions(-) delete mode 100644 src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java diff --git a/LlamaTornadoCli.java b/LlamaTornadoCli.java index cb0bf01c..e6e1186b 100755 --- a/LlamaTornadoCli.java +++ b/LlamaTornadoCli.java @@ -19,7 +19,6 @@ package org.beehive.gpullama3.cli; import org.beehive.gpullama3.Options; -import org.beehive.gpullama3.auxiliary.LastRunMetrics; import org.beehive.gpullama3.inference.sampler.Sampler; import org.beehive.gpullama3.model.Model; diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java deleted file mode 100644 index 0d411801..00000000 --- a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java +++ /dev/null @@ -1,33 +0,0 @@ -package org.beehive.gpullama3.auxiliary; - -/** - * Record to store metrics from the last model run. - * @param totalTokens The total number of tokens processed - * @param totalSeconds The total time in seconds - */ -public record LastRunMetrics(int totalTokens, double totalSeconds) { - /** - * Singleton instance to store the latest metrics - */ - private static LastRunMetrics latestMetrics; - - /** - * Sets the metrics for the latest run - * - * @param tokens The total number of tokens processed - * @param seconds The total time in seconds - */ - public static void setMetrics(int tokens, double seconds) { - latestMetrics = new LastRunMetrics(tokens, seconds); - } - - /** - * Prints the metrics from the latest run to stderr - */ - public static void printMetrics() { - if (latestMetrics != null) { - double tokensPerSecond = latestMetrics.totalTokens() / latestMetrics.totalSeconds(); - System.err.printf("\n\nachieved tok/s: %.2f. Tokens: %d, seconds: %.2f\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalSeconds()); - } - } -} From e966ed1f8da9a4cc8bf77be448588f8435773494 Mon Sep 17 00:00:00 2001 From: Orion Papadakis Date: Tue, 5 May 2026 16:28:45 +0300 Subject: [PATCH 5/5] Add `RunMetricsRenderer` abstraction with support for human-readable, JSON, and GitHub Step Summary formats --- .../gpullama3/auxiliary/RunMetrics.java | 123 +++++++++++------- .../metrics/GitHubMetricsRenderer.java | 44 +++++++ .../metrics/HumanMetricsRenderer.java | 53 ++++++++ .../metrics/JsonMetricsRenderer.java | 67 ++++++++++ .../auxiliary/metrics/MetricsRenderer.java | 7 + .../auxiliary/metrics/RunMetricsSnapshot.java | 53 ++++++++ 6 files changed, 297 insertions(+), 50 deletions(-) create mode 100644 src/main/java/org/beehive/gpullama3/auxiliary/metrics/GitHubMetricsRenderer.java create mode 100644 src/main/java/org/beehive/gpullama3/auxiliary/metrics/HumanMetricsRenderer.java create mode 100644 src/main/java/org/beehive/gpullama3/auxiliary/metrics/JsonMetricsRenderer.java create mode 100644 src/main/java/org/beehive/gpullama3/auxiliary/metrics/MetricsRenderer.java create mode 100644 src/main/java/org/beehive/gpullama3/auxiliary/metrics/RunMetricsSnapshot.java diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java index cfd499f4..43a464e2 100644 --- a/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java +++ b/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java @@ -1,31 +1,46 @@ package org.beehive.gpullama3.auxiliary; +import org.beehive.gpullama3.auxiliary.metrics.GitHubMetricsRenderer; +import org.beehive.gpullama3.auxiliary.metrics.HumanMetricsRenderer; +import org.beehive.gpullama3.auxiliary.metrics.JsonMetricsRenderer; +import org.beehive.gpullama3.auxiliary.metrics.MetricsRenderer; +import org.beehive.gpullama3.auxiliary.metrics.RunMetricsSnapshot; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.nio.file.Path; + /** * Singleton that accumulates fine-grained performance metrics across one inference run. * *

Metrics are set incrementally by different layers of the stack:

*
    - *
  • {@link #setLoadDuration} — called from {@code LlamaApp} around model file loading
  • + *
  • {@link #setLoadDuration} — called from {@code ModelLoader}
  • *
  • {@link #setTornadoMetrics} — called from TornadoVM plan constructors
  • *
  • {@link #setInferenceMetrics} — called from InferenceEngine variants at end of generation
  • *
  • {@link #setHasPrefillPhase} — called from prefill-decode engine variants
  • *
* - *

All durations are stored in nanoseconds. {@link #printMetrics()} prints throughput only:

+ *

All durations are stored in nanoseconds. {@link #printMetrics()} builds an immutable + * {@link RunMetricsSnapshot}, selects a {@link MetricsRenderer}, and writes to the configured sink.

+ * + *

Configurable via system properties:

*
    - *
  • Standard engine: {@code Total: X tok/s}
  • - *
  • Prefill-decode engines: {@code Prefill: X tok/s | Decode: Y tok/s | Total: Z tok/s}
  • + *
  • {@code llama.metrics.format} — {@code human} (default) | {@code json} | {@code github}
  • + *
  • {@code llama.metrics.output} — {@code stderr} (default) | {@code stdout} | {@code file}
  • + *
  • {@code llama.metrics.file} — target path when {@code output=file}
  • *
*/ public final class RunMetrics { // ── Core metrics (nanoseconds) ──────────────────────────────────────────── - private long totalDurationNs; - private long loadDurationNs; - private int promptEvalCount; - private long promptEvalDurationNs; - private int evalCount; - private long evalDurationNs; + private long totalDurationNs; + private long loadDurationNs; + private int promptEvalCount; + private long promptEvalDurationNs; + private int evalCount; + private long evalDurationNs; private boolean hasPrefillPhase; // ── TornadoVM-specific metrics (nanoseconds) ────────────────────────────── @@ -53,9 +68,9 @@ public static void setLoadDuration(long ns) { * @param weightCopyNs first-execution weight upload ({@code forceCopyInReadOnlyData()}) */ public static void setTornadoMetrics(long planCreationNs, long jitNs, long weightCopyNs) { - INSTANCE.tornadoPlanCreationNs = planCreationNs; - INSTANCE.tornadoJitNs = jitNs; - INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs; + INSTANCE.tornadoPlanCreationNs = planCreationNs; + INSTANCE.tornadoJitNs = jitNs; + INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs; } /** @@ -86,49 +101,57 @@ public static void setHasPrefillPhase(boolean value) { INSTANCE.hasPrefillPhase = value; } + // ── Snapshot ────────────────────────────────────────────────────────────── + + /** Returns an immutable snapshot of all currently collected metrics. */ + public static RunMetricsSnapshot snapshot() { + RunMetrics m = INSTANCE; + return RunMetricsSnapshot.of( + m.totalDurationNs, m.loadDurationNs, + m.promptEvalCount, m.promptEvalDurationNs, + m.evalCount, m.evalDurationNs, + m.hasPrefillPhase, + m.tornadoPlanCreationNs, m.tornadoJitNs, + m.readOnlyWeightsCopyInNs); + } + // ── Output ──────────────────────────────────────────────────────────────── - /** Prints throughput metrics to {@code stderr}, and TornadoVM init metrics when enabled. */ + /** + * Builds a snapshot, selects a renderer based on {@code llama.metrics.format}, + * and writes the result to the sink configured by {@code llama.metrics.output}. + */ public static void printMetrics() { - RunMetrics m = INSTANCE; + RunMetricsSnapshot snap = snapshot(); + + MetricsRenderer renderer = switch (System.getProperty("llama.metrics.format", "human").toLowerCase()) { + case "json" -> new JsonMetricsRenderer(); + case "github" -> new GitHubMetricsRenderer(); + default -> new HumanMetricsRenderer(); + }; - int totalTokens = m.promptEvalCount + m.evalCount; - double totalSecs = m.totalDurationNs / 1e9; - double totalRate = totalSecs > 0 ? totalTokens / totalSecs : 0; - - System.err.println(); - System.err.println("==== Performance Metrics ===="); - if (m.hasPrefillPhase) { - double prefillSecs = m.promptEvalDurationNs / 1e9; - double decodeSecs = m.evalDurationNs / 1e9; - double prefillRate = (prefillSecs > 0 && m.promptEvalCount > 0) - ? m.promptEvalCount / prefillSecs : 0; - double decodeRate = (decodeSecs > 0 && m.evalCount > 0) - ? m.evalCount / decodeSecs : 0; - System.err.printf( - "Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" + - "¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" + - "¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n", - totalRate, totalTokens, totalSecs, - prefillRate, m.promptEvalCount, prefillSecs, - decodeRate, m.evalCount, decodeSecs); - } else { - System.err.printf("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n", - totalRate, totalTokens, totalSecs); + String rendered = renderer.render(snap); + + switch (System.getProperty("llama.metrics.output", "stderr").toLowerCase()) { + case "stdout" -> System.out.print(rendered); + case "file" -> writeToFile(rendered); + default -> System.err.print(rendered); } + } - if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false")) - && m.tornadoPlanCreationNs > 0) { - System.err.printf( - "GGUF Model Load: %.2f ms%n" + - "Compilation & CodeGen: %.2f ms%n" + - "Warmup: %.2f ms%n" + - "Read-only weights Copy-in: %.2f ms%n", - m.loadDurationNs / 1_000_000.0, - m.tornadoPlanCreationNs / 1_000_000.0, - m.tornadoJitNs / 1_000_000.0, - m.readOnlyWeightsCopyInNs / 1_000_000.0); + private static void writeToFile(String content) { + String filePath = System.getProperty("llama.metrics.file"); + if (filePath == null || filePath.isBlank()) { + throw new IllegalStateException( + "llama.metrics.output=file requires llama.metrics.file to be set"); + } + Path path = Path.of(filePath); + try { + Path parent = path.getParent(); + if (parent != null) Files.createDirectories(parent); + Files.writeString(path, content); + } catch (IOException e) { + throw new UncheckedIOException("Failed to write metrics to " + filePath, e); } - System.err.println(); } } diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/metrics/GitHubMetricsRenderer.java b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/GitHubMetricsRenderer.java new file mode 100644 index 00000000..380aec89 --- /dev/null +++ b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/GitHubMetricsRenderer.java @@ -0,0 +1,44 @@ +package org.beehive.gpullama3.auxiliary.metrics; + +/** + * Renders metrics as a Markdown table suitable for appending to {@code $GITHUB_STEP_SUMMARY}. + * TornadoVM rows (compile, JIT, weight copy-in) are included only when plan-creation duration + * is non-zero, i.e. on GPU runs. + * + *

Enable via system properties and append the output file to the step summary:

+ *
+ *   -Dllama.metrics.format=github
+ *   -Dllama.metrics.output=file
+ *   -Dllama.metrics.file=/tmp/metrics.md
+ * 
+ * + *

In a GitHub Actions workflow step:

+ *
+ *   cat /tmp/metrics.md >> $GITHUB_STEP_SUMMARY
+ * 
+ */ +public final class GitHubMetricsRenderer implements MetricsRenderer { + + @Override + public String render(RunMetricsSnapshot s) { + StringBuilder sb = new StringBuilder(); + sb.append("| metric | value |\n"); + sb.append("|---|---:|\n"); + sb.append(String.format("| eval tok/s | %.2f |%n", s.evalRate())); + sb.append(String.format("| prompt eval tok/s | %.2f |%n", s.promptEvalRate())); + sb.append(String.format("| total tok/s | %.2f |%n", s.totalRate())); + sb.append(String.format("| load ms | %.2f |%n", s.loadDuration() / 1_000_000.0)); + sb.append(String.format("| eval tokens | %d |%n", s.evalCount())); + sb.append(String.format("| prompt tokens | %d |%n", s.promptEvalCount())); + sb.append(String.format("| total tokens | %d |%n", s.totalCount())); + if (s.tornadoPlanCreationDuration() > 0) { + sb.append(String.format("| compile ms | %.2f |%n", + s.tornadoPlanCreationDuration() / 1_000_000.0)); + sb.append(String.format("| jit ms | %.2f |%n", + s.tornadoJitDuration() / 1_000_000.0)); + sb.append(String.format("| weight copy-in ms | %.2f |%n", + s.tornadoReadOnlyWeightsCopyInDuration() / 1_000_000.0)); + } + return sb.toString(); + } +} diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/metrics/HumanMetricsRenderer.java b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/HumanMetricsRenderer.java new file mode 100644 index 00000000..093018a0 --- /dev/null +++ b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/HumanMetricsRenderer.java @@ -0,0 +1,53 @@ +package org.beehive.gpullama3.auxiliary.metrics; + +/** + * Renders metrics in human-readable format to {@code stderr}. + * + *

This is the default renderer — no configuration needed. To enable explicitly:

+ *
+ *   -Dllama.metrics.format=human   (default, can be omitted)
+ *   -Dllama.metrics.output=stderr  (default, can be omitted)
+ * 
+ * + *

To also print TornadoVM initialisation timings (plan creation, JIT, weight copy-in), + * additionally set:

+ *
+ *   -Dllama.EnableTimingForTornadoVMInit=true
+ * 
+ */ +public final class HumanMetricsRenderer implements MetricsRenderer { + + @Override + public String render(RunMetricsSnapshot s) { + StringBuilder sb = new StringBuilder(); + sb.append("\n==== Performance Metrics ====\n"); + + if (s.hasPrefillPhase()) { + sb.append(String.format( + "Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" + + "¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" + + "¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n", + s.totalRate(), s.totalCount(), s.totalDuration() / 1e9, + s.promptEvalRate(), s.promptEvalCount(), s.promptEvalDuration() / 1e9, + s.evalRate(), s.evalCount(), s.evalDuration() / 1e9)); + } else { + sb.append(String.format("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n", + s.totalRate(), s.totalCount(), s.totalDuration() / 1e9)); + } + + if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false")) + && s.tornadoPlanCreationDuration() > 0) { + sb.append(String.format( + "GGUF Model Load: %.2f ms%n" + + "Compilation & CodeGen: %.2f ms%n" + + "Warmup: %.2f ms%n" + + "Read-only weights Copy-in: %.2f ms%n", + s.loadDuration() / 1_000_000.0, + s.tornadoPlanCreationDuration() / 1_000_000.0, + s.tornadoJitDuration() / 1_000_000.0, + s.tornadoReadOnlyWeightsCopyInDuration() / 1_000_000.0)); + } + + return sb.toString(); + } +} diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/metrics/JsonMetricsRenderer.java b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/JsonMetricsRenderer.java new file mode 100644 index 00000000..9d5db0a1 --- /dev/null +++ b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/JsonMetricsRenderer.java @@ -0,0 +1,67 @@ +package org.beehive.gpullama3.auxiliary.metrics; + +import java.util.ArrayList; +import java.util.List; + +/** + * Renders metrics as an Ollama-compatible JSON object. + * All duration fields are in nanoseconds; rate fields are in tokens per second. + * The {@code tornadovm} nested object is always included (fields are zero on CPU runs). + * + *

Enable via system properties:

+ *
+ *   -Dllama.metrics.format=json
+ *   -Dllama.metrics.output=stdout   # pipe to jq or another tool
+ * 
+ * + *

Or write to a file:

+ *
+ *   -Dllama.metrics.format=json
+ *   -Dllama.metrics.output=file
+ *   -Dllama.metrics.file=/path/to/metrics.json
+ * 
+ */ +public final class JsonMetricsRenderer implements MetricsRenderer { + + @Override + public String render(RunMetricsSnapshot s) { + List fields = new ArrayList<>(); + fields.add(field(" ", "total_duration", s.totalDuration())); + fields.add(field(" ", "load_duration", s.loadDuration())); + fields.add(field(" ", "prompt_eval_count", s.promptEvalCount())); + fields.add(field(" ", "prompt_eval_duration", s.promptEvalDuration())); + fields.add(field(" ", "eval_count", s.evalCount())); + fields.add(field(" ", "eval_duration", s.evalDuration())); + fields.add(field(" ", "total_count", s.totalCount())); + fields.add(field(" ", "prompt_eval_rate", s.promptEvalRate())); + fields.add(field(" ", "eval_rate", s.evalRate())); + fields.add(field(" ", "total_rate", s.totalRate())); + fields.add(field(" ", "has_prefill_phase", s.hasPrefillPhase())); + fields.add(tornadoObject(s)); + return "{\n" + String.join(",\n", fields) + "\n}"; + } + + private static String tornadoObject(RunMetricsSnapshot s) { + List inner = new ArrayList<>(); + inner.add(field(" ", "plan_creation_duration", s.tornadoPlanCreationDuration())); + inner.add(field(" ", "jit_duration", s.tornadoJitDuration())); + inner.add(field(" ", "read_only_weights_copy_in_duration", s.tornadoReadOnlyWeightsCopyInDuration())); + return " \"tornadovm\": {\n" + String.join(",\n", inner) + "\n }"; + } + + private static String field(String indent, String key, long value) { + return indent + "\"" + key + "\": " + value; + } + + private static String field(String indent, String key, int value) { + return indent + "\"" + key + "\": " + value; + } + + private static String field(String indent, String key, double value) { + return indent + "\"" + key + "\": " + String.format("%.4f", value); + } + + private static String field(String indent, String key, boolean value) { + return indent + "\"" + key + "\": " + value; + } +} diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/metrics/MetricsRenderer.java b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/MetricsRenderer.java new file mode 100644 index 00000000..0afcd3a1 --- /dev/null +++ b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/MetricsRenderer.java @@ -0,0 +1,7 @@ +package org.beehive.gpullama3.auxiliary.metrics; + +/** Renders a {@link RunMetricsSnapshot} to a string for a specific output format. */ +@FunctionalInterface +public interface MetricsRenderer { + String render(RunMetricsSnapshot snapshot); +} diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/metrics/RunMetricsSnapshot.java b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/RunMetricsSnapshot.java new file mode 100644 index 00000000..2051b88c --- /dev/null +++ b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/RunMetricsSnapshot.java @@ -0,0 +1,53 @@ +package org.beehive.gpullama3.auxiliary.metrics; + +/** + * Immutable snapshot of all performance metrics for a single inference run. + * + *

Raw duration fields are in nanoseconds. Derived rate fields are in tokens per second. + * Construct via {@link #of} — it computes the derived values from the raw inputs.

+ */ +public record RunMetricsSnapshot( + long totalDuration, + long loadDuration, + int promptEvalCount, + long promptEvalDuration, + int evalCount, + long evalDuration, + boolean hasPrefillPhase, + long tornadoPlanCreationDuration, + long tornadoJitDuration, + long tornadoReadOnlyWeightsCopyInDuration, + // derived + int totalCount, + double promptEvalRate, + double evalRate, + double totalRate +) { + public static RunMetricsSnapshot of( + long totalDuration, long loadDuration, + int promptEvalCount, long promptEvalDuration, + int evalCount, long evalDuration, + boolean hasPrefillPhase, + long tornadoPlanCreationDuration, + long tornadoJitDuration, + long tornadoReadOnlyWeightsCopyInDuration) { + + int totalCount = promptEvalCount + evalCount; + double promptEvalRate = tokensPerSecond(promptEvalCount, promptEvalDuration); + double evalRate = tokensPerSecond(evalCount, evalDuration); + double totalRate = tokensPerSecond(totalCount, totalDuration); + + return new RunMetricsSnapshot( + totalDuration, loadDuration, + promptEvalCount, promptEvalDuration, + evalCount, evalDuration, + hasPrefillPhase, + tornadoPlanCreationDuration, tornadoJitDuration, + tornadoReadOnlyWeightsCopyInDuration, + totalCount, promptEvalRate, evalRate, totalRate); + } + + private static double tokensPerSecond(int tokens, long durationNs) { + return (durationNs > 0 && tokens > 0) ? tokens / (durationNs / 1e9) : 0.0; + } +}