diff --git a/LlamaTornadoCli.java b/LlamaTornadoCli.java
index cb0bf01c..e6e1186b 100755
--- a/LlamaTornadoCli.java
+++ b/LlamaTornadoCli.java
@@ -19,7 +19,6 @@
package org.beehive.gpullama3.cli;
import org.beehive.gpullama3.Options;
-import org.beehive.gpullama3.auxiliary.LastRunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.model.Model;
diff --git a/llama-tornado b/llama-tornado
index 11ada591..1d6c3d23 100755
--- a/llama-tornado
+++ b/llama-tornado
@@ -96,8 +96,6 @@ class LlamaRunner:
if args.cuda_graphs:
cmd.append("-Dllama.cudaGraphs=true")
- elif args.no_cuda_graphs:
- cmd.append("-Dllama.cudaGraphs=false")
# Debug options
debug_config = []
diff --git a/src/main/java/org/beehive/gpullama3/LlamaApp.java b/src/main/java/org/beehive/gpullama3/LlamaApp.java
index 822a082c..ef64ca23 100644
--- a/src/main/java/org/beehive/gpullama3/LlamaApp.java
+++ b/src/main/java/org/beehive/gpullama3/LlamaApp.java
@@ -1,6 +1,6 @@
package org.beehive.gpullama3;
-import org.beehive.gpullama3.auxiliary.LastRunMetrics;
+import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.model.Model;
@@ -18,7 +18,7 @@ private static void runSingleInstruction(Model model, Sampler sampler, Options o
String response = model.runInstructOnce(sampler, options);
System.out.println(response);
if (SHOW_PERF_INTERACTIVE) {
- LastRunMetrics.printMetrics();
+ RunMetrics.printMetrics();
}
}
diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java
deleted file mode 100644
index 0d411801..00000000
--- a/src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java
+++ /dev/null
@@ -1,33 +0,0 @@
-package org.beehive.gpullama3.auxiliary;
-
-/**
- * Record to store metrics from the last model run.
- * @param totalTokens The total number of tokens processed
- * @param totalSeconds The total time in seconds
- */
-public record LastRunMetrics(int totalTokens, double totalSeconds) {
- /**
- * Singleton instance to store the latest metrics
- */
- private static LastRunMetrics latestMetrics;
-
- /**
- * Sets the metrics for the latest run
- *
- * @param tokens The total number of tokens processed
- * @param seconds The total time in seconds
- */
- public static void setMetrics(int tokens, double seconds) {
- latestMetrics = new LastRunMetrics(tokens, seconds);
- }
-
- /**
- * Prints the metrics from the latest run to stderr
- */
- public static void printMetrics() {
- if (latestMetrics != null) {
- double tokensPerSecond = latestMetrics.totalTokens() / latestMetrics.totalSeconds();
- System.err.printf("\n\nachieved tok/s: %.2f. Tokens: %d, seconds: %.2f\n", tokensPerSecond, latestMetrics.totalTokens(), latestMetrics.totalSeconds());
- }
- }
-}
diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java b/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java
new file mode 100644
index 00000000..43a464e2
--- /dev/null
+++ b/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java
@@ -0,0 +1,157 @@
+package org.beehive.gpullama3.auxiliary;
+
+import org.beehive.gpullama3.auxiliary.metrics.GitHubMetricsRenderer;
+import org.beehive.gpullama3.auxiliary.metrics.HumanMetricsRenderer;
+import org.beehive.gpullama3.auxiliary.metrics.JsonMetricsRenderer;
+import org.beehive.gpullama3.auxiliary.metrics.MetricsRenderer;
+import org.beehive.gpullama3.auxiliary.metrics.RunMetricsSnapshot;
+
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+/**
+ * Singleton that accumulates fine-grained performance metrics across one inference run.
+ *
+ *
Metrics are set incrementally by different layers of the stack:
+ *
+ * - {@link #setLoadDuration} — called from {@code ModelLoader}
+ * - {@link #setTornadoMetrics} — called from TornadoVM plan constructors
+ * - {@link #setInferenceMetrics} — called from InferenceEngine variants at end of generation
+ * - {@link #setHasPrefillPhase} — called from prefill-decode engine variants
+ *
+ *
+ * All durations are stored in nanoseconds. {@link #printMetrics()} builds an immutable
+ * {@link RunMetricsSnapshot}, selects a {@link MetricsRenderer}, and writes to the configured sink.
+ *
+ * Configurable via system properties:
+ *
+ * - {@code llama.metrics.format} — {@code human} (default) | {@code json} | {@code github}
+ * - {@code llama.metrics.output} — {@code stderr} (default) | {@code stdout} | {@code file}
+ * - {@code llama.metrics.file} — target path when {@code output=file}
+ *
+ */
+public final class RunMetrics {
+
+ // ── Core metrics (nanoseconds) ────────────────────────────────────────────
+ private long totalDurationNs;
+ private long loadDurationNs;
+ private int promptEvalCount;
+ private long promptEvalDurationNs;
+ private int evalCount;
+ private long evalDurationNs;
+ private boolean hasPrefillPhase;
+
+ // ── TornadoVM-specific metrics (nanoseconds) ──────────────────────────────
+ private long tornadoPlanCreationNs;
+ private long tornadoJitNs;
+ private long readOnlyWeightsCopyInNs;
+
+ // ── Singleton ─────────────────────────────────────────────────────────────
+ private static final RunMetrics INSTANCE = new RunMetrics();
+
+ private RunMetrics() {}
+
+ // ── Setters ───────────────────────────────────────────────────────────────
+
+ /** Records the time spent loading the model file (not including TornadoVM initialisation). */
+ public static void setLoadDuration(long ns) {
+ INSTANCE.loadDurationNs = ns;
+ }
+
+ /**
+ * Records TornadoVM-specific initialisation durations.
+ *
+ * @param planCreationNs task-graph construction ({@code createExecutionPlan()})
+ * @param jitNs JIT compilation ({@code withPreCompilation()})
+ * @param weightCopyNs first-execution weight upload ({@code forceCopyInReadOnlyData()})
+ */
+ public static void setTornadoMetrics(long planCreationNs, long jitNs, long weightCopyNs) {
+ INSTANCE.tornadoPlanCreationNs = planCreationNs;
+ INSTANCE.tornadoJitNs = jitNs;
+ INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs;
+ }
+
+ /**
+ * Records inference-phase durations at the end of a generation run.
+ *
+ * @param promptCount number of prompt tokens processed (prefill)
+ * @param prefillNs wall-clock time spent in the prefill phase
+ * @param generatedCount number of tokens generated (decode)
+ * @param decodeNs wall-clock time spent in the decode phase
+ * @param totalNs total wall-clock time for the full inference call
+ */
+ public static void setInferenceMetrics(int promptCount, long prefillNs,
+ int generatedCount, long decodeNs,
+ long totalNs) {
+ INSTANCE.promptEvalCount = promptCount;
+ INSTANCE.promptEvalDurationNs = prefillNs;
+ INSTANCE.evalCount = generatedCount;
+ INSTANCE.evalDurationNs = decodeNs;
+ INSTANCE.totalDurationNs = totalNs;
+ }
+
+ /**
+ * Signals that prefill and decode are distinct timed phases.
+ * Called by {@code InferenceEngineWithPrefillDecode} and
+ * {@code InferenceEngineWithBatchPrefillDecode} before returning.
+ */
+ public static void setHasPrefillPhase(boolean value) {
+ INSTANCE.hasPrefillPhase = value;
+ }
+
+ // ── Snapshot ──────────────────────────────────────────────────────────────
+
+ /** Returns an immutable snapshot of all currently collected metrics. */
+ public static RunMetricsSnapshot snapshot() {
+ RunMetrics m = INSTANCE;
+ return RunMetricsSnapshot.of(
+ m.totalDurationNs, m.loadDurationNs,
+ m.promptEvalCount, m.promptEvalDurationNs,
+ m.evalCount, m.evalDurationNs,
+ m.hasPrefillPhase,
+ m.tornadoPlanCreationNs, m.tornadoJitNs,
+ m.readOnlyWeightsCopyInNs);
+ }
+
+ // ── Output ────────────────────────────────────────────────────────────────
+
+ /**
+ * Builds a snapshot, selects a renderer based on {@code llama.metrics.format},
+ * and writes the result to the sink configured by {@code llama.metrics.output}.
+ */
+ public static void printMetrics() {
+ RunMetricsSnapshot snap = snapshot();
+
+ MetricsRenderer renderer = switch (System.getProperty("llama.metrics.format", "human").toLowerCase()) {
+ case "json" -> new JsonMetricsRenderer();
+ case "github" -> new GitHubMetricsRenderer();
+ default -> new HumanMetricsRenderer();
+ };
+
+ String rendered = renderer.render(snap);
+
+ switch (System.getProperty("llama.metrics.output", "stderr").toLowerCase()) {
+ case "stdout" -> System.out.print(rendered);
+ case "file" -> writeToFile(rendered);
+ default -> System.err.print(rendered);
+ }
+ }
+
+ private static void writeToFile(String content) {
+ String filePath = System.getProperty("llama.metrics.file");
+ if (filePath == null || filePath.isBlank()) {
+ throw new IllegalStateException(
+ "llama.metrics.output=file requires llama.metrics.file to be set");
+ }
+ Path path = Path.of(filePath);
+ try {
+ Path parent = path.getParent();
+ if (parent != null) Files.createDirectories(parent);
+ Files.writeString(path, content);
+ } catch (IOException e) {
+ throw new UncheckedIOException("Failed to write metrics to " + filePath, e);
+ }
+ }
+}
diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/metrics/GitHubMetricsRenderer.java b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/GitHubMetricsRenderer.java
new file mode 100644
index 00000000..380aec89
--- /dev/null
+++ b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/GitHubMetricsRenderer.java
@@ -0,0 +1,44 @@
+package org.beehive.gpullama3.auxiliary.metrics;
+
+/**
+ * Renders metrics as a Markdown table suitable for appending to {@code $GITHUB_STEP_SUMMARY}.
+ * TornadoVM rows (compile, JIT, weight copy-in) are included only when plan-creation duration
+ * is non-zero, i.e. on GPU runs.
+ *
+ * Enable via system properties and append the output file to the step summary:
+ *
+ * -Dllama.metrics.format=github
+ * -Dllama.metrics.output=file
+ * -Dllama.metrics.file=/tmp/metrics.md
+ *
+ *
+ * In a GitHub Actions workflow step:
+ *
+ * cat /tmp/metrics.md >> $GITHUB_STEP_SUMMARY
+ *
+ */
+public final class GitHubMetricsRenderer implements MetricsRenderer {
+
+ @Override
+ public String render(RunMetricsSnapshot s) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("| metric | value |\n");
+ sb.append("|---|---:|\n");
+ sb.append(String.format("| eval tok/s | %.2f |%n", s.evalRate()));
+ sb.append(String.format("| prompt eval tok/s | %.2f |%n", s.promptEvalRate()));
+ sb.append(String.format("| total tok/s | %.2f |%n", s.totalRate()));
+ sb.append(String.format("| load ms | %.2f |%n", s.loadDuration() / 1_000_000.0));
+ sb.append(String.format("| eval tokens | %d |%n", s.evalCount()));
+ sb.append(String.format("| prompt tokens | %d |%n", s.promptEvalCount()));
+ sb.append(String.format("| total tokens | %d |%n", s.totalCount()));
+ if (s.tornadoPlanCreationDuration() > 0) {
+ sb.append(String.format("| compile ms | %.2f |%n",
+ s.tornadoPlanCreationDuration() / 1_000_000.0));
+ sb.append(String.format("| jit ms | %.2f |%n",
+ s.tornadoJitDuration() / 1_000_000.0));
+ sb.append(String.format("| weight copy-in ms | %.2f |%n",
+ s.tornadoReadOnlyWeightsCopyInDuration() / 1_000_000.0));
+ }
+ return sb.toString();
+ }
+}
diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/metrics/HumanMetricsRenderer.java b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/HumanMetricsRenderer.java
new file mode 100644
index 00000000..093018a0
--- /dev/null
+++ b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/HumanMetricsRenderer.java
@@ -0,0 +1,53 @@
+package org.beehive.gpullama3.auxiliary.metrics;
+
+/**
+ * Renders metrics in human-readable format to {@code stderr}.
+ *
+ * This is the default renderer — no configuration needed. To enable explicitly:
+ *
+ * -Dllama.metrics.format=human (default, can be omitted)
+ * -Dllama.metrics.output=stderr (default, can be omitted)
+ *
+ *
+ * To also print TornadoVM initialisation timings (plan creation, JIT, weight copy-in),
+ * additionally set:
+ *
+ * -Dllama.EnableTimingForTornadoVMInit=true
+ *
+ */
+public final class HumanMetricsRenderer implements MetricsRenderer {
+
+ @Override
+ public String render(RunMetricsSnapshot s) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("\n==== Performance Metrics ====\n");
+
+ if (s.hasPrefillPhase()) {
+ sb.append(String.format(
+ "Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
+ "¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
+ "¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
+ s.totalRate(), s.totalCount(), s.totalDuration() / 1e9,
+ s.promptEvalRate(), s.promptEvalCount(), s.promptEvalDuration() / 1e9,
+ s.evalRate(), s.evalCount(), s.evalDuration() / 1e9));
+ } else {
+ sb.append(String.format("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
+ s.totalRate(), s.totalCount(), s.totalDuration() / 1e9));
+ }
+
+ if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false"))
+ && s.tornadoPlanCreationDuration() > 0) {
+ sb.append(String.format(
+ "GGUF Model Load: %.2f ms%n" +
+ "Compilation & CodeGen: %.2f ms%n" +
+ "Warmup: %.2f ms%n" +
+ "Read-only weights Copy-in: %.2f ms%n",
+ s.loadDuration() / 1_000_000.0,
+ s.tornadoPlanCreationDuration() / 1_000_000.0,
+ s.tornadoJitDuration() / 1_000_000.0,
+ s.tornadoReadOnlyWeightsCopyInDuration() / 1_000_000.0));
+ }
+
+ return sb.toString();
+ }
+}
diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/metrics/JsonMetricsRenderer.java b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/JsonMetricsRenderer.java
new file mode 100644
index 00000000..9d5db0a1
--- /dev/null
+++ b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/JsonMetricsRenderer.java
@@ -0,0 +1,67 @@
+package org.beehive.gpullama3.auxiliary.metrics;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Renders metrics as an Ollama-compatible JSON object.
+ * All duration fields are in nanoseconds; rate fields are in tokens per second.
+ * The {@code tornadovm} nested object is always included (fields are zero on CPU runs).
+ *
+ * Enable via system properties:
+ *
+ * -Dllama.metrics.format=json
+ * -Dllama.metrics.output=stdout # pipe to jq or another tool
+ *
+ *
+ * Or write to a file:
+ *
+ * -Dllama.metrics.format=json
+ * -Dllama.metrics.output=file
+ * -Dllama.metrics.file=/path/to/metrics.json
+ *
+ */
+public final class JsonMetricsRenderer implements MetricsRenderer {
+
+ @Override
+ public String render(RunMetricsSnapshot s) {
+ List fields = new ArrayList<>();
+ fields.add(field(" ", "total_duration", s.totalDuration()));
+ fields.add(field(" ", "load_duration", s.loadDuration()));
+ fields.add(field(" ", "prompt_eval_count", s.promptEvalCount()));
+ fields.add(field(" ", "prompt_eval_duration", s.promptEvalDuration()));
+ fields.add(field(" ", "eval_count", s.evalCount()));
+ fields.add(field(" ", "eval_duration", s.evalDuration()));
+ fields.add(field(" ", "total_count", s.totalCount()));
+ fields.add(field(" ", "prompt_eval_rate", s.promptEvalRate()));
+ fields.add(field(" ", "eval_rate", s.evalRate()));
+ fields.add(field(" ", "total_rate", s.totalRate()));
+ fields.add(field(" ", "has_prefill_phase", s.hasPrefillPhase()));
+ fields.add(tornadoObject(s));
+ return "{\n" + String.join(",\n", fields) + "\n}";
+ }
+
+ private static String tornadoObject(RunMetricsSnapshot s) {
+ List inner = new ArrayList<>();
+ inner.add(field(" ", "plan_creation_duration", s.tornadoPlanCreationDuration()));
+ inner.add(field(" ", "jit_duration", s.tornadoJitDuration()));
+ inner.add(field(" ", "read_only_weights_copy_in_duration", s.tornadoReadOnlyWeightsCopyInDuration()));
+ return " \"tornadovm\": {\n" + String.join(",\n", inner) + "\n }";
+ }
+
+ private static String field(String indent, String key, long value) {
+ return indent + "\"" + key + "\": " + value;
+ }
+
+ private static String field(String indent, String key, int value) {
+ return indent + "\"" + key + "\": " + value;
+ }
+
+ private static String field(String indent, String key, double value) {
+ return indent + "\"" + key + "\": " + String.format("%.4f", value);
+ }
+
+ private static String field(String indent, String key, boolean value) {
+ return indent + "\"" + key + "\": " + value;
+ }
+}
diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/metrics/MetricsRenderer.java b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/MetricsRenderer.java
new file mode 100644
index 00000000..0afcd3a1
--- /dev/null
+++ b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/MetricsRenderer.java
@@ -0,0 +1,7 @@
+package org.beehive.gpullama3.auxiliary.metrics;
+
+/** Renders a {@link RunMetricsSnapshot} to a string for a specific output format. */
+@FunctionalInterface
+public interface MetricsRenderer {
+ String render(RunMetricsSnapshot snapshot);
+}
diff --git a/src/main/java/org/beehive/gpullama3/auxiliary/metrics/RunMetricsSnapshot.java b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/RunMetricsSnapshot.java
new file mode 100644
index 00000000..2051b88c
--- /dev/null
+++ b/src/main/java/org/beehive/gpullama3/auxiliary/metrics/RunMetricsSnapshot.java
@@ -0,0 +1,53 @@
+package org.beehive.gpullama3.auxiliary.metrics;
+
+/**
+ * Immutable snapshot of all performance metrics for a single inference run.
+ *
+ * Raw duration fields are in nanoseconds. Derived rate fields are in tokens per second.
+ * Construct via {@link #of} — it computes the derived values from the raw inputs.
+ */
+public record RunMetricsSnapshot(
+ long totalDuration,
+ long loadDuration,
+ int promptEvalCount,
+ long promptEvalDuration,
+ int evalCount,
+ long evalDuration,
+ boolean hasPrefillPhase,
+ long tornadoPlanCreationDuration,
+ long tornadoJitDuration,
+ long tornadoReadOnlyWeightsCopyInDuration,
+ // derived
+ int totalCount,
+ double promptEvalRate,
+ double evalRate,
+ double totalRate
+) {
+ public static RunMetricsSnapshot of(
+ long totalDuration, long loadDuration,
+ int promptEvalCount, long promptEvalDuration,
+ int evalCount, long evalDuration,
+ boolean hasPrefillPhase,
+ long tornadoPlanCreationDuration,
+ long tornadoJitDuration,
+ long tornadoReadOnlyWeightsCopyInDuration) {
+
+ int totalCount = promptEvalCount + evalCount;
+ double promptEvalRate = tokensPerSecond(promptEvalCount, promptEvalDuration);
+ double evalRate = tokensPerSecond(evalCount, evalDuration);
+ double totalRate = tokensPerSecond(totalCount, totalDuration);
+
+ return new RunMetricsSnapshot(
+ totalDuration, loadDuration,
+ promptEvalCount, promptEvalDuration,
+ evalCount, evalDuration,
+ hasPrefillPhase,
+ tornadoPlanCreationDuration, tornadoJitDuration,
+ tornadoReadOnlyWeightsCopyInDuration,
+ totalCount, promptEvalRate, evalRate, totalRate);
+ }
+
+ private static double tokensPerSecond(int tokens, long durationNs) {
+ return (durationNs > 0 && tokens > 0) ? tokens / (durationNs / 1e9) : 0.0;
+ }
+}
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java
index a9c65223..d653ba58 100644
--- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java
+++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java
@@ -1,6 +1,6 @@
package org.beehive.gpullama3.inference;
-import org.beehive.gpullama3.auxiliary.LastRunMetrics;
+import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.inference.state.State;
import org.beehive.gpullama3.model.Configuration;
@@ -132,10 +132,9 @@ public static List generateTokensLlama(Model model, State state, int st
// Calculate and print performance metrics
long endNanos = System.nanoTime();
- double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
- int totalTokens = promptIndex + generatedTokens.size();
-
- LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+ long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
+ RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
+ generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);
return generatedTokens;
}
@@ -213,10 +212,9 @@ public static List generateTokensQwen3(Model model, State state, int st
// Calculate and print performance metrics
long endNanos = System.nanoTime();
- double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
- int totalTokens = promptIndex + generatedTokens.size();
-
- LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+ long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
+ RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
+ generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);
return generatedTokens;
}
@@ -225,6 +223,7 @@ public static List generateTokensPhi3(Model model, State state, int sta
IntConsumer onTokenGenerated) {
long startNanos = System.nanoTime();
+ long inferenceStartNanos = 0;
if (maxTokens < 0 || model.configuration().contextLength() < maxTokens) {
maxTokens = model.configuration().contextLength();
}
@@ -245,6 +244,9 @@ public static List generateTokensPhi3(Model model, State state, int sta
System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
}
} else {
+ if (inferenceStartNanos == 0) {
+ inferenceStartNanos = System.nanoTime();
+ }
nextToken = sampler.sampleToken(state.logits);
if (echo) {
// log inferred token
@@ -266,10 +268,9 @@ public static List generateTokensPhi3(Model model, State state, int sta
// Calculate and print performance metrics
long endNanos = System.nanoTime();
- double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
- int totalTokens = promptIndex + generatedTokens.size();
-
- LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+ long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
+ RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
+ generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);
return generatedTokens;
@@ -356,11 +357,9 @@ public static List generateTokensGPULlama(Model model, State state, int
// === Performance Metrics ===
long endNanos = System.nanoTime();
- double totalSeconds = (endNanos - startNanos) / 1_000_000_000.0;
- int totalTokens = promptIndex + generatedTokens.size();
-
- // Set metrics for tokens achieved
- LastRunMetrics.setMetrics(totalTokens, totalSeconds);
+ long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
+ RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
+ generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);
return generatedTokens;
}
@@ -449,10 +448,9 @@ public static List generateTokensGPUQwen3(Model model, State state, int
// Calculate and print performance metrics
long endNanos = System.nanoTime();
- double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
- int totalTokens = promptIndex + generatedTokens.size();
-
- LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+ long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
+ RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
+ generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);
return generatedTokens;
}
@@ -524,10 +522,9 @@ public static List generateTokensGPUPhi3(Model model, State state, int
// Calculate and print performance metrics
long endNanos = System.nanoTime();
- double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
- int totalTokens = promptIndex + generatedTokens.size();
-
- LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+ long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
+ RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
+ generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);
return generatedTokens;
}
@@ -591,10 +588,9 @@ public static List generateTokensGranite(Model model, State state, int
}
long endNanos = System.nanoTime();
- double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
- int totalTokens = promptIndex + generatedTokens.size();
-
- LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+ long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
+ RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
+ generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);
return generatedTokens;
}
@@ -658,10 +654,9 @@ public static List generateTokensGPUGranite(Model model, State state, i
}
long endNanos = System.nanoTime();
- double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
- int totalTokens = promptIndex + generatedTokens.size();
-
- LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
+ long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
+ RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
+ generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);
return generatedTokens;
}
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java
index 1440a984..06dbe256 100644
--- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java
+++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithBatchPrefillDecode.java
@@ -1,6 +1,6 @@
package org.beehive.gpullama3.inference;
-import org.beehive.gpullama3.auxiliary.LastRunMetrics;
+import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.inference.state.State;
import org.beehive.gpullama3.model.Configuration;
@@ -102,6 +102,7 @@ public static List generateTokensLlama(
}
state.latestToken = currentToken;
+ long decodeStartNanos = System.nanoTime();
// ── Decode ────────────────────────────────────────────────────────────
while (pos < actualMaxTokens) {
@@ -129,8 +130,9 @@ public static List generateTokensLlama(
}
long endNanos = System.nanoTime();
- int totalTokens = promptTokens.size() + generatedTokens.size();
- LastRunMetrics.setMetrics(totalTokens, (endNanos - startNanos) / 1_000_000_000.0);
+ RunMetrics.setInferenceMetrics(promptTokens.size(), decodeStartNanos - startNanos,
+ generatedTokens.size(), endNanos - decodeStartNanos, endNanos - startNanos);
+ RunMetrics.setHasPrefillPhase(true);
return generatedTokens;
}
@@ -197,6 +199,7 @@ public static List generateTokensGPULlama(
currentToken = promptTokens.get(N - 1);
pos = startPosition + N;
state.latestToken = currentToken;
+ long decodeStartNanos = System.nanoTime();
// ── Decode ────────────────────────────────────────────────────────────
while (pos < actualMaxTokens) {
@@ -224,8 +227,9 @@ public static List generateTokensGPULlama(
}
long endNanos = System.nanoTime();
- int totalTokens = promptTokens.size() + generatedTokens.size();
- LastRunMetrics.setMetrics(totalTokens, (endNanos - startNanos) / 1_000_000_000.0);
+ RunMetrics.setInferenceMetrics(promptTokens.size(), decodeStartNanos - startNanos,
+ generatedTokens.size(), endNanos - decodeStartNanos, endNanos - startNanos);
+ RunMetrics.setHasPrefillPhase(true);
return generatedTokens;
}
diff --git a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java
index 64815d52..ae9f6e9c 100644
--- a/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java
+++ b/src/main/java/org/beehive/gpullama3/inference/InferenceEngineWithPrefillDecode.java
@@ -1,6 +1,6 @@
package org.beehive.gpullama3.inference;
-import org.beehive.gpullama3.auxiliary.LastRunMetrics;
+import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.inference.state.State;
import org.beehive.gpullama3.model.Configuration;
@@ -75,6 +75,7 @@ public static List generateTokensLlama(
}
state.latestToken = currentToken;
+ long decodeStartNanos = System.nanoTime();
// ── Decode ────────────────────────────────────────────────────────────
while (pos < actualMaxTokens) {
@@ -102,8 +103,9 @@ public static List generateTokensLlama(
}
long endNanos = System.nanoTime();
- int totalTokens = promptTokens.size() + generatedTokens.size();
- LastRunMetrics.setMetrics(totalTokens, (endNanos - startNanos) / 1_000_000_000.0);
+ RunMetrics.setInferenceMetrics(promptTokens.size(), decodeStartNanos - startNanos,
+ generatedTokens.size(), endNanos - decodeStartNanos, endNanos - startNanos);
+ RunMetrics.setHasPrefillPhase(true);
return generatedTokens;
}
@@ -153,6 +155,7 @@ public static List generateTokensGPULlama(
}
state.latestToken = currentToken;
+ long decodeStartNanos = System.nanoTime();
// ── Decode (GPU, with logits) ─────────────────────────────────────────
while (pos < actualMaxTokens) {
@@ -180,8 +183,9 @@ public static List generateTokensGPULlama(
}
long endNanos = System.nanoTime();
- int totalTokens = promptTokens.size() + generatedTokens.size();
- LastRunMetrics.setMetrics(totalTokens, (endNanos - startNanos) / 1_000_000_000.0);
+ RunMetrics.setInferenceMetrics(promptTokens.size(), decodeStartNanos - startNanos,
+ generatedTokens.size(), endNanos - decodeStartNanos, endNanos - startNanos);
+ RunMetrics.setHasPrefillPhase(true);
return generatedTokens;
}
diff --git a/src/main/java/org/beehive/gpullama3/model/Model.java b/src/main/java/org/beehive/gpullama3/model/Model.java
index 6defefd0..60e00cf0 100644
--- a/src/main/java/org/beehive/gpullama3/model/Model.java
+++ b/src/main/java/org/beehive/gpullama3/model/Model.java
@@ -1,7 +1,7 @@
package org.beehive.gpullama3.model;
import org.beehive.gpullama3.Options;
-import org.beehive.gpullama3.auxiliary.LastRunMetrics;
+import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.inference.state.State;
import org.beehive.gpullama3.inference.weights.Weights;
@@ -165,7 +165,7 @@ default void runInteractive(Sampler sampler, Options options) {
// Optionally print performance metrics after each response
if (SHOW_PERF_INTERACTIVE) {
- LastRunMetrics.printMetrics();
+ RunMetrics.printMetrics();
}
}
} finally {
diff --git a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java
index 83b25987..bac9aa38 100644
--- a/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java
+++ b/src/main/java/org/beehive/gpullama3/model/loader/ModelLoader.java
@@ -1,6 +1,7 @@
package org.beehive.gpullama3.model.loader;
import org.beehive.gpullama3.Options;
+import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.tensor.GGMLType;
import org.beehive.gpullama3.tensor.GGUF;
import org.beehive.gpullama3.tensor.*;
@@ -91,24 +92,24 @@ public static Model loadModel(Options options) throws IOException {
int contextLength = options.maxTokens();
boolean useTornadovm = options.useTornadovm();
- // initial load of metadata from gguf file
+ long start = System.nanoTime();
GGUF gguf = GGUF.loadGGUFMetadata(ggufPath);
- // detect model type
ModelType modelType = detectModelType(gguf.getMetadata());
- // model type-specific load
- return modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm);
+ Model model = modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm);
+ RunMetrics.setLoadDuration(System.nanoTime() - start);
+ return model;
}
/**
* For compatibility with langchain4j and quarkus.
*/
public static Model loadModel(Path ggufPath, int contextLength, boolean loadWeights, boolean useTornadovm) throws IOException {
- // initial load of metadata from gguf file
+ long start = System.nanoTime();
GGUF gguf = GGUF.loadGGUFMetadata(ggufPath);
- // detect model type
ModelType modelType = detectModelType(gguf.getMetadata());
- // model type-specific load
- return modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm);
+ Model model = modelType.loadModel(gguf.getFileChannel(), gguf, contextLength, useTornadovm);
+ RunMetrics.setLoadDuration(System.nanoTime() - start);
+ return model;
}
/**
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java
index da9c34c6..14aaed10 100644
--- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java
+++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanStandard.java
@@ -1,5 +1,6 @@
package org.beehive.gpullama3.tornadovm;
+import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.state.State;
import org.beehive.gpullama3.model.Configuration;
import org.beehive.gpullama3.model.Model;
@@ -27,10 +28,6 @@ public class TornadoVMMasterPlanStandard implements TornadoVMMasterPlan {
public TornadoExecutionPlan executionPlan;
public TornadoVMMasterPlanStandard(State state, Model model) {
- long startTime = System.nanoTime();
- long planCreationTime = 0;
- long warmupTime = 0;
-
if (ENABLE_TORNADOVM_INIT_TIME) {
System.err.println("\nStarting TornadoVM initialization...");
}
@@ -38,28 +35,19 @@ public TornadoVMMasterPlanStandard(State state, Model model) {
this.state = state;
this.model = model;
this.config = model.configuration();
- this.executionPlan = createExecutionPlan();
- if (ENABLE_TORNADOVM_INIT_TIME) {
- planCreationTime = System.nanoTime();
- System.err.printf("TornadoVM GPU standard execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0);
- }
+ long startTime = System.nanoTime();
+ this.executionPlan = createExecutionPlan();
+ long planCreationTime = System.nanoTime();
if (CUDA_GRAPHS) executionPlan.withAllGraphs().withCUDAGraph();
executionPlan.withPreCompilation();
-
- if (ENABLE_TORNADOVM_INIT_TIME) {
- warmupTime = System.nanoTime();
- System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0);
- }
+ long warmupTime = System.nanoTime();
forceCopyInReadOnlyData();
+ long copyTime = System.nanoTime();
- if (ENABLE_TORNADOVM_INIT_TIME) {
- long copyTime = System.nanoTime();
- System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0);
- System.err.printf("Finished TornadoVM initialization...\n \n");
- }
+ RunMetrics.setTornadoMetrics(planCreationTime - startTime, warmupTime - planCreationTime, copyTime - warmupTime);
}
/**
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java
index aa69afad..2984508e 100644
--- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java
+++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithBatchPrefillDecode.java
@@ -1,5 +1,6 @@
package org.beehive.gpullama3.tornadovm;
+import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.state.LlamaState;
import org.beehive.gpullama3.inference.state.State;
import org.beehive.gpullama3.tensor.GGMLType;
@@ -74,10 +75,6 @@ public class TornadoVMMasterPlanWithBatchPrefillDecode implements TornadoVMMaste
// ── Construction ─────────────────────────────────────────────────────────
TornadoVMMasterPlanWithBatchPrefillDecode(State initialState, Model model) {
- long startTime = System.nanoTime();
- long planCreationTime = 0;
- long warmupTime = 0;
-
if (ENABLE_TORNADOVM_INIT_TIME) {
System.err.println("\nStarting TornadoVM initialization...");
}
@@ -88,28 +85,19 @@ public class TornadoVMMasterPlanWithBatchPrefillDecode implements TornadoVMMaste
this.batchSize = PREFILL_BATCH_SIZE;
this.N = config.numberOfLayers();
this.gridScheduler = new GridScheduler();
- this.executionPlan = createExecutionPlan();
- if (ENABLE_TORNADOVM_INIT_TIME) {
- planCreationTime = System.nanoTime();
- System.err.printf("TornadoVM GPU batched prefill/decode execution plan creation: %.2f ms\n", (planCreationTime - startTime) / 1_000_000.0);
- }
+ long startTime = System.nanoTime();
+ this.executionPlan = createExecutionPlan();
+ long planCreationTime = System.nanoTime();
if (CUDA_GRAPHS) executionPlan.withAllGraphs().withCUDAGraph();
executionPlan.withPreCompilation();
-
- if (ENABLE_TORNADOVM_INIT_TIME) {
- warmupTime = System.nanoTime();
- System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n", (warmupTime - planCreationTime) / 1_000_000.0);
- }
+ long warmupTime = System.nanoTime();
forceCopyInReadOnlyData();
+ long copyTime = System.nanoTime();
- if (ENABLE_TORNADOVM_INIT_TIME) {
- long copyTime = System.nanoTime();
- System.err.printf("Transfer read-only weights to GPU: %.2f ms\n", (copyTime - warmupTime) / 1_000_000.0);
- System.err.printf("Finished TornadoVM initialization...\n \n");
- }
+ RunMetrics.setTornadoMetrics(planCreationTime - startTime, warmupTime - planCreationTime, copyTime - warmupTime);
}
// ── Batch Prefill Activation graphs ─────────────────────────────────────────────────────
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java
index 7a668546..517024f9 100644
--- a/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java
+++ b/src/main/java/org/beehive/gpullama3/tornadovm/TornadoVMMasterPlanWithPrefillDecode.java
@@ -1,5 +1,6 @@
package org.beehive.gpullama3.tornadovm;
+import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.state.LlamaState;
import org.beehive.gpullama3.inference.state.State;
import org.beehive.gpullama3.tensor.GGMLType;
@@ -63,10 +64,6 @@ public class TornadoVMMasterPlanWithPrefillDecode implements TornadoVMMasterPlan
// ── Construction ─────────────────────────────────────────────────────────
TornadoVMMasterPlanWithPrefillDecode(State initialState, Model model) {
- long startTime = System.nanoTime();
- long planCreationTime = 0;
- long warmupTime = 0;
-
if (ENABLE_TORNADOVM_INIT_TIME) {
System.err.println("\nStarting TornadoVM initialization...");
}
@@ -76,31 +73,19 @@ public class TornadoVMMasterPlanWithPrefillDecode implements TornadoVMMasterPlan
this.config = (LlamaConfiguration) model.configuration();
this.N = config.numberOfLayers();
this.gridScheduler = new GridScheduler();
- this.executionPlan = createExecutionPlan();
- if (ENABLE_TORNADOVM_INIT_TIME) {
- planCreationTime = System.nanoTime();
- System.err.printf("TornadoVM GPU single-token prefill/decode execution plan creation: %.2f ms\n",
- (planCreationTime - startTime) / 1_000_000.0);
- }
+ long startTime = System.nanoTime();
+ this.executionPlan = createExecutionPlan();
+ long planCreationTime = System.nanoTime();
if (CUDA_GRAPHS) executionPlan.withAllGraphs().withCUDAGraph();
executionPlan.withPreCompilation();
-
- if (ENABLE_TORNADOVM_INIT_TIME) {
- warmupTime = System.nanoTime();
- System.err.printf("Java to GPU JIT compiler warmup: %.2f ms\n",
- (warmupTime - planCreationTime) / 1_000_000.0);
- }
+ long warmupTime = System.nanoTime();
forceCopyInReadOnlyData();
+ long copyTime = System.nanoTime();
- if (ENABLE_TORNADOVM_INIT_TIME) {
- long copyTime = System.nanoTime();
- System.err.printf("Transfer read-only weights to GPU: %.2f ms\n",
- (copyTime - warmupTime) / 1_000_000.0);
- System.err.printf("Finished TornadoVM initialization...\n \n");
- }
+ RunMetrics.setTornadoMetrics(planCreationTime - startTime, warmupTime - planCreationTime, copyTime - warmupTime);
}
// ── Activation graph ─────────────────────────────────────────────────────