beehive-lab
diff --git a/‎.github/workflows/build-and-run.yml‎
Lines changed: 101 additions & 0 deletions b/‎.github/workflows/build-and-run.yml‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎src/main/java/org/beehive/gpullama3/inference/InferenceCore.java‎
Lines changed: 12 additions & 21 deletions b/‎src/main/java/org/beehive/gpullama3/inference/InferenceCore.java‎
Lines changed: 12 additions & 21 deletions
@@ -247,6 +247,107 @@ jobs:
           configuration: standard
           metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.json
 
+      - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Prefill-Decode
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-prefill-decode.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-prefill-decode.meta.json" \
+            backend="${{ matrix.backend.name }}" \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=prefill-decode \
+            "flags=--with-prefill-decode" \
+            prompt="Say hello"
+
+      - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Batch-Prefill-Decode
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-batch-prefill-decode.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --${{ matrix.backend.name }} \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode --batch-prefill-size 32
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-batch-prefill-decode.meta.json" \
+            backend="${{ matrix.backend.name }}" \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=batch-prefill-decode \
+            "flags=--with-prefill-decode --batch-prefill-size 32" \
+            prompt="Say hello"
+
+      # ── PTX-only: CUDA-graph variants ────────────────────────────────────────
+      - name: PTX - Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Prefill-Decode-CUDA-Graphs
+        if: matrix.backend.name == 'ptx'
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-q8-prefill-decode-cuda-graphs.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --ptx \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode \
+            --cuda-graphs
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-ptx-llama-1b-q8-prefill-decode-cuda-graphs.meta.json" \
+            backend=ptx \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=prefill-decode-cuda-graphs \
+            "flags=--with-prefill-decode --cuda-graphs" \
+            prompt="Say hello"
+
+      - name: PTX - Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf - Batch-Prefill-Decode-CUDA-Graphs
+        if: matrix.backend.name == 'ptx'
+        env:
+          JAVA_TOOL_OPTIONS: >-
+            -Dllama.metrics.format=json
+            -Dllama.metrics.output=file
+            -Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-q8-batch-prefill-decode-cuda-graphs.json
+        run: |
+          cd ${{ github.workspace }}
+          export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
+          ./llama-tornado --gpu --ptx \
+            --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
+            --prompt "Say hello" \
+            --with-prefill-decode --batch-prefill-size 32 \
+            --cuda-graphs
+          python3 scripts/write_metrics_sidecar.py \
+            --out "${{ runner.temp }}/metrics-ptx-llama-1b-q8-batch-prefill-decode-cuda-graphs.meta.json" \
+            backend=ptx \
+            task=llama-inference \
+            model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
+            model=Llama-3.2-1B-Instruct \
+            quantization=Q8_0 \
+            configuration=batch-prefill-decode-cuda-graphs \
+            "flags=--with-prefill-decode --batch-prefill-size 32 --cuda-graphs" \
+            prompt="Say hello"
+
       - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf
         uses: ./.github/actions/run-inference
         with:
 
@@ -28,15 +28,6 @@
  * This class provides core computational operations such as RMS normalization and forward passes through model layers. It supports both CPU and GPU implementations.
  * </p>
  *
- * <p>
- * Specifically, it implements:
- * <ul>
- *   <li>{@code rmsnorm} – applies Root Mean Square Layer Normalization to input vectors</li>
- *   <li>{@code forwardJava} – executes a Forward pass for LLaMA and Mistral models on CPU</li>
- *   <li>{@code forwardJavaQwen3} – executes a Forward pass for Qwen3 models on CPU</li>
- *   <li>{@code forwardTornadoVM} – executes a Forward pass using TornadoVM for GPU acceleration</li>
- * </ul>
- * </p>
  */
 
 public final class InferenceCore {
@@ -643,10 +634,10 @@ public static FloatTensor forwardJavaPhi3(Model model, Phi3State state, int toke
      * Granite uses the same transformer architecture as Llama but with maximal update parameterization (µP)
      * scaling factors applied at specific points:
      * <ul>
-     *   <li>Embedding scaling: multiply embeddings after lookup</li>
-     *   <li>Attention scaling: use custom multiplier instead of 1/sqrt(headDim)</li>
-     *   <li>Residual scaling: multiply residual connections</li>
-     *   <li>Logit scaling: divide logits by the scaling factor</li>
+     * <li>Embedding scaling: multiply embeddings after lookup</li>
+     * <li>Attention scaling: use custom multiplier instead of 1/sqrt(headDim)</li>
+     * <li>Residual scaling: multiply residual connections</li>
+     * <li>Logit scaling: divide logits by the scaling factor</li>
      * </ul>
      */
     public static FloatTensor forwardGranite(Model model, State state, int token, int position) {
@@ -771,24 +762,24 @@ static void copyChunk(FloatTensor in, FloatTensor out, int dim1In, int dim1Out,
      *
      * <p>This method handles the first phase of processing a token through the transformer model:
      * <ol>
-     *   <li>Copies the token embedding from the model's embedding table to the state's buffer</li>
-     *   <li>Delegates the transformer layer processing to TornadoVM through the master plan</li>
+     * <li>Copies the token embedding from the model's embedding table to the state's buffer</li>
+     * <li>Delegates the transformer layer processing to TornadoVM through the master plan</li>
      * </ol>
      *
      * <p>The token embedding lookup happens on the CPU using {@link MemorySegment} operations,
      * while the subsequent transformer layers processing is offloaded to the accelerator through
      * TornadoVM for improved performance.
      *
      * @param model
-     *         The Llama model containing weights and configuration parameters
+     *     The Llama model containing weights and configuration parameters
      * @param state
-     *         The current execution state holding input/output tensors and temporary buffers
+     *     The current execution state holding input/output tensors and temporary buffers
      * @param token
-     *         The input token ID to process
+     *     The input token ID to process
      * @param position
-     *         The position of this token in the sequence context window
+     *     The position of this token in the sequence context window
      * @param tornadoVMMasterPlan
-     *         The execution plan for TornadoVM acceleration
+     *     The execution plan for TornadoVM acceleration
      * @return FloatTensor containing the output logits for token prediction
      */
     public static FloatArray forwardTornadoVM(Model model, State state, int token, int position, TornadoVMMasterPlan tornadoVMMasterPlan) {
@@ -814,7 +805,7 @@ public static FloatArray forwardTornadoVM(Model model, State state, int token, i
             default -> throw new IllegalArgumentException("Unsupported weight type: " + weights.getWeightType());
         }
 
-        return tornadoVMMasterPlan.tornadoVMForwardExecuteLayered(position);
+        return tornadoVMMasterPlan.tornadoVMForwardDecode(position);
     }
 
 }