PaddlePaddle · yuanlehome · Mar 23, 2026 · Mar 23, 2026 · Copilot · Mar 23, 2026
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -99,7 +99,7 @@ def _compute_sampling_mask(
     top_p: paddle.Tensor,
     top_k: Optional[paddle.Tensor] = None,
     top_k_list: Optional[list] = None,
-) -> List[np.ndarray]:
+) -> tuple[List[np.ndarray], np.ndarray]:
     """
     Compute a combined top-k + top-p (nucleus) sampling mask as sparse
     retained-token indices.
@@ -124,8 +124,11 @@ def _compute_sampling_mask(
                     top-k filtering is needed at all.
 
     Returns:
-        List of length num_reqs; element i is a 1-D int64 numpy array of the
-        retained vocab indices for request i.
+        Tuple of (sparse_indices, logz_per_batch):
+        - sparse_indices: List of length num_reqs; element i is a 1-D int64
+          numpy array of the retained vocab indices for request i.
+        - logz_per_batch: 1-D numpy array of shape [num_reqs] containing
+          log(Z_K) where Z_K is the sum of probabilities in the candidate set.
     """
     real_bsz = probs.shape[0]
     vocab_size = probs.shape[1]
@@ -179,11 +182,21 @@ def _compute_sampling_mask(
     k_per_row = final_mask.astype("int32").sum(axis=-1)  # [B]
     max_k = int(k_per_row.max().item())
 
+    # ------------------------------------------------------------------
+    # Stage 5: compute logZ_K for renormalization
+    # Z_K = sum(probs[i] * final_mask[i]) for each request i
+    # logZ_K = log(Z_K), with small constant to avoid log(0)
+    # ------------------------------------------------------------------
+    candidate_probs = paddle.where(final_mask, sorted_probs, paddle.zeros_like(sorted_probs))
+    z_k = candidate_probs.sum(axis=-1)  # [B]
+    logz_per_batch = paddle.log(z_k + 1e-10).cpu().numpy()  # [B]
+
     # Transfer only the leading max_k columns — typically max_k << vocab_size.
     indices_window_cpu = sorted_indices[:, :max_k].cpu().numpy()  # [B, max_k]
     mask_window_cpu = final_mask[:, :max_k].cpu().numpy()  # [B, max_k]
 
-    return [indices_window_cpu[i, mask_window_cpu[i]] for i in range(real_bsz)]
+    sparse_indices = [indices_window_cpu[i, mask_window_cpu[i]] for i in range(real_bsz)]
+    return sparse_indices, logz_per_batch
 
 
 class GuidedDecoding:
@@ -622,8 +635,9 @@ def forward_cuda(
         # Compute sampling mask BEFORE top_k_top_p_sampling modifies probs.
         # Binary mask [num_reqs, vocab_size]: 1 = retained by top_k/top_p, 0 = truncated.
         sampling_mask = None
+        logz_per_batch = None
         if sampling_metadata.keep_sampling_mask:
-            sampling_mask = _compute_sampling_mask(
+            sampling_mask, logz_per_batch = _compute_sampling_mask(
                 probs,
                 sampling_metadata.top_p,
                 top_k=sampling_metadata.top_k,
@@ -654,6 +668,7 @@ def forward_cuda(
             logprobs_tensors=logprobs_tensors,
             logits=logits,
             sampling_mask=sampling_mask,
+            logz_per_batch=logz_per_batch,
         )
 
         return sampler_output
@@ -970,6 +985,7 @@ def forward_cuda(
         # Compute sampling mask at accepted token positions.
         # Shape: [total_accepted_tokens, vocab_size], bool (CPU).
         sampling_mask = None
+        logz_per_batch = None
         if keep_sampling_mask:
             # Expand top_p from [batch, 1] to [total_accepted, 1].
             accept_top_p = sampling_metadata.top_p[:real_bsz].squeeze(1).repeat_interleave(accept_nums).unsqueeze(1)
@@ -982,7 +998,7 @@ def forward_cuda(
                 accept_top_k = (
                     sampling_metadata.top_k[:real_bsz].squeeze(1).repeat_interleave(accept_nums).unsqueeze(1)
                 )
-            sampling_mask = _compute_sampling_mask(
+            sampling_mask, logz_per_batch = _compute_sampling_mask(
                 target_probs,
                 accept_top_p,
                 top_k=accept_top_k,
@@ -996,6 +1012,7 @@ def forward_cuda(
             cu_batch_token_offset=share_inputs["cu_batch_token_offset"],
             logits=logits,
             sampling_mask=sampling_mask,
+            logz_per_batch=logz_per_batch,
         )
 
         return sampler_output

diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
@@ -370,6 +370,27 @@ def post_process_normal(
             )
     # 3. Transmit the model's output and stop generation signal via message queue.
     #    In the future, we will abandon this approach.
+    # Renormalize logprobs to match truncated sampling distribution (when enabled).
+    if sampler_output.logprobs_tensors is not None and sampler_output.logz_per_batch is not None:
+        # logprobs_tensors.logprobs: [B, max_num_logprobs + 1]
+        logprobs = sampler_output.logprobs_tensors.logprobs
+        # logz_per_batch: [B], log(sum(probs in candidate set K)) for each request
+        logz = paddle.to_tensor(sampler_output.logz_per_batch, dtype=logprobs.dtype)
+        # Renormalize: log π_masked = log π_full - log Z_K
+        # Only normalize valid candidates; padding positions use -inf
+        valid_mask = paddle.isfinite(logprobs)
+        normalized_logprobs = paddle.where(
+            valid_mask,
+            logprobs - logz.unsqueeze(1),  # broadcast subtraction
+            paddle.full_like(logprobs, float("-inf")),
+        )
+        # Update logprobs_tensors with normalized values
+        sampler_output.logprobs_tensors = LogprobsTensors(
+            logprob_token_ids=sampler_output.logprobs_tensors.logprob_token_ids,
+            logprobs=normalized_logprobs,
+            selected_token_ranks=sampler_output.logprobs_tensors.selected_token_ranks,
+        )
-        # Update logprobs_tensors with normalized values
-        sampler_output.logprobs_tensors = LogprobsTensors(
-            logprob_token_ids=sampler_output.logprobs_tensors.logprob_token_ids,
-            logprobs=normalized_logprobs,
-            selected_token_ranks=sampler_output.logprobs_tensors.selected_token_ranks,
-        )
+        # Update existing logprobs_tensors with normalized values in place
+        sampler_output.logprobs_tensors.logprobs = normalized_logprobs
-        # Update logprobs_tensors with normalized values
-        sampler_output.logprobs_tensors = LogprobsTensors(
-            logprob_token_ids=sampler_output.logprobs_tensors.logprob_token_ids,
-            logprobs=normalized_logprobs,
-            selected_token_ranks=sampler_output.logprobs_tensors.selected_token_ranks,
-        )
+        # Update existing logprobs_tensors with normalized values in place
+        sampler_output.logprobs_tensors.logprobs = normalized_logprobs
+
     if not skip_save_output:
         if envs.FD_USE_GET_SAVE_OUTPUT_V1:
             if save_each_rank or model_output.mp_rank == 0:
@@ -483,6 +504,19 @@ def post_process_specualate(
         model_output.stop_nums,
         model_output.mask_rollback,
     )
+    # Renormalize logprobs to match truncated sampling distribution (when enabled).
+    if sampler_output.logprobs_tensors is not None and sampler_output.logz_per_batch is not None:
+        logprobs = sampler_output.logprobs_tensors.logprobs
+        logz = paddle.to_tensor(sampler_output.logz_per_batch, dtype=logprobs.dtype)
+        valid_mask = paddle.isfinite(logprobs)
+        normalized_logprobs = paddle.where(
+            valid_mask, logprobs - logz.unsqueeze(1), paddle.full_like(logprobs, float("-inf"))
+        )
+        sampler_output.logprobs_tensors = LogprobsTensors(
+            logprob_token_ids=sampler_output.logprobs_tensors.logprob_token_ids,
+            logprobs=normalized_logprobs,
+            selected_token_ranks=sampler_output.logprobs_tensors.selected_token_ranks,
+        )
 
     if not skip_save_output:
         if sampler_output.logprobs_tensors is None:

diff --git a/fastdeploy/worker/output.py b/fastdeploy/worker/output.py
@@ -193,6 +193,10 @@ class SamplerOutput:
     # check whether the current path is speculative or non-speculative when
     # interpreting the dimension.
     sampling_mask: Optional[List[np.ndarray]] = None
+    # logZ_K for each request: log(sum(probs in candidate set K))
+    # Used for renormalizing logprobs to match the truncated sampling distribution.
+    # Shape: [num_reqs]
-    # Shape: [num_reqs]
+    # Shape:
+    #   - Non-speculative decoding: [num_reqs]
+    #   - Speculative decoding: [total_accepted_tokens], aligned with the first
+    #     dimension of logprobs / sampling_mask and regrouped by request in
+    #     post-processing.
-    # Shape: [num_reqs]
+    # Shape:
+    #   - Non-speculative decoding: [num_reqs]
+    #   - Speculative decoding: [total_accepted_tokens], aligned with the first
+    #     dimension of logprobs / sampling_mask and regrouped by request in
+    #     post-processing.
+    logz_per_batch: Optional[np.ndarray] = None
 
 
 @dataclass