diff --git a/custom_ops/gpu_ops/append_attention.cu b/custom_ops/gpu_ops/append_attention.cu
index c1586945cc5..7603c255021 100644
--- a/custom_ops/gpu_ops/append_attention.cu
+++ b/custom_ops/gpu_ops/append_attention.cu
@@ -72,6 +72,7 @@ void AppendAttentionKernel(
     const paddle::optional<paddle::Tensor>& out_linear_shifts,
     const paddle::optional<paddle::Tensor>& out_linear_smooths,
     const paddle::optional<paddle::Tensor>& kv_signal_data,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const paddle::optional<paddle::Tensor>& q_norm_weight,
     const paddle::optional<paddle::Tensor>& k_norm_weight,
     const paddle::optional<paddle::Tensor>& sinks,
@@ -213,6 +214,7 @@ void AppendAttentionKernel(
             max_input_length,
             use_neox_rotary_style,
             rope_3d,
+            rope_3d_delta,
             main_stream,
             &qkv_out,
             const_cast<paddle::Tensor*>(&key_cache),
@@ -310,6 +312,7 @@ void AppendAttentionKernel(
               cache_quant_type_str,
               use_neox_rotary_style,
               rope_3d,
+              rope_3d_delta,
               max_input_length,
               exec_stream,
               &qkv_out,
@@ -337,6 +340,7 @@ void AppendAttentionKernel(
               cache_quant_type_str,
               use_neox_rotary_style,
               rope_3d,
+              rope_3d_delta,
               max_input_length,
               exec_stream,
               &qkv_out,
@@ -365,6 +369,7 @@ void AppendAttentionKernel(
               cache_quant_type_str,
               use_neox_rotary_style,
               rope_3d,
+              rope_3d_delta,
               max_input_length,
               exec_stream,
               &qkv_out,
@@ -391,6 +396,7 @@ void AppendAttentionKernel(
               cache_quant_type_str,
               use_neox_rotary_style,
               rope_3d,
+              rope_3d_delta,
               max_input_length,
               exec_stream,
               &qkv_out,
@@ -485,6 +491,7 @@ std::vector<paddle::Tensor> AppendAttention(
     const paddle::optional<paddle::Tensor>& out_linear_smooths,
     const paddle::optional<paddle::Tensor>& mask_offset,
     const paddle::optional<paddle::Tensor>& kv_signal_data,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const paddle::optional<paddle::Tensor>& q_norm_weight,
     const paddle::optional<paddle::Tensor>& k_norm_weight,
     const paddle::optional<paddle::Tensor>& sinks,
@@ -619,6 +626,7 @@ std::vector<paddle::Tensor> AppendAttention(
         out_linear_shifts,
         out_linear_smooths,
         kv_signal_data,
+        rope_3d_delta,
         q_norm_weight,
         k_norm_weight,
         sinks,
@@ -697,6 +705,7 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
     const paddle::optional<paddle::Tensor>& out_linear_smooths,
     const paddle::optional<paddle::Tensor>& mask_offset,
     const paddle::optional<paddle::Tensor>& kv_signal_data,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const paddle::optional<paddle::Tensor>& q_norm_weight,
     const paddle::optional<paddle::Tensor>& k_norm_weight,
     const paddle::optional<paddle::Tensor>& sinks,
@@ -777,6 +786,7 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
         out_linear_shifts,
         out_linear_smooths,
         kv_signal_data,
+        rope_3d_delta,
         q_norm_weight,
         k_norm_weight,
         sinks,
@@ -868,6 +878,7 @@ std::vector<std::vector<int64_t>> AppendAttentionInferShape(
     const paddle::optional<std::vector<int64_t>>& out_linear_smooths_shape,
     const paddle::optional<std::vector<int64_t>>& mask_offset_shape,
     const paddle::optional<std::vector<int64_t>>& kv_signal_data_shape,
+    const paddle::optional<std::vector<int64_t>>& rope_3d_delta_shape,
     const paddle::optional<std::vector<int64_t>>& q_norm_weight_shape,
     const paddle::optional<std::vector<int64_t>>& k_norm_weight_shape,
     const paddle::optional<std::vector<int64_t>>& sinks_shape,
@@ -934,6 +945,7 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
     const paddle::optional<paddle::DataType>& out_linear_smooths_dtype,
     const paddle::optional<paddle::DataType>& mask_offset_dtype,
     const paddle::optional<paddle::DataType>& kv_signal_data_dtype,
+    const paddle::optional<paddle::DataType>& rope_3d_delta_dtype,
     const paddle::optional<paddle::DataType>& q_norm_weight_dtype,
     const paddle::optional<paddle::DataType>& k_norm_weight_dtype,
     const paddle::optional<paddle::DataType>& sinks_dtype,
@@ -1021,6 +1033,7 @@ std::vector<std::vector<int64_t>> AppendAttentionWithOutputInferShape(
     const paddle::optional<std::vector<int64_t>>& out_linear_smooths_shape,
     const paddle::optional<std::vector<int64_t>>& mask_offset_shape,
     const paddle::optional<std::vector<int64_t>>& kv_signal_data_shape,
+    const paddle::optional<std::vector<int64_t>>& rope_3d_delta_shape,
     const paddle::optional<std::vector<int64_t>>& q_norm_weight_shape,
     const paddle::optional<std::vector<int64_t>>& k_norm_weight_shape,
     const paddle::optional<std::vector<int64_t>>& sinks_shape,
@@ -1080,6 +1093,7 @@ std::vector<paddle::DataType> AppendAttentionWithOutputInferDtype(
     const paddle::optional<paddle::DataType>& out_linear_smooths_dtype,
     const paddle::optional<paddle::DataType>& mask_offset_dtype,
     const paddle::optional<paddle::DataType>& kv_signal_data_dtype,
+    const paddle::optional<paddle::DataType>& rope_3d_delta_dtype,
     const paddle::optional<paddle::DataType>& q_norm_weight_dtype,
     const paddle::optional<paddle::DataType>& k_norm_weight_dtype,
     const paddle::optional<paddle::DataType>& sinks_dtype,
@@ -1138,6 +1152,7 @@ PD_BUILD_STATIC_OP(append_attention)
              paddle::Optional("out_linear_smooths"),
              paddle::Optional("mask_offset"),
              paddle::Optional("kv_signal_data"),
+             paddle::Optional("rope_3d_delta"),
              paddle::Optional("q_norm_weight"),
              paddle::Optional("k_norm_weight"),
              paddle::Optional("sinks")})
@@ -1201,6 +1216,7 @@ PD_BUILD_STATIC_OP(append_attention_with_output)
              paddle::Optional("out_linear_smooths"),
              paddle::Optional("mask_offset"),
              paddle::Optional("kv_signal_data"),
+             paddle::Optional("rope_3d_delta"),
              paddle::Optional("q_norm_weight"),
              paddle::Optional("k_norm_weight"),
              paddle::Optional("sinks")})
diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh
index 7dd4612c529..85fe0b7be4b 100644
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh
@@ -92,6 +92,7 @@ __global__ void append_decode_cache_T_rope_qk_norm_kernel(
     const uint32_t elem_cnt,
     const int kv_num_heads,
     const bool rope_3d,
+    const int* rope_3d_delta,
     const float* q_norm_weight,
     const float* k_norm_weight,
     const float rms_norm_eps) {
@@ -143,8 +144,15 @@ __global__ void append_decode_cache_T_rope_qk_norm_kernel(
     if (hi < num_heads + kv_num_heads) {
       // q k rope
       const uint32_t emb_idx = write_seq_id * half_head_size + h_bias / 2;
-      uint32_t new_emb_idx =
-          rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx;
+      uint32_t new_emb_idx = emb_idx;
+      if (rope_3d) {
+        if (rope_3d_delta) {
+          const int rope_pos = write_seq_id + rope_3d_delta[ori_bi];
+          new_emb_idx = rope_pos * half_head_size + h_bias / 2;
+        } else {
+          new_emb_idx = emb_idx + ori_bi * max_seq_len * head_size;
+        }
+      }
       Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
       Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
     }
@@ -237,7 +245,8 @@ __global__ void append_decode_cache_T_rope_kernel(
     const int block_size,
     const uint32_t elem_cnt,
     const int kv_num_heads,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int* rope_3d_delta) {
   using LoadT = AlignedVector<T, VecSize>;
   using LoadBiasT = AlignedVector<T, VecSize>;
   using LoadKVT = AlignedVector<T, VecSize>;
@@ -282,8 +291,15 @@ __global__ void append_decode_cache_T_rope_kernel(
     if (hi < num_heads + kv_num_heads) {
       // q k rope
       const uint32_t emb_idx = write_seq_id * half_head_size + h_bias / 2;
-      uint32_t new_emb_idx =
-          rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx;
+      uint32_t new_emb_idx = emb_idx;
+      if (rope_3d) {
+        if (rope_3d_delta) {
+          const int rope_pos = write_seq_id + rope_3d_delta[ori_bi];
+          new_emb_idx = rope_pos * half_head_size + h_bias / 2;
+        } else {
+          new_emb_idx = emb_idx + ori_bi * max_seq_len * head_size;
+        }
+      }
       Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
       Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
     }
@@ -1221,6 +1237,7 @@ __global__ void append_decode_cache_int8_rope_qk_norm_kernel(
     const float min_bound,
     const int kv_num_heads,
     const bool rope_3d,
+    const int* rope_3d_delta,
     const float rms_norm_eps) {
   static_assert(HeadDim == 128, "just support HeadDim be 128 now!");
   static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!");
@@ -1268,8 +1285,15 @@ __global__ void append_decode_cache_int8_rope_qk_norm_kernel(
       Load<T, VecSize>(&qkv_now[bias_idx], &src_vec);
       // q rope
       const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
-      const uint32_t new_emb_idx =
-          rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx;
+      uint32_t new_emb_idx = emb_idx;
+      if (rope_3d) {
+        if (rope_3d_delta) {
+          const int rope_pos = write_seq_id + rope_3d_delta[bid];
+          new_emb_idx = rope_pos * half_head_size + head_bias / 2;
+        } else {
+          new_emb_idx = emb_idx + bid * max_seq_len * HeadDim;
+        }
+      }
       Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
       Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
 #pragma unroll
@@ -1363,8 +1387,15 @@ __global__ void append_decode_cache_int8_rope_qk_norm_kernel(
     const int v_head_idx = head_idx - num_heads - kv_num_heads;
     if (head_idx < num_heads + kv_num_heads) {
       const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
-      const uint32_t new_emb_idx =
-          rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx;
+      uint32_t new_emb_idx = emb_idx;
+      if (rope_3d) {
+        if (rope_3d_delta) {
+          const int rope_pos = write_seq_id + rope_3d_delta[bid];
+          new_emb_idx = rope_pos * half_head_size + head_bias / 2;
+        } else {
+          new_emb_idx = emb_idx + bid * max_seq_len * HeadDim;
+        }
+      }
       Load<float, 1>(&cos_emb[new_emb_idx], &cos_emb_vec1);
       Load<float, 1>(&cos_emb[new_emb_idx + 4], &cos_emb_vec2);
       Load<float, 1>(&sin_emb[new_emb_idx], &sin_emb_vec1);
@@ -1533,7 +1564,8 @@ __global__ void append_decode_cache_int8_rope_kernel(
     const float max_bound,
     const float min_bound,
     const int kv_num_heads,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int* rope_3d_delta) {
   static_assert(HeadDim == 128, "just support HeadDim be 128 now!");
   static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!");
   constexpr int NUM_WARPS = 4;
@@ -1564,7 +1596,13 @@ __global__ void append_decode_cache_int8_rope_kernel(
         qkv_out + start_token_idx * hidden_size + head_idx * HeadDim;
 
     uint32_t emb_offset = write_seq_id * half_head_size;
-    emb_offset += rope_3d ? bid * max_seq_len * HeadDim : 0;
+    if (rope_3d) {
+      if (rope_3d_delta) {
+        emb_offset = (write_seq_id + rope_3d_delta[bid]) * half_head_size;
+      } else {
+        emb_offset += bid * max_seq_len * HeadDim;
+      }
+    }
     apply_rope<T, VecSize, HeadDim, 32, EnforceFmulRN>(qkv_now,
                                                        cos_emb + emb_offset,
                                                        sin_emb + emb_offset,
@@ -1634,8 +1672,15 @@ __global__ void append_decode_cache_int8_rope_kernel(
         cache_v_scale + v_head_idx * HeadDim + head_bias;
     if (head_idx < num_heads + kv_num_heads) {
       const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
-      uint32_t new_emb_idx =
-          rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx;
+      uint32_t new_emb_idx = emb_idx;
+      if (rope_3d) {
+        if (rope_3d_delta) {
+          const int rope_pos = write_seq_id + rope_3d_delta[bid];
+          new_emb_idx = rope_pos * half_head_size + head_bias / 2;
+        } else {
+          new_emb_idx = emb_idx + bid * max_seq_len * HeadDim;
+        }
+      }
       Load<float, 1>(&cos_emb[new_emb_idx], &cos_emb_vec1);
       Load<float, 1>(&cos_emb[new_emb_idx + 4], &cos_emb_vec2);
       Load<float, 1>(&sin_emb[new_emb_idx], &sin_emb_vec1);
diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu
index e25816fcbb3..062e911e886 100644
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu
@@ -38,6 +38,7 @@ void append_decode_cache_rope_qk_norm(const QKV_TYPE* qkv,
                                       const cudaStream_t& stream,
                                       const bool use_neox_style,
                                       const bool rope_3d,
+                                      const int* rope_3d_delta,
                                       const float* q_norm_weight,
                                       const float* k_norm_weight,
                                       const float rms_norm_eps) {
@@ -76,6 +77,7 @@ void append_decode_cache_rope_qk_norm(const QKV_TYPE* qkv,
       elem_nums,
       kv_num_heads,
       rope_3d,
+      rope_3d_delta,
       q_norm_weight,
       k_norm_weight,
       rms_norm_eps);
@@ -104,7 +106,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
                               const int bsz,
                               const cudaStream_t& stream,
                               const bool use_neox_style,
-                              const bool rope_3d) {
+                              const bool rope_3d,
+                              const int* rope_3d_delta) {
   const uint32_t elem_nums =
       use_neox_style ? bsz * (num_heads + 2 * kv_num_heads) * dim_head / 2
                      : bsz * (num_heads + 2 * kv_num_heads) * dim_head;
@@ -255,7 +258,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv,
                                block_size,
                                elem_nums,
                                kv_num_heads,
-                               rope_3d);
+                               rope_3d,
+                               rope_3d_delta);
     }
   }
 }
@@ -288,7 +292,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
                                    const int bsz,
                                    const cudaStream_t& stream,
                                    const bool use_neox_style,
-                                   const bool rope_3d) {
+                                   const bool rope_3d,
+                                   const int* rope_3d_delta) {
   constexpr int num_warps = 4;
   const int all_warps =
       ((num_heads + 2 * kv_num_heads) + num_warps - 1) / num_warps * num_warps;
@@ -427,7 +432,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv,
           127.0f,
           -127.0f,
           kv_num_heads,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta);
     }
   }
 }
@@ -615,6 +621,7 @@ void DecoderWriteCacheWithRoPEKernel(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -659,6 +666,26 @@ void DecoderWriteCacheWithRoPEKernel(
     }
   }
 
+  if (rope_3d_delta && use_neox_rotary_style) {
+    PD_THROW("rope_3d_delta is only supported for non-NeoX Ernie decode RoPE.");
+  }
+  if (rope_3d_delta && cache_quant_type_str != "none" &&
+      cache_quant_type_str != "cache_fp8" &&
+      cache_quant_type_str != "block_wise_fp8") {
+    PD_THROW(
+        "rope_3d_delta is only supported when cache_quant_type is none, "
+        "cache_fp8 or block_wise_fp8.");
+  }
+  if (rope_3d_delta && qkv_out_scales) {
+    PD_THROW("rope_3d_delta is not supported for quantized qkv decode path.");
+  }
+  if (rope_3d_delta && cache_quant_type_str == "block_wise_fp8" &&
+      !q_norm_weight && !k_norm_weight && use_neox_rotary_style) {
+    PD_THROW(
+        "rope_3d_delta is not supported for NeoX block_wise_fp8 decode "
+        "RoPE.");
+  }
+
   if (q_norm_weight && k_norm_weight) {
     if (cache_quant_type_str == "none") {
       append_decode_cache_rope_qk_norm<DataType_, QKV_TYPE, EnforceFmulRN>(
@@ -686,6 +713,7 @@ void DecoderWriteCacheWithRoPEKernel(
           stream,
           use_neox_rotary_style,
           rope_3d,
+          rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr,
           q_norm_weight ? q_norm_weight.get().data<float>() : nullptr,
           k_norm_weight ? k_norm_weight.get().data<float>() : nullptr,
           rms_norm_eps);
@@ -731,6 +759,7 @@ void DecoderWriteCacheWithRoPEKernel(
           -127.0f,
           kv_num_heads,
           rope_3d,
+          rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr,
           rms_norm_eps);
     } else if ((cache_quant_type_str == "cache_fp8")) {
       constexpr int num_warps = 4;
@@ -774,6 +803,7 @@ void DecoderWriteCacheWithRoPEKernel(
           -127.0f,
           kv_num_heads,
           rope_3d,
+          rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr,
           rms_norm_eps);
     } else {
       PD_THROW(
@@ -807,7 +837,8 @@ void DecoderWriteCacheWithRoPEKernel(
           bsz,
           stream,
           use_neox_rotary_style,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr);
     } else if (cache_quant_type_str == "cache_int8") {
       bool is_scale_channel_wise = false;
       if (cache_k_scale &&
@@ -849,7 +880,8 @@ void DecoderWriteCacheWithRoPEKernel(
             bsz,
             stream,
             use_neox_rotary_style,
-            rope_3d);
+            rope_3d,
+            nullptr);
       } else {
         append_decode_cache_int8_rope<DataType_,
                                       QKV_TYPE,
@@ -885,7 +917,8 @@ void DecoderWriteCacheWithRoPEKernel(
             bsz,
             stream,
             use_neox_rotary_style,
-            rope_3d);
+            rope_3d,
+            nullptr);
       }
     } else if (cache_quant_type_str == "cache_fp8") {
       append_decode_cache_int8_rope<DataType_,
@@ -922,7 +955,8 @@ void DecoderWriteCacheWithRoPEKernel(
           bsz,
           stream,
           use_neox_rotary_style,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr);
     } else if (cache_quant_type_str == "block_wise_fp8") {
       constexpr int num_warps = 4;
       const int all_warps = ((num_heads + 2 * kv_num_heads) + num_warps - 1) /
@@ -1003,6 +1037,7 @@ void DecoderWriteCacheWithRoPEKernel(
             -127.0f,
             kv_num_heads,
             rope_3d,
+            rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr,
             rms_norm_eps);
       }
     } else if (cache_quant_type_str == "cache_int4_zp") {
@@ -1045,8 +1080,8 @@ void DecoderWriteCacheWithRoPEKernel(
           rope_3d);
     } else {
       PD_THROW(
-          "cache_quant_type_str should be one of [none, cache_int8, cache_fp8 "
-          "cache_int4_zp]");
+          "cache_quant_type_str should be one of [none, cache_int8, cache_fp8, "
+          "block_wise_fp8, cache_int4_zp]");
     }
   }
 }
@@ -1070,6 +1105,7 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1099,6 +1135,7 @@ DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1127,6 +1164,7 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, int>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1155,6 +1193,7 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1183,6 +1222,7 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, int, true>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1212,6 +1252,7 @@ DecoderWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16, true>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1240,6 +1281,7 @@ template void DecoderWriteCacheWithRoPEKernel<paddle::float16, int, true>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1269,6 +1311,7 @@ DecoderWriteCacheWithRoPEKernel<paddle::float16, paddle::float16, true>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h
index 2acb4f8293b..f20d3e0dd74 100644
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h
@@ -35,6 +35,7 @@ void DecoderWriteCacheWithRoPEKernel(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
diff --git a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh
index 60d5d34bf48..faaeb1491c6 100644
--- a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh
@@ -34,7 +34,8 @@ __global__ void IntVariableLengthRotaryKernel(
     const int num_head,
     const int seq_len,
     const int last_dim,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int *rope_3d_delta) {
   using LoadT = AlignedVector<int, VecSize>;
   using LoadBiasT = AlignedVector<T, VecSize>;
   using LoadScaleT = AlignedVector<float, VecSize>;
@@ -70,7 +71,13 @@ __global__ void IntVariableLengthRotaryKernel(
         (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
 
     const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
-    int new_emb_idx = rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
+    int new_emb_idx = emb_idx;
+    if (rope_3d_delta) {
+      new_emb_idx =
+          (ori_seq_id + rope_3d_delta[ori_bi]) * half_lastdim + h_bias / 2;
+    } else if (rope_3d) {
+      new_emb_idx = emb_idx + ori_bi * last_dim * seq_len;
+    }
     const int bias_idx = qkv_id * hidden_size + hi * last_dim + h_bias;
     const int64_t base_idx = token_idx * 3 * hidden_size + bias_idx;
     Load<int, VecSize>(&qkv[base_idx], &src_vec);
@@ -128,7 +135,8 @@ __global__ void VariableLengthRotaryKernel(
     const int num_head,
     const int seq_len,
     const int last_dim,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int *rope_3d_delta) {
   using LoadT = AlignedVector<T, VecSize>;
   constexpr int HalfVecSize = VecSize / 2;
   using LoadEmbT = AlignedVector<float, HalfVecSize>;
@@ -160,7 +168,13 @@ __global__ void VariableLengthRotaryKernel(
         (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
 
     const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
-    int new_emb_idx = rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
+    int new_emb_idx = emb_idx;
+    if (rope_3d_delta) {
+      new_emb_idx =
+          (ori_seq_id + rope_3d_delta[ori_bi]) * half_lastdim + h_bias / 2;
+    } else if (rope_3d) {
+      new_emb_idx = emb_idx + ori_bi * last_dim * seq_len;
+    }
     const int64_t base_idx = token_idx * 3 * hidden_size +
                              qkv_id * hidden_size + hi * last_dim + h_bias;
     Load<T, VecSize>(&qkv[base_idx], &src_vec);
@@ -202,7 +216,8 @@ __global__ void IntNeoxVariableLengthRotaryKernel(
     const int num_head,
     const int seq_len,
     const int last_dim,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int *rope_3d_delta) {
   using LoadT = AlignedVector<int, VecSize>;
   using LoadBiasT = AlignedVector<T, VecSize>;
   using LoadScaleT = AlignedVector<float, VecSize>;
@@ -307,7 +322,8 @@ __global__ void NeoxVariableLengthRotaryKernel(
     const int num_head,
     const int seq_len,
     const int last_dim,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int *rope_3d_delta) {
   using LoadT = AlignedVector<T, VecSize>;
   using LoadEmbT = AlignedVector<float, VecSize>;
   LoadT left_vec;
@@ -389,7 +405,8 @@ __global__ void IntGQAVariableLengthRotaryKernel(
     const int kv_num_head,
     const int seq_len,
     const int last_dim,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int *rope_3d_delta) {
   using LoadT = AlignedVector<int, VecSize>;
   using LoadBiasT = AlignedVector<T, VecSize>;
   using LoadScaleT = AlignedVector<float, VecSize>;
@@ -422,8 +439,14 @@ __global__ void IntGQAVariableLengthRotaryKernel(
         (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
 
     const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
-    int64_t new_emb_idx =
-        rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
+    int64_t new_emb_idx = emb_idx;
+    if (rope_3d_delta) {
+      new_emb_idx = static_cast<int64_t>(ori_seq_id + rope_3d_delta[ori_bi]) *
+                        half_lastdim +
+                    h_bias / 2;
+    } else if (rope_3d) {
+      new_emb_idx = emb_idx + ori_bi * last_dim * seq_len;
+    }
     const int64_t bias_idx = hi * last_dim + h_bias;
     const int64_t base_idx = token_idx * offset + bias_idx;
     Load<int, VecSize>(&qkv[base_idx], &src_vec);
@@ -483,6 +506,7 @@ __global__ void GQAVariableLengthRotaryQKNormKernel(
     const int seq_len,
     const int last_dim,
     const bool rope_3d,
+    const int *rope_3d_delta,
     const float *q_norm_weight,
     const float *k_norm_weight,
     const float rms_norm_eps) {
@@ -521,8 +545,14 @@ __global__ void GQAVariableLengthRotaryQKNormKernel(
         h_bias;
     Load<T, VecSize>(&qkv[base_idx], &src_vec);
 
-    int64_t new_emb_idx =
-        rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
+    int64_t new_emb_idx = emb_idx;
+    if (rope_3d_delta) {
+      new_emb_idx = static_cast<int64_t>(ori_seq_id + rope_3d_delta[ori_bi]) *
+                        half_lastdim +
+                    h_bias / 2;
+    } else if (rope_3d) {
+      new_emb_idx = emb_idx + ori_bi * last_dim * seq_len;
+    }
     Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
     Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
 
@@ -580,7 +610,8 @@ __global__ void GQAVariableLengthRotaryKernel(const T *qkv,
                                               const int kv_num_head,
                                               const int seq_len,
                                               const int last_dim,
-                                              const bool rope_3d) {
+                                              const bool rope_3d,
+                                              const int *rope_3d_delta) {
   using LoadT = AlignedVector<T, VecSize>;
   constexpr int HalfVecSize = VecSize / 2;
   using LoadEmbT = AlignedVector<float, HalfVecSize>;
@@ -615,8 +646,14 @@ __global__ void GQAVariableLengthRotaryKernel(const T *qkv,
         h_bias;
     Load<T, VecSize>(&qkv[base_idx], &src_vec);
 
-    int64_t new_emb_idx =
-        rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
+    int64_t new_emb_idx = emb_idx;
+    if (rope_3d_delta) {
+      new_emb_idx = static_cast<int64_t>(ori_seq_id + rope_3d_delta[ori_bi]) *
+                        half_lastdim +
+                    h_bias / 2;
+    } else if (rope_3d) {
+      new_emb_idx = emb_idx + ori_bi * last_dim * seq_len;
+    }
     Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
     Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
 #pragma unroll
@@ -658,7 +695,8 @@ __global__ void IntGQAVariableLengthRotaryQuantKVKernel(
     const int kv_num_head,
     const int seq_len,
     const int last_dim,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int *rope_3d_delta) {
   using LoadIn = AlignedVector<int, VecSize>;
   using LoadBiasT = AlignedVector<T, VecSize>;
   constexpr int HalfVecSize = VecSize / 2;
@@ -692,8 +730,14 @@ __global__ void IntGQAVariableLengthRotaryQuantKVKernel(
         (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
 
     const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
-    int64_t new_emb_idx =
-        rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
+    int64_t new_emb_idx = emb_idx;
+    if (rope_3d_delta) {
+      new_emb_idx = static_cast<int64_t>(ori_seq_id + rope_3d_delta[ori_bi]) *
+                        half_lastdim +
+                    h_bias / 2;
+    } else if (rope_3d) {
+      new_emb_idx = emb_idx + ori_bi * last_dim * seq_len;
+    }
     const int64_t bias_idx = hi * last_dim + h_bias;
     const int64_t base_idx = token_idx * offset + bias_idx;
     Load<int, VecSize>(&qkv[base_idx], &src_vec);
@@ -769,7 +813,8 @@ __global__ void GQAVariableLengthRotaryQuantKVKernel(
     const int kv_num_head,
     const int seq_len,
     const int last_dim,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int *rope_3d_delta) {
   using LoadT = AlignedVector<T, VecSize>;
   constexpr int HalfVecSize = VecSize / 2;
   using LoadEmbT = AlignedVector<float, HalfVecSize>;
@@ -800,8 +845,14 @@ __global__ void GQAVariableLengthRotaryQuantKVKernel(
         (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi];
 
     const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2;
-    int64_t new_emb_idx =
-        rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx;
+    int64_t new_emb_idx = emb_idx;
+    if (rope_3d_delta) {
+      new_emb_idx = static_cast<int64_t>(ori_seq_id + rope_3d_delta[ori_bi]) *
+                        half_lastdim +
+                    h_bias / 2;
+    } else if (rope_3d) {
+      new_emb_idx = emb_idx + ori_bi * last_dim * seq_len;
+    }
     const int64_t bias_idx = hi * last_dim + h_bias;
     const int64_t base_idx = token_idx * offset + bias_idx;
     Load<T, VecSize>(&qkv[base_idx], &src_vec);
@@ -880,7 +931,8 @@ __global__ void IntGQANeoxVariableLengthRotaryKernel(
     const int kv_num_head,
     const int seq_len,
     const int last_dim,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int *rope_3d_delta) {
   using LoadT = AlignedVector<int, VecSize>;
   using LoadBiasT = AlignedVector<T, VecSize>;
   using LoadScaleT = AlignedVector<float, VecSize>;
@@ -983,7 +1035,8 @@ __global__ void GQANeoxVariableLengthRotaryKernel(const T *qkv,
                                                   const int kv_num_head,
                                                   const int seq_len,
                                                   const int last_dim,
-                                                  const bool rope_3d) {
+                                                  const bool rope_3d,
+                                                  const int *rope_3d_delta) {
   using LoadT = AlignedVector<T, VecSize>;
   using LoadEmbT = AlignedVector<float, VecSize>;
   LoadT left_vec;
@@ -1062,7 +1115,8 @@ __global__ void GQANeoxVariableLengthPartialRotaryKernel(
     const int seq_len,
     const int head_dim,
     const int rotary_dim,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int *rope_3d_delta) {
   using LoadT = AlignedVector<T, VecSize>;
   using LoadEmbT = AlignedVector<float, VecSize>;
   LoadT left_vec;
@@ -2246,7 +2300,8 @@ void rotary_qk_variable(
     const int dim_head,
     const cudaStream_t &stream,
     bool use_neox_style = false,
-    bool rope_3d = false) {
+    bool rope_3d = false,
+    const int *rope_3d_delta = nullptr) {
   int64_t elem_nums = qkv_out_scales ? token_num * 3 * head_num * dim_head
                                      : token_num * 2 * head_num * dim_head;
   if (use_neox_style) {
@@ -2282,7 +2337,8 @@ void rotary_qk_variable(
           head_num,
           seq_len,
           dim_head,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta);
     } else {
       launchWithPdlWhenEnabled(
           VariableLengthRotaryKernel<T, PackSize, EnforceFmulRN>,
@@ -2302,7 +2358,8 @@ void rotary_qk_variable(
           head_num,
           seq_len,
           dim_head,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta);
     }
   } else {
     const float *cos_emb = rotary_emb;
@@ -2328,7 +2385,8 @@ void rotary_qk_variable(
           head_num,
           seq_len,
           dim_head,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta);
     } else {
       launchWithPdlWhenEnabled(
           NeoxVariableLengthRotaryKernel<T, PackSize, EnforceFmulRN>,
@@ -2348,7 +2406,8 @@ void rotary_qk_variable(
           head_num,
           seq_len,
           dim_head,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta);
     }
   }
 }
@@ -2373,6 +2432,7 @@ void gqa_rotary_qk_norm_variable(
     const cudaStream_t &stream,
     bool use_neox_style = false,
     bool rope_3d = false,
+    const int *rope_3d_delta = nullptr,
     const float *q_norm_weight = nullptr,
     const float *k_norm_weight = nullptr,
     const float rms_norm_eps = 1e-6) {
@@ -2415,6 +2475,7 @@ void gqa_rotary_qk_norm_variable(
       seq_len,
       dim_head,
       rope_3d,
+      rope_3d_delta,
       q_norm_weight,
       k_norm_weight,
       rms_norm_eps);
@@ -2440,7 +2501,8 @@ void gqa_rotary_qk_variable(
     const int rotary_dim,
     const cudaStream_t &stream,
     bool use_neox_style = false,
-    bool rope_3d = false) {
+    bool rope_3d = false,
+    const int *rope_3d_delta = nullptr) {
   int64_t elem_nums =
       qkv_out_scales
           ? token_num * (num_heads + 2 * kv_num_heads) * dim_head
@@ -2480,7 +2542,8 @@ void gqa_rotary_qk_variable(
           kv_num_heads,
           seq_len,
           dim_head,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta);
     } else {
       auto *kernelFn =
           GQAVariableLengthRotaryKernel<T, PackSize, EnforceFmulRN>;
@@ -2502,7 +2565,8 @@ void gqa_rotary_qk_variable(
                                kv_num_heads,
                                seq_len,
                                dim_head,
-                               rope_3d);
+                               rope_3d,
+                               rope_3d_delta);
     }
   } else {
     const float *cos_emb = rotary_emb;
@@ -2529,7 +2593,8 @@ void gqa_rotary_qk_variable(
           kv_num_heads,
           seq_len,
           dim_head,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta);
     } else {
       if (rotary_dim < dim_head) {
         PD_CHECK((rotary_dim / 2) % PackSize == 0);
@@ -2568,7 +2633,8 @@ void gqa_rotary_qk_variable(
                                  seq_len,
                                  dim_head,
                                  rotary_dim,
-                                 rope_3d);
+                                 rope_3d,
+                                 rope_3d_delta);
       } else {
         auto *kernelFn =
             GQANeoxVariableLengthRotaryKernel<T, PackSize, EnforceFmulRN>;
@@ -2592,7 +2658,8 @@ void gqa_rotary_qk_variable(
                                  kv_num_heads,
                                  seq_len,
                                  dim_head,
-                                 rope_3d);
+                                 rope_3d,
+                                 rope_3d_delta);
       }
     }
   }
@@ -2619,7 +2686,8 @@ void gqa_rotary_qk_quant_variable(
     const int dim_head,
     const cudaStream_t &stream,
     bool use_neox_style = false,
-    bool rope_3d = false) {
+    bool rope_3d = false,
+    const int *rope_3d_delta = nullptr) {
   int64_t elem_nums = token_num * (num_heads + 2 * kv_num_heads) * dim_head;
   if (use_neox_style) {
     elem_nums /= 2;
@@ -2657,7 +2725,8 @@ void gqa_rotary_qk_quant_variable(
           kv_num_heads,
           seq_len,
           dim_head,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta);
     } else {
       launchWithPdlWhenEnabled(
           GQAVariableLengthRotaryQuantKVKernel<T, PackSize, EnforceFmulRN>,
@@ -2681,7 +2750,8 @@ void gqa_rotary_qk_quant_variable(
           kv_num_heads,
           seq_len,
           dim_head,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta);
     }
   } else {
     PADDLE_THROW("Use_neox_style mode isn't implemented yet");
diff --git a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h
index 23969aa429f..d72d15bebc6 100644
--- a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h
@@ -43,6 +43,7 @@ void EncoderWriteCacheWithRopeKernel(
     const int max_seq_len,
     const bool use_neox_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
     paddle::Tensor* key_cache_out,
@@ -96,6 +97,7 @@ void EncoderWriteCacheWithRopeKernel(
           stream,
           use_neox_style,
           rope_3d,
+          rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr,
           q_norm_weight ? q_norm_weight.get().data<float>() : nullptr,
           k_norm_weight ? k_norm_weight.get().data<float>() : nullptr,
           rms_norm_eps);
@@ -123,7 +125,8 @@ void EncoderWriteCacheWithRopeKernel(
           head_dim,
           stream,
           use_neox_style,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr);
     } else {
       if (!is_scale_channel_wise) {
         gqa_rotary_qk_variable<T, QKV_TYPE, EnforceFmulRN>(
@@ -145,7 +148,8 @@ void EncoderWriteCacheWithRopeKernel(
             rotary_dim,
             stream,
             use_neox_style,
-            rope_3d);
+            rope_3d,
+            rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr);
       } else {
         gqa_rotary_qk_quant_variable<T, QKV_TYPE, EnforceFmulRN>(
             qkv_out->data<T>(),
@@ -167,7 +171,8 @@ void EncoderWriteCacheWithRopeKernel(
             head_dim,
             stream,
             use_neox_style,
-            rope_3d);
+            rope_3d,
+            rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr);
       }
     }
   }
diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
index 9e63cf4e351..7f6da089822 100644
--- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
@@ -51,7 +51,8 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel(
     const float* q_norm_weight,
     const float* k_norm_weight,
     const float rms_norm_eps,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int* rope_3d_delta) {
   using LoadT = AlignedVector<T, VecSize>;
   using LoadFloat = AlignedVector<float, VecSize>;
   using LoadInT = AlignedVector<InT, VecSize>;
@@ -109,8 +110,15 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel(
     if (hi < num_heads + gqa_group_size) {
       // q k rope
       const int64_t emb_idx = write_seq_id * half_head_size + h_bias / 2;
-      uint32_t new_emb_idx =
-          rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx;
+      uint32_t new_emb_idx;
+      if (rope_3d_delta) {
+        const int rope_pos = write_seq_id + rope_3d_delta[ori_bi];
+        new_emb_idx = rope_pos * half_head_size + h_bias / 2;
+      } else if (rope_3d) {
+        new_emb_idx = emb_idx + ori_bi * max_seq_len * head_size;
+      } else {
+        new_emb_idx = emb_idx;
+      }
       Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
       Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
     }
@@ -366,7 +374,8 @@ __global__ void append_speculate_cache_rope_kernel(
     const int block_size,
     const int elem_cnt,
     const int gqa_group_size,
-    const bool rope_3d) {
+    const bool rope_3d,
+    const int* rope_3d_delta) {
   using LoadT = AlignedVector<T, VecSize>;
   using LoadFloat = AlignedVector<float, VecSize>;
   using LoadInT = AlignedVector<InT, VecSize>;
@@ -420,8 +429,15 @@ __global__ void append_speculate_cache_rope_kernel(
     if (hi < num_heads + gqa_group_size) {
       // q k rope
       const int64_t emb_idx = write_seq_id * half_head_size + h_bias / 2;
-      int64_t new_emb_idx =
-          rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx;
+      int64_t new_emb_idx;
+      if (rope_3d_delta) {
+        const int rope_pos = write_seq_id + rope_3d_delta[ori_bi];
+        new_emb_idx = rope_pos * half_head_size + h_bias / 2;
+      } else if (rope_3d) {
+        new_emb_idx = emb_idx + ori_bi * max_seq_len * head_size;
+      } else {
+        new_emb_idx = emb_idx;
+      }
       Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
       Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
     }
@@ -815,6 +831,7 @@ __global__ void append_speculate_cache_fp8_rope_qk_norm_dynamic_kernel(
     const float min_bound,
     const int gqa_group_size,
     const bool rope_3d,
+    const int* __restrict__ rope_3d_delta,
     const float rms_norm_eps) {
   static_assert(HeadDim == 128, "just support HeadDim be 128 now!");
   static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!");
@@ -874,8 +891,15 @@ __global__ void append_speculate_cache_fp8_rope_qk_norm_dynamic_kernel(
 
       // q rope
       const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
-      uint32_t new_emb_idx =
-          rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx;
+      uint32_t new_emb_idx;
+      if (rope_3d_delta) {
+        const int rope_pos = write_seq_id + rope_3d_delta[bid];
+        new_emb_idx = rope_pos * half_head_size + head_bias / 2;
+      } else if (rope_3d) {
+        new_emb_idx = emb_idx + bid * max_seq_len * HeadDim;
+      } else {
+        new_emb_idx = emb_idx;
+      }
       Load<float, HalfVecSize>(&cos_emb[new_emb_idx], &cos_emb_vec);
       Load<float, HalfVecSize>(&sin_emb[new_emb_idx], &sin_emb_vec);
 #pragma unroll
@@ -939,8 +963,15 @@ __global__ void append_speculate_cache_fp8_rope_qk_norm_dynamic_kernel(
     const int v_head_idx = head_idx - num_heads - gqa_group_size;
     if (head_idx < num_heads + gqa_group_size) {
       const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2;
-      uint32_t new_emb_idx =
-          rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx;
+      uint32_t new_emb_idx;
+      if (rope_3d_delta) {
+        const int rope_pos = write_seq_id + rope_3d_delta[bid];
+        new_emb_idx = rope_pos * half_head_size + head_bias / 2;
+      } else if (rope_3d) {
+        new_emb_idx = emb_idx + bid * max_seq_len * HeadDim;
+      } else {
+        new_emb_idx = emb_idx;
+      }
       Load<float, 1>(&cos_emb[new_emb_idx], &cos_emb_vec1);
       Load<float, 1>(&cos_emb[new_emb_idx + 4], &cos_emb_vec2);
       Load<float, 1>(&sin_emb[new_emb_idx], &sin_emb_vec1);
diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu
index 4ee00f12e07..021380365df 100644
--- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu
@@ -42,7 +42,8 @@ void append_speculate_cache_rope_qk_norm(const QKV_TYPE* qkv,
                                          const float* q_norm_weight,
                                          const float* k_norm_weight,
                                          const float rms_norm_eps,
-                                         const bool rope_3d) {
+                                         const bool rope_3d,
+                                         const int* rope_3d_delta) {
   int output_inner_dim = num_heads + 2 * kv_num_heads;
   const uint32_t elem_nums =
       use_neox_style ? token_num * (num_heads + 2 * kv_num_heads) * dim_head / 2
@@ -86,7 +87,8 @@ void append_speculate_cache_rope_qk_norm(const QKV_TYPE* qkv,
                                               q_norm_weight,
                                               k_norm_weight,
                                               rms_norm_eps,
-                                              rope_3d);
+                                              rope_3d,
+                                              rope_3d_delta);
   }
 }
 
@@ -116,7 +118,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
                                  const int token_num,
                                  const cudaStream_t& stream,
                                  const bool use_neox_style,
-                                 const bool rope_3d) {
+                                 const bool rope_3d,
+                                 const int* rope_3d_delta) {
   int output_inner_dim = num_heads + 2 * kv_num_heads;
 
   const uint32_t elem_nums =
@@ -212,7 +215,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
             block_size,
             elem_nums,
             kv_num_heads,
-            rope_3d);
+            rope_3d,
+            rope_3d_delta);
   }
 }
 
@@ -242,6 +246,7 @@ void append_speculate_cache_fp8_rope(const T* qkv,
                                      const int token_num,
                                      const cudaStream_t& stream,
                                      const bool rope_3d,
+                                     const int* rope_3d_delta,
                                      const float rms_norm_eps) {
   constexpr int num_warps = 4;
   const int all_warps =
@@ -291,6 +296,7 @@ void append_speculate_cache_fp8_rope(const T* qkv,
                                              -127.0f,
                                              kv_num_heads,
                                              rope_3d,
+                                             rope_3d_delta,
                                              rms_norm_eps);
 }
 
@@ -538,6 +544,7 @@ void SpeculateWriteCacheWithRoPEKernel(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -562,6 +569,21 @@ void SpeculateWriteCacheWithRoPEKernel(
 
   const float* cos_emb =
       rotary_embs ? rotary_embs.get().data<float>() : nullptr;
+  const int* rope_3d_delta_ptr =
+      rope_3d_delta ? rope_3d_delta.get().data<int>() : nullptr;
+  if (rope_3d_delta_ptr) {
+    if (use_neox_rotary_style) {
+      PD_THROW(
+          "rope_3d_delta is not supported with use_neox_rotary_style for "
+          "speculate decoder.");
+    }
+    if (cache_quant_type_str != "none" &&
+        cache_quant_type_str != "block_wise_fp8") {
+      PD_THROW(
+          "rope_3d_delta only supports cache_quant_type none/block_wise_fp8 "
+          "for speculate decoder.");
+    }
+  }
   const float* sin_emb;
   int rotary_dim = dim_head;
   if (rotary_embs) {
@@ -613,7 +635,8 @@ void SpeculateWriteCacheWithRoPEKernel(
           reinterpret_cast<const float*>(q_norm_weight.get().data<float>()),
           reinterpret_cast<const float*>(k_norm_weight.get().data<float>()),
           rms_norm_eps,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta_ptr);
     } else if (cache_quant_type_str == "block_wise_fp8") {
       append_speculate_cache_fp8_rope<DataType_, true, EnforceFmulRN>(
           reinterpret_cast<const DataType_*>(qkv_ptr),
@@ -643,6 +666,7 @@ void SpeculateWriteCacheWithRoPEKernel(
           token_nums,
           stream,
           rope_3d,
+          rope_3d_delta_ptr,
           rms_norm_eps);
     } else if (cache_quant_type_str == "cache_fp8") {
       append_speculate_cache_fp8_rope<DataType_, false, EnforceFmulRN>(
@@ -673,6 +697,7 @@ void SpeculateWriteCacheWithRoPEKernel(
           token_nums,
           stream,
           rope_3d,
+          rope_3d_delta_ptr,
           rms_norm_eps);
     } else {
       PD_THROW(
@@ -710,7 +735,8 @@ void SpeculateWriteCacheWithRoPEKernel(
           token_nums,
           stream,
           use_neox_rotary_style,
-          rope_3d);
+          rope_3d,
+          rope_3d_delta_ptr);
     } else if (cache_quant_type_str == "cache_int8") {
       append_speculate_cache_int8_rope<DataType_,
                                        QKV_TYPE,
@@ -814,6 +840,7 @@ void SpeculateWriteCacheWithRoPEKernel(
           token_nums,
           stream,
           rope_3d,
+          rope_3d_delta_ptr,
           rms_norm_eps);
     } else if (cache_quant_type_str == "cache_int4_zp") {
       append_speculate_cache_int4_rope<DataType_, QKV_TYPE, EnforceFmulRN>(
@@ -883,6 +910,7 @@ template void SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, int>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -913,6 +941,7 @@ SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -942,6 +971,7 @@ template void SpeculateWriteCacheWithRoPEKernel<paddle::float16, int>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -972,6 +1002,7 @@ SpeculateWriteCacheWithRoPEKernel<paddle::float16, paddle::float16>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1001,6 +1032,7 @@ template void SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, int, true>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1031,6 +1063,7 @@ SpeculateWriteCacheWithRoPEKernel<paddle::bfloat16, paddle::bfloat16, true>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1060,6 +1093,7 @@ template void SpeculateWriteCacheWithRoPEKernel<paddle::float16, int, true>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
@@ -1090,6 +1124,7 @@ SpeculateWriteCacheWithRoPEKernel<paddle::float16, paddle::float16, true>(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h
index c9c3ff9e0b9..43a2d9543a0 100644
--- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h
@@ -36,6 +36,7 @@ void SpeculateWriteCacheWithRoPEKernel(
     const std::string& cache_quant_type_str,
     const bool use_neox_rotary_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const int max_seq_len,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
index 915039908dc..78e415993aa 100644
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu
@@ -40,6 +40,7 @@ EncoderWriteCacheWithRopeKernel<paddle::bfloat16, paddle::bfloat16>(
     const int max_seq_len,
     const bool use_neox_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
     paddle::Tensor* key_cache_out,
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
index 3f3539b8a6e..c51d15e36c4 100644
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu
@@ -39,6 +39,7 @@ template void EncoderWriteCacheWithRopeKernel<paddle::bfloat16, int>(
     const int max_seq_len,
     const bool use_neox_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
     paddle::Tensor* key_cache_out,
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
index a559ec77f37..44650da68c7 100644
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu
@@ -39,6 +39,7 @@ template void EncoderWriteCacheWithRopeKernel<paddle::float16, paddle::float16>(
     const int max_seq_len,
     const bool use_neox_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
     paddle::Tensor* key_cache_out,
diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
index 3318a36472b..12125da218e 100644
--- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu
@@ -39,6 +39,7 @@ template void EncoderWriteCacheWithRopeKernel<paddle::float16, int>(
     const int max_seq_len,
     const bool use_neox_style,
     const bool rope_3d,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     cudaStream_t& stream,
     paddle::Tensor* qkv_out,
     paddle::Tensor* key_cache_out,
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
index 683ffe2db82..1eb02d6c642 100644
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -110,6 +110,7 @@ std::vector<paddle::Tensor> AppendAttention(
     const paddle::optional<paddle::Tensor>& out_linear_smooths,
     const paddle::optional<paddle::Tensor>& mask_offset,
     const paddle::optional<paddle::Tensor>& kv_signal_data,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const paddle::optional<paddle::Tensor>& q_norm_weight,
     const paddle::optional<paddle::Tensor>& k_norm_weight,
     const paddle::optional<paddle::Tensor>& sinks,
@@ -167,6 +168,7 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
     const paddle::optional<paddle::Tensor>& out_linear_smooths,
     const paddle::optional<paddle::Tensor>& mask_offset,
     const paddle::optional<paddle::Tensor>& kv_signal_data,
+    const paddle::optional<paddle::Tensor>& rope_3d_delta,
     const paddle::optional<paddle::Tensor>& q_norm_weight,
     const paddle::optional<paddle::Tensor>& k_norm_weight,
     const paddle::optional<paddle::Tensor>& sinks,
diff --git a/custom_ops/gpu_ops/decoder_write_cache_with_rope.cu b/custom_ops/gpu_ops/decoder_write_cache_with_rope.cu
index 7878e9926c5..d2c91408dd8 100644
--- a/custom_ops/gpu_ops/decoder_write_cache_with_rope.cu
+++ b/custom_ops/gpu_ops/decoder_write_cache_with_rope.cu
@@ -109,6 +109,7 @@ std::vector<paddle::Tensor> DecoderWriteCacheWithRoPE(
               cache_quant_type_str,
               use_neox_rotary_style,
               rope_3d,
+              paddle::optional<paddle::Tensor>(),
               max_input_length,
               stream,
               const_cast<paddle::Tensor*>(&qkv),
@@ -138,6 +139,7 @@ std::vector<paddle::Tensor> DecoderWriteCacheWithRoPE(
               cache_quant_type_str,
               use_neox_rotary_style,
               rope_3d,
+              paddle::optional<paddle::Tensor>(),
               max_input_length,
               stream,
               const_cast<paddle::Tensor*>(&qkv),
@@ -173,6 +175,7 @@ std::vector<paddle::Tensor> DecoderWriteCacheWithRoPE(
               cache_quant_type_str,
               use_neox_rotary_style,
               rope_3d,
+              paddle::optional<paddle::Tensor>(),
               max_input_length,
               stream,
               const_cast<paddle::Tensor*>(&qkv),
@@ -201,6 +204,7 @@ std::vector<paddle::Tensor> DecoderWriteCacheWithRoPE(
               cache_quant_type_str,
               use_neox_rotary_style,
               rope_3d,
+              paddle::optional<paddle::Tensor>(),
               max_input_length,
               stream,
               const_cast<paddle::Tensor*>(&qkv),
diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
index 645f6524734..5c38c089077 100644
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -978,6 +978,9 @@ def _fetch_request():
                             )
                             if finished_ids:
                                 for task in tasks:
+                                    # Fix
+                                    if task.request_id not in need_check_req_ids:
+                                        continue
                                     result = self.resource_manager.waiting_async_process(task)
                                     if result is None:
                                         self.scheduler.put_results(
diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py
index 44cf528bed3..0e2b3e7576e 100644
--- a/fastdeploy/model_executor/forward_meta.py
+++ b/fastdeploy/model_executor/forward_meta.py
@@ -69,6 +69,8 @@ class ForwardMeta:
     ids_remove_padding: paddle.Tensor
     # Rotation position embedding
     rotary_embs: Optional[paddle.Tensor] = None
+    # Per-request offset for shared Ernie VL decode 3D RoPE.
+    rope_3d_delta: Optional[paddle.Tensor] = None
 
     # Use cuda graph in this step or not. Used to avoid run cuda graph when in dummy run or prefill stage.
     step_use_cudagraph: bool = False
diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py
index 76de638bce6..61d1639fadc 100644
--- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py
@@ -442,6 +442,7 @@ def forward_mixed(
                 layer.linear_smooth,
                 forward_meta.attn_mask_offsets,
                 metadata.kv_signal_data_list[layer.layer_id],
+                forward_meta.rope_3d_delta,
                 q_norm_weight,
                 k_norm_weight,
                 getattr(layer, "sinks", None),
@@ -498,6 +499,7 @@ def forward_mixed(
                 layer.linear_smooth,
                 forward_meta.attn_mask_offsets,
                 metadata.kv_signal_data_list[layer.layer_id],
+                forward_meta.rope_3d_delta,
                 q_norm_weight,
                 k_norm_weight,
                 getattr(layer, "sinks", None),
diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
index f095980de9e..d4ab8f5f59b 100644
--- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py
@@ -598,6 +598,7 @@ def forward_mixed(
                 layer.linear_smooth,
                 forward_meta.attn_mask_offsets,
                 metadata.kv_signal_data_list[layer.layer_id],
+                forward_meta.rope_3d_delta,
                 q_norm_weight,
                 k_norm_weight,
                 getattr(layer, "sinks", None),
diff --git a/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py
index bdb018ec269..ee064e73e99 100644
--- a/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py
+++ b/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py
@@ -353,6 +353,7 @@ def forward_mixed(
             layer.linear_smooth,
             forward_meta.attn_mask_offsets,
             metadata.kv_signal_data_list[layer.layer_id],
+            forward_meta.rope_3d_delta,
             q_norm_weight,
             k_norm_weight,
             getattr(layer, "sinks", None),
diff --git a/fastdeploy/model_executor/layers/attention/ops/append_attention.py b/fastdeploy/model_executor/layers/attention/ops/append_attention.py
index 8b36ffa85b0..e59a6b6bd1e 100644
--- a/fastdeploy/model_executor/layers/attention/ops/append_attention.py
+++ b/fastdeploy/model_executor/layers/attention/ops/append_attention.py
@@ -63,6 +63,7 @@ def append_attention(
     linear_smooth: Optional[paddle.Tensor] = None,
     mask_offset: Optional[paddle.Tensor] = None,
     kv_signal_data: Optional[paddle.Tensor] = None,
+    rope_3d_delta: Optional[paddle.Tensor] = None,
     q_norm_weight: Optional[paddle.Tensor] = None,
     k_norm_weight: Optional[paddle.Tensor] = None,
     sinks: Optional[paddle.Tensor] = None,
@@ -126,6 +127,7 @@ def append_attention(
                 linear_smooth,
                 mask_offset,
                 kv_signal_data,
+                rope_3d_delta,
                 q_norm_weight,
                 k_norm_weight,
                 sinks,
@@ -185,6 +187,7 @@ def append_attention(
             linear_smooth,
             mask_offset,
             kv_signal_data,
+            rope_3d_delta,
             q_norm_weight,
             k_norm_weight,
             sinks,
@@ -254,6 +257,7 @@ def append_attention_with_output(
     linear_smooth: Optional[paddle.Tensor] = None,
     mask_offset: Optional[paddle.Tensor] = None,
     kv_signal_data: Optional[paddle.Tensor] = None,
+    rope_3d_delta: Optional[paddle.Tensor] = None,
     q_norm_weight: Optional[paddle.Tensor] = None,
     k_norm_weight: Optional[paddle.Tensor] = None,
     sinks: Optional[paddle.Tensor] = None,
@@ -314,6 +318,7 @@ def append_attention_with_output(
             linear_smooth,
             mask_offset,
             kv_signal_data,
+            rope_3d_delta,
             q_norm_weight,
             k_norm_weight,
             sinks,
diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py
index 22942f10f2c..a204e4d2603 100644
--- a/fastdeploy/model_executor/layers/rotary_embedding.py
+++ b/fastdeploy/model_executor/layers/rotary_embedding.py
@@ -641,6 +641,37 @@ def __call__(self, position_ids, max_len_lst, cumsum_seqlens):
         return rot_emb_list
 
 
+def get_ernie_rope_3d_decode(
+    rotary_dim: int,
+    base: float,
+    partial_rotary_factor: float,
+    max_position: int,
+    freq_allocation: int,
+    rope_scaling: dict,
+) -> paddle.Tensor:
+    """Build shared Ernie VL decode RoPE for text positions."""
+    rotary_emb3d_layer = ErnieVlRotaryEmbedding3D(
+        rotary_dim, base, partial_rotary_factor, max_position, freq_allocation, rope_scaling
+    )
+    position_ids_3d = paddle.tile(
+        paddle.arange(max_position, dtype="float32").unsqueeze(-1),
+        [1, 3],
+    )
+    expand_pos = paddle.index_select(
+        position_ids_3d / partial_rotary_factor,
+        rotary_emb3d_layer.using_position_axis,
+        axis=-1,
+    )
+    inv_freq = 1.0 / (base ** (paddle.arange(0, rotary_dim, 2, dtype="float32") / rotary_dim))
+    freqs = expand_pos * inv_freq
+    cos_emb = paddle.cos(freqs).unsqueeze(0).unsqueeze(0).unsqueeze(3)
+    sin_emb = paddle.sin(freqs).unsqueeze(0).unsqueeze(0).unsqueeze(3)
+    rot_emb = paddle.concat([cos_emb, sin_emb], axis=0)
+    if current_platform.is_iluvatar():
+        rot_emb = paddle.stack([rot_emb, rot_emb], axis=-1).reshape([2, 1, max_position, 1, rotary_dim])
+    return rot_emb
+
+
 def get_rope_3d(
     rotary_dim: int,
     base: float,
diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
index 6c5149e253b..442f859625b 100644
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -541,10 +541,16 @@ def insert_tasks_v1(
                 )
                 if self.use_attn_mask_offset:
                     inputs = request.multimodal_inputs
-                    self.model_inputs["attn_mask_offsets_full"][idx][0 : prefill_end_index - prefill_start_index] = (
-                        paddle.to_tensor(
-                            inputs["attention_mask_offset"][prefill_start_index:prefill_end_index], dtype="int32"
+                    attn_offset_len = prefill_end_index - prefill_start_index
+                    if inputs.get("attention_mask_offset", None) is None:
+                        attention_mask_offset_slice = np.arange(prefill_start_index, prefill_end_index, dtype=np.int32)
+                    else:
+                        attention_mask_offset_slice = np.asarray(
+                            inputs["attention_mask_offset"][prefill_start_index:prefill_end_index],
+                            dtype=np.int32,
                         )
+                    self.model_inputs["attn_mask_offsets_full"][idx][0:attn_offset_len] = paddle.to_tensor(
+                        attention_mask_offset_slice, dtype="int32"
                     )
                     # GPU don't need it anymore
                     # NOTE: XPU backend needs decoder attention mask offset; GPU backend does not use it
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 9cbb72636ef..f1fd8dc9889 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -51,7 +51,10 @@
 from fastdeploy.model_executor.layers.moe.routing_indices_cache import (
     RoutingReplayManager,
 )
-from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d
+from fastdeploy.model_executor.layers.rotary_embedding import (
+    get_ernie_rope_3d_decode,
+    get_rope_3d,
+)
 from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
 from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
 from fastdeploy.model_executor.model_loader import get_model_loader
@@ -211,6 +214,8 @@ def __init__(
         # Initialize input batch
         self.share_inputs = InputBatch(self.fd_config)
         self.share_inputs.init_share_inputs()
+        if getattr(self.share_inputs, "use_shared_ernie_decode_rope_3d", False):
+            self.share_inputs["rope_emb"][0] = self.prepare_shared_ernie_decode_rope3d()
         self.increment_value = (
             4 if not self.speculative_decoding else (self.speculative_config.num_speculative_tokens + 1) * 4
         )
@@ -799,16 +804,23 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
                 # rope 3d
                 if self.enable_mm:
                     position_ids = request.multimodal_inputs["position_ids"]
-                    rope_3d_position_ids["position_ids_idx"].append(idx)
-                    rope_3d_position_ids["position_ids_lst"].append(position_ids)
-                    rope_3d_position_ids["position_ids_offset"].append(
-                        len(position_ids) + rope_3d_position_ids["position_ids_offset"][-1]
-                    )
-
-                    if self.is_pooling_model:
-                        rope_3d_position_ids["max_tokens_lst"].append(0)
+                    if getattr(self.share_inputs, "use_shared_ernie_decode_rope_3d", False):
+                        if isinstance(position_ids, paddle.Tensor):
+                            rope_3d_delta = int(paddle.max(position_ids).item()) + 1 - len(position_ids)
+                        else:
+                            rope_3d_delta = int(np.max(position_ids)) + 1 - len(position_ids)
+                        async_set_value(self.share_inputs["rope_3d_delta"][idx : idx + 1], rope_3d_delta)
                     else:
-                        rope_3d_position_ids["max_tokens_lst"].append(request.get("max_tokens", 2048))
+                        rope_3d_position_ids["position_ids_idx"].append(idx)
+                        rope_3d_position_ids["position_ids_lst"].append(position_ids)
+                        rope_3d_position_ids["position_ids_offset"].append(
+                            len(position_ids) + rope_3d_position_ids["position_ids_offset"][-1]
+                        )
+
+                        if self.is_pooling_model:
+                            rope_3d_position_ids["max_tokens_lst"].append(0)
+                        else:
+                            rope_3d_position_ids["max_tokens_lst"].append(request.get("max_tokens", 2048))
 
                 # guided decoding
                 if (
@@ -1395,6 +1407,7 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False):
         self.forward_meta = ForwardMeta(
             ids_remove_padding=self.share_inputs["ids_remove_padding"],
             rotary_embs=self.share_inputs["rope_emb"],
+            rope_3d_delta=self.share_inputs.get("rope_3d_delta", None),
             attn_backend=self.attn_backends[0],
             decoder_batch_ids=self.share_inputs["decoder_batch_ids"],
             decoder_tile_ids_per_batch=self.share_inputs["decoder_tile_ids_per_batch"],
@@ -3172,6 +3185,19 @@ def _dummy_run_extract_vision_features(self):
                     window_size=-1,
                 )
 
+    @paddle.no_grad()
+    def prepare_shared_ernie_decode_rope3d(self) -> paddle.Tensor:
+        """prepare shared Ernie VL decode rope3d"""
+
+        return get_ernie_rope_3d_decode(
+            rotary_dim=self.model_config.head_dim,
+            partial_rotary_factor=1.0,
+            base=self.model_config.rope_theta,
+            max_position=self.model_config.max_model_len,
+            freq_allocation=getattr(self.model_config, "freq_allocation", 20),
+            rope_scaling=getattr(self.model_config, "rope_scaling", {}),
+        )
+
     @paddle.no_grad()
     def prepare_rope3d(
         self, position_ids: paddle.Tensor, max_len_lst: list[int], cumsum_seqlens: list[int]
diff --git a/fastdeploy/worker/input_batch.py b/fastdeploy/worker/input_batch.py
index 96b1694c895..6efd6801528 100644
--- a/fastdeploy/worker/input_batch.py
+++ b/fastdeploy/worker/input_batch.py
@@ -97,6 +97,18 @@ def __init__(self, fd_config: FDConfig) -> None:
         self.speculative_decoding = self.speculative_config.method is not None
         self.is_mm_model = self.model_config.enable_mm
         self.enable_mm = fd_config.enable_mm_runtime
+        model_type = getattr(self.model_config, "model_type", "")
+        if isinstance(model_type, list):
+            model_type = str(model_type[0]) if model_type else ""
+        model_type = model_type.lower()
+        self.use_shared_ernie_decode_rope_3d = (
+            self.enable_mm
+            and fd_config.enable_rope_3d_runtime
+            and "ernie" in model_type
+            and "qwen" not in model_type
+            and "paddleocr" not in model_type
+            and self.scheduler_config.splitwise_role == "decode"
+        )
         self.enable_expert_parallel = fd_config.parallel_config.enable_expert_parallel
         self.index_to_batch_id = {}
         self.enable_pd_reorder = False
@@ -343,9 +355,10 @@ def init_share_inputs(self):
             else:  # neox style = False
                 rope_head_dim = head_dim // 2
 
+            rope_batch_size = 1 if self.use_shared_ernie_decode_rope_3d else max_num_seqs
             self.rope_emb = paddle.full(
                 shape=[
-                    max_num_seqs,
+                    rope_batch_size,
                     2,
                     1,
                     self.model_config.max_model_len,
@@ -355,6 +368,8 @@ def init_share_inputs(self):
                 fill_value=0,
                 dtype="float32",
             )
+            if self.use_shared_ernie_decode_rope_3d:
+                self.rope_3d_delta = paddle.full([max_num_seqs], 0, dtype="int32")
             self.image_features = None  # Built before the forward
             self.image_grid_thws = None
             self.image_features_list = None
@@ -484,7 +499,10 @@ def swap_data(tensor, idx1, idx2):
                     self.image_features_list[i2],
                     self.image_features_list[i1],
                 )
-            swap_data(self.share_inputs["rope_emb"], i1, i2)
+            if not self.use_shared_ernie_decode_rope_3d:
+                swap_data(self.rope_emb, i1, i2)
+            else:
+                swap_data(self.rope_3d_delta, i1, i2)
             swap_data(self.decode_states, i1, i2)
             swap_data(self.attn_mask_offsets_full, i1, i2)
         # Swap mask rollback
@@ -686,18 +704,21 @@ def reset_share_inputs(self):
                 else:
                     rope_head_dim = head_dim // 2
 
-                self.rope_emb = paddle.full(
-                    shape=[
-                        max_num_seqs,
-                        2,
-                        1,
-                        self.model_config.max_model_len,
-                        1,
-                        rope_head_dim,
-                    ],
-                    fill_value=0,
-                    dtype="float32",
-                )
+                if self.use_shared_ernie_decode_rope_3d:
+                    fill_paddle_tensor(self, "rope_3d_delta", 0)
+                else:
+                    self.rope_emb = paddle.full(
+                        shape=[
+                            max_num_seqs,
+                            2,
+                            1,
+                            self.model_config.max_model_len,
+                            1,
+                            rope_head_dim,
+                        ],
+                        fill_value=0,
+                        dtype="float32",
+                    )
                 self.image_features = None
                 self.image_grid_thws = None
                 self.image_features_list = None
diff --git a/tests/deterministic/test_c16_warp1_4_determinism.py b/tests/deterministic/test_c16_warp1_4_determinism.py
index 1cd260daacc..6d7f69f9c58 100644
--- a/tests/deterministic/test_c16_warp1_4_determinism.py
+++ b/tests/deterministic/test_c16_warp1_4_determinism.py
@@ -350,6 +350,7 @@ def run_c16_warp14_decoder_test(
         None,
         None,
         None,
+        None,
         1e-6,
         compute_type,
         "none",
@@ -460,6 +461,7 @@ def run_c16_warp14_decoder_test(
         None,
         None,
         None,
+        None,
         1e-6,
         compute_type,
         "none",
@@ -545,6 +547,7 @@ def run_c16_warp14_decoder_test(
             None,
             None,
             None,
+            None,
             1e-6,
             compute_type,
             "none",
diff --git a/tests/layers/test_append_attention.py b/tests/layers/test_append_attention.py
index bc9e1bd31d8..f4813a1ce5b 100644
--- a/tests/layers/test_append_attention.py
+++ b/tests/layers/test_append_attention.py
@@ -703,6 +703,7 @@ def cmp_append_attention(self, naive_cache_k=None, naive_cache_v=None, attn_mask
                 None,  # linear_smooth
                 self.mask_offset,  # mask_offset
                 None,  # kv_signal_data
+                None,  # rope_3d_delta
                 q_norm_weight,  # q_norm_weight
                 k_norm_weight,  # k_norm_weight
                 sinks,  # sinks
@@ -768,6 +769,7 @@ def cmp_append_attention(self, naive_cache_k=None, naive_cache_v=None, attn_mask
                 None,  # linear_smooth
                 self.mask_offset,  # mask_offset
                 None,  # kv_signal_data
+                None,  # rope_3d_delta
                 q_norm_weight,  # q_norm_weight
                 k_norm_weight,  # k_norm_weight
                 sinks,  # sinks
diff --git a/tests/layers/test_append_attention_with_output.py b/tests/layers/test_append_attention_with_output.py
index 6c15de17ccc..d5ced4c62b9 100644
--- a/tests/layers/test_append_attention_with_output.py
+++ b/tests/layers/test_append_attention_with_output.py
@@ -524,6 +524,7 @@ def cmp_append_attention(self, naive_cache_k=None, naive_cache_v=None, attn_mask
                 None,  # linear_smooth
                 self.mask_offset,  # mask_offset
                 None,  # kv_signal_data
+                None,  # rope_3d_delta
                 q_norm_weight,  # q_norm_weight
                 k_norm_weight,  # k_norm_weight
                 None,  # sinks
diff --git a/tests/operators/attention/test_decode_unified_attention_c16.py b/tests/operators/attention/test_decode_unified_attention_c16.py
index 0d17d17ccd6..49cbe770ba2 100644
--- a/tests/operators/attention/test_decode_unified_attention_c16.py
+++ b/tests/operators/attention/test_decode_unified_attention_c16.py
@@ -355,6 +355,7 @@ def run_append_attention(
             None,  # linear_smooth
             None,  # mask_offset
             None,  # kv_signal_data
+            None,  # rope_3d_delta
             None,  # q_norm_weight
             None,  # k_norm_weight
             None,  # sinks
diff --git a/tests/operators/attention/test_decode_unified_attention_c8.py b/tests/operators/attention/test_decode_unified_attention_c8.py
index d5ec0e5354c..3eed3145f5f 100644
--- a/tests/operators/attention/test_decode_unified_attention_c8.py
+++ b/tests/operators/attention/test_decode_unified_attention_c8.py
@@ -356,6 +356,7 @@ def append_attention_with_args(
             None,  # linear_smooth
             None,  # mask_offset
             None,  # kv_signal_data
+            None,  # rope_3d_delta
             self.q_norm_weight,
             self.k_norm_weight,
             None,  # sinks
diff --git a/tests/operators/test_tree_mask.py b/tests/operators/test_tree_mask.py
index b05f173fce2..a75b58fb77e 100644
--- a/tests/operators/test_tree_mask.py
+++ b/tests/operators/test_tree_mask.py
@@ -309,6 +309,7 @@ def run_append_c16_attention(
                 None,  # linear_smooth
                 mask_offset,  # mask_offset
                 None,  # kv_signal_data
+                None,  # rope_3d_delta
                 self.q_norm_weight_tensor if use_qknorm else None,  # q_norm_weight
                 self.k_norm_weight_tensor if use_qknorm else None,  # k_norm_weight
                 None,  # sinks