diff --git a/custom_ops/gpu_ops/append_attention.cu b/custom_ops/gpu_ops/append_attention.cu index c1586945cc5..7603c255021 100644 --- a/custom_ops/gpu_ops/append_attention.cu +++ b/custom_ops/gpu_ops/append_attention.cu @@ -72,6 +72,7 @@ void AppendAttentionKernel( const paddle::optional& out_linear_shifts, const paddle::optional& out_linear_smooths, const paddle::optional& kv_signal_data, + const paddle::optional& rope_3d_delta, const paddle::optional& q_norm_weight, const paddle::optional& k_norm_weight, const paddle::optional& sinks, @@ -213,6 +214,7 @@ void AppendAttentionKernel( max_input_length, use_neox_rotary_style, rope_3d, + rope_3d_delta, main_stream, &qkv_out, const_cast(&key_cache), @@ -310,6 +312,7 @@ void AppendAttentionKernel( cache_quant_type_str, use_neox_rotary_style, rope_3d, + rope_3d_delta, max_input_length, exec_stream, &qkv_out, @@ -337,6 +340,7 @@ void AppendAttentionKernel( cache_quant_type_str, use_neox_rotary_style, rope_3d, + rope_3d_delta, max_input_length, exec_stream, &qkv_out, @@ -365,6 +369,7 @@ void AppendAttentionKernel( cache_quant_type_str, use_neox_rotary_style, rope_3d, + rope_3d_delta, max_input_length, exec_stream, &qkv_out, @@ -391,6 +396,7 @@ void AppendAttentionKernel( cache_quant_type_str, use_neox_rotary_style, rope_3d, + rope_3d_delta, max_input_length, exec_stream, &qkv_out, @@ -485,6 +491,7 @@ std::vector AppendAttention( const paddle::optional& out_linear_smooths, const paddle::optional& mask_offset, const paddle::optional& kv_signal_data, + const paddle::optional& rope_3d_delta, const paddle::optional& q_norm_weight, const paddle::optional& k_norm_weight, const paddle::optional& sinks, @@ -619,6 +626,7 @@ std::vector AppendAttention( out_linear_shifts, out_linear_smooths, kv_signal_data, + rope_3d_delta, q_norm_weight, k_norm_weight, sinks, @@ -697,6 +705,7 @@ std::vector AppendAttentionWithOutput( const paddle::optional& out_linear_smooths, const paddle::optional& mask_offset, const paddle::optional& kv_signal_data, + const paddle::optional& rope_3d_delta, const paddle::optional& q_norm_weight, const paddle::optional& k_norm_weight, const paddle::optional& sinks, @@ -777,6 +786,7 @@ std::vector AppendAttentionWithOutput( out_linear_shifts, out_linear_smooths, kv_signal_data, + rope_3d_delta, q_norm_weight, k_norm_weight, sinks, @@ -868,6 +878,7 @@ std::vector> AppendAttentionInferShape( const paddle::optional>& out_linear_smooths_shape, const paddle::optional>& mask_offset_shape, const paddle::optional>& kv_signal_data_shape, + const paddle::optional>& rope_3d_delta_shape, const paddle::optional>& q_norm_weight_shape, const paddle::optional>& k_norm_weight_shape, const paddle::optional>& sinks_shape, @@ -934,6 +945,7 @@ std::vector AppendAttentionInferDtype( const paddle::optional& out_linear_smooths_dtype, const paddle::optional& mask_offset_dtype, const paddle::optional& kv_signal_data_dtype, + const paddle::optional& rope_3d_delta_dtype, const paddle::optional& q_norm_weight_dtype, const paddle::optional& k_norm_weight_dtype, const paddle::optional& sinks_dtype, @@ -1021,6 +1033,7 @@ std::vector> AppendAttentionWithOutputInferShape( const paddle::optional>& out_linear_smooths_shape, const paddle::optional>& mask_offset_shape, const paddle::optional>& kv_signal_data_shape, + const paddle::optional>& rope_3d_delta_shape, const paddle::optional>& q_norm_weight_shape, const paddle::optional>& k_norm_weight_shape, const paddle::optional>& sinks_shape, @@ -1080,6 +1093,7 @@ std::vector AppendAttentionWithOutputInferDtype( const paddle::optional& out_linear_smooths_dtype, const paddle::optional& mask_offset_dtype, const paddle::optional& kv_signal_data_dtype, + const paddle::optional& rope_3d_delta_dtype, const paddle::optional& q_norm_weight_dtype, const paddle::optional& k_norm_weight_dtype, const paddle::optional& sinks_dtype, @@ -1138,6 +1152,7 @@ PD_BUILD_STATIC_OP(append_attention) paddle::Optional("out_linear_smooths"), paddle::Optional("mask_offset"), paddle::Optional("kv_signal_data"), + paddle::Optional("rope_3d_delta"), paddle::Optional("q_norm_weight"), paddle::Optional("k_norm_weight"), paddle::Optional("sinks")}) @@ -1201,6 +1216,7 @@ PD_BUILD_STATIC_OP(append_attention_with_output) paddle::Optional("out_linear_smooths"), paddle::Optional("mask_offset"), paddle::Optional("kv_signal_data"), + paddle::Optional("rope_3d_delta"), paddle::Optional("q_norm_weight"), paddle::Optional("k_norm_weight"), paddle::Optional("sinks")}) diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh index 7dd4612c529..85fe0b7be4b 100644 --- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_impl.cuh @@ -92,6 +92,7 @@ __global__ void append_decode_cache_T_rope_qk_norm_kernel( const uint32_t elem_cnt, const int kv_num_heads, const bool rope_3d, + const int* rope_3d_delta, const float* q_norm_weight, const float* k_norm_weight, const float rms_norm_eps) { @@ -143,8 +144,15 @@ __global__ void append_decode_cache_T_rope_qk_norm_kernel( if (hi < num_heads + kv_num_heads) { // q k rope const uint32_t emb_idx = write_seq_id * half_head_size + h_bias / 2; - uint32_t new_emb_idx = - rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx; + uint32_t new_emb_idx = emb_idx; + if (rope_3d) { + if (rope_3d_delta) { + const int rope_pos = write_seq_id + rope_3d_delta[ori_bi]; + new_emb_idx = rope_pos * half_head_size + h_bias / 2; + } else { + new_emb_idx = emb_idx + ori_bi * max_seq_len * head_size; + } + } Load(&cos_emb[new_emb_idx], &cos_emb_vec); Load(&sin_emb[new_emb_idx], &sin_emb_vec); } @@ -237,7 +245,8 @@ __global__ void append_decode_cache_T_rope_kernel( const int block_size, const uint32_t elem_cnt, const int kv_num_heads, - const bool rope_3d) { + const bool rope_3d, + const int* rope_3d_delta) { using LoadT = AlignedVector; using LoadBiasT = AlignedVector; using LoadKVT = AlignedVector; @@ -282,8 +291,15 @@ __global__ void append_decode_cache_T_rope_kernel( if (hi < num_heads + kv_num_heads) { // q k rope const uint32_t emb_idx = write_seq_id * half_head_size + h_bias / 2; - uint32_t new_emb_idx = - rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx; + uint32_t new_emb_idx = emb_idx; + if (rope_3d) { + if (rope_3d_delta) { + const int rope_pos = write_seq_id + rope_3d_delta[ori_bi]; + new_emb_idx = rope_pos * half_head_size + h_bias / 2; + } else { + new_emb_idx = emb_idx + ori_bi * max_seq_len * head_size; + } + } Load(&cos_emb[new_emb_idx], &cos_emb_vec); Load(&sin_emb[new_emb_idx], &sin_emb_vec); } @@ -1221,6 +1237,7 @@ __global__ void append_decode_cache_int8_rope_qk_norm_kernel( const float min_bound, const int kv_num_heads, const bool rope_3d, + const int* rope_3d_delta, const float rms_norm_eps) { static_assert(HeadDim == 128, "just support HeadDim be 128 now!"); static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!"); @@ -1268,8 +1285,15 @@ __global__ void append_decode_cache_int8_rope_qk_norm_kernel( Load(&qkv_now[bias_idx], &src_vec); // q rope const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2; - const uint32_t new_emb_idx = - rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx; + uint32_t new_emb_idx = emb_idx; + if (rope_3d) { + if (rope_3d_delta) { + const int rope_pos = write_seq_id + rope_3d_delta[bid]; + new_emb_idx = rope_pos * half_head_size + head_bias / 2; + } else { + new_emb_idx = emb_idx + bid * max_seq_len * HeadDim; + } + } Load(&cos_emb[new_emb_idx], &cos_emb_vec); Load(&sin_emb[new_emb_idx], &sin_emb_vec); #pragma unroll @@ -1363,8 +1387,15 @@ __global__ void append_decode_cache_int8_rope_qk_norm_kernel( const int v_head_idx = head_idx - num_heads - kv_num_heads; if (head_idx < num_heads + kv_num_heads) { const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2; - const uint32_t new_emb_idx = - rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx; + uint32_t new_emb_idx = emb_idx; + if (rope_3d) { + if (rope_3d_delta) { + const int rope_pos = write_seq_id + rope_3d_delta[bid]; + new_emb_idx = rope_pos * half_head_size + head_bias / 2; + } else { + new_emb_idx = emb_idx + bid * max_seq_len * HeadDim; + } + } Load(&cos_emb[new_emb_idx], &cos_emb_vec1); Load(&cos_emb[new_emb_idx + 4], &cos_emb_vec2); Load(&sin_emb[new_emb_idx], &sin_emb_vec1); @@ -1533,7 +1564,8 @@ __global__ void append_decode_cache_int8_rope_kernel( const float max_bound, const float min_bound, const int kv_num_heads, - const bool rope_3d) { + const bool rope_3d, + const int* rope_3d_delta) { static_assert(HeadDim == 128, "just support HeadDim be 128 now!"); static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!"); constexpr int NUM_WARPS = 4; @@ -1564,7 +1596,13 @@ __global__ void append_decode_cache_int8_rope_kernel( qkv_out + start_token_idx * hidden_size + head_idx * HeadDim; uint32_t emb_offset = write_seq_id * half_head_size; - emb_offset += rope_3d ? bid * max_seq_len * HeadDim : 0; + if (rope_3d) { + if (rope_3d_delta) { + emb_offset = (write_seq_id + rope_3d_delta[bid]) * half_head_size; + } else { + emb_offset += bid * max_seq_len * HeadDim; + } + } apply_rope(qkv_now, cos_emb + emb_offset, sin_emb + emb_offset, @@ -1634,8 +1672,15 @@ __global__ void append_decode_cache_int8_rope_kernel( cache_v_scale + v_head_idx * HeadDim + head_bias; if (head_idx < num_heads + kv_num_heads) { const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2; - uint32_t new_emb_idx = - rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx; + uint32_t new_emb_idx = emb_idx; + if (rope_3d) { + if (rope_3d_delta) { + const int rope_pos = write_seq_id + rope_3d_delta[bid]; + new_emb_idx = rope_pos * half_head_size + head_bias / 2; + } else { + new_emb_idx = emb_idx + bid * max_seq_len * HeadDim; + } + } Load(&cos_emb[new_emb_idx], &cos_emb_vec1); Load(&cos_emb[new_emb_idx + 4], &cos_emb_vec2); Load(&sin_emb[new_emb_idx], &sin_emb_vec1); diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu index e25816fcbb3..062e911e886 100644 --- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu @@ -38,6 +38,7 @@ void append_decode_cache_rope_qk_norm(const QKV_TYPE* qkv, const cudaStream_t& stream, const bool use_neox_style, const bool rope_3d, + const int* rope_3d_delta, const float* q_norm_weight, const float* k_norm_weight, const float rms_norm_eps) { @@ -76,6 +77,7 @@ void append_decode_cache_rope_qk_norm(const QKV_TYPE* qkv, elem_nums, kv_num_heads, rope_3d, + rope_3d_delta, q_norm_weight, k_norm_weight, rms_norm_eps); @@ -104,7 +106,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv, const int bsz, const cudaStream_t& stream, const bool use_neox_style, - const bool rope_3d) { + const bool rope_3d, + const int* rope_3d_delta) { const uint32_t elem_nums = use_neox_style ? bsz * (num_heads + 2 * kv_num_heads) * dim_head / 2 : bsz * (num_heads + 2 * kv_num_heads) * dim_head; @@ -255,7 +258,8 @@ void append_decode_cache_rope(const QKV_TYPE* qkv, block_size, elem_nums, kv_num_heads, - rope_3d); + rope_3d, + rope_3d_delta); } } } @@ -288,7 +292,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv, const int bsz, const cudaStream_t& stream, const bool use_neox_style, - const bool rope_3d) { + const bool rope_3d, + const int* rope_3d_delta) { constexpr int num_warps = 4; const int all_warps = ((num_heads + 2 * kv_num_heads) + num_warps - 1) / num_warps * num_warps; @@ -427,7 +432,8 @@ void append_decode_cache_int8_rope(const QKV_TYPE* qkv, 127.0f, -127.0f, kv_num_heads, - rope_3d); + rope_3d, + rope_3d_delta); } } } @@ -615,6 +621,7 @@ void DecoderWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -659,6 +666,26 @@ void DecoderWriteCacheWithRoPEKernel( } } + if (rope_3d_delta && use_neox_rotary_style) { + PD_THROW("rope_3d_delta is only supported for non-NeoX Ernie decode RoPE."); + } + if (rope_3d_delta && cache_quant_type_str != "none" && + cache_quant_type_str != "cache_fp8" && + cache_quant_type_str != "block_wise_fp8") { + PD_THROW( + "rope_3d_delta is only supported when cache_quant_type is none, " + "cache_fp8 or block_wise_fp8."); + } + if (rope_3d_delta && qkv_out_scales) { + PD_THROW("rope_3d_delta is not supported for quantized qkv decode path."); + } + if (rope_3d_delta && cache_quant_type_str == "block_wise_fp8" && + !q_norm_weight && !k_norm_weight && use_neox_rotary_style) { + PD_THROW( + "rope_3d_delta is not supported for NeoX block_wise_fp8 decode " + "RoPE."); + } + if (q_norm_weight && k_norm_weight) { if (cache_quant_type_str == "none") { append_decode_cache_rope_qk_norm( @@ -686,6 +713,7 @@ void DecoderWriteCacheWithRoPEKernel( stream, use_neox_rotary_style, rope_3d, + rope_3d_delta ? rope_3d_delta.get().data() : nullptr, q_norm_weight ? q_norm_weight.get().data() : nullptr, k_norm_weight ? k_norm_weight.get().data() : nullptr, rms_norm_eps); @@ -731,6 +759,7 @@ void DecoderWriteCacheWithRoPEKernel( -127.0f, kv_num_heads, rope_3d, + rope_3d_delta ? rope_3d_delta.get().data() : nullptr, rms_norm_eps); } else if ((cache_quant_type_str == "cache_fp8")) { constexpr int num_warps = 4; @@ -774,6 +803,7 @@ void DecoderWriteCacheWithRoPEKernel( -127.0f, kv_num_heads, rope_3d, + rope_3d_delta ? rope_3d_delta.get().data() : nullptr, rms_norm_eps); } else { PD_THROW( @@ -807,7 +837,8 @@ void DecoderWriteCacheWithRoPEKernel( bsz, stream, use_neox_rotary_style, - rope_3d); + rope_3d, + rope_3d_delta ? rope_3d_delta.get().data() : nullptr); } else if (cache_quant_type_str == "cache_int8") { bool is_scale_channel_wise = false; if (cache_k_scale && @@ -849,7 +880,8 @@ void DecoderWriteCacheWithRoPEKernel( bsz, stream, use_neox_rotary_style, - rope_3d); + rope_3d, + nullptr); } else { append_decode_cache_int8_rope() : nullptr); } else if (cache_quant_type_str == "block_wise_fp8") { constexpr int num_warps = 4; const int all_warps = ((num_heads + 2 * kv_num_heads) + num_warps - 1) / @@ -1003,6 +1037,7 @@ void DecoderWriteCacheWithRoPEKernel( -127.0f, kv_num_heads, rope_3d, + rope_3d_delta ? rope_3d_delta.get().data() : nullptr, rms_norm_eps); } } else if (cache_quant_type_str == "cache_int4_zp") { @@ -1045,8 +1080,8 @@ void DecoderWriteCacheWithRoPEKernel( rope_3d); } else { PD_THROW( - "cache_quant_type_str should be one of [none, cache_int8, cache_fp8 " - "cache_int4_zp]"); + "cache_quant_type_str should be one of [none, cache_int8, cache_fp8, " + "block_wise_fp8, cache_int4_zp]"); } } } @@ -1070,6 +1105,7 @@ template void DecoderWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1099,6 +1135,7 @@ DecoderWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1127,6 +1164,7 @@ template void DecoderWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1155,6 +1193,7 @@ template void DecoderWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1183,6 +1222,7 @@ template void DecoderWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1212,6 +1252,7 @@ DecoderWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1240,6 +1281,7 @@ template void DecoderWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1269,6 +1311,7 @@ DecoderWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h index 2acb4f8293b..f20d3e0dd74 100644 --- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h +++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h @@ -35,6 +35,7 @@ void DecoderWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, diff --git a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh index 60d5d34bf48..faaeb1491c6 100644 --- a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_impl.cuh @@ -34,7 +34,8 @@ __global__ void IntVariableLengthRotaryKernel( const int num_head, const int seq_len, const int last_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadT = AlignedVector; using LoadBiasT = AlignedVector; using LoadScaleT = AlignedVector; @@ -70,7 +71,13 @@ __global__ void IntVariableLengthRotaryKernel( (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi]; const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2; - int new_emb_idx = rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx; + int new_emb_idx = emb_idx; + if (rope_3d_delta) { + new_emb_idx = + (ori_seq_id + rope_3d_delta[ori_bi]) * half_lastdim + h_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + ori_bi * last_dim * seq_len; + } const int bias_idx = qkv_id * hidden_size + hi * last_dim + h_bias; const int64_t base_idx = token_idx * 3 * hidden_size + bias_idx; Load(&qkv[base_idx], &src_vec); @@ -128,7 +135,8 @@ __global__ void VariableLengthRotaryKernel( const int num_head, const int seq_len, const int last_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadT = AlignedVector; constexpr int HalfVecSize = VecSize / 2; using LoadEmbT = AlignedVector; @@ -160,7 +168,13 @@ __global__ void VariableLengthRotaryKernel( (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi]; const int emb_idx = ori_seq_id * half_lastdim + h_bias / 2; - int new_emb_idx = rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx; + int new_emb_idx = emb_idx; + if (rope_3d_delta) { + new_emb_idx = + (ori_seq_id + rope_3d_delta[ori_bi]) * half_lastdim + h_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + ori_bi * last_dim * seq_len; + } const int64_t base_idx = token_idx * 3 * hidden_size + qkv_id * hidden_size + hi * last_dim + h_bias; Load(&qkv[base_idx], &src_vec); @@ -202,7 +216,8 @@ __global__ void IntNeoxVariableLengthRotaryKernel( const int num_head, const int seq_len, const int last_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadT = AlignedVector; using LoadBiasT = AlignedVector; using LoadScaleT = AlignedVector; @@ -307,7 +322,8 @@ __global__ void NeoxVariableLengthRotaryKernel( const int num_head, const int seq_len, const int last_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadT = AlignedVector; using LoadEmbT = AlignedVector; LoadT left_vec; @@ -389,7 +405,8 @@ __global__ void IntGQAVariableLengthRotaryKernel( const int kv_num_head, const int seq_len, const int last_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadT = AlignedVector; using LoadBiasT = AlignedVector; using LoadScaleT = AlignedVector; @@ -422,8 +439,14 @@ __global__ void IntGQAVariableLengthRotaryKernel( (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi]; const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2; - int64_t new_emb_idx = - rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx; + int64_t new_emb_idx = emb_idx; + if (rope_3d_delta) { + new_emb_idx = static_cast(ori_seq_id + rope_3d_delta[ori_bi]) * + half_lastdim + + h_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + ori_bi * last_dim * seq_len; + } const int64_t bias_idx = hi * last_dim + h_bias; const int64_t base_idx = token_idx * offset + bias_idx; Load(&qkv[base_idx], &src_vec); @@ -483,6 +506,7 @@ __global__ void GQAVariableLengthRotaryQKNormKernel( const int seq_len, const int last_dim, const bool rope_3d, + const int *rope_3d_delta, const float *q_norm_weight, const float *k_norm_weight, const float rms_norm_eps) { @@ -521,8 +545,14 @@ __global__ void GQAVariableLengthRotaryQKNormKernel( h_bias; Load(&qkv[base_idx], &src_vec); - int64_t new_emb_idx = - rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx; + int64_t new_emb_idx = emb_idx; + if (rope_3d_delta) { + new_emb_idx = static_cast(ori_seq_id + rope_3d_delta[ori_bi]) * + half_lastdim + + h_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + ori_bi * last_dim * seq_len; + } Load(&cos_emb[new_emb_idx], &cos_emb_vec); Load(&sin_emb[new_emb_idx], &sin_emb_vec); @@ -580,7 +610,8 @@ __global__ void GQAVariableLengthRotaryKernel(const T *qkv, const int kv_num_head, const int seq_len, const int last_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadT = AlignedVector; constexpr int HalfVecSize = VecSize / 2; using LoadEmbT = AlignedVector; @@ -615,8 +646,14 @@ __global__ void GQAVariableLengthRotaryKernel(const T *qkv, h_bias; Load(&qkv[base_idx], &src_vec); - int64_t new_emb_idx = - rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx; + int64_t new_emb_idx = emb_idx; + if (rope_3d_delta) { + new_emb_idx = static_cast(ori_seq_id + rope_3d_delta[ori_bi]) * + half_lastdim + + h_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + ori_bi * last_dim * seq_len; + } Load(&cos_emb[new_emb_idx], &cos_emb_vec); Load(&sin_emb[new_emb_idx], &sin_emb_vec); #pragma unroll @@ -658,7 +695,8 @@ __global__ void IntGQAVariableLengthRotaryQuantKVKernel( const int kv_num_head, const int seq_len, const int last_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadIn = AlignedVector; using LoadBiasT = AlignedVector; constexpr int HalfVecSize = VecSize / 2; @@ -692,8 +730,14 @@ __global__ void IntGQAVariableLengthRotaryQuantKVKernel( (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi]; const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2; - int64_t new_emb_idx = - rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx; + int64_t new_emb_idx = emb_idx; + if (rope_3d_delta) { + new_emb_idx = static_cast(ori_seq_id + rope_3d_delta[ori_bi]) * + half_lastdim + + h_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + ori_bi * last_dim * seq_len; + } const int64_t bias_idx = hi * last_dim + h_bias; const int64_t base_idx = token_idx * offset + bias_idx; Load(&qkv[base_idx], &src_vec); @@ -769,7 +813,8 @@ __global__ void GQAVariableLengthRotaryQuantKVKernel( const int kv_num_head, const int seq_len, const int last_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadT = AlignedVector; constexpr int HalfVecSize = VecSize / 2; using LoadEmbT = AlignedVector; @@ -800,8 +845,14 @@ __global__ void GQAVariableLengthRotaryQuantKVKernel( (token_idx - cu_seqlens_q[ori_bi]) + seq_lens_decoder[ori_bi]; const int64_t emb_idx = ori_seq_id * half_lastdim + h_bias / 2; - int64_t new_emb_idx = - rope_3d ? emb_idx + ori_bi * last_dim * seq_len : emb_idx; + int64_t new_emb_idx = emb_idx; + if (rope_3d_delta) { + new_emb_idx = static_cast(ori_seq_id + rope_3d_delta[ori_bi]) * + half_lastdim + + h_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + ori_bi * last_dim * seq_len; + } const int64_t bias_idx = hi * last_dim + h_bias; const int64_t base_idx = token_idx * offset + bias_idx; Load(&qkv[base_idx], &src_vec); @@ -880,7 +931,8 @@ __global__ void IntGQANeoxVariableLengthRotaryKernel( const int kv_num_head, const int seq_len, const int last_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadT = AlignedVector; using LoadBiasT = AlignedVector; using LoadScaleT = AlignedVector; @@ -983,7 +1035,8 @@ __global__ void GQANeoxVariableLengthRotaryKernel(const T *qkv, const int kv_num_head, const int seq_len, const int last_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadT = AlignedVector; using LoadEmbT = AlignedVector; LoadT left_vec; @@ -1062,7 +1115,8 @@ __global__ void GQANeoxVariableLengthPartialRotaryKernel( const int seq_len, const int head_dim, const int rotary_dim, - const bool rope_3d) { + const bool rope_3d, + const int *rope_3d_delta) { using LoadT = AlignedVector; using LoadEmbT = AlignedVector; LoadT left_vec; @@ -2246,7 +2300,8 @@ void rotary_qk_variable( const int dim_head, const cudaStream_t &stream, bool use_neox_style = false, - bool rope_3d = false) { + bool rope_3d = false, + const int *rope_3d_delta = nullptr) { int64_t elem_nums = qkv_out_scales ? token_num * 3 * head_num * dim_head : token_num * 2 * head_num * dim_head; if (use_neox_style) { @@ -2282,7 +2337,8 @@ void rotary_qk_variable( head_num, seq_len, dim_head, - rope_3d); + rope_3d, + rope_3d_delta); } else { launchWithPdlWhenEnabled( VariableLengthRotaryKernel, @@ -2302,7 +2358,8 @@ void rotary_qk_variable( head_num, seq_len, dim_head, - rope_3d); + rope_3d, + rope_3d_delta); } } else { const float *cos_emb = rotary_emb; @@ -2328,7 +2385,8 @@ void rotary_qk_variable( head_num, seq_len, dim_head, - rope_3d); + rope_3d, + rope_3d_delta); } else { launchWithPdlWhenEnabled( NeoxVariableLengthRotaryKernel, @@ -2348,7 +2406,8 @@ void rotary_qk_variable( head_num, seq_len, dim_head, - rope_3d); + rope_3d, + rope_3d_delta); } } } @@ -2373,6 +2432,7 @@ void gqa_rotary_qk_norm_variable( const cudaStream_t &stream, bool use_neox_style = false, bool rope_3d = false, + const int *rope_3d_delta = nullptr, const float *q_norm_weight = nullptr, const float *k_norm_weight = nullptr, const float rms_norm_eps = 1e-6) { @@ -2415,6 +2475,7 @@ void gqa_rotary_qk_norm_variable( seq_len, dim_head, rope_3d, + rope_3d_delta, q_norm_weight, k_norm_weight, rms_norm_eps); @@ -2440,7 +2501,8 @@ void gqa_rotary_qk_variable( const int rotary_dim, const cudaStream_t &stream, bool use_neox_style = false, - bool rope_3d = false) { + bool rope_3d = false, + const int *rope_3d_delta = nullptr) { int64_t elem_nums = qkv_out_scales ? token_num * (num_heads + 2 * kv_num_heads) * dim_head @@ -2480,7 +2542,8 @@ void gqa_rotary_qk_variable( kv_num_heads, seq_len, dim_head, - rope_3d); + rope_3d, + rope_3d_delta); } else { auto *kernelFn = GQAVariableLengthRotaryKernel; @@ -2502,7 +2565,8 @@ void gqa_rotary_qk_variable( kv_num_heads, seq_len, dim_head, - rope_3d); + rope_3d, + rope_3d_delta); } } else { const float *cos_emb = rotary_emb; @@ -2529,7 +2593,8 @@ void gqa_rotary_qk_variable( kv_num_heads, seq_len, dim_head, - rope_3d); + rope_3d, + rope_3d_delta); } else { if (rotary_dim < dim_head) { PD_CHECK((rotary_dim / 2) % PackSize == 0); @@ -2568,7 +2633,8 @@ void gqa_rotary_qk_variable( seq_len, dim_head, rotary_dim, - rope_3d); + rope_3d, + rope_3d_delta); } else { auto *kernelFn = GQANeoxVariableLengthRotaryKernel; @@ -2592,7 +2658,8 @@ void gqa_rotary_qk_variable( kv_num_heads, seq_len, dim_head, - rope_3d); + rope_3d, + rope_3d_delta); } } } @@ -2619,7 +2686,8 @@ void gqa_rotary_qk_quant_variable( const int dim_head, const cudaStream_t &stream, bool use_neox_style = false, - bool rope_3d = false) { + bool rope_3d = false, + const int *rope_3d_delta = nullptr) { int64_t elem_nums = token_num * (num_heads + 2 * kv_num_heads) * dim_head; if (use_neox_style) { elem_nums /= 2; @@ -2657,7 +2725,8 @@ void gqa_rotary_qk_quant_variable( kv_num_heads, seq_len, dim_head, - rope_3d); + rope_3d, + rope_3d_delta); } else { launchWithPdlWhenEnabled( GQAVariableLengthRotaryQuantKVKernel, @@ -2681,7 +2750,8 @@ void gqa_rotary_qk_quant_variable( kv_num_heads, seq_len, dim_head, - rope_3d); + rope_3d, + rope_3d_delta); } } else { PADDLE_THROW("Use_neox_style mode isn't implemented yet"); diff --git a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h index 23969aa429f..d72d15bebc6 100644 --- a/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h +++ b/custom_ops/gpu_ops/append_attn/encoder_write_cache_with_rope_kernel.h @@ -43,6 +43,7 @@ void EncoderWriteCacheWithRopeKernel( const int max_seq_len, const bool use_neox_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, cudaStream_t& stream, paddle::Tensor* qkv_out, paddle::Tensor* key_cache_out, @@ -96,6 +97,7 @@ void EncoderWriteCacheWithRopeKernel( stream, use_neox_style, rope_3d, + rope_3d_delta ? rope_3d_delta.get().data() : nullptr, q_norm_weight ? q_norm_weight.get().data() : nullptr, k_norm_weight ? k_norm_weight.get().data() : nullptr, rms_norm_eps); @@ -123,7 +125,8 @@ void EncoderWriteCacheWithRopeKernel( head_dim, stream, use_neox_style, - rope_3d); + rope_3d, + rope_3d_delta ? rope_3d_delta.get().data() : nullptr); } else { if (!is_scale_channel_wise) { gqa_rotary_qk_variable( @@ -145,7 +148,8 @@ void EncoderWriteCacheWithRopeKernel( rotary_dim, stream, use_neox_style, - rope_3d); + rope_3d, + rope_3d_delta ? rope_3d_delta.get().data() : nullptr); } else { gqa_rotary_qk_quant_variable( qkv_out->data(), @@ -167,7 +171,8 @@ void EncoderWriteCacheWithRopeKernel( head_dim, stream, use_neox_style, - rope_3d); + rope_3d, + rope_3d_delta ? rope_3d_delta.get().data() : nullptr); } } } diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh index 9e63cf4e351..7f6da089822 100644 --- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh @@ -51,7 +51,8 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel( const float* q_norm_weight, const float* k_norm_weight, const float rms_norm_eps, - const bool rope_3d) { + const bool rope_3d, + const int* rope_3d_delta) { using LoadT = AlignedVector; using LoadFloat = AlignedVector; using LoadInT = AlignedVector; @@ -109,8 +110,15 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel( if (hi < num_heads + gqa_group_size) { // q k rope const int64_t emb_idx = write_seq_id * half_head_size + h_bias / 2; - uint32_t new_emb_idx = - rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx; + uint32_t new_emb_idx; + if (rope_3d_delta) { + const int rope_pos = write_seq_id + rope_3d_delta[ori_bi]; + new_emb_idx = rope_pos * half_head_size + h_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + ori_bi * max_seq_len * head_size; + } else { + new_emb_idx = emb_idx; + } Load(&cos_emb[new_emb_idx], &cos_emb_vec); Load(&sin_emb[new_emb_idx], &sin_emb_vec); } @@ -366,7 +374,8 @@ __global__ void append_speculate_cache_rope_kernel( const int block_size, const int elem_cnt, const int gqa_group_size, - const bool rope_3d) { + const bool rope_3d, + const int* rope_3d_delta) { using LoadT = AlignedVector; using LoadFloat = AlignedVector; using LoadInT = AlignedVector; @@ -420,8 +429,15 @@ __global__ void append_speculate_cache_rope_kernel( if (hi < num_heads + gqa_group_size) { // q k rope const int64_t emb_idx = write_seq_id * half_head_size + h_bias / 2; - int64_t new_emb_idx = - rope_3d ? emb_idx + ori_bi * max_seq_len * head_size : emb_idx; + int64_t new_emb_idx; + if (rope_3d_delta) { + const int rope_pos = write_seq_id + rope_3d_delta[ori_bi]; + new_emb_idx = rope_pos * half_head_size + h_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + ori_bi * max_seq_len * head_size; + } else { + new_emb_idx = emb_idx; + } Load(&cos_emb[new_emb_idx], &cos_emb_vec); Load(&sin_emb[new_emb_idx], &sin_emb_vec); } @@ -815,6 +831,7 @@ __global__ void append_speculate_cache_fp8_rope_qk_norm_dynamic_kernel( const float min_bound, const int gqa_group_size, const bool rope_3d, + const int* __restrict__ rope_3d_delta, const float rms_norm_eps) { static_assert(HeadDim == 128, "just support HeadDim be 128 now!"); static_assert(VecSize == 4, "just support VecSize be 4 now, 32 * 4!"); @@ -874,8 +891,15 @@ __global__ void append_speculate_cache_fp8_rope_qk_norm_dynamic_kernel( // q rope const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2; - uint32_t new_emb_idx = - rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx; + uint32_t new_emb_idx; + if (rope_3d_delta) { + const int rope_pos = write_seq_id + rope_3d_delta[bid]; + new_emb_idx = rope_pos * half_head_size + head_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + bid * max_seq_len * HeadDim; + } else { + new_emb_idx = emb_idx; + } Load(&cos_emb[new_emb_idx], &cos_emb_vec); Load(&sin_emb[new_emb_idx], &sin_emb_vec); #pragma unroll @@ -939,8 +963,15 @@ __global__ void append_speculate_cache_fp8_rope_qk_norm_dynamic_kernel( const int v_head_idx = head_idx - num_heads - gqa_group_size; if (head_idx < num_heads + gqa_group_size) { const uint32_t emb_idx = write_seq_id * half_head_size + head_bias / 2; - uint32_t new_emb_idx = - rope_3d ? emb_idx + bid * max_seq_len * HeadDim : emb_idx; + uint32_t new_emb_idx; + if (rope_3d_delta) { + const int rope_pos = write_seq_id + rope_3d_delta[bid]; + new_emb_idx = rope_pos * half_head_size + head_bias / 2; + } else if (rope_3d) { + new_emb_idx = emb_idx + bid * max_seq_len * HeadDim; + } else { + new_emb_idx = emb_idx; + } Load(&cos_emb[new_emb_idx], &cos_emb_vec1); Load(&cos_emb[new_emb_idx + 4], &cos_emb_vec2); Load(&sin_emb[new_emb_idx], &sin_emb_vec1); diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu index 4ee00f12e07..021380365df 100644 --- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu @@ -42,7 +42,8 @@ void append_speculate_cache_rope_qk_norm(const QKV_TYPE* qkv, const float* q_norm_weight, const float* k_norm_weight, const float rms_norm_eps, - const bool rope_3d) { + const bool rope_3d, + const int* rope_3d_delta) { int output_inner_dim = num_heads + 2 * kv_num_heads; const uint32_t elem_nums = use_neox_style ? token_num * (num_heads + 2 * kv_num_heads) * dim_head / 2 @@ -86,7 +87,8 @@ void append_speculate_cache_rope_qk_norm(const QKV_TYPE* qkv, q_norm_weight, k_norm_weight, rms_norm_eps, - rope_3d); + rope_3d, + rope_3d_delta); } } @@ -116,7 +118,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv, const int token_num, const cudaStream_t& stream, const bool use_neox_style, - const bool rope_3d) { + const bool rope_3d, + const int* rope_3d_delta) { int output_inner_dim = num_heads + 2 * kv_num_heads; const uint32_t elem_nums = @@ -212,7 +215,8 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv, block_size, elem_nums, kv_num_heads, - rope_3d); + rope_3d, + rope_3d_delta); } } @@ -242,6 +246,7 @@ void append_speculate_cache_fp8_rope(const T* qkv, const int token_num, const cudaStream_t& stream, const bool rope_3d, + const int* rope_3d_delta, const float rms_norm_eps) { constexpr int num_warps = 4; const int all_warps = @@ -291,6 +296,7 @@ void append_speculate_cache_fp8_rope(const T* qkv, -127.0f, kv_num_heads, rope_3d, + rope_3d_delta, rms_norm_eps); } @@ -538,6 +544,7 @@ void SpeculateWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -562,6 +569,21 @@ void SpeculateWriteCacheWithRoPEKernel( const float* cos_emb = rotary_embs ? rotary_embs.get().data() : nullptr; + const int* rope_3d_delta_ptr = + rope_3d_delta ? rope_3d_delta.get().data() : nullptr; + if (rope_3d_delta_ptr) { + if (use_neox_rotary_style) { + PD_THROW( + "rope_3d_delta is not supported with use_neox_rotary_style for " + "speculate decoder."); + } + if (cache_quant_type_str != "none" && + cache_quant_type_str != "block_wise_fp8") { + PD_THROW( + "rope_3d_delta only supports cache_quant_type none/block_wise_fp8 " + "for speculate decoder."); + } + } const float* sin_emb; int rotary_dim = dim_head; if (rotary_embs) { @@ -613,7 +635,8 @@ void SpeculateWriteCacheWithRoPEKernel( reinterpret_cast(q_norm_weight.get().data()), reinterpret_cast(k_norm_weight.get().data()), rms_norm_eps, - rope_3d); + rope_3d, + rope_3d_delta_ptr); } else if (cache_quant_type_str == "block_wise_fp8") { append_speculate_cache_fp8_rope( reinterpret_cast(qkv_ptr), @@ -643,6 +666,7 @@ void SpeculateWriteCacheWithRoPEKernel( token_nums, stream, rope_3d, + rope_3d_delta_ptr, rms_norm_eps); } else if (cache_quant_type_str == "cache_fp8") { append_speculate_cache_fp8_rope( @@ -673,6 +697,7 @@ void SpeculateWriteCacheWithRoPEKernel( token_nums, stream, rope_3d, + rope_3d_delta_ptr, rms_norm_eps); } else { PD_THROW( @@ -710,7 +735,8 @@ void SpeculateWriteCacheWithRoPEKernel( token_nums, stream, use_neox_rotary_style, - rope_3d); + rope_3d, + rope_3d_delta_ptr); } else if (cache_quant_type_str == "cache_int8") { append_speculate_cache_int8_rope( @@ -883,6 +910,7 @@ template void SpeculateWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -913,6 +941,7 @@ SpeculateWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -942,6 +971,7 @@ template void SpeculateWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -972,6 +1002,7 @@ SpeculateWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1001,6 +1032,7 @@ template void SpeculateWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1031,6 +1063,7 @@ SpeculateWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1060,6 +1093,7 @@ template void SpeculateWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, @@ -1090,6 +1124,7 @@ SpeculateWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h index c9c3ff9e0b9..43a2d9543a0 100644 --- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h +++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h @@ -36,6 +36,7 @@ void SpeculateWriteCacheWithRoPEKernel( const std::string& cache_quant_type_str, const bool use_neox_rotary_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, const int max_seq_len, cudaStream_t& stream, paddle::Tensor* qkv_out, diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu index 915039908dc..78e415993aa 100644 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_bfloat16_kernel.cu @@ -40,6 +40,7 @@ EncoderWriteCacheWithRopeKernel( const int max_seq_len, const bool use_neox_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, cudaStream_t& stream, paddle::Tensor* qkv_out, paddle::Tensor* key_cache_out, diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu index 3f3539b8a6e..c51d15e36c4 100644 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_bfloat16_int_kernel.cu @@ -39,6 +39,7 @@ template void EncoderWriteCacheWithRopeKernel( const int max_seq_len, const bool use_neox_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, cudaStream_t& stream, paddle::Tensor* qkv_out, paddle::Tensor* key_cache_out, diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu index a559ec77f37..44650da68c7 100644 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_float16_kernel.cu @@ -39,6 +39,7 @@ template void EncoderWriteCacheWithRopeKernel( const int max_seq_len, const bool use_neox_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, cudaStream_t& stream, paddle::Tensor* qkv_out, paddle::Tensor* key_cache_out, diff --git a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu index 3318a36472b..12125da218e 100644 --- a/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/template_instantiation/encoder_write_cache_with_rope_float16_int_kernel.cu @@ -39,6 +39,7 @@ template void EncoderWriteCacheWithRopeKernel( const int max_seq_len, const bool use_neox_style, const bool rope_3d, + const paddle::optional& rope_3d_delta, cudaStream_t& stream, paddle::Tensor* qkv_out, paddle::Tensor* key_cache_out, diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 683ffe2db82..1eb02d6c642 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -110,6 +110,7 @@ std::vector AppendAttention( const paddle::optional& out_linear_smooths, const paddle::optional& mask_offset, const paddle::optional& kv_signal_data, + const paddle::optional& rope_3d_delta, const paddle::optional& q_norm_weight, const paddle::optional& k_norm_weight, const paddle::optional& sinks, @@ -167,6 +168,7 @@ std::vector AppendAttentionWithOutput( const paddle::optional& out_linear_smooths, const paddle::optional& mask_offset, const paddle::optional& kv_signal_data, + const paddle::optional& rope_3d_delta, const paddle::optional& q_norm_weight, const paddle::optional& k_norm_weight, const paddle::optional& sinks, diff --git a/custom_ops/gpu_ops/decoder_write_cache_with_rope.cu b/custom_ops/gpu_ops/decoder_write_cache_with_rope.cu index 7878e9926c5..d2c91408dd8 100644 --- a/custom_ops/gpu_ops/decoder_write_cache_with_rope.cu +++ b/custom_ops/gpu_ops/decoder_write_cache_with_rope.cu @@ -109,6 +109,7 @@ std::vector DecoderWriteCacheWithRoPE( cache_quant_type_str, use_neox_rotary_style, rope_3d, + paddle::optional(), max_input_length, stream, const_cast(&qkv), @@ -138,6 +139,7 @@ std::vector DecoderWriteCacheWithRoPE( cache_quant_type_str, use_neox_rotary_style, rope_3d, + paddle::optional(), max_input_length, stream, const_cast(&qkv), @@ -173,6 +175,7 @@ std::vector DecoderWriteCacheWithRoPE( cache_quant_type_str, use_neox_rotary_style, rope_3d, + paddle::optional(), max_input_length, stream, const_cast(&qkv), @@ -201,6 +204,7 @@ std::vector DecoderWriteCacheWithRoPE( cache_quant_type_str, use_neox_rotary_style, rope_3d, + paddle::optional(), max_input_length, stream, const_cast(&qkv), diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 645f6524734..5c38c089077 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -978,6 +978,9 @@ def _fetch_request(): ) if finished_ids: for task in tasks: + # Fix + if task.request_id not in need_check_req_ids: + continue result = self.resource_manager.waiting_async_process(task) if result is None: self.scheduler.put_results( diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py index 44cf528bed3..0e2b3e7576e 100644 --- a/fastdeploy/model_executor/forward_meta.py +++ b/fastdeploy/model_executor/forward_meta.py @@ -69,6 +69,8 @@ class ForwardMeta: ids_remove_padding: paddle.Tensor # Rotation position embedding rotary_embs: Optional[paddle.Tensor] = None + # Per-request offset for shared Ernie VL decode 3D RoPE. + rope_3d_delta: Optional[paddle.Tensor] = None # Use cuda graph in this step or not. Used to avoid run cuda graph when in dummy run or prefill stage. step_use_cudagraph: bool = False diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index 76de638bce6..61d1639fadc 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -442,6 +442,7 @@ def forward_mixed( layer.linear_smooth, forward_meta.attn_mask_offsets, metadata.kv_signal_data_list[layer.layer_id], + forward_meta.rope_3d_delta, q_norm_weight, k_norm_weight, getattr(layer, "sinks", None), @@ -498,6 +499,7 @@ def forward_mixed( layer.linear_smooth, forward_meta.attn_mask_offsets, metadata.kv_signal_data_list[layer.layer_id], + forward_meta.rope_3d_delta, q_norm_weight, k_norm_weight, getattr(layer, "sinks", None), diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index f095980de9e..d4ab8f5f59b 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -598,6 +598,7 @@ def forward_mixed( layer.linear_smooth, forward_meta.attn_mask_offsets, metadata.kv_signal_data_list[layer.layer_id], + forward_meta.rope_3d_delta, q_norm_weight, k_norm_weight, getattr(layer, "sinks", None), diff --git a/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py index bdb018ec269..ee064e73e99 100644 --- a/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_mask_attn_backend.py @@ -353,6 +353,7 @@ def forward_mixed( layer.linear_smooth, forward_meta.attn_mask_offsets, metadata.kv_signal_data_list[layer.layer_id], + forward_meta.rope_3d_delta, q_norm_weight, k_norm_weight, getattr(layer, "sinks", None), diff --git a/fastdeploy/model_executor/layers/attention/ops/append_attention.py b/fastdeploy/model_executor/layers/attention/ops/append_attention.py index 8b36ffa85b0..e59a6b6bd1e 100644 --- a/fastdeploy/model_executor/layers/attention/ops/append_attention.py +++ b/fastdeploy/model_executor/layers/attention/ops/append_attention.py @@ -63,6 +63,7 @@ def append_attention( linear_smooth: Optional[paddle.Tensor] = None, mask_offset: Optional[paddle.Tensor] = None, kv_signal_data: Optional[paddle.Tensor] = None, + rope_3d_delta: Optional[paddle.Tensor] = None, q_norm_weight: Optional[paddle.Tensor] = None, k_norm_weight: Optional[paddle.Tensor] = None, sinks: Optional[paddle.Tensor] = None, @@ -126,6 +127,7 @@ def append_attention( linear_smooth, mask_offset, kv_signal_data, + rope_3d_delta, q_norm_weight, k_norm_weight, sinks, @@ -185,6 +187,7 @@ def append_attention( linear_smooth, mask_offset, kv_signal_data, + rope_3d_delta, q_norm_weight, k_norm_weight, sinks, @@ -254,6 +257,7 @@ def append_attention_with_output( linear_smooth: Optional[paddle.Tensor] = None, mask_offset: Optional[paddle.Tensor] = None, kv_signal_data: Optional[paddle.Tensor] = None, + rope_3d_delta: Optional[paddle.Tensor] = None, q_norm_weight: Optional[paddle.Tensor] = None, k_norm_weight: Optional[paddle.Tensor] = None, sinks: Optional[paddle.Tensor] = None, @@ -314,6 +318,7 @@ def append_attention_with_output( linear_smooth, mask_offset, kv_signal_data, + rope_3d_delta, q_norm_weight, k_norm_weight, sinks, diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py index 22942f10f2c..a204e4d2603 100644 --- a/fastdeploy/model_executor/layers/rotary_embedding.py +++ b/fastdeploy/model_executor/layers/rotary_embedding.py @@ -641,6 +641,37 @@ def __call__(self, position_ids, max_len_lst, cumsum_seqlens): return rot_emb_list +def get_ernie_rope_3d_decode( + rotary_dim: int, + base: float, + partial_rotary_factor: float, + max_position: int, + freq_allocation: int, + rope_scaling: dict, +) -> paddle.Tensor: + """Build shared Ernie VL decode RoPE for text positions.""" + rotary_emb3d_layer = ErnieVlRotaryEmbedding3D( + rotary_dim, base, partial_rotary_factor, max_position, freq_allocation, rope_scaling + ) + position_ids_3d = paddle.tile( + paddle.arange(max_position, dtype="float32").unsqueeze(-1), + [1, 3], + ) + expand_pos = paddle.index_select( + position_ids_3d / partial_rotary_factor, + rotary_emb3d_layer.using_position_axis, + axis=-1, + ) + inv_freq = 1.0 / (base ** (paddle.arange(0, rotary_dim, 2, dtype="float32") / rotary_dim)) + freqs = expand_pos * inv_freq + cos_emb = paddle.cos(freqs).unsqueeze(0).unsqueeze(0).unsqueeze(3) + sin_emb = paddle.sin(freqs).unsqueeze(0).unsqueeze(0).unsqueeze(3) + rot_emb = paddle.concat([cos_emb, sin_emb], axis=0) + if current_platform.is_iluvatar(): + rot_emb = paddle.stack([rot_emb, rot_emb], axis=-1).reshape([2, 1, max_position, 1, rotary_dim]) + return rot_emb + + def get_rope_3d( rotary_dim: int, base: float, diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 6c5149e253b..442f859625b 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -541,10 +541,16 @@ def insert_tasks_v1( ) if self.use_attn_mask_offset: inputs = request.multimodal_inputs - self.model_inputs["attn_mask_offsets_full"][idx][0 : prefill_end_index - prefill_start_index] = ( - paddle.to_tensor( - inputs["attention_mask_offset"][prefill_start_index:prefill_end_index], dtype="int32" + attn_offset_len = prefill_end_index - prefill_start_index + if inputs.get("attention_mask_offset", None) is None: + attention_mask_offset_slice = np.arange(prefill_start_index, prefill_end_index, dtype=np.int32) + else: + attention_mask_offset_slice = np.asarray( + inputs["attention_mask_offset"][prefill_start_index:prefill_end_index], + dtype=np.int32, ) + self.model_inputs["attn_mask_offsets_full"][idx][0:attn_offset_len] = paddle.to_tensor( + attention_mask_offset_slice, dtype="int32" ) # GPU don't need it anymore # NOTE: XPU backend needs decoder attention mask offset; GPU backend does not use it diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 9cbb72636ef..f1fd8dc9889 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -51,7 +51,10 @@ from fastdeploy.model_executor.layers.moe.routing_indices_cache import ( RoutingReplayManager, ) -from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d +from fastdeploy.model_executor.layers.rotary_embedding import ( + get_ernie_rope_3d_decode, + get_rope_3d, +) from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler from fastdeploy.model_executor.model_loader import get_model_loader @@ -211,6 +214,8 @@ def __init__( # Initialize input batch self.share_inputs = InputBatch(self.fd_config) self.share_inputs.init_share_inputs() + if getattr(self.share_inputs, "use_shared_ernie_decode_rope_3d", False): + self.share_inputs["rope_emb"][0] = self.prepare_shared_ernie_decode_rope3d() self.increment_value = ( 4 if not self.speculative_decoding else (self.speculative_config.num_speculative_tokens + 1) * 4 ) @@ -799,16 +804,23 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = # rope 3d if self.enable_mm: position_ids = request.multimodal_inputs["position_ids"] - rope_3d_position_ids["position_ids_idx"].append(idx) - rope_3d_position_ids["position_ids_lst"].append(position_ids) - rope_3d_position_ids["position_ids_offset"].append( - len(position_ids) + rope_3d_position_ids["position_ids_offset"][-1] - ) - - if self.is_pooling_model: - rope_3d_position_ids["max_tokens_lst"].append(0) + if getattr(self.share_inputs, "use_shared_ernie_decode_rope_3d", False): + if isinstance(position_ids, paddle.Tensor): + rope_3d_delta = int(paddle.max(position_ids).item()) + 1 - len(position_ids) + else: + rope_3d_delta = int(np.max(position_ids)) + 1 - len(position_ids) + async_set_value(self.share_inputs["rope_3d_delta"][idx : idx + 1], rope_3d_delta) else: - rope_3d_position_ids["max_tokens_lst"].append(request.get("max_tokens", 2048)) + rope_3d_position_ids["position_ids_idx"].append(idx) + rope_3d_position_ids["position_ids_lst"].append(position_ids) + rope_3d_position_ids["position_ids_offset"].append( + len(position_ids) + rope_3d_position_ids["position_ids_offset"][-1] + ) + + if self.is_pooling_model: + rope_3d_position_ids["max_tokens_lst"].append(0) + else: + rope_3d_position_ids["max_tokens_lst"].append(request.get("max_tokens", 2048)) # guided decoding if ( @@ -1395,6 +1407,7 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False): self.forward_meta = ForwardMeta( ids_remove_padding=self.share_inputs["ids_remove_padding"], rotary_embs=self.share_inputs["rope_emb"], + rope_3d_delta=self.share_inputs.get("rope_3d_delta", None), attn_backend=self.attn_backends[0], decoder_batch_ids=self.share_inputs["decoder_batch_ids"], decoder_tile_ids_per_batch=self.share_inputs["decoder_tile_ids_per_batch"], @@ -3172,6 +3185,19 @@ def _dummy_run_extract_vision_features(self): window_size=-1, ) + @paddle.no_grad() + def prepare_shared_ernie_decode_rope3d(self) -> paddle.Tensor: + """prepare shared Ernie VL decode rope3d""" + + return get_ernie_rope_3d_decode( + rotary_dim=self.model_config.head_dim, + partial_rotary_factor=1.0, + base=self.model_config.rope_theta, + max_position=self.model_config.max_model_len, + freq_allocation=getattr(self.model_config, "freq_allocation", 20), + rope_scaling=getattr(self.model_config, "rope_scaling", {}), + ) + @paddle.no_grad() def prepare_rope3d( self, position_ids: paddle.Tensor, max_len_lst: list[int], cumsum_seqlens: list[int] diff --git a/fastdeploy/worker/input_batch.py b/fastdeploy/worker/input_batch.py index 96b1694c895..6efd6801528 100644 --- a/fastdeploy/worker/input_batch.py +++ b/fastdeploy/worker/input_batch.py @@ -97,6 +97,18 @@ def __init__(self, fd_config: FDConfig) -> None: self.speculative_decoding = self.speculative_config.method is not None self.is_mm_model = self.model_config.enable_mm self.enable_mm = fd_config.enable_mm_runtime + model_type = getattr(self.model_config, "model_type", "") + if isinstance(model_type, list): + model_type = str(model_type[0]) if model_type else "" + model_type = model_type.lower() + self.use_shared_ernie_decode_rope_3d = ( + self.enable_mm + and fd_config.enable_rope_3d_runtime + and "ernie" in model_type + and "qwen" not in model_type + and "paddleocr" not in model_type + and self.scheduler_config.splitwise_role == "decode" + ) self.enable_expert_parallel = fd_config.parallel_config.enable_expert_parallel self.index_to_batch_id = {} self.enable_pd_reorder = False @@ -343,9 +355,10 @@ def init_share_inputs(self): else: # neox style = False rope_head_dim = head_dim // 2 + rope_batch_size = 1 if self.use_shared_ernie_decode_rope_3d else max_num_seqs self.rope_emb = paddle.full( shape=[ - max_num_seqs, + rope_batch_size, 2, 1, self.model_config.max_model_len, @@ -355,6 +368,8 @@ def init_share_inputs(self): fill_value=0, dtype="float32", ) + if self.use_shared_ernie_decode_rope_3d: + self.rope_3d_delta = paddle.full([max_num_seqs], 0, dtype="int32") self.image_features = None # Built before the forward self.image_grid_thws = None self.image_features_list = None @@ -484,7 +499,10 @@ def swap_data(tensor, idx1, idx2): self.image_features_list[i2], self.image_features_list[i1], ) - swap_data(self.share_inputs["rope_emb"], i1, i2) + if not self.use_shared_ernie_decode_rope_3d: + swap_data(self.rope_emb, i1, i2) + else: + swap_data(self.rope_3d_delta, i1, i2) swap_data(self.decode_states, i1, i2) swap_data(self.attn_mask_offsets_full, i1, i2) # Swap mask rollback @@ -686,18 +704,21 @@ def reset_share_inputs(self): else: rope_head_dim = head_dim // 2 - self.rope_emb = paddle.full( - shape=[ - max_num_seqs, - 2, - 1, - self.model_config.max_model_len, - 1, - rope_head_dim, - ], - fill_value=0, - dtype="float32", - ) + if self.use_shared_ernie_decode_rope_3d: + fill_paddle_tensor(self, "rope_3d_delta", 0) + else: + self.rope_emb = paddle.full( + shape=[ + max_num_seqs, + 2, + 1, + self.model_config.max_model_len, + 1, + rope_head_dim, + ], + fill_value=0, + dtype="float32", + ) self.image_features = None self.image_grid_thws = None self.image_features_list = None diff --git a/tests/deterministic/test_c16_warp1_4_determinism.py b/tests/deterministic/test_c16_warp1_4_determinism.py index 1cd260daacc..6d7f69f9c58 100644 --- a/tests/deterministic/test_c16_warp1_4_determinism.py +++ b/tests/deterministic/test_c16_warp1_4_determinism.py @@ -350,6 +350,7 @@ def run_c16_warp14_decoder_test( None, None, None, + None, 1e-6, compute_type, "none", @@ -460,6 +461,7 @@ def run_c16_warp14_decoder_test( None, None, None, + None, 1e-6, compute_type, "none", @@ -545,6 +547,7 @@ def run_c16_warp14_decoder_test( None, None, None, + None, 1e-6, compute_type, "none", diff --git a/tests/layers/test_append_attention.py b/tests/layers/test_append_attention.py index bc9e1bd31d8..f4813a1ce5b 100644 --- a/tests/layers/test_append_attention.py +++ b/tests/layers/test_append_attention.py @@ -703,6 +703,7 @@ def cmp_append_attention(self, naive_cache_k=None, naive_cache_v=None, attn_mask None, # linear_smooth self.mask_offset, # mask_offset None, # kv_signal_data + None, # rope_3d_delta q_norm_weight, # q_norm_weight k_norm_weight, # k_norm_weight sinks, # sinks @@ -768,6 +769,7 @@ def cmp_append_attention(self, naive_cache_k=None, naive_cache_v=None, attn_mask None, # linear_smooth self.mask_offset, # mask_offset None, # kv_signal_data + None, # rope_3d_delta q_norm_weight, # q_norm_weight k_norm_weight, # k_norm_weight sinks, # sinks diff --git a/tests/layers/test_append_attention_with_output.py b/tests/layers/test_append_attention_with_output.py index 6c15de17ccc..d5ced4c62b9 100644 --- a/tests/layers/test_append_attention_with_output.py +++ b/tests/layers/test_append_attention_with_output.py @@ -524,6 +524,7 @@ def cmp_append_attention(self, naive_cache_k=None, naive_cache_v=None, attn_mask None, # linear_smooth self.mask_offset, # mask_offset None, # kv_signal_data + None, # rope_3d_delta q_norm_weight, # q_norm_weight k_norm_weight, # k_norm_weight None, # sinks diff --git a/tests/operators/attention/test_decode_unified_attention_c16.py b/tests/operators/attention/test_decode_unified_attention_c16.py index 0d17d17ccd6..49cbe770ba2 100644 --- a/tests/operators/attention/test_decode_unified_attention_c16.py +++ b/tests/operators/attention/test_decode_unified_attention_c16.py @@ -355,6 +355,7 @@ def run_append_attention( None, # linear_smooth None, # mask_offset None, # kv_signal_data + None, # rope_3d_delta None, # q_norm_weight None, # k_norm_weight None, # sinks diff --git a/tests/operators/attention/test_decode_unified_attention_c8.py b/tests/operators/attention/test_decode_unified_attention_c8.py index d5ec0e5354c..3eed3145f5f 100644 --- a/tests/operators/attention/test_decode_unified_attention_c8.py +++ b/tests/operators/attention/test_decode_unified_attention_c8.py @@ -356,6 +356,7 @@ def append_attention_with_args( None, # linear_smooth None, # mask_offset None, # kv_signal_data + None, # rope_3d_delta self.q_norm_weight, self.k_norm_weight, None, # sinks diff --git a/tests/operators/test_tree_mask.py b/tests/operators/test_tree_mask.py index b05f173fce2..a75b58fb77e 100644 --- a/tests/operators/test_tree_mask.py +++ b/tests/operators/test_tree_mask.py @@ -309,6 +309,7 @@ def run_append_c16_attention( None, # linear_smooth mask_offset, # mask_offset None, # kv_signal_data + None, # rope_3d_delta self.q_norm_weight_tensor if use_qknorm else None, # q_norm_weight self.k_norm_weight_tensor if use_qknorm else None, # k_norm_weight None, # sinks