Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
3a77667
feat: add V100 (SM70) GPU support
mattheliu Feb 2, 2026
1e219c3
fix format
mattheliu Feb 2, 2026
635bf61
feat: add SM70 (V100) GPU architecture compatibility
mattheliu Feb 3, 2026
bedb279
fix: remove non-existent per_token_quant_fp8.cu from build
mattheliu Feb 3, 2026
88ecd56
fix: remove non-existent MaskedPerTokenQuant and restore FusedMaskSwi…
mattheliu Feb 3, 2026
d071ddb
fix: add fused_mask_swiglu_fp8_quant_kernel.cu back to build sources
mattheliu Feb 4, 2026
4ca1ecb
fix: add set_stop.cu to MetaX build sources
mattheliu Feb 4, 2026
76f8759
fix: add gelu_tanh.cu to MetaX build sources
mattheliu Feb 4, 2026
9e1f341
[Fix] Add SM70/SM75 compatibility for BF16 operations and sampling
mattheliu Feb 6, 2026
4a67a4e
[Fix] Use standard <limits> header for Iluvatar/MetaX compatibility
mattheliu Feb 6, 2026
5d79e8c
fix: move tritonmoe_preprocess_func out of ENABLE_BF16 conditional block
mattheliu Feb 6, 2026
0f45818
fix: restore MetaX compatibility in gelu_tanh.cu while keeping SM70 s…
mattheliu Feb 6, 2026
99ff61e
chore: remove unrelated files from PR
mattheliu Feb 9, 2026
872675f
fix: remove SpeculateGetOutputPaddingOffset deleted by PR #6358
mattheliu Feb 9, 2026
53683ed
fix: sync RebuildPaddingFunc signature with upstream PR #6358
mattheliu Feb 9, 2026
4d4f2bc
fix: add missing get_attn_mask_q declaration and build config
mattheliu Feb 9, 2026
4163bd8
[Feature] Add V100 (SM70) support for custom ops
mattheliu Feb 9, 2026
1f269eb
[Fix] V100 (SM70) compilation fixes
mattheliu Feb 11, 2026
b6b1a42
[Fix] V100 (SM70) attention backend compatibility improvements
mattheliu Feb 11, 2026
25776cc
[Feature] Add V100FlashAttentionBackend for SM70 GPU support (Phase 2)
mattheliu Feb 11, 2026
ed92124
[Fix] Add V100 compatibility to PaddleNativeAttnBackend
mattheliu Feb 11, 2026
52ed5c1
[Fix] Add forward_mixed method to PaddleNativeAttnBackend for V100
mattheliu Feb 11, 2026
0bf29e7
[Fix] V100 attention backend runtime fixes for correct inference
mattheliu Feb 12, 2026
1fad040
[Feature] Rewrite V100 attention backend with Triton kernels for SM70
mattheliu Feb 24, 2026
cabeaa1
[Fix] Replace break with conditional guard in Triton kernels
mattheliu Feb 25, 2026
6cf3946
[Fix] Fix unfair fused_rope benchmark: add warmup and clone for Pytho…
mattheliu Feb 25, 2026
99ce6b0
[Fix] Normalize partial output in decode attention stage1
mattheliu Feb 25, 2026
0cd17a7
[Perf] Replace Triton fused_rope with Paddle native ops in V100 backend
mattheliu Feb 25, 2026
8c78dfd
[Fix] Fix NaN corruption in V100 decode attention from uninitialized …
mattheliu Feb 26, 2026
7574537
[Fix] Add Python fallback env var and fix Qwen3 OOM in dummy run on V100
mattheliu Feb 26, 2026
8cd4702
[Debug] Add CUDA sync between KV cache write and attention read
mattheliu Feb 26, 2026
9d92bf8
[Fix] Fix V100 Triton extend_attention hang and validate correctness …
mattheliu Feb 26, 2026
cc877d9
[Perf] Hybrid V100 attention: cuBLAS SDPA for decode+prefill, avoid T…
mattheliu Feb 27, 2026
b7697f0
[Perf][OP] Add CUDA C++ decode attention kernel for V100, replacing T…
mattheliu Feb 28, 2026
d7b8689
[Cleanup] Remove ~1400 lines of dead Triton code from V100 attention …
mattheliu Feb 28, 2026
3bf4a24
[BugFix] Fix triton_ops __init__.py importing 5 deleted V100 functions
mattheliu Feb 28, 2026
8646b7b
[Cleanup] Remove ~145 lines of dead code from V100 attention backend
mattheliu Feb 28, 2026
0446156
[Fix] Add head_dim assert, unify FP8 fallback, cleanup test utils
mattheliu Feb 28, 2026
dd73b53
[Fix] Revert check_fp8_support to inline per-file (tests/ has no __in…
mattheliu Feb 28, 2026
23c00a5
[Fix] Protect V100 backend import with try-except in attention __init__
mattheliu Mar 2, 2026
dfb674e
[Fix] Exclude BF16 Marlin kernels for SM70/SM75 builds
mattheliu Mar 2, 2026
b48535d
[Fix] Disable BF16 Marlin MoE dispatch for SM70/SM75 builds
mattheliu Mar 2, 2026
82fe229
Revert "[Fix] Disable BF16 Marlin MoE dispatch for SM70/SM75 builds"
mattheliu Mar 3, 2026
f2adbb7
Revert "[Fix] Exclude BF16 Marlin kernels for SM70/SM75 builds"
mattheliu Mar 3, 2026
39b5f8b
[BugFix] Fix copy-paste error in moe_reduce.cu: FLOAT16 case incorrec…
mattheliu Mar 3, 2026
b36c981
[BugFix] Fix 3 P0 crash issues for V100 (SM70) deployment
mattheliu Mar 6, 2026
f4fa104
[OP][V100] Add fused RoPE + KV cache write CUDA kernel for SM70
mattheliu Mar 10, 2026
cb53fcf
[BugFix][V100] Fix v100_rope_write_cache inplace interface and attent…
mattheliu Mar 10, 2026
23bf485
[BugFix][V100] Fix greedy sampling (temperature=0) producing random o…
mattheliu Mar 30, 2026
da1c6e5
[BugFix][V100] Fix max_tokens=1 returning EOS instead of sampled token
mattheliu Mar 31, 2026
a3657e7
[V100] Runtime bugfixes and CUDA kernel improvements for SM70 deployment
mattheliu Apr 1, 2026
f93adbe
[V100] Apply upstream-compatible conflict resolutions for SM70
mattheliu Apr 1, 2026
d843923
[V100] Fix pre-commit code style issues
mattheliu Apr 1, 2026
03b4fb9
[V100] Restore try-except ImportError guard in append_attention.py
mattheliu Apr 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 46 additions & 156 deletions custom_ops/gpu_ops/cpp_extensions.cc
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ void FlashAttentionMask(const paddle::Tensor& q_input,
const int kv_head_num,
const int head_dim);

#ifdef ENABLE_APPEND_ATTENTION
std::vector<paddle::Tensor> AppendAttention(
const paddle::Tensor& qkv,
const paddle::Tensor& key_cache,
Expand Down Expand Up @@ -229,6 +230,7 @@ std::vector<paddle::Tensor> PreCacheLenConcat(
const paddle::Tensor& seq_lens_this_time,
const int max_dec_len,
const int block_size);
#endif // ENABLE_APPEND_ATTENTION

paddle::Tensor FusedExpertMoeFunc(
const paddle::Tensor& input,
Expand Down Expand Up @@ -312,13 +314,6 @@ std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
const bool use_in_ep,
const int token_nums_this_rank_padded);

std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor& input,
const int block_size,
const bool use_ue8m0);
std::vector<paddle::Tensor> PerTokenQuantPadding(paddle::Tensor& input,
const int block_size,
const bool use_ue8m0);

std::vector<paddle::Tensor> FusedMaskSwigluFP8Quant(
paddle::Tensor& input,
paddle::Tensor& token_nums_per_expert,
Expand Down Expand Up @@ -401,6 +396,7 @@ paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank,
paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor& kv_signal_metadata,
const int layer_id);

#ifdef ENABLE_APPEND_ATTENTION
void GetBlockShapeAndSplitKVBlock(
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
Expand All @@ -421,6 +417,7 @@ void GetBlockShapeAndSplitKVBlock(
const int decoder_block_shape_q,
const int group_size,
const int block_size);
#endif // ENABLE_APPEND_ATTENTION

std::vector<paddle::Tensor> GetPaddingOffset(
const paddle::Tensor& input_ids,
Expand Down Expand Up @@ -764,40 +761,21 @@ std::vector<paddle::Tensor> SpeculateGetSeqLensOutput(
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder);

std::vector<paddle::Tensor> SpeculatePreProcess(
const int64_t cpu_token_num,
const paddle::Tensor& input_ids,
const paddle::Tensor& seq_len,
const paddle::Tensor& draft_tokens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder);

std::vector<paddle::Tensor> BuildSamplingParams(
const paddle::Tensor& top_p,
const paddle::Tensor& top_k,
paddle::Tensor& infer_seed,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& cu_seqlens_q_output,
const int64_t token_num_output_cpu,
const int64_t increment_value);

void SpecTokenPenaltyMultiScores(
const paddle::Tensor& token_ids_all,
const paddle::Tensor& prompt_lens,
const paddle::Tensor& logits,
const paddle::Tensor& penalty_scores,
const paddle::Tensor& frequency_scores,
const paddle::Tensor& presence_scores,
const paddle::Tensor& temperatures,
const paddle::Tensor& bad_tokens,
const paddle::Tensor& bad_tokens_len,
const paddle::Tensor& cur_len,
const paddle::Tensor& min_len,
const paddle::Tensor& eos_token_id,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& batch_id_per_token_output,
const paddle::Tensor& cu_seqlens_q_output,
const int max_seq_len);
void SpecTokenPenaltyMultiScores(const paddle::Tensor& pre_ids,
const paddle::Tensor& logits,
const paddle::Tensor& penalty_scores,
const paddle::Tensor& frequency_scores,
const paddle::Tensor& presence_scores,
const paddle::Tensor& temperatures,
const paddle::Tensor& bad_tokens,
const paddle::Tensor& bad_tokens_len,
const paddle::Tensor& cur_len,
const paddle::Tensor& min_len,
const paddle::Tensor& eos_token_id,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& output_padding_offset,
const paddle::Tensor& output_cum_offsets,
const int max_seq_len);

void SpecGetStopFlagsMultiSeqs(const paddle::Tensor& accept_tokens,
const paddle::Tensor& accept_num,
Expand Down Expand Up @@ -848,7 +826,7 @@ void SpeculateVerify(const paddle::Tensor& sampled_token_ids,
const paddle::Tensor& max_dec_len,
const paddle::Tensor& end_tokens,
const paddle::Tensor& is_block_step,
const paddle::Tensor& cu_seqlens_q_output,
const paddle::Tensor& output_cum_offsets,
const paddle::Tensor& actual_candidate_len,
const paddle::Tensor& actual_draft_token_nums,
const paddle::Tensor& topp,
Expand Down Expand Up @@ -992,7 +970,7 @@ void DraftModelUpdate(const paddle::Tensor& inter_next_tokens,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& seq_lens_decoder,
const paddle::Tensor& step_idx,
const paddle::Tensor& cu_seqlens_q_output,
const paddle::Tensor& output_cum_offsets,
const paddle::Tensor& stop_flags,
const paddle::Tensor& not_need_stop,
const paddle::Tensor& max_dec_len,
Expand Down Expand Up @@ -1167,85 +1145,26 @@ std::vector<paddle::Tensor> FusedNeoxRopeEmbedding(

std::vector<paddle::Tensor> GeluTanh(paddle::Tensor& input);

void ReasoningPhaseTokenConstraint(
const paddle::Tensor& logits,
const paddle::Tensor& token_ids_all,
const paddle::Tensor& prompt_lens,
const paddle::Tensor& stop_flags,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& step_idx,
const paddle::Tensor& allowed_tokens,
const paddle::Tensor& reasoning_status,
const paddle::Tensor& batch_id_per_token_output,
const paddle::Tensor& cu_seqlens_q_output,
const paddle::Tensor& enable_thinking,
int64_t think_end_id,
int64_t line_break_id);
void ReasoningPhaseTokenConstraint(const paddle::Tensor& logits,
const paddle::Tensor& pre_ids,
const paddle::Tensor& stop_flags,
const paddle::Tensor& seq_lens_this_time,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& step_idx,
const paddle::Tensor& allowed_tokens,
const paddle::Tensor& reasoning_status,
const paddle::Tensor& output_padding_offset,
const paddle::Tensor& output_cum_offsets,
const paddle::Tensor& enable_thinking,
int64_t think_end_id,
int64_t line_break_id);

std::vector<paddle::Tensor> get_attn_mask_q(
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& cu_seqlens_k,
const paddle::optional<paddle::Tensor>& attn_mask_kv,
const int kv_token_num);

std::vector<paddle::Tensor> PrefillPermuteToMaskedGemm(
const paddle::Tensor& x,
const paddle::Tensor& scale,
const paddle::Tensor& topk_ids,
const int num_local_experts,
const int max_token_num);

std::vector<paddle::Tensor> DepermutePrefillCombine(
const paddle::Tensor& x,
const paddle::Tensor& indice_map,
const paddle::Tensor& topk_weights,
const int num_worst_tokens);

void RadixTopkRaggedTransform(
paddle::Tensor& input,
paddle::Tensor& output_indices,
const paddle::Tensor& offsets,
paddle::Tensor& lengths,
paddle::optional<paddle::Tensor>& seq_len_decoder,
paddle::optional<paddle::Tensor>& batch_id_per_token,
paddle::optional<paddle::Tensor>& block_tables,
paddle::optional<paddle::Tensor>& maybe_row_states_buffer,
int max_block_num,
int top_k,
int q_num_heads = 0);

std::vector<paddle::Tensor> DSMLAWriteCacheKernel(
const paddle::Tensor& kv_nope,
const paddle::Tensor& kv_pe,
const paddle::Tensor& kv_cache,
const paddle::Tensor& slot_mapping,
const paddle::optional<paddle::Tensor>& scale,
const std::string& cache_quant_type_str);

std::vector<paddle::Tensor> IndexerKQuantAndCacheKernel(
const paddle::Tensor& k,
const paddle::Tensor& kv_cache,
const paddle::Tensor& slot_mapping,
const int64_t quant_block_size,
const std::string& scale_fmt);

std::vector<paddle::Tensor> CpGatherIndexerKQuantCacheKernel(
const paddle::Tensor& kv_cache,
paddle::Tensor& dst_k,
paddle::Tensor& dst_scale,
const paddle::Tensor& block_table,
const paddle::Tensor& cu_seq_lens);

void PerTokenGroupQuantFp8(const paddle::Tensor& input,
paddle::Tensor& output_q,
paddle::Tensor& output_s,
int64_t group_size,
double eps,
double fp8_min,
double fp8_max,
bool scale_ue8m0);

PYBIND11_MODULE(fastdeploy_ops, m) {
#ifdef ENABLE_SM80_EXT_OPS
m.def("get_expert_token_num",
Expand Down Expand Up @@ -1296,7 +1215,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
py::arg("wait_flag"),
"get_output_kv_signal function");

#ifdef ENABLE_SM75_EXT_OPS
#ifdef ENABLE_BF16
m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
m.def(
"moe_deepgemm_depermute", &MoEDeepGEMMDePermute, "MoEDeepGEMMDePermute");
Expand All @@ -1314,7 +1233,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def(
"cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr"));
py::register_exception<CudaError>(m, "CudaError");
#ifdef ENABLE_SM80_EXT_OPS
#ifdef ENABLE_APPEND_ATTENTION
/**
* append_attention.cu
* append_attention
Expand Down Expand Up @@ -1344,7 +1263,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("pre_cache_len_concat",
&PreCacheLenConcat,
"pre_cache len concat function");

#endif // ENABLE_APPEND_ATTENTION
/**
* moe/fused_moe/fused_moe.cu
* fused_moe
Expand Down Expand Up @@ -1374,7 +1293,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
"moe export dispatch function");

/**
* moe/fused_moe/ep_moe_prefill_func.cu
* moe/ep_moe_expert_dispatch.cu
* ep_moe_dispatch
*/
m.def("ep_moe_expert_dispatch",
Expand Down Expand Up @@ -1402,20 +1321,6 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
"ep moe export combine function");
#endif

m.def("per_token_quant",
&PerTokenQuant,
py::arg("input"),
py::arg("block_size"),
py::arg("use_ue8m0"),
"per token per block quant");

m.def("per_token_quant_padding",
&PerTokenQuantPadding,
py::arg("input"),
py::arg("block_size"),
py::arg("use_ue8m0"),
"per token per block quant and padding transpose scale");

m.def("fused_mask_swiglu_fp8_quant",
&FusedMaskSwigluFP8Quant,
py::arg("input"),
Expand Down Expand Up @@ -1523,15 +1428,15 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
&OpenShmAndGetMetaSignalFunc,
"open_shm_and_get_meta_signal function");

#ifdef ENABLE_SM80_EXT_OPS
#ifdef ENABLE_APPEND_ATTENTION
/**
* append_attn/get_block_shape_and_split_kv_block.cu
* get_block_shape_and_split_kv_block
*/
m.def("get_block_shape_and_split_kv_block",
&GetBlockShapeAndSplitKVBlock,
"get_block_shape_and_split_kv_block function");
#endif
#endif // ENABLE_APPEND_ATTENTION

/**
* get_padding_offset.cu
Expand Down Expand Up @@ -1597,11 +1502,13 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
&TextImageGatherScatter,
"text_image_gather_scatter function");

#ifdef ENABLE_SM80_EXT_OPS
// tritonmoe_preprocess_func does not depend on BF16, keep it unconditionally
// available
m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);

m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);

#ifdef ENABLE_BF16
m.def("MoeWna16MarlinGemmApi",
&MoeWna16MarlinGemmApi,
py::arg("a"),
Expand Down Expand Up @@ -1697,6 +1604,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("noaux_tc_redundant",
&NoauxTcRedundant,
"noaux_tc_redundant for MoE compute");
#endif

#ifdef ENABLE_FP8
m.def("cutlass_fp8_fp8_half_gemm_fused",
Expand All @@ -1710,6 +1618,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
py::arg("output_dtype"),
py::arg("activation_type"),
"cutlass_fp8_fp8_half_gemm_fused function");

m.def("moe_fused_hadamard_quant_fp8",
&MoeFusedHadamardQuantFp8Func,
py::arg("input"),
Expand Down Expand Up @@ -1756,19 +1665,10 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
&get_graph_buffer_ipc_meta,
"get_graph_buffer_ipc_meta");

#ifdef ENABLE_SM80_EXT_OPS
m.def("speculate_get_seq_lens_output",
&SpeculateGetSeqLensOutput,
"speculate_get_seq_lens_output function");

m.def("speculate_pre_process",
&SpeculatePreProcess,
"speculate_pre_process function");

m.def("build_sampling_params",
&BuildSamplingParams,
"build_sampling_params function");

m.def("speculate_get_token_penalty_multi_scores",
&SpecTokenPenaltyMultiScores,
"speculate_get_token_penalty_multi_scores function");
Expand Down Expand Up @@ -1893,17 +1793,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {

m.def("get_attn_mask_q", &get_attn_mask_q, "get_attn_mask_q function");

m.def("custom_numpy_to_tensor",
&CustomNumpyToTensor,
"custom_numpy_to_tensor function");
m.def("prefill_permute_to_masked_gemm",
&PrefillPermuteToMaskedGemm,
py::arg("x"),
py::arg("scale"),
py::arg("topk_ids"),
py::arg("num_local_experts"),
py::arg("max_token_num"),
"Prefill permute to masked GEMM for MoE");
m.def("get_stop", &GetStop, "get_stop function");

m.def("depermute_prefill_combine",
&DepermutePrefillCombine,
Expand Down
9 changes: 7 additions & 2 deletions custom_ops/gpu_ops/gelu_tanh.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include <math.h>
#include "helper.h"
#include "paddle/extension.h"

#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
__forceinline__ __device__ float tanh_ptx(float x) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750
// Use hardware tanh instruction for sm_75 and above
float y;
asm volatile("tanh.approx.f32 %0, %1;" : "=f"(y) : "f"(x));
return y;
#else
// Fallback implementation for sm_70 and below
return tanhf(x);
#endif
}
#endif

Expand Down Expand Up @@ -89,7 +94,7 @@ std::vector<paddle::Tensor> GeluTanh(paddle::Tensor& input) {
DISPATCH_FLOAT_FP6_DTYPE(input.dtype(), scalar_t, {
uint32_t vec_size = 16 / sizeof(scalar_t);
dim3 grid(num_tokens);
dim3 block(std::max(d / vec_size, 1024U));
dim3 block(std::min(d / vec_size, 1024U));

#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
gelu_tanh_kernel<scalar_t><<<grid, block, 0, stream>>>(
Expand Down
Loading
Loading