diff --git a/example/neon.rs b/example/neon.rs index af10c57c19..1aec5badcb 100644 --- a/example/neon.rs +++ b/example/neon.rs @@ -415,6 +415,116 @@ unsafe fn test_vqdmulhq_s32() { assert_eq!(r, e); } +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddl_s8() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.saddlp.v4i16.v8i8 + let a = i8x8::from([1, 2, 3, 4, -5, -6, -7, -8]); + let e = i16x4::from([3, 7, -11, -15]); + let r: i16x4 = unsafe { transmute(vpaddl_s8(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddl_s16() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.saddlp.v2i32.v4i16 + let a = i16x4::from([1, 2, -3, -4]); + let e = i32x2::from([3, -7]); + let r: i32x2 = unsafe { transmute(vpaddl_s16(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddl_s32() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.saddlp.v1i64.v2i32 + let a = i32x2::from([1, -2]); + let e = i64x1::from([-1]); + let r: i64x1 = unsafe { transmute(vpaddl_s32(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddlq_s8() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.saddlp.v8i16.v16i8 + let a = i8x16::from([1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, -13, -14, -15, -16]); + let e = i16x8::from([3, 7, 11, 15, -19, -23, -27, -31]); + let r: i16x8 = unsafe { transmute(vpaddlq_s8(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddlq_s16() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.saddlp.v4i32.v8i16 + let a = i16x8::from([1, 2, 3, 4, -5, -6, -7, -8]); + let e = i32x4::from([3, 7, -11, -15]); + let r: i32x4 = unsafe { transmute(vpaddlq_s16(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddlq_s32() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.saddlp.v2i64.v4i32 + let a = i32x4::from([1, 2, -3, -4]); + let e = i64x2::from([3, -7]); + let r: i64x2 = unsafe { transmute(vpaddlq_s32(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddl_u8() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.uaddlp.v4i16.v8i8 + let a = u8x8::from([255, 254, 253, 252, 251, 250, 249, 248]); + let e = u16x4::from([509, 505, 501, 497]); + let r: u16x4 = unsafe { transmute(vpaddl_u8(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddl_u16() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.uaddlp.v2i32.v4i16 + let a = u16x4::from([65535, 65534, 65533, 65532]); + let e = u32x2::from([131069, 131065]); + let r: u32x2 = unsafe { transmute(vpaddl_u16(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddl_u32() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.uaddlp.v1i64.v2i32 + let a = u32x2::from([4294967295, 4294967294]); + let e = u64x1::from([8589934589]); + let r: u64x1 = unsafe { transmute(vpaddl_u32(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddlq_u8() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.uaddlp.v8i16.v16i8 + let a = u8x16::from([ + 255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242, 241, 240, + ]); + let e = u16x8::from([509, 505, 501, 497, 493, 489, 485, 481]); + let r: u16x8 = unsafe { transmute(vpaddlq_u8(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddlq_u16() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.uaddlp.v4i32.v8i16 + let a = u16x8::from([65535, 65534, 65533, 65532, 65531, 65530, 65529, 65528]); + let e = u32x4::from([131069, 131065, 131061, 131057]); + let r: u32x4 = unsafe { transmute(vpaddlq_u16(transmute(a))) }; + assert_eq!(r, e); +} + +#[cfg(target_arch = "aarch64")] +unsafe fn test_vpaddlq_u32() { + // AArch64 llvm intrinsic: llvm.aarch64.neon.uaddlp.v2i64.v4i32 + let a = u32x4::from([4294967295, 4294967294, 4294967293, 4294967292]); + let e = u64x2::from([8589934589, 8589934585]); + let r: u64x2 = unsafe { transmute(vpaddlq_u32(transmute(a))) }; + assert_eq!(r, e); +} + #[cfg(target_arch = "aarch64")] fn main() { unsafe { @@ -467,6 +577,20 @@ fn main() { test_vqdmulh_s32(); test_vqdmulhq_s16(); test_vqdmulhq_s32(); + + test_vpaddl_s8(); + test_vpaddl_s16(); + test_vpaddl_s32(); + test_vpaddlq_s8(); + test_vpaddlq_s16(); + test_vpaddlq_s32(); + + test_vpaddl_u8(); + test_vpaddl_u16(); + test_vpaddl_u32(); + test_vpaddlq_u8(); + test_vpaddlq_u16(); + test_vpaddlq_u32(); } } diff --git a/src/intrinsics/llvm_aarch64.rs b/src/intrinsics/llvm_aarch64.rs index d2403e079a..9da87a5774 100644 --- a/src/intrinsics/llvm_aarch64.rs +++ b/src/intrinsics/llvm_aarch64.rs @@ -823,6 +823,56 @@ pub(super) fn codegen_aarch64_llvm_intrinsic_call<'tcx>( ); } + "llvm.aarch64.neon.saddlp.v1i64.v2i32" + | "llvm.aarch64.neon.saddlp.v2i32.v4i16" + | "llvm.aarch64.neon.saddlp.v2i64.v4i32" + | "llvm.aarch64.neon.saddlp.v4i16.v8i8" + | "llvm.aarch64.neon.saddlp.v4i32.v8i16" + | "llvm.aarch64.neon.saddlp.v8i16.v16i8" => { + // https://developer.arm.com/documentation/ddi0602/2026-03/SIMD-FP-Instructions/SADDLP--Signed-add-long-pairwise- + intrinsic_args!(fx, args => (a); intrinsic); + + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + let wide_ty = fx.clif_type(ret_lane_ty).unwrap(); + + for lane_idx in 0..ret_lane_count { + let base = lane_idx * 2; + let a_lane0 = a.value_lane(fx, base).load_scalar(fx); + let a_lane1 = a.value_lane(fx, base + 1).load_scalar(fx); + let a_lane0 = fx.bcx.ins().sextend(wide_ty, a_lane0); + let a_lane1 = fx.bcx.ins().sextend(wide_ty, a_lane1); + let sum = fx.bcx.ins().iadd(a_lane0, a_lane1); + let res_lane = CValue::by_val(sum, ret_lane_layout); + ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane); + } + } + + "llvm.aarch64.neon.uaddlp.v1i64.v2i32" + | "llvm.aarch64.neon.uaddlp.v2i32.v4i16" + | "llvm.aarch64.neon.uaddlp.v2i64.v4i32" + | "llvm.aarch64.neon.uaddlp.v4i16.v8i8" + | "llvm.aarch64.neon.uaddlp.v4i32.v8i16" + | "llvm.aarch64.neon.uaddlp.v8i16.v16i8" => { + // https://developer.arm.com/documentation/ddi0602/2026-03/SIMD-FP-Instructions/UADDLP--Unsigned-add-long-pairwise- + intrinsic_args!(fx, args => (a); intrinsic); + + let (ret_lane_count, ret_lane_ty) = ret.layout().ty.simd_size_and_type(fx.tcx); + let ret_lane_layout = fx.layout_of(ret_lane_ty); + let wide_ty = fx.clif_type(ret_lane_ty).unwrap(); + + for lane_idx in 0..ret_lane_count { + let base = lane_idx * 2; + let a_lane0 = a.value_lane(fx, base).load_scalar(fx); + let a_lane1 = a.value_lane(fx, base + 1).load_scalar(fx); + let a_lane0 = fx.bcx.ins().uextend(wide_ty, a_lane0); + let a_lane1 = fx.bcx.ins().uextend(wide_ty, a_lane1); + let sum = fx.bcx.ins().iadd(a_lane0, a_lane1); + let res_lane = CValue::by_val(sum, ret_lane_layout); + ret.place_lane(fx, lane_idx).write_cvalue(fx, res_lane); + } + } + _ => { fx.tcx.dcx().warn(format!( "unsupported AArch64 llvm intrinsic {}; replacing with trap",