diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs index 4695d9f7..41d7eeec 100644 --- a/fearless_simd/src/generated.rs +++ b/fearless_simd/src/generated.rs @@ -56,6 +56,7 @@ mod fallback; #[cfg(target_arch = "aarch64")] mod neon; mod ops; +mod scalar; mod simd_trait; mod simd_types; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] @@ -68,6 +69,7 @@ pub use avx2::*; pub use fallback::*; #[cfg(target_arch = "aarch64")] pub use neon::*; +pub use scalar::*; pub use simd_trait::*; pub use simd_types::*; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] diff --git a/fearless_simd/src/generated/scalar.rs b/fearless_simd/src/generated/scalar.rs new file mode 100644 index 00000000..9d980421 --- /dev/null +++ b/fearless_simd/src/generated/scalar.rs @@ -0,0 +1,6503 @@ +// Copyright 2025 the Fearless_SIMD Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// This file is autogenerated by fearless_simd_gen + +use crate::{Level, Simd, SimdInto, seal::Seal}; +use crate::{ + f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, + i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, + mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, + u32x4, u32x8, u32x16, +}; +use core::ops::*; +#[derive(Debug, Copy, Clone)] +pub struct Scalar; +impl Scalar { + #[inline] + pub const fn new() -> Self { + Scalar + } +} +impl Simd for Scalar { + type f32s = f32; + type u8s = u8; + type i8s = i8; + type u16s = u16; + type i16s = i16; + type u32s = u32; + type i32s = i32; + type mask8s = i8; + type mask16s = i16; + type mask32s = i32; + #[inline(always)] + fn level(self) -> Level { + Level::Scalar(self) + } + #[inline] + fn vectorize R, R>(self, f: F) -> R { + f() + } + #[inline(always)] + fn splat_f32x4(self, val: f32) -> f32x4 { + [val; 4usize].simd_into(self) + } + #[inline(always)] + fn abs_f32x4(self, a: f32x4) -> f32x4 { + [ + f32::abs(a[0usize]), + f32::abs(a[1usize]), + f32::abs(a[2usize]), + f32::abs(a[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn neg_f32x4(self, a: f32x4) -> f32x4 { + [ + f32::neg(a[0usize]), + f32::neg(a[1usize]), + f32::neg(a[2usize]), + f32::neg(a[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sqrt_f32x4(self, a: f32x4) -> f32x4 { + [ + f32::sqrt(a[0usize]), + f32::sqrt(a[1usize]), + f32::sqrt(a[2usize]), + f32::sqrt(a[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [ + f32::add(a[0usize], &b[0usize]), + f32::add(a[1usize], &b[1usize]), + f32::add(a[2usize], &b[2usize]), + f32::add(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [ + f32::sub(a[0usize], &b[0usize]), + f32::sub(a[1usize], &b[1usize]), + f32::sub(a[2usize], &b[2usize]), + f32::sub(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [ + f32::mul(a[0usize], &b[0usize]), + f32::mul(a[1usize], &b[1usize]), + f32::mul(a[2usize], &b[2usize]), + f32::mul(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [ + f32::div(a[0usize], &b[0usize]), + f32::div(a[1usize], &b[1usize]), + f32::div(a[2usize], &b[2usize]), + f32::div(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [ + f32::copysign(a[0usize], b[0usize]), + f32::copysign(a[1usize], b[1usize]), + f32::copysign(a[2usize], b[2usize]), + f32::copysign(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + [ + -(f32::eq(&a[0usize], &b[0usize]) as i32), + -(f32::eq(&a[1usize], &b[1usize]) as i32), + -(f32::eq(&a[2usize], &b[2usize]) as i32), + -(f32::eq(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + [ + -(f32::lt(&a[0usize], &b[0usize]) as i32), + -(f32::lt(&a[1usize], &b[1usize]) as i32), + -(f32::lt(&a[2usize], &b[2usize]) as i32), + -(f32::lt(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + [ + -(f32::le(&a[0usize], &b[0usize]) as i32), + -(f32::le(&a[1usize], &b[1usize]) as i32), + -(f32::le(&a[2usize], &b[2usize]) as i32), + -(f32::le(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + [ + -(f32::ge(&a[0usize], &b[0usize]) as i32), + -(f32::ge(&a[1usize], &b[1usize]) as i32), + -(f32::ge(&a[2usize], &b[2usize]) as i32), + -(f32::ge(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { + [ + -(f32::gt(&a[0usize], &b[0usize]) as i32), + -(f32::gt(&a[1usize], &b[1usize]) as i32), + -(f32::gt(&a[2usize], &b[2usize]) as i32), + -(f32::gt(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [a[0usize], b[0usize], a[1usize], b[1usize]].simd_into(self) + } + #[inline(always)] + fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [a[2usize], b[2usize], a[3usize], b[3usize]].simd_into(self) + } + #[inline(always)] + fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [a[0usize], a[2usize], b[0usize], b[2usize]].simd_into(self) + } + #[inline(always)] + fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [a[1usize], a[3usize], b[1usize], b[3usize]].simd_into(self) + } + #[inline(always)] + fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [ + f32::max(a[0usize], b[0usize]), + f32::max(a[1usize], b[1usize]), + f32::max(a[2usize], b[2usize]), + f32::max(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [ + f32::max(a[0usize], b[0usize]), + f32::max(a[1usize], b[1usize]), + f32::max(a[2usize], b[2usize]), + f32::max(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [ + f32::min(a[0usize], b[0usize]), + f32::min(a[1usize], b[1usize]), + f32::min(a[2usize], b[2usize]), + f32::min(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + [ + f32::min(a[0usize], b[0usize]), + f32::min(a[1usize], b[1usize]), + f32::min(a[2usize], b[2usize]), + f32::min(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn madd_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + a.mul(b).add(c) + } + #[inline(always)] + fn msub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { + a.mul(b).sub(c) + } + #[inline(always)] + fn floor_f32x4(self, a: f32x4) -> f32x4 { + [ + f32::floor(a[0usize]), + f32::floor(a[1usize]), + f32::floor(a[2usize]), + f32::floor(a[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn fract_f32x4(self, a: f32x4) -> f32x4 { + [ + f32::fract(a[0usize]), + f32::fract(a[1usize]), + f32::fract(a[2usize]), + f32::fract(a[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn trunc_f32x4(self, a: f32x4) -> f32x4 { + [ + f32::trunc(a[0usize]), + f32::trunc(a[1usize]), + f32::trunc(a[2usize]), + f32::trunc(a[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + if a[2usize] != 0 { b[2usize] } else { c[2usize] }, + if a[3usize] != 0 { b[3usize] } else { c[3usize] }, + ] + .simd_into(self) + } + #[inline(always)] + fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { + let mut result = [0.0; 8usize]; + result[0..4usize].copy_from_slice(&a.val); + result[4usize..8usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { + f64x2 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { + i32x4 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { + u8x16 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { + u32x4 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { + [ + a[0usize] as u32, + a[1usize] as u32, + a[2usize] as u32, + a[3usize] as u32, + ] + .simd_into(self) + } + #[inline(always)] + fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { + [ + a[0usize] as i32, + a[1usize] as i32, + a[2usize] as i32, + a[3usize] as i32, + ] + .simd_into(self) + } + #[inline(always)] + fn splat_i8x16(self, val: i8) -> i8x16 { + [val; 16usize].simd_into(self) + } + #[inline(always)] + fn not_i8x16(self, a: i8x16) -> i8x16 { + [ + i8::not(a[0usize]), + i8::not(a[1usize]), + i8::not(a[2usize]), + i8::not(a[3usize]), + i8::not(a[4usize]), + i8::not(a[5usize]), + i8::not(a[6usize]), + i8::not(a[7usize]), + i8::not(a[8usize]), + i8::not(a[9usize]), + i8::not(a[10usize]), + i8::not(a[11usize]), + i8::not(a[12usize]), + i8::not(a[13usize]), + i8::not(a[14usize]), + i8::not(a[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + i8::wrapping_add(a[0usize], b[0usize]), + i8::wrapping_add(a[1usize], b[1usize]), + i8::wrapping_add(a[2usize], b[2usize]), + i8::wrapping_add(a[3usize], b[3usize]), + i8::wrapping_add(a[4usize], b[4usize]), + i8::wrapping_add(a[5usize], b[5usize]), + i8::wrapping_add(a[6usize], b[6usize]), + i8::wrapping_add(a[7usize], b[7usize]), + i8::wrapping_add(a[8usize], b[8usize]), + i8::wrapping_add(a[9usize], b[9usize]), + i8::wrapping_add(a[10usize], b[10usize]), + i8::wrapping_add(a[11usize], b[11usize]), + i8::wrapping_add(a[12usize], b[12usize]), + i8::wrapping_add(a[13usize], b[13usize]), + i8::wrapping_add(a[14usize], b[14usize]), + i8::wrapping_add(a[15usize], b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + i8::wrapping_sub(a[0usize], b[0usize]), + i8::wrapping_sub(a[1usize], b[1usize]), + i8::wrapping_sub(a[2usize], b[2usize]), + i8::wrapping_sub(a[3usize], b[3usize]), + i8::wrapping_sub(a[4usize], b[4usize]), + i8::wrapping_sub(a[5usize], b[5usize]), + i8::wrapping_sub(a[6usize], b[6usize]), + i8::wrapping_sub(a[7usize], b[7usize]), + i8::wrapping_sub(a[8usize], b[8usize]), + i8::wrapping_sub(a[9usize], b[9usize]), + i8::wrapping_sub(a[10usize], b[10usize]), + i8::wrapping_sub(a[11usize], b[11usize]), + i8::wrapping_sub(a[12usize], b[12usize]), + i8::wrapping_sub(a[13usize], b[13usize]), + i8::wrapping_sub(a[14usize], b[14usize]), + i8::wrapping_sub(a[15usize], b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + i8::wrapping_mul(a[0usize], b[0usize]), + i8::wrapping_mul(a[1usize], b[1usize]), + i8::wrapping_mul(a[2usize], b[2usize]), + i8::wrapping_mul(a[3usize], b[3usize]), + i8::wrapping_mul(a[4usize], b[4usize]), + i8::wrapping_mul(a[5usize], b[5usize]), + i8::wrapping_mul(a[6usize], b[6usize]), + i8::wrapping_mul(a[7usize], b[7usize]), + i8::wrapping_mul(a[8usize], b[8usize]), + i8::wrapping_mul(a[9usize], b[9usize]), + i8::wrapping_mul(a[10usize], b[10usize]), + i8::wrapping_mul(a[11usize], b[11usize]), + i8::wrapping_mul(a[12usize], b[12usize]), + i8::wrapping_mul(a[13usize], b[13usize]), + i8::wrapping_mul(a[14usize], b[14usize]), + i8::wrapping_mul(a[15usize], b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + i8::bitand(a[0usize], &b[0usize]), + i8::bitand(a[1usize], &b[1usize]), + i8::bitand(a[2usize], &b[2usize]), + i8::bitand(a[3usize], &b[3usize]), + i8::bitand(a[4usize], &b[4usize]), + i8::bitand(a[5usize], &b[5usize]), + i8::bitand(a[6usize], &b[6usize]), + i8::bitand(a[7usize], &b[7usize]), + i8::bitand(a[8usize], &b[8usize]), + i8::bitand(a[9usize], &b[9usize]), + i8::bitand(a[10usize], &b[10usize]), + i8::bitand(a[11usize], &b[11usize]), + i8::bitand(a[12usize], &b[12usize]), + i8::bitand(a[13usize], &b[13usize]), + i8::bitand(a[14usize], &b[14usize]), + i8::bitand(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + i8::bitor(a[0usize], &b[0usize]), + i8::bitor(a[1usize], &b[1usize]), + i8::bitor(a[2usize], &b[2usize]), + i8::bitor(a[3usize], &b[3usize]), + i8::bitor(a[4usize], &b[4usize]), + i8::bitor(a[5usize], &b[5usize]), + i8::bitor(a[6usize], &b[6usize]), + i8::bitor(a[7usize], &b[7usize]), + i8::bitor(a[8usize], &b[8usize]), + i8::bitor(a[9usize], &b[9usize]), + i8::bitor(a[10usize], &b[10usize]), + i8::bitor(a[11usize], &b[11usize]), + i8::bitor(a[12usize], &b[12usize]), + i8::bitor(a[13usize], &b[13usize]), + i8::bitor(a[14usize], &b[14usize]), + i8::bitor(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + i8::bitxor(a[0usize], &b[0usize]), + i8::bitxor(a[1usize], &b[1usize]), + i8::bitxor(a[2usize], &b[2usize]), + i8::bitxor(a[3usize], &b[3usize]), + i8::bitxor(a[4usize], &b[4usize]), + i8::bitxor(a[5usize], &b[5usize]), + i8::bitxor(a[6usize], &b[6usize]), + i8::bitxor(a[7usize], &b[7usize]), + i8::bitxor(a[8usize], &b[8usize]), + i8::bitxor(a[9usize], &b[9usize]), + i8::bitxor(a[10usize], &b[10usize]), + i8::bitxor(a[11usize], &b[11usize]), + i8::bitxor(a[12usize], &b[12usize]), + i8::bitxor(a[13usize], &b[13usize]), + i8::bitxor(a[14usize], &b[14usize]), + i8::bitxor(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { + [ + i8::shr(a[0usize], shift as i8), + i8::shr(a[1usize], shift as i8), + i8::shr(a[2usize], shift as i8), + i8::shr(a[3usize], shift as i8), + i8::shr(a[4usize], shift as i8), + i8::shr(a[5usize], shift as i8), + i8::shr(a[6usize], shift as i8), + i8::shr(a[7usize], shift as i8), + i8::shr(a[8usize], shift as i8), + i8::shr(a[9usize], shift as i8), + i8::shr(a[10usize], shift as i8), + i8::shr(a[11usize], shift as i8), + i8::shr(a[12usize], shift as i8), + i8::shr(a[13usize], shift as i8), + i8::shr(a[14usize], shift as i8), + i8::shr(a[15usize], shift as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + i8::shr(a[0usize], &b[0usize]), + i8::shr(a[1usize], &b[1usize]), + i8::shr(a[2usize], &b[2usize]), + i8::shr(a[3usize], &b[3usize]), + i8::shr(a[4usize], &b[4usize]), + i8::shr(a[5usize], &b[5usize]), + i8::shr(a[6usize], &b[6usize]), + i8::shr(a[7usize], &b[7usize]), + i8::shr(a[8usize], &b[8usize]), + i8::shr(a[9usize], &b[9usize]), + i8::shr(a[10usize], &b[10usize]), + i8::shr(a[11usize], &b[11usize]), + i8::shr(a[12usize], &b[12usize]), + i8::shr(a[13usize], &b[13usize]), + i8::shr(a[14usize], &b[14usize]), + i8::shr(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { + [ + i8::shl(a[0usize], shift as i8), + i8::shl(a[1usize], shift as i8), + i8::shl(a[2usize], shift as i8), + i8::shl(a[3usize], shift as i8), + i8::shl(a[4usize], shift as i8), + i8::shl(a[5usize], shift as i8), + i8::shl(a[6usize], shift as i8), + i8::shl(a[7usize], shift as i8), + i8::shl(a[8usize], shift as i8), + i8::shl(a[9usize], shift as i8), + i8::shl(a[10usize], shift as i8), + i8::shl(a[11usize], shift as i8), + i8::shl(a[12usize], shift as i8), + i8::shl(a[13usize], shift as i8), + i8::shl(a[14usize], shift as i8), + i8::shl(a[15usize], shift as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + [ + -(i8::eq(&a[0usize], &b[0usize]) as i8), + -(i8::eq(&a[1usize], &b[1usize]) as i8), + -(i8::eq(&a[2usize], &b[2usize]) as i8), + -(i8::eq(&a[3usize], &b[3usize]) as i8), + -(i8::eq(&a[4usize], &b[4usize]) as i8), + -(i8::eq(&a[5usize], &b[5usize]) as i8), + -(i8::eq(&a[6usize], &b[6usize]) as i8), + -(i8::eq(&a[7usize], &b[7usize]) as i8), + -(i8::eq(&a[8usize], &b[8usize]) as i8), + -(i8::eq(&a[9usize], &b[9usize]) as i8), + -(i8::eq(&a[10usize], &b[10usize]) as i8), + -(i8::eq(&a[11usize], &b[11usize]) as i8), + -(i8::eq(&a[12usize], &b[12usize]) as i8), + -(i8::eq(&a[13usize], &b[13usize]) as i8), + -(i8::eq(&a[14usize], &b[14usize]) as i8), + -(i8::eq(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + [ + -(i8::lt(&a[0usize], &b[0usize]) as i8), + -(i8::lt(&a[1usize], &b[1usize]) as i8), + -(i8::lt(&a[2usize], &b[2usize]) as i8), + -(i8::lt(&a[3usize], &b[3usize]) as i8), + -(i8::lt(&a[4usize], &b[4usize]) as i8), + -(i8::lt(&a[5usize], &b[5usize]) as i8), + -(i8::lt(&a[6usize], &b[6usize]) as i8), + -(i8::lt(&a[7usize], &b[7usize]) as i8), + -(i8::lt(&a[8usize], &b[8usize]) as i8), + -(i8::lt(&a[9usize], &b[9usize]) as i8), + -(i8::lt(&a[10usize], &b[10usize]) as i8), + -(i8::lt(&a[11usize], &b[11usize]) as i8), + -(i8::lt(&a[12usize], &b[12usize]) as i8), + -(i8::lt(&a[13usize], &b[13usize]) as i8), + -(i8::lt(&a[14usize], &b[14usize]) as i8), + -(i8::lt(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + [ + -(i8::le(&a[0usize], &b[0usize]) as i8), + -(i8::le(&a[1usize], &b[1usize]) as i8), + -(i8::le(&a[2usize], &b[2usize]) as i8), + -(i8::le(&a[3usize], &b[3usize]) as i8), + -(i8::le(&a[4usize], &b[4usize]) as i8), + -(i8::le(&a[5usize], &b[5usize]) as i8), + -(i8::le(&a[6usize], &b[6usize]) as i8), + -(i8::le(&a[7usize], &b[7usize]) as i8), + -(i8::le(&a[8usize], &b[8usize]) as i8), + -(i8::le(&a[9usize], &b[9usize]) as i8), + -(i8::le(&a[10usize], &b[10usize]) as i8), + -(i8::le(&a[11usize], &b[11usize]) as i8), + -(i8::le(&a[12usize], &b[12usize]) as i8), + -(i8::le(&a[13usize], &b[13usize]) as i8), + -(i8::le(&a[14usize], &b[14usize]) as i8), + -(i8::le(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + [ + -(i8::ge(&a[0usize], &b[0usize]) as i8), + -(i8::ge(&a[1usize], &b[1usize]) as i8), + -(i8::ge(&a[2usize], &b[2usize]) as i8), + -(i8::ge(&a[3usize], &b[3usize]) as i8), + -(i8::ge(&a[4usize], &b[4usize]) as i8), + -(i8::ge(&a[5usize], &b[5usize]) as i8), + -(i8::ge(&a[6usize], &b[6usize]) as i8), + -(i8::ge(&a[7usize], &b[7usize]) as i8), + -(i8::ge(&a[8usize], &b[8usize]) as i8), + -(i8::ge(&a[9usize], &b[9usize]) as i8), + -(i8::ge(&a[10usize], &b[10usize]) as i8), + -(i8::ge(&a[11usize], &b[11usize]) as i8), + -(i8::ge(&a[12usize], &b[12usize]) as i8), + -(i8::ge(&a[13usize], &b[13usize]) as i8), + -(i8::ge(&a[14usize], &b[14usize]) as i8), + -(i8::ge(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { + [ + -(i8::gt(&a[0usize], &b[0usize]) as i8), + -(i8::gt(&a[1usize], &b[1usize]) as i8), + -(i8::gt(&a[2usize], &b[2usize]) as i8), + -(i8::gt(&a[3usize], &b[3usize]) as i8), + -(i8::gt(&a[4usize], &b[4usize]) as i8), + -(i8::gt(&a[5usize], &b[5usize]) as i8), + -(i8::gt(&a[6usize], &b[6usize]) as i8), + -(i8::gt(&a[7usize], &b[7usize]) as i8), + -(i8::gt(&a[8usize], &b[8usize]) as i8), + -(i8::gt(&a[9usize], &b[9usize]) as i8), + -(i8::gt(&a[10usize], &b[10usize]) as i8), + -(i8::gt(&a[11usize], &b[11usize]) as i8), + -(i8::gt(&a[12usize], &b[12usize]) as i8), + -(i8::gt(&a[13usize], &b[13usize]) as i8), + -(i8::gt(&a[14usize], &b[14usize]) as i8), + -(i8::gt(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + a[0usize], b[0usize], a[1usize], b[1usize], a[2usize], b[2usize], a[3usize], b[3usize], + a[4usize], b[4usize], a[5usize], b[5usize], a[6usize], b[6usize], a[7usize], b[7usize], + ] + .simd_into(self) + } + #[inline(always)] + fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + a[8usize], b[8usize], a[9usize], b[9usize], a[10usize], b[10usize], a[11usize], + b[11usize], a[12usize], b[12usize], a[13usize], b[13usize], a[14usize], b[14usize], + a[15usize], b[15usize], + ] + .simd_into(self) + } + #[inline(always)] + fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + a[0usize], a[2usize], a[4usize], a[6usize], a[8usize], a[10usize], a[12usize], + a[14usize], b[0usize], b[2usize], b[4usize], b[6usize], b[8usize], b[10usize], + b[12usize], b[14usize], + ] + .simd_into(self) + } + #[inline(always)] + fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + a[1usize], a[3usize], a[5usize], a[7usize], a[9usize], a[11usize], a[13usize], + a[15usize], b[1usize], b[3usize], b[5usize], b[7usize], b[9usize], b[11usize], + b[13usize], b[15usize], + ] + .simd_into(self) + } + #[inline(always)] + fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + if a[2usize] != 0 { b[2usize] } else { c[2usize] }, + if a[3usize] != 0 { b[3usize] } else { c[3usize] }, + if a[4usize] != 0 { b[4usize] } else { c[4usize] }, + if a[5usize] != 0 { b[5usize] } else { c[5usize] }, + if a[6usize] != 0 { b[6usize] } else { c[6usize] }, + if a[7usize] != 0 { b[7usize] } else { c[7usize] }, + if a[8usize] != 0 { b[8usize] } else { c[8usize] }, + if a[9usize] != 0 { b[9usize] } else { c[9usize] }, + if a[10usize] != 0 { + b[10usize] + } else { + c[10usize] + }, + if a[11usize] != 0 { + b[11usize] + } else { + c[11usize] + }, + if a[12usize] != 0 { + b[12usize] + } else { + c[12usize] + }, + if a[13usize] != 0 { + b[13usize] + } else { + c[13usize] + }, + if a[14usize] != 0 { + b[14usize] + } else { + c[14usize] + }, + if a[15usize] != 0 { + b[15usize] + } else { + c[15usize] + }, + ] + .simd_into(self) + } + #[inline(always)] + fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + i8::min(a[0usize], b[0usize]), + i8::min(a[1usize], b[1usize]), + i8::min(a[2usize], b[2usize]), + i8::min(a[3usize], b[3usize]), + i8::min(a[4usize], b[4usize]), + i8::min(a[5usize], b[5usize]), + i8::min(a[6usize], b[6usize]), + i8::min(a[7usize], b[7usize]), + i8::min(a[8usize], b[8usize]), + i8::min(a[9usize], b[9usize]), + i8::min(a[10usize], b[10usize]), + i8::min(a[11usize], b[11usize]), + i8::min(a[12usize], b[12usize]), + i8::min(a[13usize], b[13usize]), + i8::min(a[14usize], b[14usize]), + i8::min(a[15usize], b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + [ + i8::max(a[0usize], b[0usize]), + i8::max(a[1usize], b[1usize]), + i8::max(a[2usize], b[2usize]), + i8::max(a[3usize], b[3usize]), + i8::max(a[4usize], b[4usize]), + i8::max(a[5usize], b[5usize]), + i8::max(a[6usize], b[6usize]), + i8::max(a[7usize], b[7usize]), + i8::max(a[8usize], b[8usize]), + i8::max(a[9usize], b[9usize]), + i8::max(a[10usize], b[10usize]), + i8::max(a[11usize], b[11usize]), + i8::max(a[12usize], b[12usize]), + i8::max(a[13usize], b[13usize]), + i8::max(a[14usize], b[14usize]), + i8::max(a[15usize], b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { + let mut result = [0; 32usize]; + result[0..16usize].copy_from_slice(&a.val); + result[16usize..32usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn neg_i8x16(self, a: i8x16) -> i8x16 { + [ + i8::neg(a[0usize]), + i8::neg(a[1usize]), + i8::neg(a[2usize]), + i8::neg(a[3usize]), + i8::neg(a[4usize]), + i8::neg(a[5usize]), + i8::neg(a[6usize]), + i8::neg(a[7usize]), + i8::neg(a[8usize]), + i8::neg(a[9usize]), + i8::neg(a[10usize]), + i8::neg(a[11usize]), + i8::neg(a[12usize]), + i8::neg(a[13usize]), + i8::neg(a[14usize]), + i8::neg(a[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { + u8x16 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { + u32x4 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn splat_u8x16(self, val: u8) -> u8x16 { + [val; 16usize].simd_into(self) + } + #[inline(always)] + fn not_u8x16(self, a: u8x16) -> u8x16 { + [ + u8::not(a[0usize]), + u8::not(a[1usize]), + u8::not(a[2usize]), + u8::not(a[3usize]), + u8::not(a[4usize]), + u8::not(a[5usize]), + u8::not(a[6usize]), + u8::not(a[7usize]), + u8::not(a[8usize]), + u8::not(a[9usize]), + u8::not(a[10usize]), + u8::not(a[11usize]), + u8::not(a[12usize]), + u8::not(a[13usize]), + u8::not(a[14usize]), + u8::not(a[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + u8::wrapping_add(a[0usize], b[0usize]), + u8::wrapping_add(a[1usize], b[1usize]), + u8::wrapping_add(a[2usize], b[2usize]), + u8::wrapping_add(a[3usize], b[3usize]), + u8::wrapping_add(a[4usize], b[4usize]), + u8::wrapping_add(a[5usize], b[5usize]), + u8::wrapping_add(a[6usize], b[6usize]), + u8::wrapping_add(a[7usize], b[7usize]), + u8::wrapping_add(a[8usize], b[8usize]), + u8::wrapping_add(a[9usize], b[9usize]), + u8::wrapping_add(a[10usize], b[10usize]), + u8::wrapping_add(a[11usize], b[11usize]), + u8::wrapping_add(a[12usize], b[12usize]), + u8::wrapping_add(a[13usize], b[13usize]), + u8::wrapping_add(a[14usize], b[14usize]), + u8::wrapping_add(a[15usize], b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + u8::wrapping_sub(a[0usize], b[0usize]), + u8::wrapping_sub(a[1usize], b[1usize]), + u8::wrapping_sub(a[2usize], b[2usize]), + u8::wrapping_sub(a[3usize], b[3usize]), + u8::wrapping_sub(a[4usize], b[4usize]), + u8::wrapping_sub(a[5usize], b[5usize]), + u8::wrapping_sub(a[6usize], b[6usize]), + u8::wrapping_sub(a[7usize], b[7usize]), + u8::wrapping_sub(a[8usize], b[8usize]), + u8::wrapping_sub(a[9usize], b[9usize]), + u8::wrapping_sub(a[10usize], b[10usize]), + u8::wrapping_sub(a[11usize], b[11usize]), + u8::wrapping_sub(a[12usize], b[12usize]), + u8::wrapping_sub(a[13usize], b[13usize]), + u8::wrapping_sub(a[14usize], b[14usize]), + u8::wrapping_sub(a[15usize], b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + u8::wrapping_mul(a[0usize], b[0usize]), + u8::wrapping_mul(a[1usize], b[1usize]), + u8::wrapping_mul(a[2usize], b[2usize]), + u8::wrapping_mul(a[3usize], b[3usize]), + u8::wrapping_mul(a[4usize], b[4usize]), + u8::wrapping_mul(a[5usize], b[5usize]), + u8::wrapping_mul(a[6usize], b[6usize]), + u8::wrapping_mul(a[7usize], b[7usize]), + u8::wrapping_mul(a[8usize], b[8usize]), + u8::wrapping_mul(a[9usize], b[9usize]), + u8::wrapping_mul(a[10usize], b[10usize]), + u8::wrapping_mul(a[11usize], b[11usize]), + u8::wrapping_mul(a[12usize], b[12usize]), + u8::wrapping_mul(a[13usize], b[13usize]), + u8::wrapping_mul(a[14usize], b[14usize]), + u8::wrapping_mul(a[15usize], b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + u8::bitand(a[0usize], &b[0usize]), + u8::bitand(a[1usize], &b[1usize]), + u8::bitand(a[2usize], &b[2usize]), + u8::bitand(a[3usize], &b[3usize]), + u8::bitand(a[4usize], &b[4usize]), + u8::bitand(a[5usize], &b[5usize]), + u8::bitand(a[6usize], &b[6usize]), + u8::bitand(a[7usize], &b[7usize]), + u8::bitand(a[8usize], &b[8usize]), + u8::bitand(a[9usize], &b[9usize]), + u8::bitand(a[10usize], &b[10usize]), + u8::bitand(a[11usize], &b[11usize]), + u8::bitand(a[12usize], &b[12usize]), + u8::bitand(a[13usize], &b[13usize]), + u8::bitand(a[14usize], &b[14usize]), + u8::bitand(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + u8::bitor(a[0usize], &b[0usize]), + u8::bitor(a[1usize], &b[1usize]), + u8::bitor(a[2usize], &b[2usize]), + u8::bitor(a[3usize], &b[3usize]), + u8::bitor(a[4usize], &b[4usize]), + u8::bitor(a[5usize], &b[5usize]), + u8::bitor(a[6usize], &b[6usize]), + u8::bitor(a[7usize], &b[7usize]), + u8::bitor(a[8usize], &b[8usize]), + u8::bitor(a[9usize], &b[9usize]), + u8::bitor(a[10usize], &b[10usize]), + u8::bitor(a[11usize], &b[11usize]), + u8::bitor(a[12usize], &b[12usize]), + u8::bitor(a[13usize], &b[13usize]), + u8::bitor(a[14usize], &b[14usize]), + u8::bitor(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + u8::bitxor(a[0usize], &b[0usize]), + u8::bitxor(a[1usize], &b[1usize]), + u8::bitxor(a[2usize], &b[2usize]), + u8::bitxor(a[3usize], &b[3usize]), + u8::bitxor(a[4usize], &b[4usize]), + u8::bitxor(a[5usize], &b[5usize]), + u8::bitxor(a[6usize], &b[6usize]), + u8::bitxor(a[7usize], &b[7usize]), + u8::bitxor(a[8usize], &b[8usize]), + u8::bitxor(a[9usize], &b[9usize]), + u8::bitxor(a[10usize], &b[10usize]), + u8::bitxor(a[11usize], &b[11usize]), + u8::bitxor(a[12usize], &b[12usize]), + u8::bitxor(a[13usize], &b[13usize]), + u8::bitxor(a[14usize], &b[14usize]), + u8::bitxor(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { + [ + u8::shr(a[0usize], shift as u8), + u8::shr(a[1usize], shift as u8), + u8::shr(a[2usize], shift as u8), + u8::shr(a[3usize], shift as u8), + u8::shr(a[4usize], shift as u8), + u8::shr(a[5usize], shift as u8), + u8::shr(a[6usize], shift as u8), + u8::shr(a[7usize], shift as u8), + u8::shr(a[8usize], shift as u8), + u8::shr(a[9usize], shift as u8), + u8::shr(a[10usize], shift as u8), + u8::shr(a[11usize], shift as u8), + u8::shr(a[12usize], shift as u8), + u8::shr(a[13usize], shift as u8), + u8::shr(a[14usize], shift as u8), + u8::shr(a[15usize], shift as u8), + ] + .simd_into(self) + } + #[inline(always)] + fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + u8::shr(a[0usize], &b[0usize]), + u8::shr(a[1usize], &b[1usize]), + u8::shr(a[2usize], &b[2usize]), + u8::shr(a[3usize], &b[3usize]), + u8::shr(a[4usize], &b[4usize]), + u8::shr(a[5usize], &b[5usize]), + u8::shr(a[6usize], &b[6usize]), + u8::shr(a[7usize], &b[7usize]), + u8::shr(a[8usize], &b[8usize]), + u8::shr(a[9usize], &b[9usize]), + u8::shr(a[10usize], &b[10usize]), + u8::shr(a[11usize], &b[11usize]), + u8::shr(a[12usize], &b[12usize]), + u8::shr(a[13usize], &b[13usize]), + u8::shr(a[14usize], &b[14usize]), + u8::shr(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { + [ + u8::shl(a[0usize], shift as u8), + u8::shl(a[1usize], shift as u8), + u8::shl(a[2usize], shift as u8), + u8::shl(a[3usize], shift as u8), + u8::shl(a[4usize], shift as u8), + u8::shl(a[5usize], shift as u8), + u8::shl(a[6usize], shift as u8), + u8::shl(a[7usize], shift as u8), + u8::shl(a[8usize], shift as u8), + u8::shl(a[9usize], shift as u8), + u8::shl(a[10usize], shift as u8), + u8::shl(a[11usize], shift as u8), + u8::shl(a[12usize], shift as u8), + u8::shl(a[13usize], shift as u8), + u8::shl(a[14usize], shift as u8), + u8::shl(a[15usize], shift as u8), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + [ + -(u8::eq(&a[0usize], &b[0usize]) as i8), + -(u8::eq(&a[1usize], &b[1usize]) as i8), + -(u8::eq(&a[2usize], &b[2usize]) as i8), + -(u8::eq(&a[3usize], &b[3usize]) as i8), + -(u8::eq(&a[4usize], &b[4usize]) as i8), + -(u8::eq(&a[5usize], &b[5usize]) as i8), + -(u8::eq(&a[6usize], &b[6usize]) as i8), + -(u8::eq(&a[7usize], &b[7usize]) as i8), + -(u8::eq(&a[8usize], &b[8usize]) as i8), + -(u8::eq(&a[9usize], &b[9usize]) as i8), + -(u8::eq(&a[10usize], &b[10usize]) as i8), + -(u8::eq(&a[11usize], &b[11usize]) as i8), + -(u8::eq(&a[12usize], &b[12usize]) as i8), + -(u8::eq(&a[13usize], &b[13usize]) as i8), + -(u8::eq(&a[14usize], &b[14usize]) as i8), + -(u8::eq(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + [ + -(u8::lt(&a[0usize], &b[0usize]) as i8), + -(u8::lt(&a[1usize], &b[1usize]) as i8), + -(u8::lt(&a[2usize], &b[2usize]) as i8), + -(u8::lt(&a[3usize], &b[3usize]) as i8), + -(u8::lt(&a[4usize], &b[4usize]) as i8), + -(u8::lt(&a[5usize], &b[5usize]) as i8), + -(u8::lt(&a[6usize], &b[6usize]) as i8), + -(u8::lt(&a[7usize], &b[7usize]) as i8), + -(u8::lt(&a[8usize], &b[8usize]) as i8), + -(u8::lt(&a[9usize], &b[9usize]) as i8), + -(u8::lt(&a[10usize], &b[10usize]) as i8), + -(u8::lt(&a[11usize], &b[11usize]) as i8), + -(u8::lt(&a[12usize], &b[12usize]) as i8), + -(u8::lt(&a[13usize], &b[13usize]) as i8), + -(u8::lt(&a[14usize], &b[14usize]) as i8), + -(u8::lt(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + [ + -(u8::le(&a[0usize], &b[0usize]) as i8), + -(u8::le(&a[1usize], &b[1usize]) as i8), + -(u8::le(&a[2usize], &b[2usize]) as i8), + -(u8::le(&a[3usize], &b[3usize]) as i8), + -(u8::le(&a[4usize], &b[4usize]) as i8), + -(u8::le(&a[5usize], &b[5usize]) as i8), + -(u8::le(&a[6usize], &b[6usize]) as i8), + -(u8::le(&a[7usize], &b[7usize]) as i8), + -(u8::le(&a[8usize], &b[8usize]) as i8), + -(u8::le(&a[9usize], &b[9usize]) as i8), + -(u8::le(&a[10usize], &b[10usize]) as i8), + -(u8::le(&a[11usize], &b[11usize]) as i8), + -(u8::le(&a[12usize], &b[12usize]) as i8), + -(u8::le(&a[13usize], &b[13usize]) as i8), + -(u8::le(&a[14usize], &b[14usize]) as i8), + -(u8::le(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + [ + -(u8::ge(&a[0usize], &b[0usize]) as i8), + -(u8::ge(&a[1usize], &b[1usize]) as i8), + -(u8::ge(&a[2usize], &b[2usize]) as i8), + -(u8::ge(&a[3usize], &b[3usize]) as i8), + -(u8::ge(&a[4usize], &b[4usize]) as i8), + -(u8::ge(&a[5usize], &b[5usize]) as i8), + -(u8::ge(&a[6usize], &b[6usize]) as i8), + -(u8::ge(&a[7usize], &b[7usize]) as i8), + -(u8::ge(&a[8usize], &b[8usize]) as i8), + -(u8::ge(&a[9usize], &b[9usize]) as i8), + -(u8::ge(&a[10usize], &b[10usize]) as i8), + -(u8::ge(&a[11usize], &b[11usize]) as i8), + -(u8::ge(&a[12usize], &b[12usize]) as i8), + -(u8::ge(&a[13usize], &b[13usize]) as i8), + -(u8::ge(&a[14usize], &b[14usize]) as i8), + -(u8::ge(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { + [ + -(u8::gt(&a[0usize], &b[0usize]) as i8), + -(u8::gt(&a[1usize], &b[1usize]) as i8), + -(u8::gt(&a[2usize], &b[2usize]) as i8), + -(u8::gt(&a[3usize], &b[3usize]) as i8), + -(u8::gt(&a[4usize], &b[4usize]) as i8), + -(u8::gt(&a[5usize], &b[5usize]) as i8), + -(u8::gt(&a[6usize], &b[6usize]) as i8), + -(u8::gt(&a[7usize], &b[7usize]) as i8), + -(u8::gt(&a[8usize], &b[8usize]) as i8), + -(u8::gt(&a[9usize], &b[9usize]) as i8), + -(u8::gt(&a[10usize], &b[10usize]) as i8), + -(u8::gt(&a[11usize], &b[11usize]) as i8), + -(u8::gt(&a[12usize], &b[12usize]) as i8), + -(u8::gt(&a[13usize], &b[13usize]) as i8), + -(u8::gt(&a[14usize], &b[14usize]) as i8), + -(u8::gt(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + a[0usize], b[0usize], a[1usize], b[1usize], a[2usize], b[2usize], a[3usize], b[3usize], + a[4usize], b[4usize], a[5usize], b[5usize], a[6usize], b[6usize], a[7usize], b[7usize], + ] + .simd_into(self) + } + #[inline(always)] + fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + a[8usize], b[8usize], a[9usize], b[9usize], a[10usize], b[10usize], a[11usize], + b[11usize], a[12usize], b[12usize], a[13usize], b[13usize], a[14usize], b[14usize], + a[15usize], b[15usize], + ] + .simd_into(self) + } + #[inline(always)] + fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + a[0usize], a[2usize], a[4usize], a[6usize], a[8usize], a[10usize], a[12usize], + a[14usize], b[0usize], b[2usize], b[4usize], b[6usize], b[8usize], b[10usize], + b[12usize], b[14usize], + ] + .simd_into(self) + } + #[inline(always)] + fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + a[1usize], a[3usize], a[5usize], a[7usize], a[9usize], a[11usize], a[13usize], + a[15usize], b[1usize], b[3usize], b[5usize], b[7usize], b[9usize], b[11usize], + b[13usize], b[15usize], + ] + .simd_into(self) + } + #[inline(always)] + fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + if a[2usize] != 0 { b[2usize] } else { c[2usize] }, + if a[3usize] != 0 { b[3usize] } else { c[3usize] }, + if a[4usize] != 0 { b[4usize] } else { c[4usize] }, + if a[5usize] != 0 { b[5usize] } else { c[5usize] }, + if a[6usize] != 0 { b[6usize] } else { c[6usize] }, + if a[7usize] != 0 { b[7usize] } else { c[7usize] }, + if a[8usize] != 0 { b[8usize] } else { c[8usize] }, + if a[9usize] != 0 { b[9usize] } else { c[9usize] }, + if a[10usize] != 0 { + b[10usize] + } else { + c[10usize] + }, + if a[11usize] != 0 { + b[11usize] + } else { + c[11usize] + }, + if a[12usize] != 0 { + b[12usize] + } else { + c[12usize] + }, + if a[13usize] != 0 { + b[13usize] + } else { + c[13usize] + }, + if a[14usize] != 0 { + b[14usize] + } else { + c[14usize] + }, + if a[15usize] != 0 { + b[15usize] + } else { + c[15usize] + }, + ] + .simd_into(self) + } + #[inline(always)] + fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + u8::min(a[0usize], b[0usize]), + u8::min(a[1usize], b[1usize]), + u8::min(a[2usize], b[2usize]), + u8::min(a[3usize], b[3usize]), + u8::min(a[4usize], b[4usize]), + u8::min(a[5usize], b[5usize]), + u8::min(a[6usize], b[6usize]), + u8::min(a[7usize], b[7usize]), + u8::min(a[8usize], b[8usize]), + u8::min(a[9usize], b[9usize]), + u8::min(a[10usize], b[10usize]), + u8::min(a[11usize], b[11usize]), + u8::min(a[12usize], b[12usize]), + u8::min(a[13usize], b[13usize]), + u8::min(a[14usize], b[14usize]), + u8::min(a[15usize], b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + [ + u8::max(a[0usize], b[0usize]), + u8::max(a[1usize], b[1usize]), + u8::max(a[2usize], b[2usize]), + u8::max(a[3usize], b[3usize]), + u8::max(a[4usize], b[4usize]), + u8::max(a[5usize], b[5usize]), + u8::max(a[6usize], b[6usize]), + u8::max(a[7usize], b[7usize]), + u8::max(a[8usize], b[8usize]), + u8::max(a[9usize], b[9usize]), + u8::max(a[10usize], b[10usize]), + u8::max(a[11usize], b[11usize]), + u8::max(a[12usize], b[12usize]), + u8::max(a[13usize], b[13usize]), + u8::max(a[14usize], b[14usize]), + u8::max(a[15usize], b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { + let mut result = [0; 32usize]; + result[0..16usize].copy_from_slice(&a.val); + result[16usize..32usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn widen_u8x16(self, a: u8x16) -> u16x16 { + [ + a[0usize] as u16, + a[1usize] as u16, + a[2usize] as u16, + a[3usize] as u16, + a[4usize] as u16, + a[5usize] as u16, + a[6usize] as u16, + a[7usize] as u16, + a[8usize] as u16, + a[9usize] as u16, + a[10usize] as u16, + a[11usize] as u16, + a[12usize] as u16, + a[13usize] as u16, + a[14usize] as u16, + a[15usize] as u16, + ] + .simd_into(self) + } + #[inline(always)] + fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { + u32x4 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn splat_mask8x16(self, val: i8) -> mask8x16 { + [val; 16usize].simd_into(self) + } + #[inline(always)] + fn not_mask8x16(self, a: mask8x16) -> mask8x16 { + [ + i8::not(a[0usize]), + i8::not(a[1usize]), + i8::not(a[2usize]), + i8::not(a[3usize]), + i8::not(a[4usize]), + i8::not(a[5usize]), + i8::not(a[6usize]), + i8::not(a[7usize]), + i8::not(a[8usize]), + i8::not(a[9usize]), + i8::not(a[10usize]), + i8::not(a[11usize]), + i8::not(a[12usize]), + i8::not(a[13usize]), + i8::not(a[14usize]), + i8::not(a[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + [ + i8::bitand(a[0usize], &b[0usize]), + i8::bitand(a[1usize], &b[1usize]), + i8::bitand(a[2usize], &b[2usize]), + i8::bitand(a[3usize], &b[3usize]), + i8::bitand(a[4usize], &b[4usize]), + i8::bitand(a[5usize], &b[5usize]), + i8::bitand(a[6usize], &b[6usize]), + i8::bitand(a[7usize], &b[7usize]), + i8::bitand(a[8usize], &b[8usize]), + i8::bitand(a[9usize], &b[9usize]), + i8::bitand(a[10usize], &b[10usize]), + i8::bitand(a[11usize], &b[11usize]), + i8::bitand(a[12usize], &b[12usize]), + i8::bitand(a[13usize], &b[13usize]), + i8::bitand(a[14usize], &b[14usize]), + i8::bitand(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + [ + i8::bitor(a[0usize], &b[0usize]), + i8::bitor(a[1usize], &b[1usize]), + i8::bitor(a[2usize], &b[2usize]), + i8::bitor(a[3usize], &b[3usize]), + i8::bitor(a[4usize], &b[4usize]), + i8::bitor(a[5usize], &b[5usize]), + i8::bitor(a[6usize], &b[6usize]), + i8::bitor(a[7usize], &b[7usize]), + i8::bitor(a[8usize], &b[8usize]), + i8::bitor(a[9usize], &b[9usize]), + i8::bitor(a[10usize], &b[10usize]), + i8::bitor(a[11usize], &b[11usize]), + i8::bitor(a[12usize], &b[12usize]), + i8::bitor(a[13usize], &b[13usize]), + i8::bitor(a[14usize], &b[14usize]), + i8::bitor(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + [ + i8::bitxor(a[0usize], &b[0usize]), + i8::bitxor(a[1usize], &b[1usize]), + i8::bitxor(a[2usize], &b[2usize]), + i8::bitxor(a[3usize], &b[3usize]), + i8::bitxor(a[4usize], &b[4usize]), + i8::bitxor(a[5usize], &b[5usize]), + i8::bitxor(a[6usize], &b[6usize]), + i8::bitxor(a[7usize], &b[7usize]), + i8::bitxor(a[8usize], &b[8usize]), + i8::bitxor(a[9usize], &b[9usize]), + i8::bitxor(a[10usize], &b[10usize]), + i8::bitxor(a[11usize], &b[11usize]), + i8::bitxor(a[12usize], &b[12usize]), + i8::bitxor(a[13usize], &b[13usize]), + i8::bitxor(a[14usize], &b[14usize]), + i8::bitxor(a[15usize], &b[15usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn select_mask8x16( + self, + a: mask8x16, + b: mask8x16, + c: mask8x16, + ) -> mask8x16 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + if a[2usize] != 0 { b[2usize] } else { c[2usize] }, + if a[3usize] != 0 { b[3usize] } else { c[3usize] }, + if a[4usize] != 0 { b[4usize] } else { c[4usize] }, + if a[5usize] != 0 { b[5usize] } else { c[5usize] }, + if a[6usize] != 0 { b[6usize] } else { c[6usize] }, + if a[7usize] != 0 { b[7usize] } else { c[7usize] }, + if a[8usize] != 0 { b[8usize] } else { c[8usize] }, + if a[9usize] != 0 { b[9usize] } else { c[9usize] }, + if a[10usize] != 0 { + b[10usize] + } else { + c[10usize] + }, + if a[11usize] != 0 { + b[11usize] + } else { + c[11usize] + }, + if a[12usize] != 0 { + b[12usize] + } else { + c[12usize] + }, + if a[13usize] != 0 { + b[13usize] + } else { + c[13usize] + }, + if a[14usize] != 0 { + b[14usize] + } else { + c[14usize] + }, + if a[15usize] != 0 { + b[15usize] + } else { + c[15usize] + }, + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { + [ + -(i8::eq(&a[0usize], &b[0usize]) as i8), + -(i8::eq(&a[1usize], &b[1usize]) as i8), + -(i8::eq(&a[2usize], &b[2usize]) as i8), + -(i8::eq(&a[3usize], &b[3usize]) as i8), + -(i8::eq(&a[4usize], &b[4usize]) as i8), + -(i8::eq(&a[5usize], &b[5usize]) as i8), + -(i8::eq(&a[6usize], &b[6usize]) as i8), + -(i8::eq(&a[7usize], &b[7usize]) as i8), + -(i8::eq(&a[8usize], &b[8usize]) as i8), + -(i8::eq(&a[9usize], &b[9usize]) as i8), + -(i8::eq(&a[10usize], &b[10usize]) as i8), + -(i8::eq(&a[11usize], &b[11usize]) as i8), + -(i8::eq(&a[12usize], &b[12usize]) as i8), + -(i8::eq(&a[13usize], &b[13usize]) as i8), + -(i8::eq(&a[14usize], &b[14usize]) as i8), + -(i8::eq(&a[15usize], &b[15usize]) as i8), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { + let mut result = [0; 32usize]; + result[0..16usize].copy_from_slice(&a.val); + result[16usize..32usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn splat_i16x8(self, val: i16) -> i16x8 { + [val; 8usize].simd_into(self) + } + #[inline(always)] + fn not_i16x8(self, a: i16x8) -> i16x8 { + [ + i16::not(a[0usize]), + i16::not(a[1usize]), + i16::not(a[2usize]), + i16::not(a[3usize]), + i16::not(a[4usize]), + i16::not(a[5usize]), + i16::not(a[6usize]), + i16::not(a[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + i16::wrapping_add(a[0usize], b[0usize]), + i16::wrapping_add(a[1usize], b[1usize]), + i16::wrapping_add(a[2usize], b[2usize]), + i16::wrapping_add(a[3usize], b[3usize]), + i16::wrapping_add(a[4usize], b[4usize]), + i16::wrapping_add(a[5usize], b[5usize]), + i16::wrapping_add(a[6usize], b[6usize]), + i16::wrapping_add(a[7usize], b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + i16::wrapping_sub(a[0usize], b[0usize]), + i16::wrapping_sub(a[1usize], b[1usize]), + i16::wrapping_sub(a[2usize], b[2usize]), + i16::wrapping_sub(a[3usize], b[3usize]), + i16::wrapping_sub(a[4usize], b[4usize]), + i16::wrapping_sub(a[5usize], b[5usize]), + i16::wrapping_sub(a[6usize], b[6usize]), + i16::wrapping_sub(a[7usize], b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + i16::wrapping_mul(a[0usize], b[0usize]), + i16::wrapping_mul(a[1usize], b[1usize]), + i16::wrapping_mul(a[2usize], b[2usize]), + i16::wrapping_mul(a[3usize], b[3usize]), + i16::wrapping_mul(a[4usize], b[4usize]), + i16::wrapping_mul(a[5usize], b[5usize]), + i16::wrapping_mul(a[6usize], b[6usize]), + i16::wrapping_mul(a[7usize], b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + i16::bitand(a[0usize], &b[0usize]), + i16::bitand(a[1usize], &b[1usize]), + i16::bitand(a[2usize], &b[2usize]), + i16::bitand(a[3usize], &b[3usize]), + i16::bitand(a[4usize], &b[4usize]), + i16::bitand(a[5usize], &b[5usize]), + i16::bitand(a[6usize], &b[6usize]), + i16::bitand(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + i16::bitor(a[0usize], &b[0usize]), + i16::bitor(a[1usize], &b[1usize]), + i16::bitor(a[2usize], &b[2usize]), + i16::bitor(a[3usize], &b[3usize]), + i16::bitor(a[4usize], &b[4usize]), + i16::bitor(a[5usize], &b[5usize]), + i16::bitor(a[6usize], &b[6usize]), + i16::bitor(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + i16::bitxor(a[0usize], &b[0usize]), + i16::bitxor(a[1usize], &b[1usize]), + i16::bitxor(a[2usize], &b[2usize]), + i16::bitxor(a[3usize], &b[3usize]), + i16::bitxor(a[4usize], &b[4usize]), + i16::bitxor(a[5usize], &b[5usize]), + i16::bitxor(a[6usize], &b[6usize]), + i16::bitxor(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { + [ + i16::shr(a[0usize], shift as i16), + i16::shr(a[1usize], shift as i16), + i16::shr(a[2usize], shift as i16), + i16::shr(a[3usize], shift as i16), + i16::shr(a[4usize], shift as i16), + i16::shr(a[5usize], shift as i16), + i16::shr(a[6usize], shift as i16), + i16::shr(a[7usize], shift as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + i16::shr(a[0usize], &b[0usize]), + i16::shr(a[1usize], &b[1usize]), + i16::shr(a[2usize], &b[2usize]), + i16::shr(a[3usize], &b[3usize]), + i16::shr(a[4usize], &b[4usize]), + i16::shr(a[5usize], &b[5usize]), + i16::shr(a[6usize], &b[6usize]), + i16::shr(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { + [ + i16::shl(a[0usize], shift as i16), + i16::shl(a[1usize], shift as i16), + i16::shl(a[2usize], shift as i16), + i16::shl(a[3usize], shift as i16), + i16::shl(a[4usize], shift as i16), + i16::shl(a[5usize], shift as i16), + i16::shl(a[6usize], shift as i16), + i16::shl(a[7usize], shift as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + [ + -(i16::eq(&a[0usize], &b[0usize]) as i16), + -(i16::eq(&a[1usize], &b[1usize]) as i16), + -(i16::eq(&a[2usize], &b[2usize]) as i16), + -(i16::eq(&a[3usize], &b[3usize]) as i16), + -(i16::eq(&a[4usize], &b[4usize]) as i16), + -(i16::eq(&a[5usize], &b[5usize]) as i16), + -(i16::eq(&a[6usize], &b[6usize]) as i16), + -(i16::eq(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + [ + -(i16::lt(&a[0usize], &b[0usize]) as i16), + -(i16::lt(&a[1usize], &b[1usize]) as i16), + -(i16::lt(&a[2usize], &b[2usize]) as i16), + -(i16::lt(&a[3usize], &b[3usize]) as i16), + -(i16::lt(&a[4usize], &b[4usize]) as i16), + -(i16::lt(&a[5usize], &b[5usize]) as i16), + -(i16::lt(&a[6usize], &b[6usize]) as i16), + -(i16::lt(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + [ + -(i16::le(&a[0usize], &b[0usize]) as i16), + -(i16::le(&a[1usize], &b[1usize]) as i16), + -(i16::le(&a[2usize], &b[2usize]) as i16), + -(i16::le(&a[3usize], &b[3usize]) as i16), + -(i16::le(&a[4usize], &b[4usize]) as i16), + -(i16::le(&a[5usize], &b[5usize]) as i16), + -(i16::le(&a[6usize], &b[6usize]) as i16), + -(i16::le(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + [ + -(i16::ge(&a[0usize], &b[0usize]) as i16), + -(i16::ge(&a[1usize], &b[1usize]) as i16), + -(i16::ge(&a[2usize], &b[2usize]) as i16), + -(i16::ge(&a[3usize], &b[3usize]) as i16), + -(i16::ge(&a[4usize], &b[4usize]) as i16), + -(i16::ge(&a[5usize], &b[5usize]) as i16), + -(i16::ge(&a[6usize], &b[6usize]) as i16), + -(i16::ge(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { + [ + -(i16::gt(&a[0usize], &b[0usize]) as i16), + -(i16::gt(&a[1usize], &b[1usize]) as i16), + -(i16::gt(&a[2usize], &b[2usize]) as i16), + -(i16::gt(&a[3usize], &b[3usize]) as i16), + -(i16::gt(&a[4usize], &b[4usize]) as i16), + -(i16::gt(&a[5usize], &b[5usize]) as i16), + -(i16::gt(&a[6usize], &b[6usize]) as i16), + -(i16::gt(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + a[0usize], b[0usize], a[1usize], b[1usize], a[2usize], b[2usize], a[3usize], b[3usize], + ] + .simd_into(self) + } + #[inline(always)] + fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + a[4usize], b[4usize], a[5usize], b[5usize], a[6usize], b[6usize], a[7usize], b[7usize], + ] + .simd_into(self) + } + #[inline(always)] + fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + a[0usize], a[2usize], a[4usize], a[6usize], b[0usize], b[2usize], b[4usize], b[6usize], + ] + .simd_into(self) + } + #[inline(always)] + fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + a[1usize], a[3usize], a[5usize], a[7usize], b[1usize], b[3usize], b[5usize], b[7usize], + ] + .simd_into(self) + } + #[inline(always)] + fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + if a[2usize] != 0 { b[2usize] } else { c[2usize] }, + if a[3usize] != 0 { b[3usize] } else { c[3usize] }, + if a[4usize] != 0 { b[4usize] } else { c[4usize] }, + if a[5usize] != 0 { b[5usize] } else { c[5usize] }, + if a[6usize] != 0 { b[6usize] } else { c[6usize] }, + if a[7usize] != 0 { b[7usize] } else { c[7usize] }, + ] + .simd_into(self) + } + #[inline(always)] + fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + i16::min(a[0usize], b[0usize]), + i16::min(a[1usize], b[1usize]), + i16::min(a[2usize], b[2usize]), + i16::min(a[3usize], b[3usize]), + i16::min(a[4usize], b[4usize]), + i16::min(a[5usize], b[5usize]), + i16::min(a[6usize], b[6usize]), + i16::min(a[7usize], b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + [ + i16::max(a[0usize], b[0usize]), + i16::max(a[1usize], b[1usize]), + i16::max(a[2usize], b[2usize]), + i16::max(a[3usize], b[3usize]), + i16::max(a[4usize], b[4usize]), + i16::max(a[5usize], b[5usize]), + i16::max(a[6usize], b[6usize]), + i16::max(a[7usize], b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { + let mut result = [0; 16usize]; + result[0..8usize].copy_from_slice(&a.val); + result[8usize..16usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn neg_i16x8(self, a: i16x8) -> i16x8 { + [ + i16::neg(a[0usize]), + i16::neg(a[1usize]), + i16::neg(a[2usize]), + i16::neg(a[3usize]), + i16::neg(a[4usize]), + i16::neg(a[5usize]), + i16::neg(a[6usize]), + i16::neg(a[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { + u8x16 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { + u32x4 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn splat_u16x8(self, val: u16) -> u16x8 { + [val; 8usize].simd_into(self) + } + #[inline(always)] + fn not_u16x8(self, a: u16x8) -> u16x8 { + [ + u16::not(a[0usize]), + u16::not(a[1usize]), + u16::not(a[2usize]), + u16::not(a[3usize]), + u16::not(a[4usize]), + u16::not(a[5usize]), + u16::not(a[6usize]), + u16::not(a[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + u16::wrapping_add(a[0usize], b[0usize]), + u16::wrapping_add(a[1usize], b[1usize]), + u16::wrapping_add(a[2usize], b[2usize]), + u16::wrapping_add(a[3usize], b[3usize]), + u16::wrapping_add(a[4usize], b[4usize]), + u16::wrapping_add(a[5usize], b[5usize]), + u16::wrapping_add(a[6usize], b[6usize]), + u16::wrapping_add(a[7usize], b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + u16::wrapping_sub(a[0usize], b[0usize]), + u16::wrapping_sub(a[1usize], b[1usize]), + u16::wrapping_sub(a[2usize], b[2usize]), + u16::wrapping_sub(a[3usize], b[3usize]), + u16::wrapping_sub(a[4usize], b[4usize]), + u16::wrapping_sub(a[5usize], b[5usize]), + u16::wrapping_sub(a[6usize], b[6usize]), + u16::wrapping_sub(a[7usize], b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + u16::wrapping_mul(a[0usize], b[0usize]), + u16::wrapping_mul(a[1usize], b[1usize]), + u16::wrapping_mul(a[2usize], b[2usize]), + u16::wrapping_mul(a[3usize], b[3usize]), + u16::wrapping_mul(a[4usize], b[4usize]), + u16::wrapping_mul(a[5usize], b[5usize]), + u16::wrapping_mul(a[6usize], b[6usize]), + u16::wrapping_mul(a[7usize], b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + u16::bitand(a[0usize], &b[0usize]), + u16::bitand(a[1usize], &b[1usize]), + u16::bitand(a[2usize], &b[2usize]), + u16::bitand(a[3usize], &b[3usize]), + u16::bitand(a[4usize], &b[4usize]), + u16::bitand(a[5usize], &b[5usize]), + u16::bitand(a[6usize], &b[6usize]), + u16::bitand(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + u16::bitor(a[0usize], &b[0usize]), + u16::bitor(a[1usize], &b[1usize]), + u16::bitor(a[2usize], &b[2usize]), + u16::bitor(a[3usize], &b[3usize]), + u16::bitor(a[4usize], &b[4usize]), + u16::bitor(a[5usize], &b[5usize]), + u16::bitor(a[6usize], &b[6usize]), + u16::bitor(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + u16::bitxor(a[0usize], &b[0usize]), + u16::bitxor(a[1usize], &b[1usize]), + u16::bitxor(a[2usize], &b[2usize]), + u16::bitxor(a[3usize], &b[3usize]), + u16::bitxor(a[4usize], &b[4usize]), + u16::bitxor(a[5usize], &b[5usize]), + u16::bitxor(a[6usize], &b[6usize]), + u16::bitxor(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { + [ + u16::shr(a[0usize], shift as u16), + u16::shr(a[1usize], shift as u16), + u16::shr(a[2usize], shift as u16), + u16::shr(a[3usize], shift as u16), + u16::shr(a[4usize], shift as u16), + u16::shr(a[5usize], shift as u16), + u16::shr(a[6usize], shift as u16), + u16::shr(a[7usize], shift as u16), + ] + .simd_into(self) + } + #[inline(always)] + fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + u16::shr(a[0usize], &b[0usize]), + u16::shr(a[1usize], &b[1usize]), + u16::shr(a[2usize], &b[2usize]), + u16::shr(a[3usize], &b[3usize]), + u16::shr(a[4usize], &b[4usize]), + u16::shr(a[5usize], &b[5usize]), + u16::shr(a[6usize], &b[6usize]), + u16::shr(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { + [ + u16::shl(a[0usize], shift as u16), + u16::shl(a[1usize], shift as u16), + u16::shl(a[2usize], shift as u16), + u16::shl(a[3usize], shift as u16), + u16::shl(a[4usize], shift as u16), + u16::shl(a[5usize], shift as u16), + u16::shl(a[6usize], shift as u16), + u16::shl(a[7usize], shift as u16), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + [ + -(u16::eq(&a[0usize], &b[0usize]) as i16), + -(u16::eq(&a[1usize], &b[1usize]) as i16), + -(u16::eq(&a[2usize], &b[2usize]) as i16), + -(u16::eq(&a[3usize], &b[3usize]) as i16), + -(u16::eq(&a[4usize], &b[4usize]) as i16), + -(u16::eq(&a[5usize], &b[5usize]) as i16), + -(u16::eq(&a[6usize], &b[6usize]) as i16), + -(u16::eq(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + [ + -(u16::lt(&a[0usize], &b[0usize]) as i16), + -(u16::lt(&a[1usize], &b[1usize]) as i16), + -(u16::lt(&a[2usize], &b[2usize]) as i16), + -(u16::lt(&a[3usize], &b[3usize]) as i16), + -(u16::lt(&a[4usize], &b[4usize]) as i16), + -(u16::lt(&a[5usize], &b[5usize]) as i16), + -(u16::lt(&a[6usize], &b[6usize]) as i16), + -(u16::lt(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + [ + -(u16::le(&a[0usize], &b[0usize]) as i16), + -(u16::le(&a[1usize], &b[1usize]) as i16), + -(u16::le(&a[2usize], &b[2usize]) as i16), + -(u16::le(&a[3usize], &b[3usize]) as i16), + -(u16::le(&a[4usize], &b[4usize]) as i16), + -(u16::le(&a[5usize], &b[5usize]) as i16), + -(u16::le(&a[6usize], &b[6usize]) as i16), + -(u16::le(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + [ + -(u16::ge(&a[0usize], &b[0usize]) as i16), + -(u16::ge(&a[1usize], &b[1usize]) as i16), + -(u16::ge(&a[2usize], &b[2usize]) as i16), + -(u16::ge(&a[3usize], &b[3usize]) as i16), + -(u16::ge(&a[4usize], &b[4usize]) as i16), + -(u16::ge(&a[5usize], &b[5usize]) as i16), + -(u16::ge(&a[6usize], &b[6usize]) as i16), + -(u16::ge(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { + [ + -(u16::gt(&a[0usize], &b[0usize]) as i16), + -(u16::gt(&a[1usize], &b[1usize]) as i16), + -(u16::gt(&a[2usize], &b[2usize]) as i16), + -(u16::gt(&a[3usize], &b[3usize]) as i16), + -(u16::gt(&a[4usize], &b[4usize]) as i16), + -(u16::gt(&a[5usize], &b[5usize]) as i16), + -(u16::gt(&a[6usize], &b[6usize]) as i16), + -(u16::gt(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + a[0usize], b[0usize], a[1usize], b[1usize], a[2usize], b[2usize], a[3usize], b[3usize], + ] + .simd_into(self) + } + #[inline(always)] + fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + a[4usize], b[4usize], a[5usize], b[5usize], a[6usize], b[6usize], a[7usize], b[7usize], + ] + .simd_into(self) + } + #[inline(always)] + fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + a[0usize], a[2usize], a[4usize], a[6usize], b[0usize], b[2usize], b[4usize], b[6usize], + ] + .simd_into(self) + } + #[inline(always)] + fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + a[1usize], a[3usize], a[5usize], a[7usize], b[1usize], b[3usize], b[5usize], b[7usize], + ] + .simd_into(self) + } + #[inline(always)] + fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + if a[2usize] != 0 { b[2usize] } else { c[2usize] }, + if a[3usize] != 0 { b[3usize] } else { c[3usize] }, + if a[4usize] != 0 { b[4usize] } else { c[4usize] }, + if a[5usize] != 0 { b[5usize] } else { c[5usize] }, + if a[6usize] != 0 { b[6usize] } else { c[6usize] }, + if a[7usize] != 0 { b[7usize] } else { c[7usize] }, + ] + .simd_into(self) + } + #[inline(always)] + fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + u16::min(a[0usize], b[0usize]), + u16::min(a[1usize], b[1usize]), + u16::min(a[2usize], b[2usize]), + u16::min(a[3usize], b[3usize]), + u16::min(a[4usize], b[4usize]), + u16::min(a[5usize], b[5usize]), + u16::min(a[6usize], b[6usize]), + u16::min(a[7usize], b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + [ + u16::max(a[0usize], b[0usize]), + u16::max(a[1usize], b[1usize]), + u16::max(a[2usize], b[2usize]), + u16::max(a[3usize], b[3usize]), + u16::max(a[4usize], b[4usize]), + u16::max(a[5usize], b[5usize]), + u16::max(a[6usize], b[6usize]), + u16::max(a[7usize], b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { + let mut result = [0; 16usize]; + result[0..8usize].copy_from_slice(&a.val); + result[8usize..16usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { + u8x16 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { + u32x4 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn splat_mask16x8(self, val: i16) -> mask16x8 { + [val; 8usize].simd_into(self) + } + #[inline(always)] + fn not_mask16x8(self, a: mask16x8) -> mask16x8 { + [ + i16::not(a[0usize]), + i16::not(a[1usize]), + i16::not(a[2usize]), + i16::not(a[3usize]), + i16::not(a[4usize]), + i16::not(a[5usize]), + i16::not(a[6usize]), + i16::not(a[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + [ + i16::bitand(a[0usize], &b[0usize]), + i16::bitand(a[1usize], &b[1usize]), + i16::bitand(a[2usize], &b[2usize]), + i16::bitand(a[3usize], &b[3usize]), + i16::bitand(a[4usize], &b[4usize]), + i16::bitand(a[5usize], &b[5usize]), + i16::bitand(a[6usize], &b[6usize]), + i16::bitand(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + [ + i16::bitor(a[0usize], &b[0usize]), + i16::bitor(a[1usize], &b[1usize]), + i16::bitor(a[2usize], &b[2usize]), + i16::bitor(a[3usize], &b[3usize]), + i16::bitor(a[4usize], &b[4usize]), + i16::bitor(a[5usize], &b[5usize]), + i16::bitor(a[6usize], &b[6usize]), + i16::bitor(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + [ + i16::bitxor(a[0usize], &b[0usize]), + i16::bitxor(a[1usize], &b[1usize]), + i16::bitxor(a[2usize], &b[2usize]), + i16::bitxor(a[3usize], &b[3usize]), + i16::bitxor(a[4usize], &b[4usize]), + i16::bitxor(a[5usize], &b[5usize]), + i16::bitxor(a[6usize], &b[6usize]), + i16::bitxor(a[7usize], &b[7usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn select_mask16x8( + self, + a: mask16x8, + b: mask16x8, + c: mask16x8, + ) -> mask16x8 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + if a[2usize] != 0 { b[2usize] } else { c[2usize] }, + if a[3usize] != 0 { b[3usize] } else { c[3usize] }, + if a[4usize] != 0 { b[4usize] } else { c[4usize] }, + if a[5usize] != 0 { b[5usize] } else { c[5usize] }, + if a[6usize] != 0 { b[6usize] } else { c[6usize] }, + if a[7usize] != 0 { b[7usize] } else { c[7usize] }, + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { + [ + -(i16::eq(&a[0usize], &b[0usize]) as i16), + -(i16::eq(&a[1usize], &b[1usize]) as i16), + -(i16::eq(&a[2usize], &b[2usize]) as i16), + -(i16::eq(&a[3usize], &b[3usize]) as i16), + -(i16::eq(&a[4usize], &b[4usize]) as i16), + -(i16::eq(&a[5usize], &b[5usize]) as i16), + -(i16::eq(&a[6usize], &b[6usize]) as i16), + -(i16::eq(&a[7usize], &b[7usize]) as i16), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { + let mut result = [0; 16usize]; + result[0..8usize].copy_from_slice(&a.val); + result[8usize..16usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn splat_i32x4(self, val: i32) -> i32x4 { + [val; 4usize].simd_into(self) + } + #[inline(always)] + fn not_i32x4(self, a: i32x4) -> i32x4 { + [ + i32::not(a[0usize]), + i32::not(a[1usize]), + i32::not(a[2usize]), + i32::not(a[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [ + i32::wrapping_add(a[0usize], b[0usize]), + i32::wrapping_add(a[1usize], b[1usize]), + i32::wrapping_add(a[2usize], b[2usize]), + i32::wrapping_add(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [ + i32::wrapping_sub(a[0usize], b[0usize]), + i32::wrapping_sub(a[1usize], b[1usize]), + i32::wrapping_sub(a[2usize], b[2usize]), + i32::wrapping_sub(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [ + i32::wrapping_mul(a[0usize], b[0usize]), + i32::wrapping_mul(a[1usize], b[1usize]), + i32::wrapping_mul(a[2usize], b[2usize]), + i32::wrapping_mul(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [ + i32::bitand(a[0usize], &b[0usize]), + i32::bitand(a[1usize], &b[1usize]), + i32::bitand(a[2usize], &b[2usize]), + i32::bitand(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [ + i32::bitor(a[0usize], &b[0usize]), + i32::bitor(a[1usize], &b[1usize]), + i32::bitor(a[2usize], &b[2usize]), + i32::bitor(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [ + i32::bitxor(a[0usize], &b[0usize]), + i32::bitxor(a[1usize], &b[1usize]), + i32::bitxor(a[2usize], &b[2usize]), + i32::bitxor(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { + [ + i32::shr(a[0usize], shift as i32), + i32::shr(a[1usize], shift as i32), + i32::shr(a[2usize], shift as i32), + i32::shr(a[3usize], shift as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [ + i32::shr(a[0usize], &b[0usize]), + i32::shr(a[1usize], &b[1usize]), + i32::shr(a[2usize], &b[2usize]), + i32::shr(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { + [ + i32::shl(a[0usize], shift as i32), + i32::shl(a[1usize], shift as i32), + i32::shl(a[2usize], shift as i32), + i32::shl(a[3usize], shift as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + [ + -(i32::eq(&a[0usize], &b[0usize]) as i32), + -(i32::eq(&a[1usize], &b[1usize]) as i32), + -(i32::eq(&a[2usize], &b[2usize]) as i32), + -(i32::eq(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + [ + -(i32::lt(&a[0usize], &b[0usize]) as i32), + -(i32::lt(&a[1usize], &b[1usize]) as i32), + -(i32::lt(&a[2usize], &b[2usize]) as i32), + -(i32::lt(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + [ + -(i32::le(&a[0usize], &b[0usize]) as i32), + -(i32::le(&a[1usize], &b[1usize]) as i32), + -(i32::le(&a[2usize], &b[2usize]) as i32), + -(i32::le(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + [ + -(i32::ge(&a[0usize], &b[0usize]) as i32), + -(i32::ge(&a[1usize], &b[1usize]) as i32), + -(i32::ge(&a[2usize], &b[2usize]) as i32), + -(i32::ge(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { + [ + -(i32::gt(&a[0usize], &b[0usize]) as i32), + -(i32::gt(&a[1usize], &b[1usize]) as i32), + -(i32::gt(&a[2usize], &b[2usize]) as i32), + -(i32::gt(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [a[0usize], b[0usize], a[1usize], b[1usize]].simd_into(self) + } + #[inline(always)] + fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [a[2usize], b[2usize], a[3usize], b[3usize]].simd_into(self) + } + #[inline(always)] + fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [a[0usize], a[2usize], b[0usize], b[2usize]].simd_into(self) + } + #[inline(always)] + fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [a[1usize], a[3usize], b[1usize], b[3usize]].simd_into(self) + } + #[inline(always)] + fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + if a[2usize] != 0 { b[2usize] } else { c[2usize] }, + if a[3usize] != 0 { b[3usize] } else { c[3usize] }, + ] + .simd_into(self) + } + #[inline(always)] + fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [ + i32::min(a[0usize], b[0usize]), + i32::min(a[1usize], b[1usize]), + i32::min(a[2usize], b[2usize]), + i32::min(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + [ + i32::max(a[0usize], b[0usize]), + i32::max(a[1usize], b[1usize]), + i32::max(a[2usize], b[2usize]), + i32::max(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { + let mut result = [0; 8usize]; + result[0..4usize].copy_from_slice(&a.val); + result[4usize..8usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn neg_i32x4(self, a: i32x4) -> i32x4 { + [ + i32::neg(a[0usize]), + i32::neg(a[1usize]), + i32::neg(a[2usize]), + i32::neg(a[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { + u8x16 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { + u32x4 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { + [ + a[0usize] as f32, + a[1usize] as f32, + a[2usize] as f32, + a[3usize] as f32, + ] + .simd_into(self) + } + #[inline(always)] + fn splat_u32x4(self, val: u32) -> u32x4 { + [val; 4usize].simd_into(self) + } + #[inline(always)] + fn not_u32x4(self, a: u32x4) -> u32x4 { + [ + u32::not(a[0usize]), + u32::not(a[1usize]), + u32::not(a[2usize]), + u32::not(a[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [ + u32::wrapping_add(a[0usize], b[0usize]), + u32::wrapping_add(a[1usize], b[1usize]), + u32::wrapping_add(a[2usize], b[2usize]), + u32::wrapping_add(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [ + u32::wrapping_sub(a[0usize], b[0usize]), + u32::wrapping_sub(a[1usize], b[1usize]), + u32::wrapping_sub(a[2usize], b[2usize]), + u32::wrapping_sub(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [ + u32::wrapping_mul(a[0usize], b[0usize]), + u32::wrapping_mul(a[1usize], b[1usize]), + u32::wrapping_mul(a[2usize], b[2usize]), + u32::wrapping_mul(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [ + u32::bitand(a[0usize], &b[0usize]), + u32::bitand(a[1usize], &b[1usize]), + u32::bitand(a[2usize], &b[2usize]), + u32::bitand(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [ + u32::bitor(a[0usize], &b[0usize]), + u32::bitor(a[1usize], &b[1usize]), + u32::bitor(a[2usize], &b[2usize]), + u32::bitor(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [ + u32::bitxor(a[0usize], &b[0usize]), + u32::bitxor(a[1usize], &b[1usize]), + u32::bitxor(a[2usize], &b[2usize]), + u32::bitxor(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { + [ + u32::shr(a[0usize], shift as u32), + u32::shr(a[1usize], shift as u32), + u32::shr(a[2usize], shift as u32), + u32::shr(a[3usize], shift as u32), + ] + .simd_into(self) + } + #[inline(always)] + fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [ + u32::shr(a[0usize], &b[0usize]), + u32::shr(a[1usize], &b[1usize]), + u32::shr(a[2usize], &b[2usize]), + u32::shr(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { + [ + u32::shl(a[0usize], shift as u32), + u32::shl(a[1usize], shift as u32), + u32::shl(a[2usize], shift as u32), + u32::shl(a[3usize], shift as u32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + [ + -(u32::eq(&a[0usize], &b[0usize]) as i32), + -(u32::eq(&a[1usize], &b[1usize]) as i32), + -(u32::eq(&a[2usize], &b[2usize]) as i32), + -(u32::eq(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + [ + -(u32::lt(&a[0usize], &b[0usize]) as i32), + -(u32::lt(&a[1usize], &b[1usize]) as i32), + -(u32::lt(&a[2usize], &b[2usize]) as i32), + -(u32::lt(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + [ + -(u32::le(&a[0usize], &b[0usize]) as i32), + -(u32::le(&a[1usize], &b[1usize]) as i32), + -(u32::le(&a[2usize], &b[2usize]) as i32), + -(u32::le(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + [ + -(u32::ge(&a[0usize], &b[0usize]) as i32), + -(u32::ge(&a[1usize], &b[1usize]) as i32), + -(u32::ge(&a[2usize], &b[2usize]) as i32), + -(u32::ge(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { + [ + -(u32::gt(&a[0usize], &b[0usize]) as i32), + -(u32::gt(&a[1usize], &b[1usize]) as i32), + -(u32::gt(&a[2usize], &b[2usize]) as i32), + -(u32::gt(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [a[0usize], b[0usize], a[1usize], b[1usize]].simd_into(self) + } + #[inline(always)] + fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [a[2usize], b[2usize], a[3usize], b[3usize]].simd_into(self) + } + #[inline(always)] + fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [a[0usize], a[2usize], b[0usize], b[2usize]].simd_into(self) + } + #[inline(always)] + fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [a[1usize], a[3usize], b[1usize], b[3usize]].simd_into(self) + } + #[inline(always)] + fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + if a[2usize] != 0 { b[2usize] } else { c[2usize] }, + if a[3usize] != 0 { b[3usize] } else { c[3usize] }, + ] + .simd_into(self) + } + #[inline(always)] + fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [ + u32::min(a[0usize], b[0usize]), + u32::min(a[1usize], b[1usize]), + u32::min(a[2usize], b[2usize]), + u32::min(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + [ + u32::max(a[0usize], b[0usize]), + u32::max(a[1usize], b[1usize]), + u32::max(a[2usize], b[2usize]), + u32::max(a[3usize], b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { + let mut result = [0; 8usize]; + result[0..4usize].copy_from_slice(&a.val); + result[4usize..8usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { + u8x16 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { + [ + a[0usize] as f32, + a[1usize] as f32, + a[2usize] as f32, + a[3usize] as f32, + ] + .simd_into(self) + } + #[inline(always)] + fn splat_mask32x4(self, val: i32) -> mask32x4 { + [val; 4usize].simd_into(self) + } + #[inline(always)] + fn not_mask32x4(self, a: mask32x4) -> mask32x4 { + [ + i32::not(a[0usize]), + i32::not(a[1usize]), + i32::not(a[2usize]), + i32::not(a[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + [ + i32::bitand(a[0usize], &b[0usize]), + i32::bitand(a[1usize], &b[1usize]), + i32::bitand(a[2usize], &b[2usize]), + i32::bitand(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + [ + i32::bitor(a[0usize], &b[0usize]), + i32::bitor(a[1usize], &b[1usize]), + i32::bitor(a[2usize], &b[2usize]), + i32::bitor(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + [ + i32::bitxor(a[0usize], &b[0usize]), + i32::bitxor(a[1usize], &b[1usize]), + i32::bitxor(a[2usize], &b[2usize]), + i32::bitxor(a[3usize], &b[3usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn select_mask32x4( + self, + a: mask32x4, + b: mask32x4, + c: mask32x4, + ) -> mask32x4 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + if a[2usize] != 0 { b[2usize] } else { c[2usize] }, + if a[3usize] != 0 { b[3usize] } else { c[3usize] }, + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { + [ + -(i32::eq(&a[0usize], &b[0usize]) as i32), + -(i32::eq(&a[1usize], &b[1usize]) as i32), + -(i32::eq(&a[2usize], &b[2usize]) as i32), + -(i32::eq(&a[3usize], &b[3usize]) as i32), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { + let mut result = [0; 8usize]; + result[0..4usize].copy_from_slice(&a.val); + result[4usize..8usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn splat_f64x2(self, val: f64) -> f64x2 { + [val; 2usize].simd_into(self) + } + #[inline(always)] + fn abs_f64x2(self, a: f64x2) -> f64x2 { + [f64::abs(a[0usize]), f64::abs(a[1usize])].simd_into(self) + } + #[inline(always)] + fn neg_f64x2(self, a: f64x2) -> f64x2 { + [f64::neg(a[0usize]), f64::neg(a[1usize])].simd_into(self) + } + #[inline(always)] + fn sqrt_f64x2(self, a: f64x2) -> f64x2 { + [f64::sqrt(a[0usize]), f64::sqrt(a[1usize])].simd_into(self) + } + #[inline(always)] + fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [ + f64::add(a[0usize], &b[0usize]), + f64::add(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [ + f64::sub(a[0usize], &b[0usize]), + f64::sub(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [ + f64::mul(a[0usize], &b[0usize]), + f64::mul(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [ + f64::div(a[0usize], &b[0usize]), + f64::div(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [ + f64::copysign(a[0usize], b[0usize]), + f64::copysign(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + [ + -(f64::eq(&a[0usize], &b[0usize]) as i64), + -(f64::eq(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + [ + -(f64::lt(&a[0usize], &b[0usize]) as i64), + -(f64::lt(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + [ + -(f64::le(&a[0usize], &b[0usize]) as i64), + -(f64::le(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + [ + -(f64::ge(&a[0usize], &b[0usize]) as i64), + -(f64::ge(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { + [ + -(f64::gt(&a[0usize], &b[0usize]) as i64), + -(f64::gt(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [a[0usize], b[0usize]].simd_into(self) + } + #[inline(always)] + fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [a[1usize], b[1usize]].simd_into(self) + } + #[inline(always)] + fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [a[0usize], b[0usize]].simd_into(self) + } + #[inline(always)] + fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [a[1usize], b[1usize]].simd_into(self) + } + #[inline(always)] + fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [ + f64::max(a[0usize], b[0usize]), + f64::max(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [ + f64::max(a[0usize], b[0usize]), + f64::max(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [ + f64::min(a[0usize], b[0usize]), + f64::min(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + [ + f64::min(a[0usize], b[0usize]), + f64::min(a[1usize], b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn madd_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + a.mul(b).add(c) + } + #[inline(always)] + fn msub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { + a.mul(b).sub(c) + } + #[inline(always)] + fn floor_f64x2(self, a: f64x2) -> f64x2 { + [f64::floor(a[0usize]), f64::floor(a[1usize])].simd_into(self) + } + #[inline(always)] + fn fract_f64x2(self, a: f64x2) -> f64x2 { + [f64::fract(a[0usize]), f64::fract(a[1usize])].simd_into(self) + } + #[inline(always)] + fn trunc_f64x2(self, a: f64x2) -> f64x2 { + [f64::trunc(a[0usize]), f64::trunc(a[1usize])].simd_into(self) + } + #[inline(always)] + fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + ] + .simd_into(self) + } + #[inline(always)] + fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { + let mut result = [0.0; 4usize]; + result[0..2usize].copy_from_slice(&a.val); + result[2usize..4usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { + f32x4 { + val: bytemuck::cast(a.val), + simd: a.simd, + } + } + #[inline(always)] + fn splat_mask64x2(self, val: i64) -> mask64x2 { + [val; 2usize].simd_into(self) + } + #[inline(always)] + fn not_mask64x2(self, a: mask64x2) -> mask64x2 { + [i64::not(a[0usize]), i64::not(a[1usize])].simd_into(self) + } + #[inline(always)] + fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + [ + i64::bitand(a[0usize], &b[0usize]), + i64::bitand(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + [ + i64::bitor(a[0usize], &b[0usize]), + i64::bitor(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + [ + i64::bitxor(a[0usize], &b[0usize]), + i64::bitxor(a[1usize], &b[1usize]), + ] + .simd_into(self) + } + #[inline(always)] + fn select_mask64x2( + self, + a: mask64x2, + b: mask64x2, + c: mask64x2, + ) -> mask64x2 { + [ + if a[0usize] != 0 { b[0usize] } else { c[0usize] }, + if a[1usize] != 0 { b[1usize] } else { c[1usize] }, + ] + .simd_into(self) + } + #[inline(always)] + fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { + [ + -(i64::eq(&a[0usize], &b[0usize]) as i64), + -(i64::eq(&a[1usize], &b[1usize]) as i64), + ] + .simd_into(self) + } + #[inline(always)] + fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { + let mut result = [0; 4usize]; + result[0..2usize].copy_from_slice(&a.val); + result[2usize..4usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn splat_f32x8(self, a: f32) -> f32x8 { + let half = self.splat_f32x4(a); + self.combine_f32x4(half, half) + } + #[inline(always)] + fn abs_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) + } + #[inline(always)] + fn neg_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) + } + #[inline(always)] + fn sqrt_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) + } + #[inline(always)] + fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) + } + #[inline(always)] + fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) + } + #[inline(always)] + fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) + } + #[inline(always)] + fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) + } + #[inline(always)] + fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) + } + #[inline(always)] + fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) + } + #[inline(always)] + fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) + } + #[inline(always)] + fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) + } + #[inline(always)] + fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) + } + #[inline(always)] + fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) + } + #[inline(always)] + fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, _) = self.split_f32x8(a); + let (b0, _) = self.split_f32x8(b); + self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) + } + #[inline(always)] + fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (_, a1) = self.split_f32x8(a); + let (_, b1) = self.split_f32x8(b); + self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) + } + #[inline(always)] + fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) + } + #[inline(always)] + fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) + } + #[inline(always)] + fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) + } + #[inline(always)] + fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.max_precise_f32x4(a0, b0), + self.max_precise_f32x4(a1, b1), + ) + } + #[inline(always)] + fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) + } + #[inline(always)] + fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.min_precise_f32x4(a0, b0), + self.min_precise_f32x4(a1, b1), + ) + } + #[inline(always)] + fn madd_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4(self.madd_f32x4(a0, b0, c0), self.madd_f32x4(a1, b1, c1)) + } + #[inline(always)] + fn msub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4(self.msub_f32x4(a0, b0, c0), self.msub_f32x4(a1, b1, c1)) + } + #[inline(always)] + fn floor_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) + } + #[inline(always)] + fn fract_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) + } + #[inline(always)] + fn trunc_f32x8(self, a: f32x8) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) + } + #[inline(always)] + fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_f32x8(b); + let (c0, c1) = self.split_f32x8(c); + self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) + } + #[inline(always)] + fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { + let mut result = [0.0; 16usize]; + result[0..8usize].copy_from_slice(&a.val); + result[8usize..16usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { + let mut b0 = [0.0; 4usize]; + let mut b1 = [0.0; 4usize]; + b0.copy_from_slice(&a.val[0..4usize]); + b1.copy_from_slice(&a.val[4usize..8usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { + let (a0, a1) = self.split_f32x8(a); + self.combine_f64x2( + self.reinterpret_f64_f32x4(a0), + self.reinterpret_f64_f32x4(a1), + ) + } + #[inline(always)] + fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4( + self.reinterpret_i32_f32x4(a0), + self.reinterpret_i32_f32x4(a1), + ) + } + #[inline(always)] + fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u32x4( + self.reinterpret_u32_f32x4(a0), + self.reinterpret_u32_f32x4(a1), + ) + } + #[inline(always)] + fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) + } + #[inline(always)] + fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { + let (a0, a1) = self.split_f32x8(a); + self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) + } + #[inline(always)] + fn splat_i8x32(self, a: i8) -> i8x32 { + let half = self.splat_i8x16(a); + self.combine_i8x16(half, half) + } + #[inline(always)] + fn not_i8x32(self, a: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) + } + #[inline(always)] + fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) + } + #[inline(always)] + fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) + } + #[inline(always)] + fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) + } + #[inline(always)] + fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) + } + #[inline(always)] + fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) + } + #[inline(always)] + fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) + } + #[inline(always)] + fn shr_i8x32(self, a: i8x32, b: u32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.shr_i8x16(a0, b), self.shr_i8x16(a1, b)) + } + #[inline(always)] + fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) + } + #[inline(always)] + fn shl_i8x32(self, a: i8x32, b: u32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.shl_i8x16(a0, b), self.shl_i8x16(a1, b)) + } + #[inline(always)] + fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) + } + #[inline(always)] + fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) + } + #[inline(always)] + fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) + } + #[inline(always)] + fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) + } + #[inline(always)] + fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) + } + #[inline(always)] + fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, _) = self.split_i8x32(a); + let (b0, _) = self.split_i8x32(b); + self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) + } + #[inline(always)] + fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (_, a1) = self.split_i8x32(a); + let (_, b1) = self.split_i8x32(b); + self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) + } + #[inline(always)] + fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) + } + #[inline(always)] + fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) + } + #[inline(always)] + fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_i8x32(b); + let (c0, c1) = self.split_i8x32(c); + self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) + } + #[inline(always)] + fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) + } + #[inline(always)] + fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) + } + #[inline(always)] + fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { + let mut result = [0; 64usize]; + result[0..32usize].copy_from_slice(&a.val); + result[32usize..64usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { + let mut b0 = [0; 16usize]; + let mut b1 = [0; 16usize]; + b0.copy_from_slice(&a.val[0..16usize]); + b1.copy_from_slice(&a.val[16usize..32usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn neg_i8x32(self, a: i8x32) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) + } + #[inline(always)] + fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { + let (a0, a1) = self.split_i8x32(a); + self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) + } + #[inline(always)] + fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { + let (a0, a1) = self.split_i8x32(a); + self.combine_u32x4( + self.reinterpret_u32_i8x16(a0), + self.reinterpret_u32_i8x16(a1), + ) + } + #[inline(always)] + fn splat_u8x32(self, a: u8) -> u8x32 { + let half = self.splat_u8x16(a); + self.combine_u8x16(half, half) + } + #[inline(always)] + fn not_u8x32(self, a: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) + } + #[inline(always)] + fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) + } + #[inline(always)] + fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) + } + #[inline(always)] + fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) + } + #[inline(always)] + fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) + } + #[inline(always)] + fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) + } + #[inline(always)] + fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) + } + #[inline(always)] + fn shr_u8x32(self, a: u8x32, b: u32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.shr_u8x16(a0, b), self.shr_u8x16(a1, b)) + } + #[inline(always)] + fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) + } + #[inline(always)] + fn shl_u8x32(self, a: u8x32, b: u32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u8x16(self.shl_u8x16(a0, b), self.shl_u8x16(a1, b)) + } + #[inline(always)] + fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) + } + #[inline(always)] + fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) + } + #[inline(always)] + fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) + } + #[inline(always)] + fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) + } + #[inline(always)] + fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) + } + #[inline(always)] + fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, _) = self.split_u8x32(a); + let (b0, _) = self.split_u8x32(b); + self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) + } + #[inline(always)] + fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (_, a1) = self.split_u8x32(a); + let (_, b1) = self.split_u8x32(b); + self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) + } + #[inline(always)] + fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) + } + #[inline(always)] + fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) + } + #[inline(always)] + fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_u8x32(b); + let (c0, c1) = self.split_u8x32(c); + self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) + } + #[inline(always)] + fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) + } + #[inline(always)] + fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) + } + #[inline(always)] + fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { + let mut result = [0; 64usize]; + result[0..32usize].copy_from_slice(&a.val); + result[32usize..64usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { + let mut b0 = [0; 16usize]; + let mut b1 = [0; 16usize]; + b0.copy_from_slice(&a.val[0..16usize]); + b1.copy_from_slice(&a.val[16usize..32usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn widen_u8x32(self, a: u8x32) -> u16x32 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) + } + #[inline(always)] + fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { + let (a0, a1) = self.split_u8x32(a); + self.combine_u32x4( + self.reinterpret_u32_u8x16(a0), + self.reinterpret_u32_u8x16(a1), + ) + } + #[inline(always)] + fn splat_mask8x32(self, a: i8) -> mask8x32 { + let half = self.splat_mask8x16(a); + self.combine_mask8x16(half, half) + } + #[inline(always)] + fn not_mask8x32(self, a: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) + } + #[inline(always)] + fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) + } + #[inline(always)] + fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) + } + #[inline(always)] + fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) + } + #[inline(always)] + fn select_mask8x32( + self, + a: mask8x32, + b: mask8x32, + c: mask8x32, + ) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + let (c0, c1) = self.split_mask8x32(c); + self.combine_mask8x16( + self.select_mask8x16(a0, b0, c0), + self.select_mask8x16(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) + } + #[inline(always)] + fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { + let mut result = [0; 64usize]; + result[0..32usize].copy_from_slice(&a.val); + result[32usize..64usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { + let mut b0 = [0; 16usize]; + let mut b1 = [0; 16usize]; + b0.copy_from_slice(&a.val[0..16usize]); + b1.copy_from_slice(&a.val[16usize..32usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn splat_i16x16(self, a: i16) -> i16x16 { + let half = self.splat_i16x8(a); + self.combine_i16x8(half, half) + } + #[inline(always)] + fn not_i16x16(self, a: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) + } + #[inline(always)] + fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) + } + #[inline(always)] + fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) + } + #[inline(always)] + fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) + } + #[inline(always)] + fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) + } + #[inline(always)] + fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) + } + #[inline(always)] + fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) + } + #[inline(always)] + fn shr_i16x16(self, a: i16x16, b: u32) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.shr_i16x8(a0, b), self.shr_i16x8(a1, b)) + } + #[inline(always)] + fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) + } + #[inline(always)] + fn shl_i16x16(self, a: i16x16, b: u32) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.shl_i16x8(a0, b), self.shl_i16x8(a1, b)) + } + #[inline(always)] + fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) + } + #[inline(always)] + fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) + } + #[inline(always)] + fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, _) = self.split_i16x16(a); + let (b0, _) = self.split_i16x16(b); + self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) + } + #[inline(always)] + fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (_, a1) = self.split_i16x16(a); + let (_, b1) = self.split_i16x16(b); + self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) + } + #[inline(always)] + fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_i16x16(b); + let (c0, c1) = self.split_i16x16(c); + self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) + } + #[inline(always)] + fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) + } + #[inline(always)] + fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) + } + #[inline(always)] + fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { + let mut result = [0; 32usize]; + result[0..16usize].copy_from_slice(&a.val); + result[16usize..32usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { + let mut b0 = [0; 8usize]; + let mut b1 = [0; 8usize]; + b0.copy_from_slice(&a.val[0..8usize]); + b1.copy_from_slice(&a.val[8usize..16usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn neg_i16x16(self, a: i16x16) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) + } + #[inline(always)] + fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { + let (a0, a1) = self.split_i16x16(a); + self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) + } + #[inline(always)] + fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { + let (a0, a1) = self.split_i16x16(a); + self.combine_u32x4( + self.reinterpret_u32_i16x8(a0), + self.reinterpret_u32_i16x8(a1), + ) + } + #[inline(always)] + fn splat_u16x16(self, a: u16) -> u16x16 { + let half = self.splat_u16x8(a); + self.combine_u16x8(half, half) + } + #[inline(always)] + fn not_u16x16(self, a: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) + } + #[inline(always)] + fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) + } + #[inline(always)] + fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) + } + #[inline(always)] + fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) + } + #[inline(always)] + fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) + } + #[inline(always)] + fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) + } + #[inline(always)] + fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) + } + #[inline(always)] + fn shr_u16x16(self, a: u16x16, b: u32) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.shr_u16x8(a0, b), self.shr_u16x8(a1, b)) + } + #[inline(always)] + fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) + } + #[inline(always)] + fn shl_u16x16(self, a: u16x16, b: u32) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u16x8(self.shl_u16x8(a0, b), self.shl_u16x8(a1, b)) + } + #[inline(always)] + fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) + } + #[inline(always)] + fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) + } + #[inline(always)] + fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, _) = self.split_u16x16(a); + let (b0, _) = self.split_u16x16(b); + self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) + } + #[inline(always)] + fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (_, a1) = self.split_u16x16(a); + let (_, b1) = self.split_u16x16(b); + self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) + } + #[inline(always)] + fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_u16x16(b); + let (c0, c1) = self.split_u16x16(c); + self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) + } + #[inline(always)] + fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) + } + #[inline(always)] + fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) + } + #[inline(always)] + fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { + let mut result = [0; 32usize]; + result[0..16usize].copy_from_slice(&a.val); + result[16usize..32usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { + let mut b0 = [0; 8usize]; + let mut b1 = [0; 8usize]; + b0.copy_from_slice(&a.val[0..8usize]); + b1.copy_from_slice(&a.val[8usize..16usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn narrow_u16x16(self, a: u16x16) -> u8x16 { + [ + a[0usize] as u8, + a[1usize] as u8, + a[2usize] as u8, + a[3usize] as u8, + a[4usize] as u8, + a[5usize] as u8, + a[6usize] as u8, + a[7usize] as u8, + a[8usize] as u8, + a[9usize] as u8, + a[10usize] as u8, + a[11usize] as u8, + a[12usize] as u8, + a[13usize] as u8, + a[14usize] as u8, + a[15usize] as u8, + ] + .simd_into(self) + } + #[inline(always)] + fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) + } + #[inline(always)] + fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { + let (a0, a1) = self.split_u16x16(a); + self.combine_u32x4( + self.reinterpret_u32_u16x8(a0), + self.reinterpret_u32_u16x8(a1), + ) + } + #[inline(always)] + fn splat_mask16x16(self, a: i16) -> mask16x16 { + let half = self.splat_mask16x8(a); + self.combine_mask16x8(half, half) + } + #[inline(always)] + fn not_mask16x16(self, a: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) + } + #[inline(always)] + fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) + } + #[inline(always)] + fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) + } + #[inline(always)] + fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) + } + #[inline(always)] + fn select_mask16x16( + self, + a: mask16x16, + b: mask16x16, + c: mask16x16, + ) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + let (c0, c1) = self.split_mask16x16(c); + self.combine_mask16x8( + self.select_mask16x8(a0, b0, c0), + self.select_mask16x8(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) + } + #[inline(always)] + fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { + let mut result = [0; 32usize]; + result[0..16usize].copy_from_slice(&a.val); + result[16usize..32usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { + let mut b0 = [0; 8usize]; + let mut b1 = [0; 8usize]; + b0.copy_from_slice(&a.val[0..8usize]); + b1.copy_from_slice(&a.val[8usize..16usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn splat_i32x8(self, a: i32) -> i32x8 { + let half = self.splat_i32x4(a); + self.combine_i32x4(half, half) + } + #[inline(always)] + fn not_i32x8(self, a: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) + } + #[inline(always)] + fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) + } + #[inline(always)] + fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) + } + #[inline(always)] + fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) + } + #[inline(always)] + fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) + } + #[inline(always)] + fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) + } + #[inline(always)] + fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) + } + #[inline(always)] + fn shr_i32x8(self, a: i32x8, b: u32) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.shr_i32x4(a0, b), self.shr_i32x4(a1, b)) + } + #[inline(always)] + fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) + } + #[inline(always)] + fn shl_i32x8(self, a: i32x8, b: u32) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.shl_i32x4(a0, b), self.shl_i32x4(a1, b)) + } + #[inline(always)] + fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) + } + #[inline(always)] + fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) + } + #[inline(always)] + fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) + } + #[inline(always)] + fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) + } + #[inline(always)] + fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) + } + #[inline(always)] + fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, _) = self.split_i32x8(a); + let (b0, _) = self.split_i32x8(b); + self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) + } + #[inline(always)] + fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (_, a1) = self.split_i32x8(a); + let (_, b1) = self.split_i32x8(b); + self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) + } + #[inline(always)] + fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) + } + #[inline(always)] + fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) + } + #[inline(always)] + fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_i32x8(b); + let (c0, c1) = self.split_i32x8(c); + self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) + } + #[inline(always)] + fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) + } + #[inline(always)] + fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) + } + #[inline(always)] + fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { + let mut result = [0; 16usize]; + result[0..8usize].copy_from_slice(&a.val); + result[8usize..16usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { + let mut b0 = [0; 4usize]; + let mut b1 = [0; 4usize]; + b0.copy_from_slice(&a.val[0..4usize]); + b1.copy_from_slice(&a.val[4usize..8usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn neg_i32x8(self, a: i32x8) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) + } + #[inline(always)] + fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { + let (a0, a1) = self.split_i32x8(a); + self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) + } + #[inline(always)] + fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_u32x4( + self.reinterpret_u32_i32x4(a0), + self.reinterpret_u32_i32x4(a1), + ) + } + #[inline(always)] + fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { + let (a0, a1) = self.split_i32x8(a); + self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) + } + #[inline(always)] + fn splat_u32x8(self, a: u32) -> u32x8 { + let half = self.splat_u32x4(a); + self.combine_u32x4(half, half) + } + #[inline(always)] + fn not_u32x8(self, a: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) + } + #[inline(always)] + fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) + } + #[inline(always)] + fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) + } + #[inline(always)] + fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) + } + #[inline(always)] + fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) + } + #[inline(always)] + fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) + } + #[inline(always)] + fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) + } + #[inline(always)] + fn shr_u32x8(self, a: u32x8, b: u32) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.shr_u32x4(a0, b), self.shr_u32x4(a1, b)) + } + #[inline(always)] + fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) + } + #[inline(always)] + fn shl_u32x8(self, a: u32x8, b: u32) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u32x4(self.shl_u32x4(a0, b), self.shl_u32x4(a1, b)) + } + #[inline(always)] + fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) + } + #[inline(always)] + fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) + } + #[inline(always)] + fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) + } + #[inline(always)] + fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) + } + #[inline(always)] + fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) + } + #[inline(always)] + fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, _) = self.split_u32x8(a); + let (b0, _) = self.split_u32x8(b); + self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) + } + #[inline(always)] + fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (_, a1) = self.split_u32x8(a); + let (_, b1) = self.split_u32x8(b); + self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) + } + #[inline(always)] + fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) + } + #[inline(always)] + fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) + } + #[inline(always)] + fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_u32x8(b); + let (c0, c1) = self.split_u32x8(c); + self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) + } + #[inline(always)] + fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) + } + #[inline(always)] + fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) + } + #[inline(always)] + fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { + let mut result = [0; 16usize]; + result[0..8usize].copy_from_slice(&a.val); + result[8usize..16usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { + let mut b0 = [0; 4usize]; + let mut b1 = [0; 4usize]; + b0.copy_from_slice(&a.val[0..4usize]); + b1.copy_from_slice(&a.val[4usize..8usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { + let (a0, a1) = self.split_u32x8(a); + self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) + } + #[inline(always)] + fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { + let (a0, a1) = self.split_u32x8(a); + self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) + } + #[inline(always)] + fn splat_mask32x8(self, a: i32) -> mask32x8 { + let half = self.splat_mask32x4(a); + self.combine_mask32x4(half, half) + } + #[inline(always)] + fn not_mask32x8(self, a: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) + } + #[inline(always)] + fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) + } + #[inline(always)] + fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) + } + #[inline(always)] + fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) + } + #[inline(always)] + fn select_mask32x8( + self, + a: mask32x8, + b: mask32x8, + c: mask32x8, + ) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + let (c0, c1) = self.split_mask32x8(c); + self.combine_mask32x4( + self.select_mask32x4(a0, b0, c0), + self.select_mask32x4(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) + } + #[inline(always)] + fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { + let mut result = [0; 16usize]; + result[0..8usize].copy_from_slice(&a.val); + result[8usize..16usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { + let mut b0 = [0; 4usize]; + let mut b1 = [0; 4usize]; + b0.copy_from_slice(&a.val[0..4usize]); + b1.copy_from_slice(&a.val[4usize..8usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn splat_f64x4(self, a: f64) -> f64x4 { + let half = self.splat_f64x2(a); + self.combine_f64x2(half, half) + } + #[inline(always)] + fn abs_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) + } + #[inline(always)] + fn neg_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) + } + #[inline(always)] + fn sqrt_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) + } + #[inline(always)] + fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) + } + #[inline(always)] + fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) + } + #[inline(always)] + fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) + } + #[inline(always)] + fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) + } + #[inline(always)] + fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) + } + #[inline(always)] + fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) + } + #[inline(always)] + fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) + } + #[inline(always)] + fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) + } + #[inline(always)] + fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) + } + #[inline(always)] + fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) + } + #[inline(always)] + fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, _) = self.split_f64x4(a); + let (b0, _) = self.split_f64x4(b); + self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) + } + #[inline(always)] + fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (_, a1) = self.split_f64x4(a); + let (_, b1) = self.split_f64x4(b); + self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) + } + #[inline(always)] + fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) + } + #[inline(always)] + fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) + } + #[inline(always)] + fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) + } + #[inline(always)] + fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.max_precise_f64x2(a0, b0), + self.max_precise_f64x2(a1, b1), + ) + } + #[inline(always)] + fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) + } + #[inline(always)] + fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.min_precise_f64x2(a0, b0), + self.min_precise_f64x2(a1, b1), + ) + } + #[inline(always)] + fn madd_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2(self.madd_f64x2(a0, b0, c0), self.madd_f64x2(a1, b1, c1)) + } + #[inline(always)] + fn msub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2(self.msub_f64x2(a0, b0, c0), self.msub_f64x2(a1, b1, c1)) + } + #[inline(always)] + fn floor_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) + } + #[inline(always)] + fn fract_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) + } + #[inline(always)] + fn trunc_f64x4(self, a: f64x4) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) + } + #[inline(always)] + fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_f64x4(b); + let (c0, c1) = self.split_f64x4(c); + self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) + } + #[inline(always)] + fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { + let mut result = [0.0; 8usize]; + result[0..4usize].copy_from_slice(&a.val); + result[4usize..8usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { + let mut b0 = [0.0; 2usize]; + let mut b1 = [0.0; 2usize]; + b0.copy_from_slice(&a.val[0..2usize]); + b1.copy_from_slice(&a.val[2usize..4usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { + let (a0, a1) = self.split_f64x4(a); + self.combine_f32x4( + self.reinterpret_f32_f64x2(a0), + self.reinterpret_f32_f64x2(a1), + ) + } + #[inline(always)] + fn splat_mask64x4(self, a: i64) -> mask64x4 { + let half = self.splat_mask64x2(a); + self.combine_mask64x2(half, half) + } + #[inline(always)] + fn not_mask64x4(self, a: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) + } + #[inline(always)] + fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) + } + #[inline(always)] + fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) + } + #[inline(always)] + fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) + } + #[inline(always)] + fn select_mask64x4( + self, + a: mask64x4, + b: mask64x4, + c: mask64x4, + ) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + let (c0, c1) = self.split_mask64x4(c); + self.combine_mask64x2( + self.select_mask64x2(a0, b0, c0), + self.select_mask64x2(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) + } + #[inline(always)] + fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { + let mut result = [0; 8usize]; + result[0..4usize].copy_from_slice(&a.val); + result[4usize..8usize].copy_from_slice(&b.val); + result.simd_into(self) + } + #[inline(always)] + fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { + let mut b0 = [0; 2usize]; + let mut b1 = [0; 2usize]; + b0.copy_from_slice(&a.val[0..2usize]); + b1.copy_from_slice(&a.val[2usize..4usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn splat_f32x16(self, a: f32) -> f32x16 { + let half = self.splat_f32x8(a); + self.combine_f32x8(half, half) + } + #[inline(always)] + fn abs_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) + } + #[inline(always)] + fn neg_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) + } + #[inline(always)] + fn sqrt_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) + } + #[inline(always)] + fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) + } + #[inline(always)] + fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) + } + #[inline(always)] + fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) + } + #[inline(always)] + fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) + } + #[inline(always)] + fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) + } + #[inline(always)] + fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, _) = self.split_f32x16(a); + let (b0, _) = self.split_f32x16(b); + self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) + } + #[inline(always)] + fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (_, a1) = self.split_f32x16(a); + let (_, b1) = self.split_f32x16(b); + self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) + } + #[inline(always)] + fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) + } + #[inline(always)] + fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.max_precise_f32x8(a0, b0), + self.max_precise_f32x8(a1, b1), + ) + } + #[inline(always)] + fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) + } + #[inline(always)] + fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.min_precise_f32x8(a0, b0), + self.min_precise_f32x8(a1, b1), + ) + } + #[inline(always)] + fn madd_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8(self.madd_f32x8(a0, b0, c0), self.madd_f32x8(a1, b1, c1)) + } + #[inline(always)] + fn msub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8(self.msub_f32x8(a0, b0, c0), self.msub_f32x8(a1, b1, c1)) + } + #[inline(always)] + fn floor_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) + } + #[inline(always)] + fn fract_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) + } + #[inline(always)] + fn trunc_f32x16(self, a: f32x16) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) + } + #[inline(always)] + fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_f32x16(b); + let (c0, c1) = self.split_f32x16(c); + self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) + } + #[inline(always)] + fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { + let mut b0 = [0.0; 8usize]; + let mut b1 = [0.0; 8usize]; + b0.copy_from_slice(&a.val[0..8usize]); + b1.copy_from_slice(&a.val[8usize..16usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { + let (a0, a1) = self.split_f32x16(a); + self.combine_f64x4( + self.reinterpret_f64_f32x8(a0), + self.reinterpret_f64_f32x8(a1), + ) + } + #[inline(always)] + fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8( + self.reinterpret_i32_f32x8(a0), + self.reinterpret_i32_f32x8(a1), + ) + } + #[inline(always)] + fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { + [ + src[0usize], + src[4usize], + src[8usize], + src[12usize], + src[1usize], + src[5usize], + src[9usize], + src[13usize], + src[2usize], + src[6usize], + src[10usize], + src[14usize], + src[3usize], + src[7usize], + src[11usize], + src[15usize], + ] + .simd_into(self) + } + #[inline(always)] + fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { + *dest = [ + a[0usize], a[4usize], a[8usize], a[12usize], a[1usize], a[5usize], a[9usize], + a[13usize], a[2usize], a[6usize], a[10usize], a[14usize], a[3usize], a[7usize], + a[11usize], a[15usize], + ]; + } + #[inline(always)] + fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) + } + #[inline(always)] + fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8( + self.reinterpret_u32_f32x8(a0), + self.reinterpret_u32_f32x8(a1), + ) + } + #[inline(always)] + fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) + } + #[inline(always)] + fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { + let (a0, a1) = self.split_f32x16(a); + self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) + } + #[inline(always)] + fn splat_i8x64(self, a: i8) -> i8x64 { + let half = self.splat_i8x32(a); + self.combine_i8x32(half, half) + } + #[inline(always)] + fn not_i8x64(self, a: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) + } + #[inline(always)] + fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) + } + #[inline(always)] + fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) + } + #[inline(always)] + fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) + } + #[inline(always)] + fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) + } + #[inline(always)] + fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) + } + #[inline(always)] + fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) + } + #[inline(always)] + fn shr_i8x64(self, a: i8x64, b: u32) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.shr_i8x32(a0, b), self.shr_i8x32(a1, b)) + } + #[inline(always)] + fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) + } + #[inline(always)] + fn shl_i8x64(self, a: i8x64, b: u32) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.shl_i8x32(a0, b), self.shl_i8x32(a1, b)) + } + #[inline(always)] + fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) + } + #[inline(always)] + fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) + } + #[inline(always)] + fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, _) = self.split_i8x64(a); + let (b0, _) = self.split_i8x64(b); + self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) + } + #[inline(always)] + fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (_, a1) = self.split_i8x64(a); + let (_, b1) = self.split_i8x64(b); + self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) + } + #[inline(always)] + fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) + } + #[inline(always)] + fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) + } + #[inline(always)] + fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_i8x64(b); + let (c0, c1) = self.split_i8x64(c); + self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) + } + #[inline(always)] + fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) + } + #[inline(always)] + fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) + } + #[inline(always)] + fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { + let mut b0 = [0; 32usize]; + let mut b1 = [0; 32usize]; + b0.copy_from_slice(&a.val[0..32usize]); + b1.copy_from_slice(&a.val[32usize..64usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn neg_i8x64(self, a: i8x64) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) + } + #[inline(always)] + fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { + let (a0, a1) = self.split_i8x64(a); + self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) + } + #[inline(always)] + fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { + let (a0, a1) = self.split_i8x64(a); + self.combine_u32x8( + self.reinterpret_u32_i8x32(a0), + self.reinterpret_u32_i8x32(a1), + ) + } + #[inline(always)] + fn splat_u8x64(self, a: u8) -> u8x64 { + let half = self.splat_u8x32(a); + self.combine_u8x32(half, half) + } + #[inline(always)] + fn not_u8x64(self, a: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) + } + #[inline(always)] + fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) + } + #[inline(always)] + fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) + } + #[inline(always)] + fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) + } + #[inline(always)] + fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) + } + #[inline(always)] + fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) + } + #[inline(always)] + fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) + } + #[inline(always)] + fn shr_u8x64(self, a: u8x64, b: u32) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.shr_u8x32(a0, b), self.shr_u8x32(a1, b)) + } + #[inline(always)] + fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) + } + #[inline(always)] + fn shl_u8x64(self, a: u8x64, b: u32) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u8x32(self.shl_u8x32(a0, b), self.shl_u8x32(a1, b)) + } + #[inline(always)] + fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) + } + #[inline(always)] + fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) + } + #[inline(always)] + fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) + } + #[inline(always)] + fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) + } + #[inline(always)] + fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) + } + #[inline(always)] + fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, _) = self.split_u8x64(a); + let (b0, _) = self.split_u8x64(b); + self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) + } + #[inline(always)] + fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (_, a1) = self.split_u8x64(a); + let (_, b1) = self.split_u8x64(b); + self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) + } + #[inline(always)] + fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) + } + #[inline(always)] + fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) + } + #[inline(always)] + fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_u8x64(b); + let (c0, c1) = self.split_u8x64(c); + self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) + } + #[inline(always)] + fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) + } + #[inline(always)] + fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) + } + #[inline(always)] + fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { + let mut b0 = [0; 32usize]; + let mut b1 = [0; 32usize]; + b0.copy_from_slice(&a.val[0..32usize]); + b1.copy_from_slice(&a.val[32usize..64usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { + [ + src[0usize], + src[4usize], + src[8usize], + src[12usize], + src[16usize], + src[20usize], + src[24usize], + src[28usize], + src[32usize], + src[36usize], + src[40usize], + src[44usize], + src[48usize], + src[52usize], + src[56usize], + src[60usize], + src[1usize], + src[5usize], + src[9usize], + src[13usize], + src[17usize], + src[21usize], + src[25usize], + src[29usize], + src[33usize], + src[37usize], + src[41usize], + src[45usize], + src[49usize], + src[53usize], + src[57usize], + src[61usize], + src[2usize], + src[6usize], + src[10usize], + src[14usize], + src[18usize], + src[22usize], + src[26usize], + src[30usize], + src[34usize], + src[38usize], + src[42usize], + src[46usize], + src[50usize], + src[54usize], + src[58usize], + src[62usize], + src[3usize], + src[7usize], + src[11usize], + src[15usize], + src[19usize], + src[23usize], + src[27usize], + src[31usize], + src[35usize], + src[39usize], + src[43usize], + src[47usize], + src[51usize], + src[55usize], + src[59usize], + src[63usize], + ] + .simd_into(self) + } + #[inline(always)] + fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { + *dest = [ + a[0usize], a[16usize], a[32usize], a[48usize], a[1usize], a[17usize], a[33usize], + a[49usize], a[2usize], a[18usize], a[34usize], a[50usize], a[3usize], a[19usize], + a[35usize], a[51usize], a[4usize], a[20usize], a[36usize], a[52usize], a[5usize], + a[21usize], a[37usize], a[53usize], a[6usize], a[22usize], a[38usize], a[54usize], + a[7usize], a[23usize], a[39usize], a[55usize], a[8usize], a[24usize], a[40usize], + a[56usize], a[9usize], a[25usize], a[41usize], a[57usize], a[10usize], a[26usize], + a[42usize], a[58usize], a[11usize], a[27usize], a[43usize], a[59usize], a[12usize], + a[28usize], a[44usize], a[60usize], a[13usize], a[29usize], a[45usize], a[61usize], + a[14usize], a[30usize], a[46usize], a[62usize], a[15usize], a[31usize], a[47usize], + a[63usize], + ]; + } + #[inline(always)] + fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { + let (a0, a1) = self.split_u8x64(a); + self.combine_u32x8( + self.reinterpret_u32_u8x32(a0), + self.reinterpret_u32_u8x32(a1), + ) + } + #[inline(always)] + fn splat_mask8x64(self, a: i8) -> mask8x64 { + let half = self.splat_mask8x32(a); + self.combine_mask8x32(half, half) + } + #[inline(always)] + fn not_mask8x64(self, a: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) + } + #[inline(always)] + fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) + } + #[inline(always)] + fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) + } + #[inline(always)] + fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) + } + #[inline(always)] + fn select_mask8x64( + self, + a: mask8x64, + b: mask8x64, + c: mask8x64, + ) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + let (c0, c1) = self.split_mask8x64(c); + self.combine_mask8x32( + self.select_mask8x32(a0, b0, c0), + self.select_mask8x32(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) + } + #[inline(always)] + fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { + let mut b0 = [0; 32usize]; + let mut b1 = [0; 32usize]; + b0.copy_from_slice(&a.val[0..32usize]); + b1.copy_from_slice(&a.val[32usize..64usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn splat_i16x32(self, a: i16) -> i16x32 { + let half = self.splat_i16x16(a); + self.combine_i16x16(half, half) + } + #[inline(always)] + fn not_i16x32(self, a: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) + } + #[inline(always)] + fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) + } + #[inline(always)] + fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) + } + #[inline(always)] + fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) + } + #[inline(always)] + fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) + } + #[inline(always)] + fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) + } + #[inline(always)] + fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) + } + #[inline(always)] + fn shr_i16x32(self, a: i16x32, b: u32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.shr_i16x16(a0, b), self.shr_i16x16(a1, b)) + } + #[inline(always)] + fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) + } + #[inline(always)] + fn shl_i16x32(self, a: i16x32, b: u32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.shl_i16x16(a0, b), self.shl_i16x16(a1, b)) + } + #[inline(always)] + fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) + } + #[inline(always)] + fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) + } + #[inline(always)] + fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) + } + #[inline(always)] + fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) + } + #[inline(always)] + fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) + } + #[inline(always)] + fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, _) = self.split_i16x32(a); + let (b0, _) = self.split_i16x32(b); + self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) + } + #[inline(always)] + fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (_, a1) = self.split_i16x32(a); + let (_, b1) = self.split_i16x32(b); + self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) + } + #[inline(always)] + fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) + } + #[inline(always)] + fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.unzip_high_i16x16(a0, a1), + self.unzip_high_i16x16(b0, b1), + ) + } + #[inline(always)] + fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_i16x32(b); + let (c0, c1) = self.split_i16x32(c); + self.combine_i16x16( + self.select_i16x16(a0, b0, c0), + self.select_i16x16(a1, b1, c1), + ) + } + #[inline(always)] + fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) + } + #[inline(always)] + fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) + } + #[inline(always)] + fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { + let mut b0 = [0; 16usize]; + let mut b1 = [0; 16usize]; + b0.copy_from_slice(&a.val[0..16usize]); + b1.copy_from_slice(&a.val[16usize..32usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn neg_i16x32(self, a: i16x32) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) + } + #[inline(always)] + fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { + let (a0, a1) = self.split_i16x32(a); + self.combine_u8x32( + self.reinterpret_u8_i16x16(a0), + self.reinterpret_u8_i16x16(a1), + ) + } + #[inline(always)] + fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { + let (a0, a1) = self.split_i16x32(a); + self.combine_u32x8( + self.reinterpret_u32_i16x16(a0), + self.reinterpret_u32_i16x16(a1), + ) + } + #[inline(always)] + fn splat_u16x32(self, a: u16) -> u16x32 { + let half = self.splat_u16x16(a); + self.combine_u16x16(half, half) + } + #[inline(always)] + fn not_u16x32(self, a: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) + } + #[inline(always)] + fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) + } + #[inline(always)] + fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) + } + #[inline(always)] + fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) + } + #[inline(always)] + fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) + } + #[inline(always)] + fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) + } + #[inline(always)] + fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) + } + #[inline(always)] + fn shr_u16x32(self, a: u16x32, b: u32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.shr_u16x16(a0, b), self.shr_u16x16(a1, b)) + } + #[inline(always)] + fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) + } + #[inline(always)] + fn shl_u16x32(self, a: u16x32, b: u32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u16x16(self.shl_u16x16(a0, b), self.shl_u16x16(a1, b)) + } + #[inline(always)] + fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) + } + #[inline(always)] + fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) + } + #[inline(always)] + fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) + } + #[inline(always)] + fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) + } + #[inline(always)] + fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) + } + #[inline(always)] + fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, _) = self.split_u16x32(a); + let (b0, _) = self.split_u16x32(b); + self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) + } + #[inline(always)] + fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (_, a1) = self.split_u16x32(a); + let (_, b1) = self.split_u16x32(b); + self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) + } + #[inline(always)] + fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) + } + #[inline(always)] + fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.unzip_high_u16x16(a0, a1), + self.unzip_high_u16x16(b0, b1), + ) + } + #[inline(always)] + fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_u16x32(b); + let (c0, c1) = self.split_u16x32(c); + self.combine_u16x16( + self.select_u16x16(a0, b0, c0), + self.select_u16x16(a1, b1, c1), + ) + } + #[inline(always)] + fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) + } + #[inline(always)] + fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) + } + #[inline(always)] + fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { + let mut b0 = [0; 16usize]; + let mut b1 = [0; 16usize]; + b0.copy_from_slice(&a.val[0..16usize]); + b1.copy_from_slice(&a.val[16usize..32usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { + [ + src[0usize], + src[4usize], + src[8usize], + src[12usize], + src[16usize], + src[20usize], + src[24usize], + src[28usize], + src[1usize], + src[5usize], + src[9usize], + src[13usize], + src[17usize], + src[21usize], + src[25usize], + src[29usize], + src[2usize], + src[6usize], + src[10usize], + src[14usize], + src[18usize], + src[22usize], + src[26usize], + src[30usize], + src[3usize], + src[7usize], + src[11usize], + src[15usize], + src[19usize], + src[23usize], + src[27usize], + src[31usize], + ] + .simd_into(self) + } + #[inline(always)] + fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { + *dest = [ + a[0usize], a[8usize], a[16usize], a[24usize], a[1usize], a[9usize], a[17usize], + a[25usize], a[2usize], a[10usize], a[18usize], a[26usize], a[3usize], a[11usize], + a[19usize], a[27usize], a[4usize], a[12usize], a[20usize], a[28usize], a[5usize], + a[13usize], a[21usize], a[29usize], a[6usize], a[14usize], a[22usize], a[30usize], + a[7usize], a[15usize], a[23usize], a[31usize], + ]; + } + #[inline(always)] + fn narrow_u16x32(self, a: u16x32) -> u8x32 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) + } + #[inline(always)] + fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u8x32( + self.reinterpret_u8_u16x16(a0), + self.reinterpret_u8_u16x16(a1), + ) + } + #[inline(always)] + fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { + let (a0, a1) = self.split_u16x32(a); + self.combine_u32x8( + self.reinterpret_u32_u16x16(a0), + self.reinterpret_u32_u16x16(a1), + ) + } + #[inline(always)] + fn splat_mask16x32(self, a: i16) -> mask16x32 { + let half = self.splat_mask16x16(a); + self.combine_mask16x16(half, half) + } + #[inline(always)] + fn not_mask16x32(self, a: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) + } + #[inline(always)] + fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) + } + #[inline(always)] + fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) + } + #[inline(always)] + fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) + } + #[inline(always)] + fn select_mask16x32( + self, + a: mask16x32, + b: mask16x32, + c: mask16x32, + ) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + let (c0, c1) = self.split_mask16x32(c); + self.combine_mask16x16( + self.select_mask16x16(a0, b0, c0), + self.select_mask16x16(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16( + self.simd_eq_mask16x16(a0, b0), + self.simd_eq_mask16x16(a1, b1), + ) + } + #[inline(always)] + fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { + let mut b0 = [0; 16usize]; + let mut b1 = [0; 16usize]; + b0.copy_from_slice(&a.val[0..16usize]); + b1.copy_from_slice(&a.val[16usize..32usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn splat_i32x16(self, a: i32) -> i32x16 { + let half = self.splat_i32x8(a); + self.combine_i32x8(half, half) + } + #[inline(always)] + fn not_i32x16(self, a: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) + } + #[inline(always)] + fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) + } + #[inline(always)] + fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) + } + #[inline(always)] + fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) + } + #[inline(always)] + fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) + } + #[inline(always)] + fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) + } + #[inline(always)] + fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) + } + #[inline(always)] + fn shr_i32x16(self, a: i32x16, b: u32) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.shr_i32x8(a0, b), self.shr_i32x8(a1, b)) + } + #[inline(always)] + fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) + } + #[inline(always)] + fn shl_i32x16(self, a: i32x16, b: u32) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.shl_i32x8(a0, b), self.shl_i32x8(a1, b)) + } + #[inline(always)] + fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) + } + #[inline(always)] + fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) + } + #[inline(always)] + fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, _) = self.split_i32x16(a); + let (b0, _) = self.split_i32x16(b); + self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) + } + #[inline(always)] + fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (_, a1) = self.split_i32x16(a); + let (_, b1) = self.split_i32x16(b); + self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) + } + #[inline(always)] + fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_i32x16(b); + let (c0, c1) = self.split_i32x16(c); + self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) + } + #[inline(always)] + fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) + } + #[inline(always)] + fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) + } + #[inline(always)] + fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { + let mut b0 = [0; 8usize]; + let mut b1 = [0; 8usize]; + b0.copy_from_slice(&a.val[0..8usize]); + b1.copy_from_slice(&a.val[8usize..16usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn neg_i32x16(self, a: i32x16) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) + } + #[inline(always)] + fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { + let (a0, a1) = self.split_i32x16(a); + self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) + } + #[inline(always)] + fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_u32x8( + self.reinterpret_u32_i32x8(a0), + self.reinterpret_u32_i32x8(a1), + ) + } + #[inline(always)] + fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { + let (a0, a1) = self.split_i32x16(a); + self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) + } + #[inline(always)] + fn splat_u32x16(self, a: u32) -> u32x16 { + let half = self.splat_u32x8(a); + self.combine_u32x8(half, half) + } + #[inline(always)] + fn not_u32x16(self, a: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) + } + #[inline(always)] + fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) + } + #[inline(always)] + fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) + } + #[inline(always)] + fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) + } + #[inline(always)] + fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) + } + #[inline(always)] + fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) + } + #[inline(always)] + fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) + } + #[inline(always)] + fn shr_u32x16(self, a: u32x16, b: u32) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.shr_u32x8(a0, b), self.shr_u32x8(a1, b)) + } + #[inline(always)] + fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) + } + #[inline(always)] + fn shl_u32x16(self, a: u32x16, b: u32) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u32x8(self.shl_u32x8(a0, b), self.shl_u32x8(a1, b)) + } + #[inline(always)] + fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) + } + #[inline(always)] + fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) + } + #[inline(always)] + fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) + } + #[inline(always)] + fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) + } + #[inline(always)] + fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) + } + #[inline(always)] + fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, _) = self.split_u32x16(a); + let (b0, _) = self.split_u32x16(b); + self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) + } + #[inline(always)] + fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (_, a1) = self.split_u32x16(a); + let (_, b1) = self.split_u32x16(b); + self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) + } + #[inline(always)] + fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) + } + #[inline(always)] + fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) + } + #[inline(always)] + fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_u32x16(b); + let (c0, c1) = self.split_u32x16(c); + self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) + } + #[inline(always)] + fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) + } + #[inline(always)] + fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) + } + #[inline(always)] + fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { + let mut b0 = [0; 8usize]; + let mut b1 = [0; 8usize]; + b0.copy_from_slice(&a.val[0..8usize]); + b1.copy_from_slice(&a.val[8usize..16usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { + [ + src[0usize], + src[4usize], + src[8usize], + src[12usize], + src[1usize], + src[5usize], + src[9usize], + src[13usize], + src[2usize], + src[6usize], + src[10usize], + src[14usize], + src[3usize], + src[7usize], + src[11usize], + src[15usize], + ] + .simd_into(self) + } + #[inline(always)] + fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { + *dest = [ + a[0usize], a[4usize], a[8usize], a[12usize], a[1usize], a[5usize], a[9usize], + a[13usize], a[2usize], a[6usize], a[10usize], a[14usize], a[3usize], a[7usize], + a[11usize], a[15usize], + ]; + } + #[inline(always)] + fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { + let (a0, a1) = self.split_u32x16(a); + self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) + } + #[inline(always)] + fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { + let (a0, a1) = self.split_u32x16(a); + self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) + } + #[inline(always)] + fn splat_mask32x16(self, a: i32) -> mask32x16 { + let half = self.splat_mask32x8(a); + self.combine_mask32x8(half, half) + } + #[inline(always)] + fn not_mask32x16(self, a: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) + } + #[inline(always)] + fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) + } + #[inline(always)] + fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) + } + #[inline(always)] + fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) + } + #[inline(always)] + fn select_mask32x16( + self, + a: mask32x16, + b: mask32x16, + c: mask32x16, + ) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + let (c0, c1) = self.split_mask32x16(c); + self.combine_mask32x8( + self.select_mask32x8(a0, b0, c0), + self.select_mask32x8(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) + } + #[inline(always)] + fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { + let mut b0 = [0; 8usize]; + let mut b1 = [0; 8usize]; + b0.copy_from_slice(&a.val[0..8usize]); + b1.copy_from_slice(&a.val[8usize..16usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn splat_f64x8(self, a: f64) -> f64x8 { + let half = self.splat_f64x4(a); + self.combine_f64x4(half, half) + } + #[inline(always)] + fn abs_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) + } + #[inline(always)] + fn neg_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) + } + #[inline(always)] + fn sqrt_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) + } + #[inline(always)] + fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) + } + #[inline(always)] + fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) + } + #[inline(always)] + fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) + } + #[inline(always)] + fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) + } + #[inline(always)] + fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) + } + #[inline(always)] + fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) + } + #[inline(always)] + fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) + } + #[inline(always)] + fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) + } + #[inline(always)] + fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) + } + #[inline(always)] + fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) + } + #[inline(always)] + fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, _) = self.split_f64x8(a); + let (b0, _) = self.split_f64x8(b); + self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) + } + #[inline(always)] + fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (_, a1) = self.split_f64x8(a); + let (_, b1) = self.split_f64x8(b); + self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) + } + #[inline(always)] + fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) + } + #[inline(always)] + fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) + } + #[inline(always)] + fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) + } + #[inline(always)] + fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.max_precise_f64x4(a0, b0), + self.max_precise_f64x4(a1, b1), + ) + } + #[inline(always)] + fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) + } + #[inline(always)] + fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.min_precise_f64x4(a0, b0), + self.min_precise_f64x4(a1, b1), + ) + } + #[inline(always)] + fn madd_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4(self.madd_f64x4(a0, b0, c0), self.madd_f64x4(a1, b1, c1)) + } + #[inline(always)] + fn msub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4(self.msub_f64x4(a0, b0, c0), self.msub_f64x4(a1, b1, c1)) + } + #[inline(always)] + fn floor_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) + } + #[inline(always)] + fn fract_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) + } + #[inline(always)] + fn trunc_f64x8(self, a: f64x8) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) + } + #[inline(always)] + fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_f64x8(b); + let (c0, c1) = self.split_f64x8(c); + self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) + } + #[inline(always)] + fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { + let mut b0 = [0.0; 4usize]; + let mut b1 = [0.0; 4usize]; + b0.copy_from_slice(&a.val[0..4usize]); + b1.copy_from_slice(&a.val[4usize..8usize]); + (b0.simd_into(self), b1.simd_into(self)) + } + #[inline(always)] + fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { + let (a0, a1) = self.split_f64x8(a); + self.combine_f32x8( + self.reinterpret_f32_f64x4(a0), + self.reinterpret_f32_f64x4(a1), + ) + } + #[inline(always)] + fn splat_mask64x8(self, a: i64) -> mask64x8 { + let half = self.splat_mask64x4(a); + self.combine_mask64x4(half, half) + } + #[inline(always)] + fn not_mask64x8(self, a: mask64x8) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1)) + } + #[inline(always)] + fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_mask64x8(b); + self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1)) + } + #[inline(always)] + fn or_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_mask64x8(b); + self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1)) + } + #[inline(always)] + fn xor_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_mask64x8(b); + self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1)) + } + #[inline(always)] + fn select_mask64x8( + self, + a: mask64x8, + b: mask64x8, + c: mask64x8, + ) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_mask64x8(b); + let (c0, c1) = self.split_mask64x8(c); + self.combine_mask64x4( + self.select_mask64x4(a0, b0, c0), + self.select_mask64x4(a1, b1, c1), + ) + } + #[inline(always)] + fn simd_eq_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_mask64x8(b); + self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1)) + } + #[inline(always)] + fn split_mask64x8(self, a: mask64x8) -> (mask64x4, mask64x4) { + let mut b0 = [0; 4usize]; + let mut b1 = [0; 4usize]; + b0.copy_from_slice(&a.val[0..4usize]); + b1.copy_from_slice(&a.val[4usize..8usize]); + (b0.simd_into(self), b1.simd_into(self)) + } +} +impl Seal for Scalar {} diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs index 108473d6..ff22104d 100644 --- a/fearless_simd/src/generated/simd_types.rs +++ b/fearless_simd/src/generated/simd_types.rs @@ -3,7 +3,7 @@ // This file is autogenerated by fearless_simd_gen -use crate::{Bytes, Select, Simd, SimdCvtFloat, SimdCvtTruncate, SimdFrom, SimdInto}; +use crate::{Bytes, Scalar, Select, Simd, SimdCvtFloat, SimdCvtTruncate, SimdFrom, SimdInto}; #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct f32x4 { @@ -8052,3 +8052,614 @@ impl crate::SimdMask for mask64x8 { self.simd.simd_eq_mask64x8(self, rhs.simd_into(self.simd)) } } +impl crate::Bytes for u8 { + type Bytes = u8; + fn to_bytes(self) -> u8 { + self + } + fn from_bytes(value: u8) -> Self { + value + } +} +impl crate::SimdBase for u8 { + const N: usize = 1; + type Mask = i8; + type Block = u8x16; + #[inline(always)] + fn witness(&self) -> Scalar { + Scalar + } + #[inline(always)] + fn as_slice(&self) -> &[u8] { + core::slice::from_ref(self) + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [u8] { + core::slice::from_mut(self) + } + #[inline(always)] + fn from_slice(Scalar: Scalar, slice: &[u8]) -> Self { + slice[0] + } + #[inline(always)] + fn splat(Scalar: Scalar, val: u8) -> Self { + val + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block.as_slice()[0] + } +} +impl crate::SimdInt for u8 { + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as i8) + } + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + -((self < rhs.simd_into(Scalar)) as i8) + } + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + -((self <= rhs.simd_into(Scalar)) as i8) + } + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + -((self >= rhs.simd_into(Scalar)) as i8) + } + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + -((self > rhs.simd_into(Scalar)) as i8) + } + fn zip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn zip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn min(self, rhs: impl SimdInto) -> Self { + Ord::min(self, rhs.simd_into(Scalar)) + } + fn max(self, rhs: impl SimdInto) -> Self { + Ord::max(self, rhs.simd_into(Scalar)) + } +} +impl crate::Select for i8 { + fn select(self, if_true: u8, if_false: u8) -> u8 { + if self != 0 { if_true } else { if_false } + } +} +impl crate::Bytes for u16 { + type Bytes = u16; + fn to_bytes(self) -> u16 { + self + } + fn from_bytes(value: u16) -> Self { + value + } +} +impl crate::SimdBase for u16 { + const N: usize = 1; + type Mask = i16; + type Block = u16x8; + #[inline(always)] + fn witness(&self) -> Scalar { + Scalar + } + #[inline(always)] + fn as_slice(&self) -> &[u16] { + core::slice::from_ref(self) + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [u16] { + core::slice::from_mut(self) + } + #[inline(always)] + fn from_slice(Scalar: Scalar, slice: &[u16]) -> Self { + slice[0] + } + #[inline(always)] + fn splat(Scalar: Scalar, val: u16) -> Self { + val + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block.as_slice()[0] + } +} +impl crate::SimdInt for u16 { + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as i16) + } + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + -((self < rhs.simd_into(Scalar)) as i16) + } + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + -((self <= rhs.simd_into(Scalar)) as i16) + } + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + -((self >= rhs.simd_into(Scalar)) as i16) + } + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + -((self > rhs.simd_into(Scalar)) as i16) + } + fn zip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn zip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn min(self, rhs: impl SimdInto) -> Self { + Ord::min(self, rhs.simd_into(Scalar)) + } + fn max(self, rhs: impl SimdInto) -> Self { + Ord::max(self, rhs.simd_into(Scalar)) + } +} +impl crate::Select for i16 { + fn select(self, if_true: u16, if_false: u16) -> u16 { + if self != 0 { if_true } else { if_false } + } +} +impl crate::Bytes for u32 { + type Bytes = u32; + fn to_bytes(self) -> u32 { + self + } + fn from_bytes(value: u32) -> Self { + value + } +} +impl crate::SimdBase for u32 { + const N: usize = 1; + type Mask = i32; + type Block = u32x4; + #[inline(always)] + fn witness(&self) -> Scalar { + Scalar + } + #[inline(always)] + fn as_slice(&self) -> &[u32] { + core::slice::from_ref(self) + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [u32] { + core::slice::from_mut(self) + } + #[inline(always)] + fn from_slice(Scalar: Scalar, slice: &[u32]) -> Self { + slice[0] + } + #[inline(always)] + fn splat(Scalar: Scalar, val: u32) -> Self { + val + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block.as_slice()[0] + } +} +impl crate::SimdInt for u32 { + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as i32) + } + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + -((self < rhs.simd_into(Scalar)) as i32) + } + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + -((self <= rhs.simd_into(Scalar)) as i32) + } + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + -((self >= rhs.simd_into(Scalar)) as i32) + } + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + -((self > rhs.simd_into(Scalar)) as i32) + } + fn zip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn zip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn min(self, rhs: impl SimdInto) -> Self { + Ord::min(self, rhs.simd_into(Scalar)) + } + fn max(self, rhs: impl SimdInto) -> Self { + Ord::max(self, rhs.simd_into(Scalar)) + } +} +impl crate::Select for i32 { + fn select(self, if_true: u32, if_false: u32) -> u32 { + if self != 0 { if_true } else { if_false } + } +} +impl crate::SimdCvtTruncate for u32 { + fn truncate_from(x: f32) -> Self { + x as Self + } +} +impl crate::SimdCvtFloat for f32 { + fn float_from(x: u32) -> Self { + x as Self + } +} +impl crate::Bytes for i8 { + type Bytes = u8; + fn to_bytes(self) -> u8 { + self as u8 + } + fn from_bytes(value: u8) -> Self { + value as Self + } +} +impl crate::SimdBase for i8 { + const N: usize = 1; + type Mask = i8; + type Block = i8x16; + #[inline(always)] + fn witness(&self) -> Scalar { + Scalar + } + #[inline(always)] + fn as_slice(&self) -> &[i8] { + core::slice::from_ref(self) + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [i8] { + core::slice::from_mut(self) + } + #[inline(always)] + fn from_slice(Scalar: Scalar, slice: &[i8]) -> Self { + slice[0] + } + #[inline(always)] + fn splat(Scalar: Scalar, val: i8) -> Self { + val + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block.as_slice()[0] + } +} +impl crate::SimdInt for i8 { + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as i8) + } + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + -((self < rhs.simd_into(Scalar)) as i8) + } + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + -((self <= rhs.simd_into(Scalar)) as i8) + } + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + -((self >= rhs.simd_into(Scalar)) as i8) + } + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + -((self > rhs.simd_into(Scalar)) as i8) + } + fn zip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn zip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn min(self, rhs: impl SimdInto) -> Self { + Ord::min(self, rhs.simd_into(Scalar)) + } + fn max(self, rhs: impl SimdInto) -> Self { + Ord::max(self, rhs.simd_into(Scalar)) + } +} +impl crate::Select for i8 { + fn select(self, if_true: i8, if_false: i8) -> i8 { + if self != 0 { if_true } else { if_false } + } +} +impl crate::Bytes for i16 { + type Bytes = u16; + fn to_bytes(self) -> u16 { + self as u16 + } + fn from_bytes(value: u16) -> Self { + value as Self + } +} +impl crate::SimdBase for i16 { + const N: usize = 1; + type Mask = i16; + type Block = i16x8; + #[inline(always)] + fn witness(&self) -> Scalar { + Scalar + } + #[inline(always)] + fn as_slice(&self) -> &[i16] { + core::slice::from_ref(self) + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [i16] { + core::slice::from_mut(self) + } + #[inline(always)] + fn from_slice(Scalar: Scalar, slice: &[i16]) -> Self { + slice[0] + } + #[inline(always)] + fn splat(Scalar: Scalar, val: i16) -> Self { + val + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block.as_slice()[0] + } +} +impl crate::SimdInt for i16 { + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as i16) + } + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + -((self < rhs.simd_into(Scalar)) as i16) + } + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + -((self <= rhs.simd_into(Scalar)) as i16) + } + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + -((self >= rhs.simd_into(Scalar)) as i16) + } + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + -((self > rhs.simd_into(Scalar)) as i16) + } + fn zip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn zip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn min(self, rhs: impl SimdInto) -> Self { + Ord::min(self, rhs.simd_into(Scalar)) + } + fn max(self, rhs: impl SimdInto) -> Self { + Ord::max(self, rhs.simd_into(Scalar)) + } +} +impl crate::Select for i16 { + fn select(self, if_true: i16, if_false: i16) -> i16 { + if self != 0 { if_true } else { if_false } + } +} +impl crate::Bytes for i32 { + type Bytes = u32; + fn to_bytes(self) -> u32 { + self as u32 + } + fn from_bytes(value: u32) -> Self { + value as Self + } +} +impl crate::SimdBase for i32 { + const N: usize = 1; + type Mask = i32; + type Block = i32x4; + #[inline(always)] + fn witness(&self) -> Scalar { + Scalar + } + #[inline(always)] + fn as_slice(&self) -> &[i32] { + core::slice::from_ref(self) + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [i32] { + core::slice::from_mut(self) + } + #[inline(always)] + fn from_slice(Scalar: Scalar, slice: &[i32]) -> Self { + slice[0] + } + #[inline(always)] + fn splat(Scalar: Scalar, val: i32) -> Self { + val + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block.as_slice()[0] + } +} +impl crate::SimdInt for i32 { + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as i32) + } + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + -((self < rhs.simd_into(Scalar)) as i32) + } + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + -((self <= rhs.simd_into(Scalar)) as i32) + } + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + -((self >= rhs.simd_into(Scalar)) as i32) + } + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + -((self > rhs.simd_into(Scalar)) as i32) + } + fn zip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn zip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn min(self, rhs: impl SimdInto) -> Self { + Ord::min(self, rhs.simd_into(Scalar)) + } + fn max(self, rhs: impl SimdInto) -> Self { + Ord::max(self, rhs.simd_into(Scalar)) + } +} +impl crate::Select for i32 { + fn select(self, if_true: i32, if_false: i32) -> i32 { + if self != 0 { if_true } else { if_false } + } +} +impl crate::SimdCvtTruncate for i32 { + fn truncate_from(x: f32) -> Self { + x as Self + } +} +impl crate::SimdCvtFloat for f32 { + fn float_from(x: i32) -> Self { + x as Self + } +} +impl crate::Bytes for f32 { + type Bytes = u32; + fn to_bytes(self) -> u32 { + self.to_bits() + } + fn from_bytes(value: u32) -> Self { + f32::from_bits(value) + } +} +impl crate::SimdBase for f32 { + const N: usize = 1; + type Mask = i32; + type Block = f32x4; + #[inline(always)] + fn witness(&self) -> Scalar { + Scalar + } + #[inline(always)] + fn as_slice(&self) -> &[f32] { + core::slice::from_ref(self) + } + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [f32] { + core::slice::from_mut(self) + } + #[inline(always)] + fn from_slice(Scalar: Scalar, slice: &[f32]) -> Self { + slice[0] + } + #[inline(always)] + fn splat(Scalar: Scalar, val: f32) -> Self { + val + } + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block.as_slice()[0] + } +} +impl crate::SimdFloat for f32 { + fn abs(self) -> Self { + f32::abs(self) + } + fn sqrt(self) -> Self { + f32::sqrt(self) + } + fn copysign(self, rhs: impl SimdInto) -> Self { + f32::copysign(self, rhs.simd_into(Scalar)) + } + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as i32) + } + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + -((self < rhs.simd_into(Scalar)) as i32) + } + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + -((self <= rhs.simd_into(Scalar)) as i32) + } + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + -((self >= rhs.simd_into(Scalar)) as i32) + } + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + -((self > rhs.simd_into(Scalar)) as i32) + } + fn zip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn zip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_low(self, _rhs: impl SimdInto) -> Self { + self + } + fn unzip_high(self, _rhs: impl SimdInto) -> Self { + self + } + fn max(self, rhs: impl SimdInto) -> Self { + f32::max(self, rhs.simd_into(Scalar)) + } + fn max_precise(self, rhs: impl SimdInto) -> Self { + f32::max(self, rhs.simd_into(Scalar)) + } + fn min(self, rhs: impl SimdInto) -> Self { + f32::min(self, rhs.simd_into(Scalar)) + } + fn min_precise(self, rhs: impl SimdInto) -> Self { + f32::min(self, rhs.simd_into(Scalar)) + } + fn madd(self, op1: impl SimdInto, op2: impl SimdInto) -> Self { + self.mul_add(op1.simd_into(Scalar), op2.simd_into(Scalar)) + } + fn msub(self, op1: impl SimdInto, op2: impl SimdInto) -> Self { + self.mul_add(op1.simd_into(Scalar), -op2.simd_into(Scalar)) + } + fn floor(self) -> Self { + f32::floor(self) + } + fn fract(self) -> Self { + f32::fract(self) + } + fn trunc(self) -> Self { + f32::trunc(self) + } +} +impl crate::Select for i32 { + fn select(self, if_true: f32, if_false: f32) -> f32 { + if self != 0 { if_true } else { if_false } + } +} +impl crate::SimdMask for i8 { + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as i8) + } +} +impl crate::SimdMask for i16 { + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as i16) + } +} +impl crate::SimdMask for i32 { + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as i32) + } +} diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs index 17437000..31ad8583 100644 --- a/fearless_simd/src/lib.rs +++ b/fearless_simd/src/lib.rs @@ -186,6 +186,7 @@ pub enum Level { /// The AVX2 and FMA instruction set on (32 and 64 bit) x86. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Avx2(Avx2), + Scalar(Scalar), // If new variants are added, make sure to handle them in `Level::dispatch` // and `dispatch!()` } diff --git a/fearless_simd/src/macros.rs b/fearless_simd/src/macros.rs index 1c25948d..5c0e2b5f 100644 --- a/fearless_simd/src/macros.rs +++ b/fearless_simd/src/macros.rs @@ -124,6 +124,15 @@ macro_rules! dispatch { || $op, ) } + $crate::Level::Scalar(s) => { + let $simd = launder(s); + // This vectorize call does nothing, but it is reasonable to be consistent here. + $crate::Simd::vectorize( + s, + #[inline(always)] + || $op, + ) + } _ => unreachable!(), } }}; diff --git a/fearless_simd_gen/src/main.rs b/fearless_simd_gen/src/main.rs index 3aab3b38..45c50f9f 100644 --- a/fearless_simd_gen/src/main.rs +++ b/fearless_simd_gen/src/main.rs @@ -18,6 +18,7 @@ mod mk_avx2; mod mk_fallback; mod mk_neon; mod mk_ops; +mod mk_scalar; mod mk_simd_trait; mod mk_simd_types; mod mk_sse4_2; @@ -36,6 +37,7 @@ enum Module { Fallback, Sse4_2, Avx2, + Scalar, } #[derive(Parser)] @@ -66,6 +68,7 @@ impl Module { Module::Fallback => mk_fallback::mk_fallback_impl(), Module::Sse4_2 => mk_sse4_2::mk_sse4_2_impl(), Module::Avx2 => mk_avx2::mk_avx2_impl(), + Module::Scalar => mk_scalar::mk_scalar_impl(), } } @@ -105,6 +108,7 @@ impl Module { Module::Wasm => "wasm", Module::Sse4_2 => "sse4_2", Module::Avx2 => "avx2", + Module::Scalar => "scalar", } } } @@ -118,6 +122,7 @@ const MODULES: &[Module] = &[ Module::Wasm, Module::Sse4_2, Module::Avx2, + Module::Scalar, ]; const FILE_BASE: &str = "./fearless_simd/src/generated"; diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs index 1a94b986..2512c945 100644 --- a/fearless_simd_gen/src/mk_fallback.rs +++ b/fearless_simd_gen/src/mk_fallback.rs @@ -109,6 +109,41 @@ pub fn mk_fallback_impl() -> TokenStream { fn mk_simd_impl() -> TokenStream { let level_tok = Level.token(); + let methods = fallback_methods(); + + // Note: the `vectorize` implementation is pretty boilerplate and should probably + // be factored out for DRY. + quote! { + impl Simd for #level_tok { + type f32s = f32x4; + type u8s = u8x16; + type i8s = i8x16; + type u16s = u16x8; + type i16s = i16x8; + type u32s = u32x4; + type i32s = i32x4; + type mask8s = mask8x16; + type mask16s = mask16x8; + type mask32s = mask32x4; + #[inline(always)] + fn level(self) -> Level { + #[cfg(feature = "force_support_fallback")] + return Level::#level_tok(self); + #[cfg(not(feature = "force_support_fallback"))] + Level::baseline() + } + + #[inline] + fn vectorize R, R>(self, f: F) -> R { + f() + } + + #( #methods )* + } + } +} + +pub(crate) fn fallback_methods() -> Vec { let mut methods = vec![]; for vec_ty in SIMD_TYPES { let scalar_bits = vec_ty.scalar_bits; @@ -390,37 +425,7 @@ fn mk_simd_impl() -> TokenStream { methods.push(method); } } - - // Note: the `vectorize` implementation is pretty boilerplate and should probably - // be factored out for DRY. - quote! { - impl Simd for #level_tok { - type f32s = f32x4; - type u8s = u8x16; - type i8s = i8x16; - type u16s = u16x8; - type i16s = i16x8; - type u32s = u32x4; - type i32s = i32x4; - type mask8s = mask8x16; - type mask16s = mask16x8; - type mask32s = mask32x4; - #[inline(always)] - fn level(self) -> Level { - #[cfg(feature = "force_support_fallback")] - return Level::#level_tok(self); - #[cfg(not(feature = "force_support_fallback"))] - Level::baseline() - } - - #[inline] - fn vectorize R, R>(self, f: F) -> R { - f() - } - - #( #methods )* - } - } + methods } fn interleave_indices( diff --git a/fearless_simd_gen/src/mk_scalar.rs b/fearless_simd_gen/src/mk_scalar.rs new file mode 100644 index 00000000..3d1da8e3 --- /dev/null +++ b/fearless_simd_gen/src/mk_scalar.rs @@ -0,0 +1,53 @@ +use proc_macro2::TokenStream; +use quote::quote; + +use crate::{mk_fallback::fallback_methods, types::type_imports}; + +pub(crate) fn mk_scalar_impl() -> TokenStream { + let imports = type_imports(); + let methods = fallback_methods(); + + quote! { + use core::ops::*; + use crate::{seal::Seal, Level, Simd, SimdInto}; + + #imports + + #[derive(Debug, Copy, Clone)] + pub struct Scalar; + + impl Scalar { + #[inline] + pub const fn new() -> Self { + Scalar + } + } + + impl Simd for Scalar { + type f32s = f32; + type u8s = u8; + type i8s = i8; + type u16s = u16; + type i16s = i16; + type u32s = u32; + type i32s = i32; + type mask8s = i8; + type mask16s = i16; + type mask32s = i32; + + #[inline(always)] + fn level(self) -> Level { + Level::Scalar(self) + } + + #[inline] + fn vectorize R, R>(self, f: F) -> R { + f() + } + + #( #methods )* + } + + impl Seal for Scalar {} + } +} diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs index a2879bf8..d9140992 100644 --- a/fearless_simd_gen/src/mk_simd_types.rs +++ b/fearless_simd_gen/src/mk_simd_types.rs @@ -16,7 +16,7 @@ use crate::{ pub fn mk_simd_types() -> TokenStream { let mut result = quote! { - use crate::{Bytes, Select, Simd, SimdFrom, SimdInto, SimdCvtFloat, SimdCvtTruncate}; + use crate::{Bytes, Select, Simd, SimdFrom, SimdInto, SimdCvtFloat, SimdCvtTruncate, Scalar}; }; for ty in SIMD_TYPES { let name = ty.rust(); @@ -170,9 +170,231 @@ pub fn mk_simd_types() -> TokenStream { #( #cvt_impls )* }); } + + for ty in [ScalarType::Unsigned, ScalarType::Int, ScalarType::Float] { + for bits in [8, 16, 32] { + if ty == ScalarType::Float && ![32, 64].contains(&bits) { + continue; + } + result.extend(scalar_impl(ty, bits)); + } + } + for bits in [8, 16, 32] { + let ty = ScalarType::Int; + let scalar = ty.rust(bits); + result.extend(quote! { + impl crate::SimdMask<#scalar, Scalar> for #scalar { + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as #scalar) + } + } + }); + } + result } +fn scalar_impl(ty: ScalarType, bits: usize) -> TokenStream { + let scalar = ty.rust(bits); + let block_ty = VecType::new(ty, bits, 128 / bits).rust(); + let mask = ScalarType::Int.rust(bits); + let bytes = ScalarType::Unsigned.rust(bits); + let to_bytes = match ty { + ScalarType::Float => quote! { self.to_bits() }, + ScalarType::Int => quote! { self as #bytes }, + ScalarType::Unsigned | ScalarType::Mask => quote! { self }, + }; + let from_bytes = match ty { + ScalarType::Float => quote! { #scalar::from_bits(value) }, + ScalarType::Int => quote! { value as Self }, + ScalarType::Unsigned | ScalarType::Mask => quote! { value }, + }; + let cvt_float = match (ty, bits) { + (ScalarType::Int | ScalarType::Unsigned, 32) => quote! { + impl crate::SimdCvtTruncate for #scalar { + fn truncate_from(x: f32) -> Self { x as Self } + } + impl crate::SimdCvtFloat<#scalar> for f32 { + fn float_from(x: #scalar) -> Self { + x as Self + } + } + }, + _ => quote!(), + }; + let common = quote! { + #[inline(always)] + fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask { + -((self == rhs.simd_into(Scalar)) as #mask) + } + #[inline(always)] + fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask { + -((self < rhs.simd_into(Scalar)) as #mask) + } + #[inline(always)] + fn simd_le(self, rhs: impl SimdInto) -> Self::Mask { + -((self <= rhs.simd_into(Scalar)) as #mask) + } + #[inline(always)] + fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask { + -((self >= rhs.simd_into(Scalar)) as #mask) + } + #[inline(always)] + fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask { + -((self > rhs.simd_into(Scalar)) as #mask) + } + #[inline(always)] + fn zip_low(self, _rhs: impl SimdInto) -> Self { + self + } + #[inline(always)] + fn zip_high(self, _rhs: impl SimdInto) -> Self { + self + } + #[inline(always)] + fn unzip_low(self, _rhs: impl SimdInto) -> Self { + self + } + #[inline(always)] + fn unzip_high(self, _rhs: impl SimdInto) -> Self { + self + } + }; + let ty_impl = match ty { + ScalarType::Int | ScalarType::Unsigned => quote! { + impl crate::SimdInt<#scalar, Scalar> for #scalar { + #common + + #[inline(always)] + fn min(self, rhs: impl SimdInto) -> Self { + Ord::min(self, rhs.simd_into(Scalar)) + } + #[inline(always)] + fn max(self, rhs: impl SimdInto) -> Self { + Ord::max(self, rhs.simd_into(Scalar)) + } + } + }, + ScalarType::Float => quote! { + impl crate::SimdFloat<#scalar, Scalar> for #scalar { + #common + + #[inline(always)] + fn abs(self) -> Self { + #scalar::abs(self) + } + #[inline(always)] + fn sqrt(self) -> Self { + #scalar::sqrt(self) + } + #[inline(always)] + fn copysign(self, rhs: impl SimdInto) -> Self { + #scalar::copysign(self, rhs.simd_into(Scalar)) + } + + #[inline(always)] + fn max(self, rhs: impl SimdInto) -> Self { + #scalar::max(self, rhs.simd_into(Scalar)) + } + #[inline(always)] + fn max_precise(self, rhs: impl SimdInto) -> Self { + #scalar::max(self, rhs.simd_into(Scalar)) + } + #[inline(always)] + fn min(self, rhs: impl SimdInto) -> Self { + #scalar::min(self, rhs.simd_into(Scalar)) + } + #[inline(always)] + fn min_precise(self, rhs: impl SimdInto) -> Self { + #scalar::min(self, rhs.simd_into(Scalar)) + } + #[inline(always)] + fn madd( + self, + op1: impl SimdInto, + op2: impl SimdInto, + ) -> Self { + self.mul_add(op1.simd_into(Scalar), op2.simd_into(Scalar)) + } + #[inline(always)] + fn msub( + self, + op1: impl SimdInto, + op2: impl SimdInto, + ) -> Self { + self.mul_add(op1.simd_into(Scalar), -op2.simd_into(Scalar)) + } + #[inline(always)] + fn floor(self) -> Self { + #scalar::floor(self) + } + #[inline(always)] + fn fract(self) -> Self { + #scalar::fract(self) + } + #[inline(always)] + fn trunc(self) -> Self { + #scalar::trunc(self) + } + } + }, + _ => quote!(), + }; + quote! { + impl crate::Bytes for #scalar { + type Bytes = #bytes; + fn to_bytes(self) -> #bytes { #to_bytes } + fn from_bytes(value: #bytes) -> Self { #from_bytes } + } + + impl crate::SimdBase<#scalar, Scalar> for #scalar { + const N: usize = 1; + type Mask = #mask; + type Block = #block_ty; + + #[inline(always)] + fn witness(&self) -> Scalar { + Scalar + } + + #[inline(always)] + fn as_slice(&self) -> &[#scalar] { + core::slice::from_ref(self) + } + + #[inline(always)] + fn as_mut_slice(&mut self) -> &mut [#scalar] { + core::slice::from_mut(self) + } + + #[inline(always)] + fn from_slice(Scalar: Scalar, slice: &[#scalar]) -> Self { + slice[0] + } + + #[inline(always)] + fn splat(Scalar: Scalar, val: #scalar) -> Self { + val + } + + #[inline(always)] + fn block_splat(block: Self::Block) -> Self { + block.as_slice()[0] + } + } + + #ty_impl + + impl crate::Select<#scalar> for #mask { + fn select(self, if_true: #scalar, if_false: #scalar) -> #scalar { + if self != 0 { if_true } else { if_false } + } + } + + #cvt_float + } +} + /// Create the impl block for the type /// /// This may go away, as possibly all methods will be subsumed by the `vec_impl`.