From 5b7866b8fe7f748c38f0fc815c4eee09e522ff68 Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Tue, 25 Nov 2025 17:23:10 -0600 Subject: [PATCH 1/3] s390x: Emit instructions from MIE4 & VXRS_EXT3 on z17 This emits & tests a bunch of instructions: * from Miscellaneous-Instruction-Extensions Facility 4: * CLZ, 64bit * CTZ, 64bit * from Vector-Enhancements Facility 3: * 32x4, 64x2 & 128x1 variants of the following: * Divide * Remainder * 64x2 & 128x1 multiply variants * 128x1 vaiants of: * Compare * CLZ * CTZ * Max * Min * Average * Negation * Evaluate Co-authored-by: Jimmy Brisson --- cranelift/codegen/src/isa/s390x/inst.isle | 180 +++- cranelift/codegen/src/isa/s390x/inst/emit.rs | 116 ++- .../codegen/src/isa/s390x/inst/emit_tests.rs | 461 ++++++++- cranelift/codegen/src/isa/s390x/inst/mod.rs | 138 ++- cranelift/codegen/src/isa/s390x/lower.isle | 430 +++++++- cranelift/codegen/src/isa/s390x/lower/isle.rs | 36 + .../isa/s390x/arithmetic-arch15.clif | 333 ++++++ .../filetests/isa/s390x/bitops-arch15.clif | 326 ++++++ .../filetests/isa/s390x/icmp-i128-arch15.clif | 243 +++++ .../isa/s390x/vec-arithmetic-arch15.clif | 51 + .../isa/s390x/vec-bitwise-arch15.clif | 946 ++++++++++++++++++ 11 files changed, 3178 insertions(+), 82 deletions(-) create mode 100644 cranelift/filetests/filetests/isa/s390x/arithmetic-arch15.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/bitops-arch15.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-arithmetic-arch15.clif create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-bitwise-arch15.clif diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle index 7ac2e36f839e..db64ed797141 100644 --- a/cranelift/codegen/src/isa/s390x/inst.isle +++ b/cranelift/codegen/src/isa/s390x/inst.isle @@ -607,6 +607,14 @@ (rm Reg) (ra Reg)) + ;; Vector evaluate instruction. + (VecEvaluate + (imm u8) + (rd WritableReg) + (rn Reg) + (rm Reg) + (ra Reg)) + ;; Vector permute doubleword immediate instruction. (VecPermuteDWImm (rd WritableReg) @@ -645,6 +653,13 @@ (rn Reg) (rm Reg)) + ;; Vector integer element comparison with two registers sources, + ;; setting the condition code. + (VecIntEltCmp + (op VecIntEltCmpOp) + (rn Reg) + (rm Reg)) + ;; Synthetic instruction to compare signed 128-bit values. ;; Sets CC 1 if rn > rm, sets a different CC otherwise. (VecInt128SCmpHi @@ -1117,6 +1132,8 @@ (PopcntReg) (BSwap32) (BSwap64) + (Clz64) + (Ctz64) )) ;; A shift operation. @@ -1170,53 +1187,82 @@ (Sub32x4) (Sub64x2) (Sub128) - ;; Multiplication (64-bit not supported) + ;; Multiplication (Mul8x16) (Mul16x8) (Mul32x4) + (Mul64x2) + (Mul128) (UMulHi8x16) (UMulHi16x8) (UMulHi32x4) + (UMulHi64x2) + (UMulHi128) (SMulHi8x16) (SMulHi16x8) (SMulHi32x4) + (SMulHi64x2) + (SMulHi128) (UMulEven8x16) (UMulEven16x8) (UMulEven32x4) + (UMulEven64x2) (SMulEven8x16) (SMulEven16x8) (SMulEven32x4) + (SMulEven64x2) (UMulOdd8x16) (UMulOdd16x8) (UMulOdd32x4) + (UMulOdd64x2) (SMulOdd8x16) (SMulOdd16x8) (SMulOdd32x4) + (SMulOdd64x2) + ;; Division and remainder + (UDiv32x4) + (UDiv64x2) + (UDiv128) + (URem32x4) + (URem64x2) + (URem128) + (SDiv32x4) + (SDiv64x2) + (SDiv128) + (SRem32x4) + (SRem64x2) + (SRem128) ;; Minimum, maximum, and average (UMax8x16) (UMax16x8) (UMax32x4) (UMax64x2) + (UMax128) (SMax8x16) (SMax16x8) (SMax32x4) (SMax64x2) + (SMax128) (UMin8x16) (UMin16x8) (UMin32x4) (UMin64x2) + (UMin128) (SMin8x16) (SMin16x8) (SMin32x4) (SMin64x2) + (SMin128) (UAvg8x16) (UAvg16x8) (UAvg32x4) (UAvg64x2) + (UAvg128) (SAvg8x16) (SAvg16x8) (SAvg32x4) (SAvg64x2) + (SAvg128) ;; Bitwise operations (And128) (Orr128) @@ -1266,10 +1312,12 @@ (Abs16x8) (Abs32x4) (Abs64x2) + (Abs128) (Neg8x16) (Neg16x8) (Neg32x4) (Neg64x2) + (Neg128) ;; Population count (Popcnt8x16) (Popcnt16x8) @@ -1280,23 +1328,29 @@ (Clz16x8) (Clz32x4) (Clz64x2) + (Clz128) (Ctz8x16) (Ctz16x8) (Ctz32x4) (Ctz64x2) + (Ctz128) ;; Unpack (UnpackULow8x16) (UnpackULow16x8) (UnpackULow32x4) + (UnpackULow64x2) (UnpackUHigh8x16) (UnpackUHigh16x8) (UnpackUHigh32x4) + (UnpackUHigh64x2) (UnpackSLow8x16) (UnpackSLow16x8) (UnpackSLow32x4) + (UnpackSLow64x2) (UnpackSHigh8x16) (UnpackSHigh16x8) (UnpackSHigh32x4) + (UnpackSHigh64x2) )) ;; A vector shift operation. @@ -1327,16 +1381,26 @@ (CmpEq16x8) (CmpEq32x4) (CmpEq64x2) + (CmpEq128) (SCmpHi8x16) (SCmpHi16x8) (SCmpHi32x4) (SCmpHi64x2) + (SCmpHi128) (UCmpHi8x16) (UCmpHi16x8) (UCmpHi32x4) (UCmpHi64x2) + (UCmpHi128) )) +;; An integer vector element comparion operation. +(type VecIntEltCmpOp + (enum + (SCmp128) + (UCmp128) +)); + ;; A floatint-point vector comparison operation. (type VecFloatCmpOp (enum @@ -1493,11 +1557,19 @@ (extern extractor mie3_enabled mie3_enabled) (decl mie3_disabled () Type) (extern extractor mie3_disabled mie3_disabled) +(decl mie4_enabled () Type) +(extern extractor mie4_enabled mie4_enabled) +(decl mie4_disabled () Type) +(extern extractor mie4_disabled mie4_disabled) (decl vxrs_ext2_enabled () Type) (extern extractor vxrs_ext2_enabled vxrs_ext2_enabled) (decl vxrs_ext2_disabled () Type) (extern extractor vxrs_ext2_disabled vxrs_ext2_disabled) +(decl vxrs_ext3_enabled () Type) +(extern extractor vxrs_ext3_enabled vxrs_ext3_enabled) +(decl vxrs_ext3_disabled () Type) +(extern extractor vxrs_ext3_disabled vxrs_ext3_disabled) ;; Helpers for SIMD lane number operations ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2421,6 +2493,13 @@ (_ Unit (emit (MInst.VecPermute dst src1 src2 src3)))) dst)) +;; Helper for emitting `MInst.VecEvaluate` instructions. +(decl vec_eval (Type u8 Reg Reg Reg) Reg) +(rule (vec_eval ty op src1 src2 src3) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecEvaluate op dst src1 src2 src3)))) + dst)) + ;; Helper for emitting `MInst.VecPermuteDWImm` instructions. (decl vec_permute_dw_imm (Type Reg u8 Reg u8) Reg) (rule (vec_permute_dw_imm ty src1 idx1 src2 idx2) @@ -2454,6 +2533,11 @@ (let ((tmp WritableReg (temp_writable_reg ty))) (ProducesFlags.ProducesFlagsSideEffect (MInst.VecFloatCmpS op tmp src1 src2)))) +;; Helper for emitting `MInst.VecIntEltCmp` instructions. +(decl vec_int_elt_cmp (VecIntEltCmpOp Reg Reg) ProducesFlags) +(rule (vec_int_elt_cmp op src1 src2) + (ProducesFlags.ProducesFlagsSideEffect (MInst.VecIntEltCmp op src1 src2))) + ;; Helper for emitting `MInst.VecInt128SCmpHi` instructions. (decl vec_int128_scmphi (Reg Reg) ProducesBool) (rule (vec_int128_scmphi src1 src2) @@ -3618,28 +3702,37 @@ ;; Helpers for generating `clz` and `ctz` instructions ;;;;;;;;;;;;;;;;;;;;;;;;; -;; Count leading zeroes. For a zero input, return the specified value. -(decl clz_reg (i16 Reg) Reg) +;; Count leading zeroes via FLOGR. For a zero input, return the specified value. +(decl clz_flogr_reg (i16 Reg) Reg) ;; The flogr instruction returns 64 for zero input by default. -(rule (clz_reg 64 x) +(rule (clz_flogr_reg 64 x) (let ((dst WritableRegPair (temp_writable_regpair $I64)) (_ Unit (emit (MInst.Flogr dst x)))) (regpair_hi dst))) ;; If another zero return value was requested, we need to override the flogr result. -(rule -1 (clz_reg zeroval x) +(rule -1 (clz_flogr_reg zeroval x) (let ((tmp WritableRegPair (temp_writable_regpair $I64))) (with_flags_reg (ProducesFlags.ProducesFlagsSideEffect (MInst.Flogr tmp x)) (cmov_imm $I64 (intcc_as_cond (IntCC.Equal)) zeroval (regpair_hi tmp))))) +;; Count leading zeros (z17 instruction). +(decl clz_reg (Reg) Reg) +(rule (clz_reg x) (unary_rr $I64 (UnaryOp.Clz64) x)) + +;; Count trailing zeros (z17 instruction). +(decl ctz_reg (Reg) Reg) +(rule (ctz_reg x) (unary_rr $I64 (UnaryOp.Ctz64) x)) + ;; Vector count leading zeros. (decl vecop_clz (Type) VecUnaryOp) (rule (vecop_clz $I8X16) (VecUnaryOp.Clz8x16)) (rule (vecop_clz $I16X8) (VecUnaryOp.Clz16x8)) (rule (vecop_clz $I32X4) (VecUnaryOp.Clz32x4)) (rule (vecop_clz $I64X2) (VecUnaryOp.Clz64x2)) +(rule (vecop_clz $I128) (VecUnaryOp.Clz128)) (decl vec_clz (Type Reg) Reg) (rule (vec_clz ty x) (vec_rr ty (vecop_clz ty) x)) @@ -3650,6 +3743,7 @@ (rule (vecop_ctz $I16X8) (VecUnaryOp.Ctz16x8)) (rule (vecop_ctz $I32X4) (VecUnaryOp.Ctz32x4)) (rule (vecop_ctz $I64X2) (VecUnaryOp.Ctz64x2)) +(rule (vecop_ctz $I128) (VecUnaryOp.Ctz128)) (decl vec_ctz (Type Reg) Reg) (rule (vec_ctz ty x) (vec_rr ty (vecop_ctz ty) x)) @@ -3900,7 +3994,8 @@ (rule (vecop_mul $I8X16) (VecBinaryOp.Mul8x16)) (rule (vecop_mul $I16X8) (VecBinaryOp.Mul16x8)) (rule (vecop_mul $I32X4) (VecBinaryOp.Mul32x4)) -;; No support for $I64X2 multiplication. +(rule (vecop_mul $I64X2) (VecBinaryOp.Mul64x2)) +(rule (vecop_mul $I128) (VecBinaryOp.Mul128)) (decl vec_mul (Type Reg Reg) Reg) (rule (vec_mul ty x y) (vec_rrr ty (vecop_mul ty) x y)) @@ -3909,7 +4004,8 @@ (rule (vecop_umulhi $I8X16) (VecBinaryOp.UMulHi8x16)) (rule (vecop_umulhi $I16X8) (VecBinaryOp.UMulHi16x8)) (rule (vecop_umulhi $I32X4) (VecBinaryOp.UMulHi32x4)) -;; No support for $I64X2 multiplication. +(rule (vecop_umulhi $I64X2) (VecBinaryOp.UMulHi64x2)) +(rule (vecop_umulhi $I128) (VecBinaryOp.UMulHi128)) (decl vec_umulhi (Type Reg Reg) Reg) (rule (vec_umulhi ty x y) (vec_rrr ty (vecop_umulhi ty) x y)) @@ -3918,7 +4014,8 @@ (rule (vecop_smulhi $I8X16) (VecBinaryOp.SMulHi8x16)) (rule (vecop_smulhi $I16X8) (VecBinaryOp.SMulHi16x8)) (rule (vecop_smulhi $I32X4) (VecBinaryOp.SMulHi32x4)) -;; No support for $I64X2 multiplication. +(rule (vecop_smulhi $I64X2) (VecBinaryOp.SMulHi64x2)) +(rule (vecop_smulhi $I128) (VecBinaryOp.SMulHi128)) (decl vec_smulhi (Type Reg Reg) Reg) (rule (vec_smulhi ty x y) (vec_rrr ty (vecop_smulhi ty) x y)) @@ -3927,7 +4024,7 @@ (rule (vecop_umul_even $I8X16) (VecBinaryOp.UMulEven8x16)) (rule (vecop_umul_even $I16X8) (VecBinaryOp.UMulEven16x8)) (rule (vecop_umul_even $I32X4) (VecBinaryOp.UMulEven32x4)) -;; No support for $I64X2 multiplication. +(rule (vecop_umul_even $I64X2) (VecBinaryOp.UMulEven64x2)) (decl vec_umul_even (Type Reg Reg) Reg) (rule (vec_umul_even ty x y) (vec_rrr ty (vecop_umul_even ty) x y)) @@ -3936,7 +4033,7 @@ (rule (vecop_smul_even $I8X16) (VecBinaryOp.SMulEven8x16)) (rule (vecop_smul_even $I16X8) (VecBinaryOp.SMulEven16x8)) (rule (vecop_smul_even $I32X4) (VecBinaryOp.SMulEven32x4)) -;; No support for $I64X2 multiplication. +(rule (vecop_smul_even $I64X2) (VecBinaryOp.SMulEven64x2)) (decl vec_smul_even (Type Reg Reg) Reg) (rule (vec_smul_even ty x y) (vec_rrr ty (vecop_smul_even ty) x y)) @@ -3945,7 +4042,7 @@ (rule (vecop_umul_odd $I8X16) (VecBinaryOp.UMulOdd8x16)) (rule (vecop_umul_odd $I16X8) (VecBinaryOp.UMulOdd16x8)) (rule (vecop_umul_odd $I32X4) (VecBinaryOp.UMulOdd32x4)) -;; No support for $I64X2 multiplication. +(rule (vecop_umul_odd $I64X2) (VecBinaryOp.UMulOdd64x2)) (decl vec_umul_odd (Type Reg Reg) Reg) (rule (vec_umul_odd ty x y) (vec_rrr ty (vecop_umul_odd ty) x y)) @@ -3954,12 +4051,47 @@ (rule (vecop_smul_odd $I8X16) (VecBinaryOp.SMulOdd8x16)) (rule (vecop_smul_odd $I16X8) (VecBinaryOp.SMulOdd16x8)) (rule (vecop_smul_odd $I32X4) (VecBinaryOp.SMulOdd32x4)) -;; No support for $I64X2 multiplication. +(rule (vecop_smul_odd $I64X2) (VecBinaryOp.SMulOdd64x2)) (decl vec_smul_odd (Type Reg Reg) Reg) (rule (vec_smul_odd ty x y) (vec_rrr ty (vecop_smul_odd ty) x y)) +;; Helpers for generating vector divide instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(decl vecop_sdiv (Type) VecBinaryOp) +(rule (vecop_sdiv $I32X4) (VecBinaryOp.SDiv32x4)) +(rule (vecop_sdiv $I64X2) (VecBinaryOp.SDiv64x2)) +(rule (vecop_sdiv $I128) (VecBinaryOp.SDiv128)) + +(decl vec_sdiv (Type Reg Reg) Reg) +(rule (vec_sdiv ty x y) (vec_rrr ty (vecop_sdiv ty) x y)) + +(decl vecop_udiv (Type) VecBinaryOp) +(rule (vecop_udiv $I32X4) (VecBinaryOp.UDiv32x4)) +(rule (vecop_udiv $I64X2) (VecBinaryOp.UDiv64x2)) +(rule (vecop_udiv $I128) (VecBinaryOp.UDiv128)) + +(decl vec_udiv (Type Reg Reg) Reg) +(rule (vec_udiv ty x y) (vec_rrr ty (vecop_udiv ty) x y)) + +(decl vecop_srem (Type) VecBinaryOp) +(rule (vecop_srem $I32X4) (VecBinaryOp.SRem32x4)) +(rule (vecop_srem $I64X2) (VecBinaryOp.SRem64x2)) +(rule (vecop_srem $I128) (VecBinaryOp.SRem128)) + +(decl vec_srem (Type Reg Reg) Reg) +(rule (vec_srem ty x y) (vec_rrr ty (vecop_srem ty) x y)) + +(decl vecop_urem (Type) VecBinaryOp) +(rule (vecop_urem $I32X4) (VecBinaryOp.URem32x4)) +(rule (vecop_urem $I64X2) (VecBinaryOp.URem64x2)) +(rule (vecop_urem $I128) (VecBinaryOp.URem128)) + +(decl vec_urem (Type Reg Reg) Reg) +(rule (vec_urem ty x y) (vec_rrr ty (vecop_urem ty) x y)) + + ;; Helpers for generating `udivmod` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (decl udivmod (Type RegPair Reg) RegPair) @@ -3981,6 +4113,7 @@ (rule (vecop_umax $I16X8) (VecBinaryOp.UMax16x8)) (rule (vecop_umax $I32X4) (VecBinaryOp.UMax32x4)) (rule (vecop_umax $I64X2) (VecBinaryOp.UMax64x2)) +(rule (vecop_umax $I128) (VecBinaryOp.UMax128)) (decl vec_umax (Type Reg Reg) Reg) (rule (vec_umax ty x y) (vec_rrr ty (vecop_umax ty) x y)) @@ -3993,6 +4126,7 @@ (rule (vecop_smax $I16X8) (VecBinaryOp.SMax16x8)) (rule (vecop_smax $I32X4) (VecBinaryOp.SMax32x4)) (rule (vecop_smax $I64X2) (VecBinaryOp.SMax64x2)) +(rule (vecop_smax $I128) (VecBinaryOp.SMax128)) (decl vec_smax (Type Reg Reg) Reg) (rule (vec_smax ty x y) (vec_rrr ty (vecop_smax ty) x y)) @@ -4005,6 +4139,7 @@ (rule (vecop_umin $I16X8) (VecBinaryOp.UMin16x8)) (rule (vecop_umin $I32X4) (VecBinaryOp.UMin32x4)) (rule (vecop_umin $I64X2) (VecBinaryOp.UMin64x2)) +(rule (vecop_umin $I128) (VecBinaryOp.UMin128)) (decl vec_umin (Type Reg Reg) Reg) (rule (vec_umin ty x y) (vec_rrr ty (vecop_umin ty) x y)) @@ -4017,6 +4152,7 @@ (rule (vecop_smin $I16X8) (VecBinaryOp.SMin16x8)) (rule (vecop_smin $I32X4) (VecBinaryOp.SMin32x4)) (rule (vecop_smin $I64X2) (VecBinaryOp.SMin64x2)) +(rule (vecop_smin $I128) (VecBinaryOp.SMin128)) (decl vec_smin (Type Reg Reg) Reg) (rule (vec_smin ty x y) (vec_rrr ty (vecop_smin ty) x y)) @@ -4213,6 +4349,7 @@ (rule (vecop_abs $I16X8) (VecUnaryOp.Abs16x8)) (rule (vecop_abs $I32X4) (VecUnaryOp.Abs32x4)) (rule (vecop_abs $I64X2) (VecUnaryOp.Abs64x2)) +(rule (vecop_abs $I128) (VecUnaryOp.Abs128)) (decl vec_abs (Type Reg) Reg) (rule (vec_abs ty x) (vec_rr ty (vecop_abs ty) x)) @@ -4240,6 +4377,7 @@ (rule (vecop_neg $I16X8) (VecUnaryOp.Neg16x8)) (rule (vecop_neg $I32X4) (VecUnaryOp.Neg32x4)) (rule (vecop_neg $I64X2) (VecUnaryOp.Neg64x2)) +(rule (vecop_neg $I128) (VecUnaryOp.Neg128)) (decl vec_neg (Type Reg) Reg) (rule (vec_neg ty x) (vec_rr ty (vecop_neg ty) x)) @@ -4871,9 +5009,9 @@ (rule (vecop_int_cmpeq (multi_lane 64 2)) (VecIntCmpOp.CmpEq64x2)) (decl vec_cmpeq (Type Reg Reg) Reg) -(rule (vec_cmpeq (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmpeq ty) x y)) +(rule (vec_cmpeq (vr128_ty ty) x y) (vec_int_cmp ty (vecop_int_cmpeq ty) x y)) (decl vec_cmpeqs (Type Reg Reg) ProducesFlags) -(rule (vec_cmpeqs (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmpeq ty) x y)) +(rule (vec_cmpeqs (vr128_ty ty) x y) (vec_int_cmps ty (vecop_int_cmpeq ty) x y)) (decl vecop_int_cmph (Type) VecIntCmpOp) (rule (vecop_int_cmph (multi_lane 8 16)) (VecIntCmpOp.SCmpHi8x16)) @@ -4882,9 +5020,9 @@ (rule (vecop_int_cmph (multi_lane 64 2)) (VecIntCmpOp.SCmpHi64x2)) (decl vec_cmph (Type Reg Reg) Reg) -(rule (vec_cmph (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmph ty) x y)) +(rule (vec_cmph (vr128_ty ty) x y) (vec_int_cmp ty (vecop_int_cmph ty) x y)) (decl vec_cmphs (Type Reg Reg) ProducesFlags) -(rule (vec_cmphs (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmph ty) x y)) +(rule (vec_cmphs (vr128_ty ty) x y) (vec_int_cmps ty (vecop_int_cmph ty) x y)) (decl vecop_int_cmphl (Type) VecIntCmpOp) (rule (vecop_int_cmphl (multi_lane 8 16)) (VecIntCmpOp.UCmpHi8x16)) @@ -4893,9 +5031,15 @@ (rule (vecop_int_cmphl (multi_lane 64 2)) (VecIntCmpOp.UCmpHi64x2)) (decl vec_cmphl (Type Reg Reg) Reg) -(rule (vec_cmphl (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmphl ty) x y)) +(rule (vec_cmphl (vr128_ty ty) x y) (vec_int_cmp ty (vecop_int_cmphl ty) x y)) (decl vec_cmphls (Type Reg Reg) ProducesFlags) -(rule (vec_cmphls (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmphl ty) x y)) +(rule (vec_cmphls (vr128_ty ty) x y) (vec_int_cmps ty (vecop_int_cmphl ty) x y)) + +(decl vec_elt_icmps (Reg Reg) ProducesFlags) +(rule (vec_elt_icmps x y) (vec_int_elt_cmp (VecIntEltCmpOp.SCmp128) x y)) + +(decl vec_elt_icmpu (Reg Reg) ProducesFlags) +(rule (vec_elt_icmpu x y) (vec_int_elt_cmp (VecIntEltCmpOp.UCmp128) x y)) ;; Helpers for generating `fcmp` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/s390x/inst/emit.rs b/cranelift/codegen/src/isa/s390x/inst/emit.rs index 5baef0fede8a..944f63ed5842 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs @@ -1072,6 +1072,31 @@ fn enc_vri_c(opcode: u16, v1: Reg, i2: u16, v3: Reg, m4: u8) -> [u8; 6] { enc } +/// VRIk-type instructions. +/// +/// 47 39 35 31 27 23 15 11 7 +/// opcode1 v1 v2 v3 - i5 v4 rxb opcode2 +/// 40 36 32 28 24 16 12 8 0 +/// +fn enc_vri_k(opcode: u16, i5: u8, v1: Reg, v2: Reg, v3: Reg, v4: Reg) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), Some(v2), Some(v3), Some(v4)); + let v1 = machreg_to_vr(v1) & 0x0f; + let v2 = machreg_to_vr(v2) & 0x0f; + let v3 = machreg_to_vr(v3) & 0x0f; + let v4 = machreg_to_vr(v4) & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4 | v2; + enc[2] = v3 << 4; + enc[3] = i5; + enc[4] = v4 << 4 | rxb; + enc[5] = opcode2; + enc +} + /// VRRa-type instructions. /// /// 47 39 35 31 23 19 15 11 7 @@ -1437,8 +1462,12 @@ impl Inst { InstructionSet::Base => true, // Miscellaneous-Instruction-Extensions Facility 3 (z15) InstructionSet::MIE3 => emit_info.isa_flags.has_mie3(), + // Miscellaneous-Instruction-Extensions Facility 4 (z17) + InstructionSet::MIE4 => emit_info.isa_flags.has_mie4(), // Vector-Enhancements Facility 2 (z15) InstructionSet::VXRS_EXT2 => emit_info.isa_flags.has_vxrs_ext2(), + // Vector-Enhancements Facility 3 (z17) + InstructionSet::VXRS_EXT3 => emit_info.isa_flags.has_vxrs_ext3(), } }; let isa_requirements = self.available_in_isa(); @@ -1884,6 +1913,14 @@ impl Inst { let opcode = 0xb90f; // LRVRG put(sink, &enc_rre(opcode, rd.to_reg(), rn)); } + UnaryOp::Clz64 => { + let opcode = 0xb968; // CLZG + put(sink, &enc_rre(opcode, rd.to_reg(), rn)); + } + UnaryOp::Ctz64 => { + let opcode = 0xb969; // CTZG + put(sink, &enc_rre(opcode, rd.to_reg(), rn)); + } } } @@ -2663,48 +2700,76 @@ impl Inst { VecBinaryOp::Mul8x16 => (0xe7a2, 0), // VMLB VecBinaryOp::Mul16x8 => (0xe7a2, 1), // VMLHW VecBinaryOp::Mul32x4 => (0xe7a2, 2), // VMLF + VecBinaryOp::Mul64x2 => (0xe7a2, 3), // VMLG + VecBinaryOp::Mul128 => (0xe7a2, 4), // VMLQ VecBinaryOp::UMulHi8x16 => (0xe7a1, 0), // VMLHB VecBinaryOp::UMulHi16x8 => (0xe7a1, 1), // VMLHH VecBinaryOp::UMulHi32x4 => (0xe7a1, 2), // VMLHF + VecBinaryOp::UMulHi64x2 => (0xe7a1, 3), // VMLHG + VecBinaryOp::UMulHi128 => (0xe7a1, 4), // VMLHQ VecBinaryOp::SMulHi8x16 => (0xe7a3, 0), // VMHB VecBinaryOp::SMulHi16x8 => (0xe7a3, 1), // VMHH VecBinaryOp::SMulHi32x4 => (0xe7a3, 2), // VMHF + VecBinaryOp::SMulHi64x2 => (0xe7a3, 3), // VMHG + VecBinaryOp::SMulHi128 => (0xe7a3, 4), // VMHQ VecBinaryOp::UMulEven8x16 => (0xe7a4, 0), // VMLEB VecBinaryOp::UMulEven16x8 => (0xe7a4, 1), // VMLEH VecBinaryOp::UMulEven32x4 => (0xe7a4, 2), // VMLEF + VecBinaryOp::UMulEven64x2 => (0xe7a4, 3), // VMLEG VecBinaryOp::SMulEven8x16 => (0xe7a6, 0), // VMEB VecBinaryOp::SMulEven16x8 => (0xe7a6, 1), // VMEH VecBinaryOp::SMulEven32x4 => (0xe7a6, 2), // VMEF + VecBinaryOp::SMulEven64x2 => (0xe7a6, 3), // VMEG VecBinaryOp::UMulOdd8x16 => (0xe7a5, 0), // VMLOB VecBinaryOp::UMulOdd16x8 => (0xe7a5, 1), // VMLOH VecBinaryOp::UMulOdd32x4 => (0xe7a5, 2), // VMLOF + VecBinaryOp::UMulOdd64x2 => (0xe7a5, 3), // VMLOG VecBinaryOp::SMulOdd8x16 => (0xe7a7, 0), // VMOB VecBinaryOp::SMulOdd16x8 => (0xe7a7, 1), // VMOH VecBinaryOp::SMulOdd32x4 => (0xe7a7, 2), // VMOF + VecBinaryOp::SMulOdd64x2 => (0xe7a7, 3), // VMOG + VecBinaryOp::UDiv32x4 => (0xe7b0, 2), // VDLF + VecBinaryOp::UDiv64x2 => (0xe7b0, 3), // VDLG + VecBinaryOp::UDiv128 => (0xe7b0, 4), // VDLQ + VecBinaryOp::SDiv32x4 => (0xe7b2, 2), // VDF + VecBinaryOp::SDiv64x2 => (0xe7b2, 3), // VDG + VecBinaryOp::SDiv128 => (0xe7b2, 4), // VDQ + VecBinaryOp::URem32x4 => (0xe7b1, 2), // VRLF + VecBinaryOp::URem64x2 => (0xe7b1, 3), // VRLG + VecBinaryOp::URem128 => (0xe7b1, 4), // VRLQ + VecBinaryOp::SRem32x4 => (0xe7b3, 2), // VRF + VecBinaryOp::SRem64x2 => (0xe7b3, 3), // VRG + VecBinaryOp::SRem128 => (0xe7b3, 4), // VRQ VecBinaryOp::UMax8x16 => (0xe7fd, 0), // VMXLB VecBinaryOp::UMax16x8 => (0xe7fd, 1), // VMXLH VecBinaryOp::UMax32x4 => (0xe7fd, 2), // VMXLF VecBinaryOp::UMax64x2 => (0xe7fd, 3), // VMXLG + VecBinaryOp::UMax128 => (0xe7fd, 4), // VMXLQ VecBinaryOp::SMax8x16 => (0xe7ff, 0), // VMXB VecBinaryOp::SMax16x8 => (0xe7ff, 1), // VMXH VecBinaryOp::SMax32x4 => (0xe7ff, 2), // VMXF VecBinaryOp::SMax64x2 => (0xe7ff, 3), // VMXG + VecBinaryOp::SMax128 => (0xe7ff, 4), // VMXQ VecBinaryOp::UMin8x16 => (0xe7fc, 0), // VMNLB VecBinaryOp::UMin16x8 => (0xe7fc, 1), // VMNLH VecBinaryOp::UMin32x4 => (0xe7fc, 2), // VMNLF VecBinaryOp::UMin64x2 => (0xe7fc, 3), // VMNLG + VecBinaryOp::UMin128 => (0xe7fc, 4), // VMNLQ VecBinaryOp::SMin8x16 => (0xe7fe, 0), // VMNB VecBinaryOp::SMin16x8 => (0xe7fe, 1), // VMNH VecBinaryOp::SMin32x4 => (0xe7fe, 2), // VMNF VecBinaryOp::SMin64x2 => (0xe7fe, 3), // VMNG + VecBinaryOp::SMin128 => (0xe7fe, 4), // VMNQ VecBinaryOp::UAvg8x16 => (0xe7f0, 0), // VAVGLB VecBinaryOp::UAvg16x8 => (0xe7f0, 1), // VAVGLH VecBinaryOp::UAvg32x4 => (0xe7f0, 2), // VAVGLF VecBinaryOp::UAvg64x2 => (0xe7f0, 3), // VAVGLG + VecBinaryOp::UAvg128 => (0xe7f0, 4), // VAVGLQ VecBinaryOp::SAvg8x16 => (0xe7f2, 0), // VAVGB VecBinaryOp::SAvg16x8 => (0xe7f2, 1), // VAVGH VecBinaryOp::SAvg32x4 => (0xe7f2, 2), // VAVGF VecBinaryOp::SAvg64x2 => (0xe7f2, 3), // VAVGG + VecBinaryOp::SAvg128 => (0xe7f2, 4), // VAVGQ VecBinaryOp::And128 => (0xe768, 0), // VN VecBinaryOp::Orr128 => (0xe76a, 0), // VO VecBinaryOp::Xor128 => (0xe76d, 0), // VX @@ -2739,7 +2804,27 @@ impl Inst { VecBinaryOp::MergeHigh64x2 => (0xe761, 3), // VMRHG }; - put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, 0)); + let enc = &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, 0); + let may_trap = match op { + VecBinaryOp::UDiv32x4 + | VecBinaryOp::UDiv64x2 + | VecBinaryOp::UDiv128 + | VecBinaryOp::SDiv32x4 + | VecBinaryOp::SDiv64x2 + | VecBinaryOp::SDiv128 + | VecBinaryOp::URem32x4 + | VecBinaryOp::URem64x2 + | VecBinaryOp::URem128 + | VecBinaryOp::SRem32x4 + | VecBinaryOp::SRem64x2 + | VecBinaryOp::SRem128 => true, + _ => false, + }; + if may_trap { + put_with_trap(sink, enc, TrapCode::INTEGER_DIVISION_BY_ZERO); + } else { + put(sink, enc); + } } &Inst::VecRR { op, rd, rn } => { let (opcode, m3) = match op { @@ -2747,10 +2832,12 @@ impl Inst { VecUnaryOp::Abs16x8 => (0xe7df, 1), // VLPH VecUnaryOp::Abs32x4 => (0xe7df, 2), // VLPF VecUnaryOp::Abs64x2 => (0xe7df, 3), // VLPG + VecUnaryOp::Abs128 => (0xe7df, 4), // VLPQ VecUnaryOp::Neg8x16 => (0xe7de, 0), // VLCB VecUnaryOp::Neg16x8 => (0xe7de, 1), // VLCH VecUnaryOp::Neg32x4 => (0xe7de, 2), // VLCF VecUnaryOp::Neg64x2 => (0xe7de, 3), // VLCG + VecUnaryOp::Neg128 => (0xe7de, 4), // VLCQ VecUnaryOp::Popcnt8x16 => (0xe750, 0), // VPOPCTB VecUnaryOp::Popcnt16x8 => (0xe750, 1), // VPOPCTH VecUnaryOp::Popcnt32x4 => (0xe750, 2), // VPOPCTF @@ -2759,22 +2846,28 @@ impl Inst { VecUnaryOp::Clz16x8 => (0xe753, 1), // VCLZH VecUnaryOp::Clz32x4 => (0xe753, 2), // VCLZF VecUnaryOp::Clz64x2 => (0xe753, 3), // VCLZG + VecUnaryOp::Clz128 => (0xe753, 4), // VCLZQ VecUnaryOp::Ctz8x16 => (0xe752, 0), // VCTZB VecUnaryOp::Ctz16x8 => (0xe752, 1), // VCTZH VecUnaryOp::Ctz32x4 => (0xe752, 2), // VCTZF VecUnaryOp::Ctz64x2 => (0xe752, 3), // VCTZG + VecUnaryOp::Ctz128 => (0xe752, 4), // VCTZQ VecUnaryOp::UnpackULow8x16 => (0xe7d4, 0), // VUPLLB VecUnaryOp::UnpackULow16x8 => (0xe7d4, 1), // VUPLLH VecUnaryOp::UnpackULow32x4 => (0xe7d4, 2), // VUPLLF + VecUnaryOp::UnpackULow64x2 => (0xe7d4, 3), // VUPLLG VecUnaryOp::UnpackUHigh8x16 => (0xe7d5, 0), // VUPLHB VecUnaryOp::UnpackUHigh16x8 => (0xe7d5, 1), // VUPLHH VecUnaryOp::UnpackUHigh32x4 => (0xe7d5, 2), // VUPLHF + VecUnaryOp::UnpackUHigh64x2 => (0xe7d5, 3), // VUPLHG VecUnaryOp::UnpackSLow8x16 => (0xe7d6, 0), // VUPLB VecUnaryOp::UnpackSLow16x8 => (0xe7d6, 1), // VUPLH VecUnaryOp::UnpackSLow32x4 => (0xe7d6, 2), // VUPLF + VecUnaryOp::UnpackSLow64x2 => (0xe7d6, 3), // VUPLG VecUnaryOp::UnpackSHigh8x16 => (0xe7d7, 0), // VUPHB VecUnaryOp::UnpackSHigh16x8 => (0xe7d7, 1), // VUPHH VecUnaryOp::UnpackSHigh32x4 => (0xe7d7, 2), // VUPHF + VecUnaryOp::UnpackSHigh64x2 => (0xe7d7, 3), // VUPHG }; put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, 0, 0)); @@ -2817,6 +2910,16 @@ impl Inst { let opcode = 0xe78c; // VPERM put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, 0, 0)); } + &Inst::VecEvaluate { + imm, + rd, + rn, + rm, + ra, + } => { + let opcode = 0xe788; //VEVAL + put(sink, &enc_vri_k(opcode, imm, rd.to_reg(), rn, rm, ra)); + } &Inst::VecPermuteDWImm { rd, rn, @@ -2835,14 +2938,17 @@ impl Inst { VecIntCmpOp::CmpEq16x8 => (0xe7f8, 1), // VCEQH VecIntCmpOp::CmpEq32x4 => (0xe7f8, 2), // VCEQF VecIntCmpOp::CmpEq64x2 => (0xe7f8, 3), // VCEQG + VecIntCmpOp::CmpEq128 => (0xe7f8, 4), // VCEQQ VecIntCmpOp::SCmpHi8x16 => (0xe7fb, 0), // VCHB VecIntCmpOp::SCmpHi16x8 => (0xe7fb, 1), // VCHH VecIntCmpOp::SCmpHi32x4 => (0xe7fb, 2), // VCHG VecIntCmpOp::SCmpHi64x2 => (0xe7fb, 3), // VCHG + VecIntCmpOp::SCmpHi128 => (0xe7fb, 4), // VCHQ VecIntCmpOp::UCmpHi8x16 => (0xe7f9, 0), // VCHLB VecIntCmpOp::UCmpHi16x8 => (0xe7f9, 1), // VCHLH VecIntCmpOp::UCmpHi32x4 => (0xe7f9, 2), // VCHLG VecIntCmpOp::UCmpHi64x2 => (0xe7f9, 3), // VCHLG + VecIntCmpOp::UCmpHi128 => (0xe7f9, 4), // VCHLQ }; let m5 = match self { &Inst::VecIntCmp { .. } => 0, @@ -2869,6 +2975,14 @@ impl Inst { put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, m6)); } + &Inst::VecIntEltCmp { op, rn, rm } => { + let (opcode, m3) = match op { + VecIntEltCmpOp::SCmp128 => (0xe7db, 4), // VECQ + VecIntEltCmpOp::UCmp128 => (0xe7d9, 4), // VECLQ + }; + + put(sink, &enc_vrr_a(opcode, rn, rm, m3, 0, 0)); + } &Inst::VecInt128SCmpHi { tmp, rn, rm } | &Inst::VecInt128UCmpHi { tmp, rn, rm } => { // Synthetic instruction to compare 128-bit values. // Sets CC 1 if rn > rm, sets a different CC otherwise. diff --git a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs index aa6f9e6b19c9..d69ccbd91c2a 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs @@ -1643,6 +1643,24 @@ fn test_s390x_binemit() { "B90F001A", "lrvgr %r1, %r10", )); + insns.push(( + Inst::UnaryRR { + op: UnaryOp::Clz64, + rd: writable_gpr(1), + rn: gpr(10), + }, + "B968001A", + "clzg %r1, %r10", + )); + insns.push(( + Inst::UnaryRR { + op: UnaryOp::Ctz64, + rd: writable_gpr(1), + rn: gpr(10), + }, + "B969001A", + "ctzg %r1, %r10", + )); insns.push(( Inst::CmpRR { @@ -8522,6 +8540,26 @@ fn test_s390x_binemit() { "E748C00028A2", "vmlf %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Mul64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038A2", + "vmlg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::Mul128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048A2", + "vmlq %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::UMulHi8x16, @@ -8552,6 +8590,26 @@ fn test_s390x_binemit() { "E748C00028A1", "vmlhf %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038A1", + "vmlhg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulHi128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048A1", + "vmlhq %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::SMulHi8x16, @@ -8582,6 +8640,26 @@ fn test_s390x_binemit() { "E748C00028A3", "vmhf %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulHi64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038A3", + "vmhg %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulHi128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048A3", + "vmhq %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::UMulEven8x16, @@ -8612,6 +8690,16 @@ fn test_s390x_binemit() { "E748C00028A4", "vmlef %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulEven64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038A4", + "vmleg %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::SMulEven8x16, @@ -8642,6 +8730,16 @@ fn test_s390x_binemit() { "E748C00028A6", "vmef %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulEven64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038A6", + "vmeg %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::UMulOdd8x16, @@ -8672,6 +8770,16 @@ fn test_s390x_binemit() { "E748C00028A5", "vmlof %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMulOdd64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038A5", + "vmlog %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::SMulOdd8x16, @@ -8702,6 +8810,136 @@ fn test_s390x_binemit() { "E748C00028A7", "vmof %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMulOdd64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038A7", + "vmog %v20, %v8, %v12", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UDiv32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028B0", + "vdlf %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UDiv64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038B0", + "vdlg %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UDiv128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048B0", + "vdlq %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SDiv32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028B2", + "vdf %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SDiv64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038B2", + "vdg %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SDiv128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048B2", + "vdq %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::URem32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028B1", + "vrlf %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::URem64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038B1", + "vrlg %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::URem128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048B1", + "vrlq %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SRem32x4, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00028B3", + "vrf %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SRem64x2, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00038B3", + "vrg %v20, %v8, %v12, 0", + )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SRem128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048B3", + "vrq %v20, %v8, %v12, 0", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::UMax8x16, @@ -8742,6 +8980,16 @@ fn test_s390x_binemit() { "E748C00038FD", "vmxlg %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMax128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048FD", + "vmxlq %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::SMax8x16, @@ -8782,6 +9030,16 @@ fn test_s390x_binemit() { "E748C00038FF", "vmxg %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMax128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048FF", + "vmxq %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::UMin8x16, @@ -8822,6 +9080,16 @@ fn test_s390x_binemit() { "E748C00038FC", "vmnlg %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UMin128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048FC", + "vmnlq %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::SMin8x16, @@ -8862,6 +9130,16 @@ fn test_s390x_binemit() { "E748C00038FE", "vmng %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SMin128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048FE", + "vmnq %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::UAvg8x16, @@ -8902,6 +9180,16 @@ fn test_s390x_binemit() { "E748C00038F0", "vavglg %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::UAvg128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048F0", + "vavglq %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::SAvg8x16, @@ -8942,6 +9230,16 @@ fn test_s390x_binemit() { "E748C00038F2", "vavgg %v20, %v8, %v12", )); + insns.push(( + Inst::VecRRR { + op: VecBinaryOp::SAvg128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048F2", + "vavgq %v20, %v8, %v12", + )); insns.push(( Inst::VecRRR { op: VecBinaryOp::And128, @@ -9299,6 +9597,15 @@ fn test_s390x_binemit() { "E748000038DF", "vlpg %v20, %v8", )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Abs128, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000048DF", + "vlpq %v20, %v8", + )); insns.push(( Inst::VecRR { op: VecUnaryOp::Neg8x16, @@ -9335,6 +9642,15 @@ fn test_s390x_binemit() { "E748000038DE", "vlcg %v20, %v8", )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Neg128, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000048DE", + "vlcq %v20, %v8", + )); insns.push(( Inst::VecRR { op: VecUnaryOp::Popcnt8x16, @@ -9407,6 +9723,15 @@ fn test_s390x_binemit() { "E74800003853", "vclzg %v20, %v8", )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Clz128, + rd: writable_vr(20), + rn: vr(8), + }, + "E74800004853", + "vclzq %v20, %v8", + )); insns.push(( Inst::VecRR { op: VecUnaryOp::Ctz8x16, @@ -9443,6 +9768,15 @@ fn test_s390x_binemit() { "E74800003852", "vctzg %v20, %v8", )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::Ctz128, + rd: writable_vr(20), + rn: vr(8), + }, + "E74800004852", + "vctzq %v20, %v8", + )); insns.push(( Inst::VecRR { op: VecUnaryOp::UnpackULow8x16, @@ -9470,6 +9804,15 @@ fn test_s390x_binemit() { "E748000028D4", "vupllf %v20, %v8", )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackULow64x2, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000038D4", + "vupllg %v20, %v8", + )); insns.push(( Inst::VecRR { op: VecUnaryOp::UnpackUHigh8x16, @@ -9497,6 +9840,15 @@ fn test_s390x_binemit() { "E748000028D5", "vuplhf %v20, %v8", )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackUHigh64x2, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000038D5", + "vuplhg %v20, %v8", + )); insns.push(( Inst::VecRR { op: VecUnaryOp::UnpackSLow8x16, @@ -9524,6 +9876,15 @@ fn test_s390x_binemit() { "E748000028D6", "vuplf %v20, %v8", )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSLow64x2, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000038D6", + "vuplg %v20, %v8", + )); insns.push(( Inst::VecRR { op: VecUnaryOp::UnpackSHigh8x16, @@ -9551,6 +9912,15 @@ fn test_s390x_binemit() { "E748000028D7", "vuphf %v20, %v8", )); + insns.push(( + Inst::VecRR { + op: VecUnaryOp::UnpackSHigh64x2, + rd: writable_vr(20), + rn: vr(8), + }, + "E748000038D7", + "vuphg %v20, %v8", + )); insns.push(( Inst::VecShiftRR { @@ -9934,6 +10304,16 @@ fn test_s390x_binemit() { "E748C00038F8", "vceqg %v20, %v8, %v12", )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::CmpEq128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048F8", + "vceqq %v20, %v8, %v12", + )); insns.push(( Inst::VecIntCmp { op: VecIntCmpOp::SCmpHi8x16, @@ -9974,6 +10354,16 @@ fn test_s390x_binemit() { "E748C00038FB", "vchg %v20, %v8, %v12", )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::SCmpHi128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048FB", + "vchq %v20, %v8, %v12", + )); insns.push(( Inst::VecIntCmp { op: VecIntCmpOp::UCmpHi8x16, @@ -10014,6 +10404,16 @@ fn test_s390x_binemit() { "E748C00038F9", "vchlg %v20, %v8, %v12", )); + insns.push(( + Inst::VecIntCmp { + op: VecIntCmpOp::UCmpHi128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C00048F9", + "vchlq %v20, %v8, %v12", + )); insns.push(( Inst::VecIntCmpS { op: VecIntCmpOp::CmpEq8x16, @@ -10054,6 +10454,16 @@ fn test_s390x_binemit() { "E748C01038F8", "vceqgs %v20, %v8, %v12", )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::CmpEq128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01048F8", + "vceqqs %v20, %v8, %v12", + )); insns.push(( Inst::VecIntCmpS { op: VecIntCmpOp::SCmpHi8x16, @@ -10094,6 +10504,16 @@ fn test_s390x_binemit() { "E748C01038FB", "vchgs %v20, %v8, %v12", )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::SCmpHi128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01048FB", + "vchqs %v20, %v8, %v12", + )); insns.push(( Inst::VecIntCmpS { op: VecIntCmpOp::UCmpHi8x16, @@ -10134,6 +10554,45 @@ fn test_s390x_binemit() { "E748C01038F9", "vchlgs %v20, %v8, %v12", )); + insns.push(( + Inst::VecIntCmpS { + op: VecIntCmpOp::UCmpHi128, + rd: writable_vr(20), + rn: vr(8), + rm: vr(12), + }, + "E748C01048F9", + "vchlqs %v20, %v8, %v12", + )); + insns.push(( + Inst::VecIntEltCmp { + op: VecIntEltCmpOp::SCmp128, + rn: vr(20), + rm: vr(12), + }, + "E74C000048DB", + "vecq %v20, %v12", + )); + insns.push(( + Inst::VecIntEltCmp { + op: VecIntEltCmpOp::UCmp128, + rn: vr(20), + rm: vr(12), + }, + "E74C000048D9", + "veclq %v20, %v12", + )); + insns.push(( + Inst::VecEvaluate { + imm: 0x02, + rd: writable_vr(12), + rn: vr(20), + rm: vr(21), + ra: vr(22), + }, + "E7C450026788", + "veval %v12, %v20, %v21, %v22, 2", + )); insns.push(( Inst::VecInt128SCmpHi { tmp: writable_vr(20), @@ -13465,7 +13924,7 @@ fn test_s390x_binemit() { use crate::settings::Configurable; let mut isa_flag_builder = s390x_settings::builder(); - isa_flag_builder.enable("arch13").unwrap(); + isa_flag_builder.enable("arch15").unwrap(); let isa_flags = s390x_settings::Flags::new(&flags, &isa_flag_builder); let ctrl_plane = &mut Default::default(); let constants = Default::default(); diff --git a/cranelift/codegen/src/isa/s390x/inst/mod.rs b/cranelift/codegen/src/isa/s390x/inst/mod.rs index 762f6c58d27e..6247e966cde6 100644 --- a/cranelift/codegen/src/isa/s390x/inst/mod.rs +++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs @@ -30,7 +30,7 @@ mod emit_tests; pub use crate::isa::s390x::lower::isle::generated_code::{ ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuConv128Op, FpuRoundMode, FpuRoundOp, LaneOrder, MInst as Inst, RxSBGOp, ShiftOp, SymbolReloc, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp, - VecShiftOp, VecUnaryOp, + VecIntEltCmpOp, VecShiftOp, VecUnaryOp, }; /// The destination of a call instruction. @@ -89,8 +89,12 @@ pub(crate) enum InstructionSet { Base, /// Miscellaneous-Instruction-Extensions Facility 3 (z15) MIE3, + /// Miscellaneous-Instruction-Extensions Facility 4 (z17) + MIE4, /// Vector-Enhancements Facility 2 (z15) VXRS_EXT2, + /// Vector-Enhancements Facility 3 (z17) + VXRS_EXT3, } impl Inst { @@ -188,14 +192,10 @@ impl Inst { | Inst::FpuCmp32 { .. } | Inst::FpuCmp64 { .. } | Inst::FpuCmp128 { .. } - | Inst::VecRRR { .. } - | Inst::VecRR { .. } | Inst::VecShiftRR { .. } | Inst::VecSelect { .. } | Inst::VecPermute { .. } | Inst::VecPermuteDWImm { .. } - | Inst::VecIntCmp { .. } - | Inst::VecIntCmpS { .. } | Inst::VecFloatCmp { .. } | Inst::VecFloatCmpS { .. } | Inst::VecInt128SCmpHi { .. } @@ -251,6 +251,7 @@ impl Inst { }, Inst::UnaryRR { op, .. } => match op { UnaryOp::PopcntReg => InstructionSet::MIE3, + UnaryOp::Clz64 | UnaryOp::Ctz64 => InstructionSet::MIE4, _ => InstructionSet::Base, }, Inst::FpuRound { op, .. } => match op { @@ -260,6 +261,43 @@ impl Inst { FpuRoundOp::ToUInt32x4 | FpuRoundOp::FromUInt32x4 => InstructionSet::VXRS_EXT2, _ => InstructionSet::Base, }, + Inst::VecRRR { op, .. } => match op { + VecBinaryOp::Mul64x2 | VecBinaryOp::Mul128 => InstructionSet::VXRS_EXT3, + VecBinaryOp::UMulHi64x2 | VecBinaryOp::UMulHi128 => InstructionSet::VXRS_EXT3, + VecBinaryOp::SMulHi64x2 | VecBinaryOp::SMulHi128 => InstructionSet::VXRS_EXT3, + VecBinaryOp::UMulEven64x2 | VecBinaryOp::SMulEven64x2 => InstructionSet::VXRS_EXT3, + VecBinaryOp::UMulOdd64x2 | VecBinaryOp::SMulOdd64x2 => InstructionSet::VXRS_EXT3, + VecBinaryOp::UDiv32x4 | VecBinaryOp::SDiv32x4 => InstructionSet::VXRS_EXT3, + VecBinaryOp::UDiv64x2 | VecBinaryOp::SDiv64x2 => InstructionSet::VXRS_EXT3, + VecBinaryOp::UDiv128 | VecBinaryOp::SDiv128 => InstructionSet::VXRS_EXT3, + VecBinaryOp::URem32x4 | VecBinaryOp::SRem32x4 => InstructionSet::VXRS_EXT3, + VecBinaryOp::URem64x2 | VecBinaryOp::SRem64x2 => InstructionSet::VXRS_EXT3, + VecBinaryOp::URem128 | VecBinaryOp::SRem128 => InstructionSet::VXRS_EXT3, + VecBinaryOp::UMax128 | VecBinaryOp::SMax128 => InstructionSet::VXRS_EXT3, + VecBinaryOp::UMin128 | VecBinaryOp::SMin128 => InstructionSet::VXRS_EXT3, + VecBinaryOp::UAvg128 | VecBinaryOp::SAvg128 => InstructionSet::VXRS_EXT3, + _ => InstructionSet::Base, + }, + &Inst::VecRR { op, .. } => match op { + VecUnaryOp::Abs128 | VecUnaryOp::Neg128 => InstructionSet::VXRS_EXT3, + VecUnaryOp::Clz128 | VecUnaryOp::Ctz128 => InstructionSet::VXRS_EXT3, + VecUnaryOp::UnpackULow64x2 => InstructionSet::VXRS_EXT3, + VecUnaryOp::UnpackUHigh64x2 => InstructionSet::VXRS_EXT3, + VecUnaryOp::UnpackSLow64x2 => InstructionSet::VXRS_EXT3, + VecUnaryOp::UnpackSHigh64x2 => InstructionSet::VXRS_EXT3, + _ => InstructionSet::Base, + }, + &Inst::VecIntCmp { op, .. } | &Inst::VecIntCmpS { op, .. } => match op { + VecIntCmpOp::CmpEq128 => InstructionSet::VXRS_EXT3, + VecIntCmpOp::SCmpHi128 => InstructionSet::VXRS_EXT3, + VecIntCmpOp::UCmpHi128 => InstructionSet::VXRS_EXT3, + _ => InstructionSet::Base, + }, + &Inst::VecIntEltCmp { op, .. } => match op { + VecIntEltCmpOp::SCmp128 => InstructionSet::VXRS_EXT3, + VecIntEltCmpOp::UCmp128 => InstructionSet::VXRS_EXT3, + // We do not use any of the pre-z17 variants of these instructions. + }, // These are all part of VXRS_EXT2 Inst::VecLoadRev { .. } @@ -281,6 +319,8 @@ impl Inst { | Inst::VecLoadLaneRevUndef { .. } | Inst::VecStoreLaneRev { .. } => InstructionSet::VXRS_EXT2, + Inst::VecEvaluate { .. } => InstructionSet::VXRS_EXT3, + Inst::DummyUse { .. } => InstructionSet::Base, Inst::LabelAddress { .. } => InstructionSet::Base, @@ -700,13 +740,9 @@ fn s390x_get_operands(inst: &mut Inst, collector: &mut DenyReuseVisitor { - collector.reg_def(rd); - collector.reg_use(rn); - collector.reg_use(rm); - collector.reg_use(ra); - } - Inst::VecPermute { rd, rn, rm, ra, .. } => { + Inst::VecSelect { rd, rn, rm, ra, .. } + | Inst::VecPermute { rd, rn, rm, ra, .. } + | Inst::VecEvaluate { rd, rn, rm, ra, .. } => { collector.reg_def(rd); collector.reg_use(rn); collector.reg_use(rm); @@ -727,6 +763,10 @@ fn s390x_get_operands(inst: &mut Inst, collector: &mut DenyReuseVisitor { + collector.reg_use(rn); + collector.reg_use(rm); + } Inst::VecInt128SCmpHi { tmp, rn, rm, .. } | Inst::VecInt128UCmpHi { tmp, rn, rm, .. } => { collector.reg_def(tmp); collector.reg_use(rn); @@ -1627,6 +1667,8 @@ impl Inst { UnaryOp::PopcntReg => ("popcnt", ", 8"), UnaryOp::BSwap32 => ("lrvr", ""), UnaryOp::BSwap64 => ("lrvgr", ""), + UnaryOp::Clz64 => ("clzg", ""), + UnaryOp::Ctz64 => ("ctzg", ""), }; let rd = pretty_print_reg(rd.to_reg()); let rn = pretty_print_reg(rn); @@ -2453,6 +2495,15 @@ impl Inst { } &Inst::VecRRR { op, rd, rn, rm } => { + let m5 = match op { + VecBinaryOp::UDiv32x4 | VecBinaryOp::SDiv32x4 => ", 0", + VecBinaryOp::UDiv64x2 | VecBinaryOp::SDiv64x2 => ", 0", + VecBinaryOp::UDiv128 | VecBinaryOp::SDiv128 => ", 0", + VecBinaryOp::URem32x4 | VecBinaryOp::SRem32x4 => ", 0", + VecBinaryOp::URem64x2 | VecBinaryOp::SRem64x2 => ", 0", + VecBinaryOp::URem128 | VecBinaryOp::SRem128 => ", 0", + _ => "", + }; let op = match op { VecBinaryOp::Add8x16 => "vab", VecBinaryOp::Add16x8 => "vah", @@ -2467,48 +2518,76 @@ impl Inst { VecBinaryOp::Mul8x16 => "vmlb", VecBinaryOp::Mul16x8 => "vmlhw", VecBinaryOp::Mul32x4 => "vmlf", + VecBinaryOp::Mul64x2 => "vmlg", + VecBinaryOp::Mul128 => "vmlq", VecBinaryOp::UMulHi8x16 => "vmlhb", VecBinaryOp::UMulHi16x8 => "vmlhh", VecBinaryOp::UMulHi32x4 => "vmlhf", + VecBinaryOp::UMulHi64x2 => "vmlhg", + VecBinaryOp::UMulHi128 => "vmlhq", VecBinaryOp::SMulHi8x16 => "vmhb", VecBinaryOp::SMulHi16x8 => "vmhh", VecBinaryOp::SMulHi32x4 => "vmhf", + VecBinaryOp::SMulHi64x2 => "vmhg", + VecBinaryOp::SMulHi128 => "vmhq", VecBinaryOp::UMulEven8x16 => "vmleb", VecBinaryOp::UMulEven16x8 => "vmleh", VecBinaryOp::UMulEven32x4 => "vmlef", + VecBinaryOp::UMulEven64x2 => "vmleg", VecBinaryOp::SMulEven8x16 => "vmeb", VecBinaryOp::SMulEven16x8 => "vmeh", VecBinaryOp::SMulEven32x4 => "vmef", + VecBinaryOp::SMulEven64x2 => "vmeg", VecBinaryOp::UMulOdd8x16 => "vmlob", VecBinaryOp::UMulOdd16x8 => "vmloh", VecBinaryOp::UMulOdd32x4 => "vmlof", + VecBinaryOp::UMulOdd64x2 => "vmlog", VecBinaryOp::SMulOdd8x16 => "vmob", VecBinaryOp::SMulOdd16x8 => "vmoh", VecBinaryOp::SMulOdd32x4 => "vmof", + VecBinaryOp::SMulOdd64x2 => "vmog", + VecBinaryOp::SDiv32x4 => "vdf", + VecBinaryOp::SDiv64x2 => "vdg", + VecBinaryOp::SDiv128 => "vdq", + VecBinaryOp::UDiv32x4 => "vdlf", + VecBinaryOp::UDiv64x2 => "vdlg", + VecBinaryOp::UDiv128 => "vdlq", + VecBinaryOp::SRem32x4 => "vrf", + VecBinaryOp::SRem64x2 => "vrg", + VecBinaryOp::SRem128 => "vrq", + VecBinaryOp::URem32x4 => "vrlf", + VecBinaryOp::URem64x2 => "vrlg", + VecBinaryOp::URem128 => "vrlq", VecBinaryOp::UMax8x16 => "vmxlb", VecBinaryOp::UMax16x8 => "vmxlh", VecBinaryOp::UMax32x4 => "vmxlf", VecBinaryOp::UMax64x2 => "vmxlg", + VecBinaryOp::UMax128 => "vmxlq", VecBinaryOp::SMax8x16 => "vmxb", VecBinaryOp::SMax16x8 => "vmxh", VecBinaryOp::SMax32x4 => "vmxf", VecBinaryOp::SMax64x2 => "vmxg", + VecBinaryOp::SMax128 => "vmxq", VecBinaryOp::UMin8x16 => "vmnlb", VecBinaryOp::UMin16x8 => "vmnlh", VecBinaryOp::UMin32x4 => "vmnlf", VecBinaryOp::UMin64x2 => "vmnlg", + VecBinaryOp::UMin128 => "vmnlq", VecBinaryOp::SMin8x16 => "vmnb", VecBinaryOp::SMin16x8 => "vmnh", VecBinaryOp::SMin32x4 => "vmnf", VecBinaryOp::SMin64x2 => "vmng", + VecBinaryOp::SMin128 => "vmnq", VecBinaryOp::UAvg8x16 => "vavglb", VecBinaryOp::UAvg16x8 => "vavglh", VecBinaryOp::UAvg32x4 => "vavglf", VecBinaryOp::UAvg64x2 => "vavglg", + VecBinaryOp::UAvg128 => "vavglq", VecBinaryOp::SAvg8x16 => "vavgb", VecBinaryOp::SAvg16x8 => "vavgh", VecBinaryOp::SAvg32x4 => "vavgf", VecBinaryOp::SAvg64x2 => "vavgg", + VecBinaryOp::SAvg128 => "vavgq", VecBinaryOp::And128 => "vn", VecBinaryOp::Orr128 => "vo", VecBinaryOp::Xor128 => "vx", @@ -2545,7 +2624,7 @@ impl Inst { let rd = pretty_print_reg(rd.to_reg()); let rn = pretty_print_reg(rn); let rm = pretty_print_reg(rm); - format!("{op} {rd}, {rn}, {rm}") + format!("{op} {rd}, {rn}, {rm}{m5}") } &Inst::VecRR { op, rd, rn } => { let op = match op { @@ -2553,10 +2632,12 @@ impl Inst { VecUnaryOp::Abs16x8 => "vlph", VecUnaryOp::Abs32x4 => "vlpf", VecUnaryOp::Abs64x2 => "vlpg", + VecUnaryOp::Abs128 => "vlpq", VecUnaryOp::Neg8x16 => "vlcb", VecUnaryOp::Neg16x8 => "vlch", VecUnaryOp::Neg32x4 => "vlcf", VecUnaryOp::Neg64x2 => "vlcg", + VecUnaryOp::Neg128 => "vlcq", VecUnaryOp::Popcnt8x16 => "vpopctb", VecUnaryOp::Popcnt16x8 => "vpopcth", VecUnaryOp::Popcnt32x4 => "vpopctf", @@ -2565,22 +2646,28 @@ impl Inst { VecUnaryOp::Clz16x8 => "vclzh", VecUnaryOp::Clz32x4 => "vclzf", VecUnaryOp::Clz64x2 => "vclzg", + VecUnaryOp::Clz128 => "vclzq", VecUnaryOp::Ctz8x16 => "vctzb", VecUnaryOp::Ctz16x8 => "vctzh", VecUnaryOp::Ctz32x4 => "vctzf", VecUnaryOp::Ctz64x2 => "vctzg", + VecUnaryOp::Ctz128 => "vctzq", VecUnaryOp::UnpackULow8x16 => "vupllb", VecUnaryOp::UnpackULow16x8 => "vupllh", VecUnaryOp::UnpackULow32x4 => "vupllf", + VecUnaryOp::UnpackULow64x2 => "vupllg", VecUnaryOp::UnpackUHigh8x16 => "vuplhb", VecUnaryOp::UnpackUHigh16x8 => "vuplhh", VecUnaryOp::UnpackUHigh32x4 => "vuplhf", + VecUnaryOp::UnpackUHigh64x2 => "vuplhg", VecUnaryOp::UnpackSLow8x16 => "vuplb", VecUnaryOp::UnpackSLow16x8 => "vuplh", VecUnaryOp::UnpackSLow32x4 => "vuplf", + VecUnaryOp::UnpackSLow64x2 => "vuplg", VecUnaryOp::UnpackSHigh8x16 => "vuphb", VecUnaryOp::UnpackSHigh16x8 => "vuphh", VecUnaryOp::UnpackSHigh32x4 => "vuphf", + VecUnaryOp::UnpackSHigh64x2 => "vuphg", }; let rd = pretty_print_reg(rd.to_reg()); let rn = pretty_print_reg(rn); @@ -2634,6 +2721,19 @@ impl Inst { let ra = pretty_print_reg(ra); format!("vperm {rd}, {rn}, {rm}, {ra}") } + &Inst::VecEvaluate { + imm, + rd, + rn, + rm, + ra, + } => { + let rd = pretty_print_reg(rd.to_reg()); + let rn = pretty_print_reg(rn); + let rm = pretty_print_reg(rm); + let ra = pretty_print_reg(ra); + format!("veval {rd}, {rn}, {rm}, {ra}, {imm}") + } &Inst::VecPermuteDWImm { rd, rn, @@ -2653,14 +2753,17 @@ impl Inst { VecIntCmpOp::CmpEq16x8 => "vceqh", VecIntCmpOp::CmpEq32x4 => "vceqf", VecIntCmpOp::CmpEq64x2 => "vceqg", + VecIntCmpOp::CmpEq128 => "vceqq", VecIntCmpOp::SCmpHi8x16 => "vchb", VecIntCmpOp::SCmpHi16x8 => "vchh", VecIntCmpOp::SCmpHi32x4 => "vchf", VecIntCmpOp::SCmpHi64x2 => "vchg", + VecIntCmpOp::SCmpHi128 => "vchq", VecIntCmpOp::UCmpHi8x16 => "vchlb", VecIntCmpOp::UCmpHi16x8 => "vchlh", VecIntCmpOp::UCmpHi32x4 => "vchlf", VecIntCmpOp::UCmpHi64x2 => "vchlg", + VecIntCmpOp::UCmpHi128 => "vchlq", }; let s = match self { &Inst::VecIntCmp { .. } => "", @@ -2691,6 +2794,15 @@ impl Inst { let rm = pretty_print_reg(rm); format!("{op}{s} {rd}, {rn}, {rm}") } + &Inst::VecIntEltCmp { op, rn, rm } => { + let op = match op { + VecIntEltCmpOp::SCmp128 => "vecq", + VecIntEltCmpOp::UCmp128 => "veclq", + }; + let rn = pretty_print_reg(rn); + let rm = pretty_print_reg(rm); + format!("{op} {rn}, {rm}") + } &Inst::VecInt128SCmpHi { tmp, rn, rm } | &Inst::VecInt128UCmpHi { tmp, rn, rm } => { let op = match self { &Inst::VecInt128SCmpHi { .. } => "vecg", diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index 875469fae073..3bf616b6229d 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -221,8 +221,12 @@ (rule 1 (lower (has_type (ty_vec128 ty) (iabs x))) (vec_abs ty x)) -;; Absolute value of a 128-bit integer. -(rule 0 (lower (has_type $I128 (iabs x))) +;; Absolute value of a 128-bit integer on z17. +(rule 4 (lower (has_type (and (vxrs_ext3_enabled) $I128) (iabs x))) + (vec_abs $I128 x)) + +;; Absolute value of a 128-bit integer pre-z17. +(rule 0 (lower (has_type (and (vxrs_ext3_disabled) $I128) (iabs x))) (let ((zero Reg (vec_imm $I128 0)) (pos Reg x) (neg Reg (vec_sub $I128 zero pos)) @@ -245,8 +249,12 @@ (rule 1 (lower (has_type (ty_vec128 ty) (ineg x))) (vec_neg ty x)) -;; Negate a 128-bit integer. -(rule 0 (lower (has_type $I128 (ineg x))) +;; Negate a 128-bit integer on z17. +(rule 4 (lower (has_type (and (vxrs_ext3_enabled) $I128) (ineg x))) + (vec_neg $I128 x)) + +;; Negate a 128-bit integer pre-z17. +(rule 0 (lower (has_type (and (vxrs_ext3_disabled) $I128) (ineg x))) (vec_sub $I128 (vec_imm $I128 0) x)) @@ -260,13 +268,17 @@ (intcc_as_cond (IntCC.UnsignedLessThan))))) (select_bool_reg ty cond y_ext x_ext))) -;; Unsigned maximum of two 128-bit integers - expand to icmp + select. -(rule 1 (lower (has_type $I128 (umax x y))) +;; Unsigned maximum of two 128-bit integers pre-z17 - expand to icmp + select. +(rule 1 (lower (has_type (and (vxrs_ext3_disabled) $I128) (umax x y))) (let ((x_reg Reg (put_in_reg x)) (y_reg Reg (put_in_reg y)) (cond ProducesBool (vec_int128_ucmphi y_reg x_reg))) (select_bool_reg $I128 cond y_reg x_reg))) +;; Unsigned maximum of two 128-bit integers on z17. +(rule 3 (lower (has_type (and (vxrs_ext3_enabled) $I128) (umax x y))) + (vec_umax $I128 x y)) + ;; Unsigned maximum of two vector registers. (rule 0 (lower (has_type (ty_vec128 ty) (umax x y))) (vec_umax ty x y)) @@ -282,13 +294,17 @@ (intcc_as_cond (IntCC.UnsignedGreaterThan))))) (select_bool_reg ty cond y_ext x_ext))) -;; Unsigned maximum of two 128-bit integers - expand to icmp + select. -(rule 1 (lower (has_type $I128 (umin x y))) +;; Unsigned maximum of two 128-bit integers pre-z17 - expand to icmp + select. +(rule 1 (lower (has_type (and (vxrs_ext3_disabled) $I128) (umin x y))) (let ((x_reg Reg (put_in_reg x)) (y_reg Reg (put_in_reg y)) (cond ProducesBool (vec_int128_ucmphi x_reg y_reg))) (select_bool_reg $I128 cond y_reg x_reg))) +;; Unsigned minimum of two 128-bit integers on z17. +(rule 3 (lower (has_type (and (vxrs_ext3_enabled) $I128) (umin x y))) + (vec_umin $I128 x y)) + ;; Unsigned minimum of two vector registers. (rule 0 (lower (has_type (ty_vec128 ty) (umin x y))) (vec_umin ty x y)) @@ -304,13 +320,17 @@ (intcc_as_cond (IntCC.SignedLessThan))))) (select_bool_reg ty cond y_ext x_ext))) -;; Signed maximum of two 128-bit integers - expand to icmp + select. -(rule 1 (lower (has_type $I128 (smax x y))) +;; Signed maximum of two 128-bit integers pre-z17 - expand to icmp + select. +(rule 1 (lower (has_type (and (vxrs_ext3_disabled) $I128) (smax x y))) (let ((x_reg Reg (put_in_reg x)) (y_reg Reg (put_in_reg y)) (cond ProducesBool (vec_int128_scmphi y_reg x_reg))) (select_bool_reg $I128 cond y_reg x_reg))) +;; Signed maximum of two 128-bit integers on z17. +(rule 3 (lower (has_type (and (vxrs_ext3_enabled) $I128) (smax x y))) + (vec_smax $I128 x y)) + ;; Signed maximum of two vector registers. (rule (lower (has_type (ty_vec128 ty) (smax x y))) (vec_smax ty x y)) @@ -326,13 +346,17 @@ (intcc_as_cond (IntCC.SignedGreaterThan))))) (select_bool_reg ty cond y_ext x_ext))) -;; Signed maximum of two 128-bit integers - expand to icmp + select. -(rule 1 (lower (has_type $I128 (smin x y))) +;; Signed maximum of two 128-bit integers pre-z17 - expand to icmp + select. +(rule 1 (lower (has_type (and (vxrs_ext3_disabled) $I128) (smin x y))) (let ((x_reg Reg (put_in_reg x)) (y_reg Reg (put_in_reg y)) (cond ProducesBool (vec_int128_scmphi x_reg y_reg))) (select_bool_reg $I128 cond y_reg x_reg))) +;; Signed minimum of two 128-bit integers on z17. +(rule 3 (lower (has_type (and (vxrs_ext3_enabled) $I128) (smin x y))) + (vec_smin $I128 x y)) + ;; Signed minimum of two vector registers. (rule (lower (has_type (ty_vec128 ty) (smin x y))) (vec_smin ty x y)) @@ -399,16 +423,22 @@ (rule (vec_mul_impl $I16X8 x y) (vec_mul $I16X8 x y)) (rule (vec_mul_impl $I32X4 x y) (vec_mul $I32X4 x y)) -;; Multiply two vector registers - doubleword. Has to be scalarized. -(rule (vec_mul_impl $I64X2 x y) +;; Multiply two vector registers - doubleword on z17. +(rule 1 (vec_mul_impl (and (vxrs_ext3_enabled) $I64X2) x y) (vec_mul $I64X2 x y)) + +;; Multiply two vector registers - doubleword pre-z17. Has to be scalarized. +(rule (vec_mul_impl (and (vxrs_ext3_disabled) $I64X2) x y) (mov_to_vec128 $I64X2 (mul_reg $I64 (vec_extract_lane $I64X2 x 0 (zero_reg)) (vec_extract_lane $I64X2 y 0 (zero_reg))) (mul_reg $I64 (vec_extract_lane $I64X2 x 1 (zero_reg)) (vec_extract_lane $I64X2 y 1 (zero_reg))))) -;; Multiply two vector registers - quadword. -(rule (vec_mul_impl $I128 x y) +;; Multiply two vector registers - quadword on z17. +(rule 1 (vec_mul_impl (and (vxrs_ext3_enabled) $I128) x y) (vec_mul $I128 x y)) + +;; Multiply two vector registers - quadword pre-z17. +(rule (vec_mul_impl (and (vxrs_ext3_disabled) $I128) x y) (let ((x_hi Reg (vec_extract_lane $I64X2 x 0 (zero_reg))) (x_lo Reg (vec_extract_lane $I64X2 x 1 (zero_reg))) (y_hi Reg (vec_extract_lane $I64X2 y 0 (zero_reg))) @@ -457,9 +487,12 @@ (rule (lower (has_type $I16X8 (umulhi x y))) (vec_umulhi $I16X8 x y)) (rule (lower (has_type $I32X4 (umulhi x y))) (vec_umulhi $I32X4 x y)) -;; Multiply high part unsigned, vector types with 64-bit elements. +;; Multiply high part unsigned, vector types with 64-bit elements on z17. +(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I64X2) (umulhi x y))) (vec_umulhi $I64X2 x y)) + +;; Multiply high part unsigned, vector types with 64-bit elements pre-z17. ;; Has to be scalarized. -(rule (lower (has_type $I64X2 (umulhi x y))) +(rule (lower (has_type (and (vxrs_ext3_disabled) $I64X2) (umulhi x y))) (let ((pair_0 RegPair (umul_wide (vec_extract_lane $I64X2 x 0 (zero_reg)) (vec_extract_lane $I64X2 y 0 (zero_reg)))) (res_0 Reg (regpair_hi pair_0)) @@ -495,9 +528,12 @@ (rule (lower (has_type $I16X8 (smulhi x y))) (vec_smulhi $I16X8 x y)) (rule (lower (has_type $I32X4 (smulhi x y))) (vec_smulhi $I32X4 x y)) -;; Multiply high part unsigned, vector types with 64-bit elements. +;; Multiply high part signed, vector types with 64-bit elements on z17. +(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I64X2) (smulhi x y))) (vec_smulhi $I64X2 x y)) + +;; Multiply high part signed, vector types with 64-bit elements pre-z17. ;; Has to be scalarized. -(rule (lower (has_type $I64X2 (smulhi x y))) +(rule (lower (has_type (and (vxrs_ext3_disabled) $I64X2) (smulhi x y))) (let ((pair_0 RegPair (smul_wide (vec_extract_lane $I64X2 x 0 (zero_reg)) (vec_extract_lane $I64X2 y 0 (zero_reg)))) (res_0 Reg (copy_reg $I64 (regpair_hi pair_0))) @@ -575,6 +611,14 @@ (pair RegPair (udivmod ext_ty ext_x ext_y))) (regpair_hi pair))) +;; Implement `udiv` for 128-bit integers on z17 (only). +(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I128) (udiv x y))) + (vec_udiv $I128 x y)) + +;; Implement `urem` for 128-bit integers on z17 (only). +(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I128) (urem x y))) + (vec_urem $I128 x y)) + ;;;; Rules for `sdiv` and `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -660,6 +704,22 @@ (icmps_simm16_and_trap ext_ty reg -1 (intcc_as_cond (IntCC.Equal)) (trap_code_integer_overflow)))) +(rule 1 (maybe_trap_if_sdiv_overflow true $I128 $I128 x y) + (let ( + ;; We need to trap when y == INT_MIN && x == -1 + ;; y == INT_MIN is implemented as y == -y, as -INT_MIN == INT_MIN. + ;; This checks that y == -y, by using Not-Xor for bitwise + ;; equality, producing all 0b1's (-1u128) when y == -y. + ;; Then it uses band to include the x == -1 check as well. + ;; using (band x (bnot (bxor y neg_divison))) vaiant of vec eval + (neg_divisor Reg (vec_neg $I128 y)) + (reg Reg (vec_eval $I128 0b00001001 x y neg_divisor)) + ;; finally, we check that the combination of x & y == -y is -1 + (flags ProducesFlags (vec_elt_icmps reg (vec_imm $I128 -1)))) + (trap_if flags + (intcc_as_cond (IntCC.Equal)) + (trap_code_integer_overflow)))) + (decl int_max (Type) u64) (rule (int_max $I8) 0x7f) (rule (int_max $I16) 0x7fff) @@ -687,6 +747,18 @@ (with_flags_reg (icmps_simm16 $I64 y -1) (cmov_imm $I64 (intcc_as_cond (IntCC.Equal)) 0 x))) +;; Implement `sdiv` for 128-bit integers on z17 (only). +(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I128) (sdiv x y))) + (let ((OFcheck bool (div_overflow_check_needed y)) + (_ Reg (maybe_trap_if_sdiv_overflow OFcheck $I128 $I128 x y))) + (vec_sdiv $I128 x y))) + +;; Implement `srem` for 128-bit integers on z17 (only). +(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I128) (srem x y))) + (let ((OFcheck bool (div_overflow_check_needed y)) + (_ Reg (maybe_trap_if_sdiv_overflow OFcheck $I128 $I128 x y))) + (vec_srem $I128 x y))) + ;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1041,6 +1113,11 @@ (rule 11 (lower (has_type (ty_scalar_float _) (band x y))) (vec_and $F64X2 x y)) +(rule 12 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (band x y) z))) + (vec_eval ty 0b00000001 x y z)) +(rule 13 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (band y z)))) + (vec_eval ty 0b00000001 x y z)) + ;; Specialized lowerings for `(band x (bnot y))` which is additionally produced ;; by Cranelift's `band_not` instruction that is legalized into the simpler ;; forms early on. @@ -1057,6 +1134,56 @@ (rule 10 (lower (has_type (vr128_ty ty) (band (bnot y) x))) (vec_and_not ty x y)) +;; And-not three vector registers. +(rule 14 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (band (bnot x) y) z))) + (vec_eval ty 0b00010000 x y z)) +(rule 15 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bnot x) (band y z)))) + (vec_eval ty 0b00010000 x y z)) +(rule 16 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (band x y) (bnot z)))) + (vec_eval ty 0b00000010 x y z)) +(rule 17 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (band y (bnot z))))) + (vec_eval ty 0b00000010 x y z)) +(rule 18 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (band x (bnot y)) z))) + (vec_eval ty 0b00000100 y x z)) +(rule 19 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (band (bnot y) z)))) + (vec_eval ty 0b00000100 z x y)) + +;; Not-and three vector registers +(rule 20 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (band (band x y) z)))) + (vec_eval ty 0b11111110 x y z)) +(rule 21 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (band x (band y z))))) + (vec_eval ty 0b11111110 x y z)) + +;; And-Nand three vector registers +(rule 20 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bnot (band x y)) z))) + (vec_eval ty 0b01010100 x y z)) +(rule 21 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (bnot (band y z))))) + (vec_eval ty 0b00001110 x y z)) + +;; And-Or 3 vector registers +(rule 22 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (bor y z)))) + (vec_eval ty 0b00000111 x y z)) +(rule 23 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bor x y) z))) + (vec_eval ty 0b00010101 x y z)) + +;; And-Nor 3 vector registers +(rule 24 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (bnot (bor y z))))) + (vec_eval ty 0b00001000 x y z)) +(rule 25 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bnot (bor x y)) z))) + (vec_eval ty 0b01000000 x y z)) + +;; And-Xor 3 vector registers +(rule 26 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (bxor y z)))) + (vec_eval ty 0b00000110 x y z)) +(rule 27 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bxor x y) z))) + (vec_eval ty 0b00010100 x y z)) + +;; And-Nxor 3 vector registers +(rule 28 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (bnot (bxor y z))))) + (vec_eval ty 0b00001001 x y z)) +(rule 29 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bnot (bxor x y)) z))) + (vec_eval ty 0b01000001 x y z)) + ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Or two registers. @@ -1087,6 +1214,12 @@ (rule 11 (lower (has_type (ty_scalar_float _) (bor x y))) (vec_or $F64X2 x y)) +;; Or three vector registers. +(rule 12 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bor x y) z))) + (vec_eval ty 0b01111111 x y z)) +(rule 13 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bor y z)))) + (vec_eval ty 0b01111111 x y z)) + ;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced ;; by Cranelift's `bor_not` instruction that is legalized into the simpler ;; forms early on. @@ -1103,6 +1236,79 @@ (rule 10 (lower (has_type (vr128_ty ty) (bor (bnot y) x))) (vec_or_not ty x y)) +;; 3-input bor with a single not +(rule 14 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bor (bnot x) y) z))) + (vec_eval ty 0b11110111 x y z)) +(rule 15 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bnot x) (bor y z)))) + (vec_eval ty 0b11110111 x y z)) +(rule 16 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bor x (bnot y)) z))) + (vec_eval ty 0b11011111 x y z)) +(rule 17 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bor (bnot y) z)))) + (vec_eval ty 0b11011111 x y z)) +(rule 18 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bor x y) (bnot z)))) + (vec_eval ty 0b10111111 x y z)) +(rule 19 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bor y (bnot z))))) + (vec_eval ty 0b10111111 x y z)) + +;; 3-input bnor +(rule 20 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor (bor x y) z)))) + (vec_eval ty 0b10000000 x y z)) +(rule 21 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor x (bor y z))))) + (vec_eval ty 0b10000000 x y z)) + +;; Or-Nor +(rule 20 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bnot (bor x y)) z))) + (vec_eval ty 0b11010101 x y z)) +(rule 21 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bnot (bor y z))))) + (vec_eval ty 0b10001111 x y z)) + +;; Or-And +(rule 22 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (band x y) z))) + (vec_eval ty 0b01010111 x y z)) +(rule 23 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (band y z)))) + (vec_eval ty 0b00011111 x y z)) + +;; Or-Nand +(rule 24 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bnot (band x y)) z))) + (vec_eval ty 0b11111101 x y z)) +(rule 25 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bnot (band y z))))) + (vec_eval ty 0b11101111 x y z)) + +;; Or-Xor +(rule 26 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bxor x y) z))) + (vec_eval ty 0b01111101 x y z)) +(rule 27 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bxor y z)))) + (vec_eval ty 0b01101111 x y z)) + +;; Or-Nxor +(rule 28 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bnot (bxor x y)) z))) + (vec_eval ty 0b11010111 x y z)) +(rule 29 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bnot (bxor y z))))) + (vec_eval ty 0b10011111 x y z)) + +;; Nor-And +(rule 30 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor (band x y) z)))) + (vec_eval ty 0b10101000 x y z)) +(rule 31 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor x (band y z))))) + (vec_eval ty 0b11100000 x y z)) + +;; Nor-Nand +(rule 32 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor (bnot (band x y)) z)))) + (vec_eval ty 0b00000010 x y z)) +(rule 33 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor x (bnot (band y z)))))) + (vec_eval ty 0b00010000 x y z)) + +;; Nor-Xor +(rule 34 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor (bxor x y) z)))) + (vec_eval ty 0b10000010 x y z)) +(rule 35 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor x (bxor y z))))) + (vec_eval ty 0b10010000 x y z)) + +;; Nor-Nxor +(rule 36 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor (bnot (bxor x y)) z)))) + (vec_eval ty 0b00101000 x y z)) +(rule 37 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor x (bnot (bxor y z)))))) + (vec_eval ty 0b01100000 x y z)) ;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1146,6 +1352,77 @@ (rule 8 (lower (has_type (vr128_ty ty) (bxor (bnot y) x))) (vec_not_xor ty x y)) +;; 3-input Xor +(rule 10 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (bxor x y) z))) + (vec_eval ty 0b01101001 x y z)) +(rule 11 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (bxor y z)))) + (vec_eval ty 0b01101001 x y z)) + +;; Xor-And +(rule 12 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (band x y) z))) + (vec_eval ty 0b01010110 x y z)) +(rule 13 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (band y z)))) + (vec_eval ty 0b00011110 x y z)) + +;; Xor-Nand +(rule 14 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (bnot (band x y)) z))) + (vec_eval ty 0b10101001 x y z)) +(rule 15 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (bnot (band y z))))) + (vec_eval ty 0b11100001 x y z)) + +;; Xor-Or +(rule 16 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (bor x y) z))) + (vec_eval ty 0b01101010 x y z)) +(rule 17 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (bor y z)))) + (vec_eval ty 0b01111000 x y z)) + +;; Xor-Nor +(rule 18 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (bnot (bor x y)) z))) + (vec_eval ty 0b10010101 x y z)) +(rule 19 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (bnot (bor y z))))) + (vec_eval ty 0b10000111 x y z)) + +;; Xor-Nxor +(rule 18 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (bnot (bxor x y)) z))) + (vec_eval ty 0b10010110 x y z)) +(rule 19 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (bnot (bxor y z))))) + (vec_eval ty 0b10010110 x y z)) + +;; 3-input Nxor +(rule 20 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (bxor x y) z)))) + (vec_eval ty 0b10010110 x y z)) +(rule 21 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (bxor y z))))) + (vec_eval ty 0b10010110 x y z)) + +;; Nxor-And +(rule 22 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (band x y) z)))) + (vec_eval ty 0b10101001 x y z)) +(rule 23 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (band y z))))) + (vec_eval ty 0b11100001 x y z)) + +;; Nxor-Nand +(rule 24 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (bnot (band x y)) z)))) + (vec_eval ty 0b01010110 x y z)) +(rule 25 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (bnot (band y z)))))) + (vec_eval ty 0b00011110 x y z)) + +;; Nxor-Or +(rule 26 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (bor x y) z)))) + (vec_eval ty 0b10010101 x y z)) +(rule 27 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (bor y z))))) + (vec_eval ty 0b10000111 x y z)) + +;; Nxor-Nor +(rule 28 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (bnot (bor x y)) z)))) + (vec_eval ty 0b01101010 x y z)) +(rule 29 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (bnot (bor y z)))))) + (vec_eval ty 0b01111000 x y z)) + +;; Xor-Nxor +(rule 18 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (bnot (bxor x y)) z)))) + (vec_eval ty 0b01101001 x y z)) +(rule 19 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (bnot (bxor y z)))))) + (vec_eval ty 0b01101001 x y z)) ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1167,6 +1444,14 @@ (rule (lower (has_type (vr128_ty ty) (bitselect x y z))) (vec_select ty y z x)) +;; Bitselect-not vector registers. +(rule 5 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bitselect x (bnot y) z))) + (vec_eval ty 0b01011100 x y z)) +(rule 6 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bitselect x y (bnot z)))) + (vec_eval ty 0b10100011 x y z)) +(rule 7 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bitselect x (bnot y) (bnot z)))) + (vec_eval ty 0b10101100 x y z)) + ;; Special-case some float-selection instructions for min/max (rule 3 (lower (has_type (ty_vec128 ty) (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y))) (fmin_pseudo_reg ty y x)) @@ -1232,17 +1517,17 @@ (rule (clz_offset $I32 x) (add_simm16 $I32 x -32)) (rule (clz_offset $I64 x) x) -;; Count leading zeros, via FLOGR on an input zero-extended to 64 bits, +;; Count leading zeros pre-z17, via FLOGR on an input zero-extended to 64 bits, ;; with the result compensated for the extra bits. -(rule 1 (lower (has_type (fits_in_64 ty) (clz x))) +(rule 1 (lower (has_type (and (mie4_disabled) (fits_in_64 ty)) (clz x))) (let ((ext_reg Reg (put_in_reg_zext64 x)) ;; Ask for a value of 64 in the all-zero 64-bit input case. ;; After compensation this will match the expected semantics. - (clz Reg (clz_reg 64 ext_reg))) + (clz Reg (clz_flogr_reg 64 ext_reg))) (clz_offset ty clz))) -;; Count leading zeros, 128-bit full vector. -(rule (lower (has_type $I128 (clz x))) +;; Count leading zeros, 128-bit full vector pre-z17. +(rule (lower (has_type (and (vxrs_ext3_disabled) $I128) (clz x))) (let ((clz_vec Reg (vec_clz $I64X2 x)) (zero Reg (vec_imm $I64X2 0)) (clz_hi Reg (vec_permute_dw_imm $I64X2 zero 0 clz_vec 0)) @@ -1251,6 +1536,16 @@ (mask Reg (vec_cmpeq $I64X2 clz_hi (vec_imm_splat $I64X2 64)))) (vec_select $I128 clz_sum clz_hi mask))) +;; Count leading zeros on z17, via CLZG on an input zero-extended to 64 bits, +;; with the result compensated for the extra bits. +(rule 3 (lower (has_type (and (mie4_enabled) (fits_in_64 ty)) (clz x))) + (let ((ext_reg Reg (put_in_reg_zext64 x)) + (clz Reg (clz_reg ext_reg))) + (clz_offset ty clz))) + +;; Count leading zeros, 128-bit full vector on z17. +(rule 2 (lower (has_type (and (vxrs_ext3_enabled) $I128) (clz x))) + (vec_clz $I128 x)) ;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1263,20 +1558,20 @@ (rule (cls_offset $I32 x) (add_simm16 $I32 x -33)) (rule (cls_offset $I64 x) (add_simm16 $I64 x -1)) -;; Count leading sign-bit copies. We don't have any instruction for that, +;; Count leading sign-bit copies pre-z17. We don't have any instruction for that, ;; so we instead count the leading zeros after inverting the input if negative, ;; i.e. computing ;; cls(x) == clz(x ^ (x >> 63)) - 1 ;; where x is the sign-extended input. -(rule 1 (lower (has_type (fits_in_64 ty) (cls x))) +(rule 1 (lower (has_type (and (mie4_disabled) (fits_in_64 ty)) (cls x))) (let ((ext_reg Reg (put_in_reg_sext64 x)) (signbit_copies Reg (ashr_imm $I64 ext_reg 63)) (inv_reg Reg (xor_reg $I64 ext_reg signbit_copies)) - (clz Reg (clz_reg 64 inv_reg))) + (clz Reg (clz_flogr_reg 64 inv_reg))) (cls_offset ty clz))) -;; Count leading sign-bit copies, 128-bit full vector. -(rule (lower (has_type $I128 (cls x))) +;; Count leading sign-bit copies, 128-bit full vector pre-z17. +(rule (lower (has_type (and (vxrs_ext3_disabled) $I128) (cls x))) (let ((x_reg Reg x) (ones Reg (vec_imm_splat $I8X16 255)) (signbit_copies Reg (vec_ashr_by_bit (vec_ashr_by_byte x_reg ones) ones)) @@ -1289,6 +1584,22 @@ (mask Reg (vec_cmpeq $I64X2 clz_hi (vec_imm_splat $I64X2 64)))) (vec_add $I128 (vec_select $I128 clz_sum clz_hi mask) ones))) +;; Count leading sign-bit copies on z17, similar to above. +(rule 3 (lower (has_type (and (mie4_enabled) (fits_in_64 ty)) (cls x))) + (let ((ext_reg Reg (put_in_reg_sext64 x)) + (signbit_copies Reg (ashr_imm $I64 ext_reg 63)) + (inv_reg Reg (xor_reg $I64 ext_reg signbit_copies)) + (clz Reg (clz_reg inv_reg))) + (cls_offset ty clz))) + +;; Count leading sign-bit copies, 128-bit full vector on z17. +(rule 2 (lower (has_type (and (vxrs_ext3_enabled) $I128) (cls x))) + (let ((x_reg Reg x) + (ones Reg (vec_imm_splat $I8X16 255)) + (signbit_copies Reg (vec_ashr_by_bit (vec_ashr_by_byte x_reg ones) ones)) + (inv_reg Reg (vec_xor $I128 x_reg signbit_copies)) + (clz_vec Reg (vec_clz $I128 inv_reg))) + (vec_add $I128 clz_vec ones))) ;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -1305,10 +1616,10 @@ ;; never zero by setting a "guard bit" in the position corresponding to ;; the input type size. This way the 64-bit algorithm above will handle ;; that case correctly automatically. -(rule 2 (lower (has_type (gpr32_ty ty) (ctz x))) +(rule 2 (lower (has_type (and (mie4_disabled) (gpr32_ty ty)) (ctz x))) (let ((rx Reg (or_uimm16shifted $I64 x (ctz_guardbit ty))) (lastbit Reg (and_reg $I64 rx (neg_reg $I64 rx))) - (clz Reg (clz_reg 64 lastbit))) + (clz Reg (clz_flogr_reg 64 lastbit))) (sub_reg ty (imm ty 63) clz))) (decl ctz_guardbit (Type) UImm16Shifted) @@ -1320,14 +1631,14 @@ ;; via its condition code. We check for that and replace the instruction ;; result with the value -1 via a conditional move, which will then lead to ;; the correct result after the final subtraction from 63. -(rule 1 (lower (has_type (gpr64_ty _ty) (ctz x))) +(rule 1 (lower (has_type (and (mie4_disabled) (gpr64_ty _ty)) (ctz x))) (let ((rx Reg x) (lastbit Reg (and_reg $I64 rx (neg_reg $I64 rx))) - (clz Reg (clz_reg -1 lastbit))) + (clz Reg (clz_flogr_reg -1 lastbit))) (sub_reg $I64 (imm $I64 63) clz))) -;; Count trailing zeros, 128-bit full vector. -(rule 0 (lower (has_type $I128 (ctz x))) +;; Count trailing zeros, 128-bit full vector pre-z17. +(rule 0 (lower (has_type (and (vxrs_ext3_disabled) $I128) (ctz x))) (let ((ctz_vec Reg (vec_ctz $I64X2 x)) (zero Reg (vec_imm $I64X2 0)) (ctz_hi Reg (vec_permute_dw_imm $I64X2 zero 0 ctz_vec 0)) @@ -1336,6 +1647,19 @@ (mask Reg (vec_cmpeq $I64X2 ctz_lo (vec_imm_splat $I64X2 64)))) (vec_select $I128 ctz_sum ctz_lo mask))) +;; Count leading zeros on z17, via CTZG on types smaller than 64-bit, +;; using the same guard bit mechanism as above. +(rule 5 (lower (has_type (and (mie4_enabled) (gpr32_ty ty)) (ctz x))) + (ctz_reg (or_uimm16shifted $I64 x (ctz_guardbit ty)))) + +;; Count leading zeros on z17, via CTZG directly on 64-bit types. +(rule 4 (lower (has_type (and (mie4_enabled) (gpr64_ty _ty)) (ctz x))) + (ctz_reg x)) + +;; Count trailing zeros, 128-bit full vector on z17. +(rule 3 (lower (has_type (and (vxrs_ext3_enabled) $I128) (ctz x))) + (vec_ctz $I128 x)) + ;;;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3477,35 +3801,43 @@ (icmpu_mem_zext32 ty x (sink_uload32 y))) -;; Compare 128-bit integers for equality. +;; Compare (signed) 128-bit integers on z17. +(rule 2 (icmp_val _ int_cc @ (signed) x @ (value_type (and (vxrs_ext3_enabled) (vr128_ty _))) y) + (bool (vec_elt_icmps x y) (intcc_as_cond int_cc))) + +;; Compare (unsigned) 128-bit integers on z17. +(rule 1 (icmp_val _ int_cc @ (unsigned) x @ (value_type (and (vxrs_ext3_enabled) (vr128_ty _))) y) + (bool (vec_elt_icmpu x y) (intcc_as_cond int_cc))) + +;; Compare 128-bit integers for equality pre-z17. ;; Implemented via element-wise comparison using the all-element true CC flag. -(rule (icmp_val _ (IntCC.Equal) x @ (value_type (vr128_ty _)) y) +(rule (icmp_val _ (IntCC.Equal) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty _))) y) (bool (vec_cmpeqs $I64X2 x y) (floatcc_as_cond (FloatCC.Equal)))) -(rule (icmp_val _ (IntCC.NotEqual) x @ (value_type (vr128_ty _)) y) +(rule (icmp_val _ (IntCC.NotEqual) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty _))) y) (bool (vec_cmpeqs $I64X2 x y) (floatcc_as_cond (FloatCC.NotEqual)))) -;; Compare (signed) 128-bit integers for relational inequality. +;; Compare (signed) 128-bit integers for relational inequality pre-z17. ;; Implemented via synthetic instruction using VECG and VCHLGS. -(rule (icmp_val _ (IntCC.SignedGreaterThan) x @ (value_type (vr128_ty ty)) y) +(rule (icmp_val _ (IntCC.SignedGreaterThan) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y) (vec_int128_scmphi x y)) -(rule (icmp_val _ (IntCC.SignedLessThan) x @ (value_type (vr128_ty ty)) y) +(rule (icmp_val _ (IntCC.SignedLessThan) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y) (vec_int128_scmphi y x)) -(rule (icmp_val _ (IntCC.SignedGreaterThanOrEqual) x @ (value_type (vr128_ty ty)) y) +(rule (icmp_val _ (IntCC.SignedGreaterThanOrEqual) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y) (invert_bool (vec_int128_scmphi y x))) -(rule (icmp_val _ (IntCC.SignedLessThanOrEqual) x @ (value_type (vr128_ty ty)) y) +(rule (icmp_val _ (IntCC.SignedLessThanOrEqual) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y) (invert_bool (vec_int128_scmphi x y))) -;; Compare (unsigned) 128-bit integers for relational inequality. +;; Compare (unsigned) 128-bit integers for relational inequality pre-z17. ;; Implemented via synthetic instruction using VECLG and VCHLGS. -(rule (icmp_val _ (IntCC.UnsignedGreaterThan) x @ (value_type (vr128_ty ty)) y) +(rule (icmp_val _ (IntCC.UnsignedGreaterThan) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y) (vec_int128_ucmphi x y)) -(rule (icmp_val _ (IntCC.UnsignedLessThan) x @ (value_type (vr128_ty ty)) y) +(rule (icmp_val _ (IntCC.UnsignedLessThan) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y) (vec_int128_ucmphi y x)) -(rule (icmp_val _ (IntCC.UnsignedGreaterThanOrEqual) x @ (value_type (vr128_ty ty)) y) +(rule (icmp_val _ (IntCC.UnsignedGreaterThanOrEqual) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y) (invert_bool (vec_int128_ucmphi y x))) -(rule (icmp_val _ (IntCC.UnsignedLessThanOrEqual) x @ (value_type (vr128_ty ty)) y) +(rule (icmp_val _ (IntCC.UnsignedLessThanOrEqual) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y) (invert_bool (vec_int128_ucmphi x y))) diff --git a/cranelift/codegen/src/isa/s390x/lower/isle.rs b/cranelift/codegen/src/isa/s390x/lower/isle.rs index 561fabd9561e..1e53aef7d664 100644 --- a/cranelift/codegen/src/isa/s390x/lower/isle.rs +++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs @@ -229,6 +229,24 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, S390xBackend> { } } + #[inline] + fn mie4_enabled(&mut self, _: Type) -> Option<()> { + if self.backend.isa_flags.has_mie4() { + Some(()) + } else { + None + } + } + + #[inline] + fn mie4_disabled(&mut self, _: Type) -> Option<()> { + if !self.backend.isa_flags.has_mie4() { + Some(()) + } else { + None + } + } + #[inline] fn vxrs_ext2_enabled(&mut self, _: Type) -> Option<()> { if self.backend.isa_flags.has_vxrs_ext2() { @@ -247,6 +265,24 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, S390xBackend> { } } + #[inline] + fn vxrs_ext3_enabled(&mut self, _: Type) -> Option<()> { + if self.backend.isa_flags.has_vxrs_ext3() { + Some(()) + } else { + None + } + } + + #[inline] + fn vxrs_ext3_disabled(&mut self, _: Type) -> Option<()> { + if !self.backend.isa_flags.has_vxrs_ext3() { + Some(()) + } else { + None + } + } + #[inline] fn writable_gpr(&mut self, regno: u8) -> WritableReg { writable_gpr(regno) diff --git a/cranelift/filetests/filetests/isa/s390x/arithmetic-arch15.clif b/cranelift/filetests/filetests/isa/s390x/arithmetic-arch15.clif new file mode 100644 index 000000000000..55499a1a19d3 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/arithmetic-arch15.clif @@ -0,0 +1,333 @@ +test compile precise-output +set enable_multi_ret_implicit_sret +target s390x arch15 + +function %imul_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = imul.i128 v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vmlq %v6, %v1, %v3 +; vst %v6, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vml %v6, %v1, %v3, 4 +; vst %v6, 0(%r2) +; br %r14 + +function %mul_uextend_i64(i64, i64) -> i128 { +block0(v0: i64, v1: i64): + v2 = uextend.i128 v0 + v3 = uextend.i128 v1 + v4 = imul v2, v3 + return v4 +} + +; VCode: +; block0: +; lgr %r5, %r2 +; mlgr %r2, %r4 +; vlvgp %v7, %r2, %r3 +; lgr %r2, %r5 +; vst %v7, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; lgr %r5, %r2 +; mlgr %r2, %r4 +; vlvgp %v7, %r2, %r3 +; lgr %r2, %r5 +; vst %v7, 0(%r2) +; br %r14 + +function %mul_sextend_i64(i64, i64) -> i128 { +block0(v0: i64, v1: i64): + v2 = sextend.i128 v0 + v3 = sextend.i128 v1 + v4 = imul v2, v3 + return v4 +} + +; VCode: +; block0: +; lgr %r5, %r2 +; mgrk %r2, %r3, %r4 +; vlvgp %v7, %r2, %r3 +; lgr %r2, %r5 +; vst %v7, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; lgr %r5, %r2 +; mgrk %r2, %r3, %r4 +; vlvgp %v7, %r2, %r3 +; lgr %r2, %r5 +; vst %v7, 0(%r2) +; br %r14 + +function %sdiv_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = sdiv.i128 v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vlcq %v6, %v3 +; veval %v16, %v1, %v3, %v6, 9 +; vrepib %v18, 255 +; vecq %v16, %v18 +; jge .+2 # trap=int_ovf +; vdq %v22, %v1, %v3, 0 +; vst %v22, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vlc %v6, %v3, 4 +; .byte 0xe7, 0x01 +; lper %f0, %f9 +; ld %f8, 0x720(%r8, %r14) +; .byte 0x00, 0xff +; .byte 0x08, 0x45 +; vec %v16, %v18, 4 +; jge 0x26 ; trap: int_ovf +; .byte 0xe7, 0x61 +; lper %f0, %f0 +; lh %r11, 0x760(%r2, %r14) ; trap: int_divz +; lpdr %f0, %f0 +; .byte 0x08, 0x0e +; br %r14 + +function %udiv_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = udiv.i128 v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vdlq %v6, %v1, %v3, 0 +; vst %v6, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; .byte 0xe7, 0x61 +; lper %f0, %f0 +; sth %r11, 0x760(%r14) ; trap: int_divz +; lpdr %f0, %f0 +; .byte 0x00, 0x0e +; br %r14 + +function %srem_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = srem.i128 v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vlcq %v6, %v3 +; veval %v16, %v1, %v3, %v6, 9 +; vrepib %v18, 255 +; vecq %v16, %v18 +; jge .+2 # trap=int_ovf +; vrq %v22, %v1, %v3, 0 +; vst %v22, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vlc %v6, %v3, 4 +; .byte 0xe7, 0x01 +; lper %f0, %f9 +; ld %f8, 0x720(%r8, %r14) +; .byte 0x00, 0xff +; .byte 0x08, 0x45 +; vec %v16, %v18, 4 +; jge 0x26 ; trap: int_ovf +; .byte 0xe7, 0x61 +; lper %f0, %f0 +; lh %r11, 0x760(%r3, %r14) ; trap: int_divz +; lpdr %f0, %f0 +; .byte 0x08, 0x0e +; br %r14 + +function %urem_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = urem.i128 v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vrlq %v6, %v1, %v3, 0 +; vst %v6, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; .byte 0xe7, 0x61 +; lper %f0, %f0 +; sth %r11, 0x760(%r1, %r14) ; trap: int_divz +; lpdr %f0, %f0 +; .byte 0x00, 0x0e +; br %r14 + +function %umax_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = umax.i128 v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vmxlq %v6, %v1, %v3 +; vst %v6, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vmxl %v6, %v1, %v3, 4 +; vst %v6, 0(%r2) +; br %r14 + +function %umin_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = umin.i128 v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vmnlq %v6, %v1, %v3 +; vst %v6, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vmnl %v6, %v1, %v3, 4 +; vst %v6, 0(%r2) +; br %r14 + +function %smax_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = smax.i128 v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vmxq %v6, %v1, %v3 +; vst %v6, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vmx %v6, %v1, %v3, 4 +; vst %v6, 0(%r2) +; br %r14 + +function %smin_i128(i128, i128) -> i128 { +block0(v0: i128, v1: i128): + v2 = smin.i128 v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vmnq %v6, %v1, %v3 +; vst %v6, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vl %v3, 0(%r4) +; vmn %v6, %v1, %v3, 4 +; vst %v6, 0(%r2) +; br %r14 + + +function %iabs_i128(i128) -> i128 { +block0(v0: i128): + v1 = iabs.i128 v0 + return v1 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vlpq %v4, %v1 +; vst %v4, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vlp %v4, %v1, 4 +; vst %v4, 0(%r2) +; br %r14 + +function %ineg_i128(i128) -> i128 { +block0(v0: i128): + v1 = ineg.i128 v0 + return v1 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vlcq %v4, %v1 +; vst %v4, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vlc %v4, %v1, 4 +; vst %v4, 0(%r2) +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/bitops-arch15.clif b/cranelift/filetests/filetests/isa/s390x/bitops-arch15.clif new file mode 100644 index 000000000000..8f9b3b30ea89 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/bitops-arch15.clif @@ -0,0 +1,326 @@ +test compile precise-output +set enable_multi_ret_implicit_sret +target s390x arch15 + +function %clz_i128(i128) -> i128 { +block0(v0: i128): + v1 = clz v0 + return v1 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vclzq %v4, %v1 +; vst %v4, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vclz %v4, %v1, 4 +; vst %v4, 0(%r2) +; br %r14 + +function %clz_i64(i64) -> i64 { +block0(v0: i64): + v1 = clz v0 + return v1 +} + +; VCode: +; block0: +; clzg %r2, %r2 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xb9, 0x68 +; .byte 0x00, 0x22 +; br %r14 + +function %clz_i32(i32) -> i32 { +block0(v0: i32): + v1 = clz v0 + return v1 +} + +; VCode: +; block0: +; llgfr %r4, %r2 +; clzg %r2, %r4 +; ahi %r2, -32 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; llgfr %r4, %r2 +; .byte 0xb9, 0x68 +; .byte 0x00, 0x24 +; ahi %r2, -0x20 +; br %r14 + +function %clz_i16(i16) -> i16 { +block0(v0: i16): + v1 = clz v0 + return v1 +} + +; VCode: +; block0: +; llghr %r4, %r2 +; clzg %r2, %r4 +; ahi %r2, -48 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; llghr %r4, %r2 +; .byte 0xb9, 0x68 +; .byte 0x00, 0x24 +; ahi %r2, -0x30 +; br %r14 + +function %clz_i8(i8) -> i8 { +block0(v0: i8): + v1 = clz v0 + return v1 +} + +; VCode: +; block0: +; llgcr %r4, %r2 +; clzg %r2, %r4 +; ahi %r2, -56 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; llgcr %r4, %r2 +; .byte 0xb9, 0x68 +; .byte 0x00, 0x24 +; ahi %r2, -0x38 +; br %r14 + +function %cls_i128(i128) -> i128 { +block0(v0: i128): + v1 = cls v0 + return v1 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vrepib %v4, 255 +; vsrab %v6, %v1, %v4 +; vsra %v16, %v6, %v4 +; vx %v18, %v1, %v16 +; vclzq %v20, %v18 +; vaq %v22, %v20, %v4 +; vst %v22, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vrepib %v4, 0xff +; vsrab %v6, %v1, %v4 +; vsra %v16, %v6, %v4 +; vx %v18, %v1, %v16 +; vclz %v20, %v18, 4 +; vaq %v22, %v20, %v4 +; vst %v22, 0(%r2) +; br %r14 + +function %cls_i64(i64) -> i64 { +block0(v0: i64): + v1 = cls v0 + return v1 +} + +; VCode: +; block0: +; srag %r4, %r2, 63 +; xgr %r2, %r4 +; clzg %r4, %r2 +; aghik %r2, %r4, -1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; srag %r4, %r2, 0x3f +; xgr %r2, %r4 +; .byte 0xb9, 0x68 +; .byte 0x00, 0x42 +; aghik %r2, %r4, -1 +; br %r14 + +function %cls_i32(i32) -> i32 { +block0(v0: i32): + v1 = cls v0 + return v1 +} + +; VCode: +; block0: +; lgfr %r4, %r2 +; srag %r2, %r4, 63 +; xgr %r4, %r2 +; clzg %r2, %r4 +; ahi %r2, -33 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; lgfr %r4, %r2 +; srag %r2, %r4, 0x3f +; xgr %r4, %r2 +; .byte 0xb9, 0x68 +; .byte 0x00, 0x24 +; ahi %r2, -0x21 +; br %r14 + +function %cls_i16(i16) -> i16 { +block0(v0: i16): + v1 = cls v0 + return v1 +} + +; VCode: +; block0: +; lghr %r4, %r2 +; srag %r2, %r4, 63 +; xgr %r4, %r2 +; clzg %r2, %r4 +; ahi %r2, -49 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; lghr %r4, %r2 +; srag %r2, %r4, 0x3f +; xgr %r4, %r2 +; .byte 0xb9, 0x68 +; .byte 0x00, 0x24 +; ahi %r2, -0x31 +; br %r14 + +function %cls_i8(i8) -> i8 { +block0(v0: i8): + v1 = cls v0 + return v1 +} + +; VCode: +; block0: +; lgbr %r4, %r2 +; srag %r2, %r4, 63 +; xgr %r4, %r2 +; clzg %r2, %r4 +; ahi %r2, -57 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; lgbr %r4, %r2 +; srag %r2, %r4, 0x3f +; xgr %r4, %r2 +; .byte 0xb9, 0x68 +; .byte 0x00, 0x24 +; ahi %r2, -0x39 +; br %r14 + +function %ctz_i128(i128) -> i128 { +block0(v0: i128): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; vl %v1, 0(%r3) +; vctzq %v4, %v1 +; vst %v4, 0(%r2) +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r3) +; vctz %v4, %v1, 4 +; vst %v4, 0(%r2) +; br %r14 + +function %ctz_i64(i64) -> i64 { +block0(v0: i64): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; ctzg %r2, %r2 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xb9, 0x69 +; .byte 0x00, 0x22 +; br %r14 + +function %ctz_i32(i32) -> i32 { +block0(v0: i32): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; oihl %r2, 1 +; ctzg %r2, %r2 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; oihl %r2, 1 +; .byte 0xb9, 0x69 +; .byte 0x00, 0x22 +; br %r14 + +function %ctz_i16(i16) -> i16 { +block0(v0: i16): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; oilh %r2, 1 +; ctzg %r2, %r2 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; oilh %r2, 1 +; .byte 0xb9, 0x69 +; .byte 0x00, 0x22 +; br %r14 + +function %ctz_i8(i8) -> i8 { +block0(v0: i8): + v1 = ctz v0 + return v1 +} + +; VCode: +; block0: +; oill %r2, 256 +; ctzg %r2, %r2 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; oill %r2, 0x100 +; .byte 0xb9, 0x69 +; .byte 0x00, 0x22 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif new file mode 100644 index 000000000000..93f3de14e972 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif @@ -0,0 +1,243 @@ +test compile precise-output +target s390x arch15 + +function %icmp_eq_i128(i128, i128) -> i8 { +block0(v0: i128, v1: i128): + v2 = icmp.i128 eq v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; veclq %v1, %v3 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vecl %v1, %v3, 4 +; lhi %r2, 0 +; lochie %r2, 1 +; br %r14 + +function %icmp_ne_i128(i128, i128) -> i8 { +block0(v0: i128, v1: i128): + v2 = icmp.i128 ne v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; veclq %v1, %v3 +; lhi %r2, 0 +; lochilh %r2, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vecl %v1, %v3, 4 +; lhi %r2, 0 +; lochilh %r2, 1 +; br %r14 + +function %icmp_slt_i128(i128, i128) -> i8 { +block0(v0: i128, v1: i128): + v2 = icmp.i128 slt v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vecq %v1, %v3 +; lhi %r2, 0 +; lochil %r2, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vec %v1, %v3, 4 +; lhi %r2, 0 +; lochil %r2, 1 +; br %r14 + +function %icmp_sgt_i128(i128, i128) -> i8 { +block0(v0: i128, v1: i128): + v2 = icmp.i128 sgt v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vecq %v1, %v3 +; lhi %r2, 0 +; lochih %r2, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vec %v1, %v3, 4 +; lhi %r2, 0 +; lochih %r2, 1 +; br %r14 + +function %icmp_sle_i128(i128, i128) -> i8 { +block0(v0: i128, v1: i128): + v2 = icmp.i128 sle v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vecq %v1, %v3 +; lhi %r2, 0 +; lochile %r2, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vec %v1, %v3, 4 +; lhi %r2, 0 +; lochile %r2, 1 +; br %r14 + +function %icmp_sge_i128(i128, i128) -> i8 { +block0(v0: i128, v1: i128): + v2 = icmp.i128 sge v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vecq %v1, %v3 +; lhi %r2, 0 +; lochihe %r2, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vec %v1, %v3, 4 +; lhi %r2, 0 +; lochihe %r2, 1 +; br %r14 + +function %icmp_ult_i128(i128, i128) -> i8 { +block0(v0: i128, v1: i128): + v2 = icmp.i128 ult v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; veclq %v1, %v3 +; lhi %r2, 0 +; lochil %r2, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vecl %v1, %v3, 4 +; lhi %r2, 0 +; lochil %r2, 1 +; br %r14 + +function %icmp_ugt_i128(i128, i128) -> i8 { +block0(v0: i128, v1: i128): + v2 = icmp.i128 ugt v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; veclq %v1, %v3 +; lhi %r2, 0 +; lochih %r2, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vecl %v1, %v3, 4 +; lhi %r2, 0 +; lochih %r2, 1 +; br %r14 + +function %icmp_ule_i128(i128, i128) -> i8 { +block0(v0: i128, v1: i128): + v2 = icmp.i128 ule v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; veclq %v1, %v3 +; lhi %r2, 0 +; lochile %r2, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vecl %v1, %v3, 4 +; lhi %r2, 0 +; lochile %r2, 1 +; br %r14 + +function %icmp_uge_i128(i128, i128) -> i8 { +block0(v0: i128, v1: i128): + v2 = icmp.i128 uge v0, v1 + return v2 +} + +; VCode: +; block0: +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; veclq %v1, %v3 +; lhi %r2, 0 +; lochihe %r2, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vl %v1, 0(%r2) +; vl %v3, 0(%r3) +; vecl %v1, %v3, 4 +; lhi %r2, 0 +; lochihe %r2, 1 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-arithmetic-arch15.clif b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic-arch15.clif new file mode 100644 index 000000000000..381ec6c32d94 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic-arch15.clif @@ -0,0 +1,51 @@ +test compile precise-output +target s390x arch15 + +function %imul_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = imul.i64x2 v0, v1 + return v2 +} + +; VCode: +; block0: +; vmlg %v24, %v24, %v25 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vml %v24, %v24, %v25, 3 +; br %r14 + +function %umulhi_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = umulhi.i64x2 v0, v1 + return v2 +} + +; VCode: +; block0: +; vmlhg %v24, %v24, %v25 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vmlh %v24, %v24, %v25, 3 +; br %r14 + +function %smulhi_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = smulhi.i64x2 v0, v1 + return v2 +} + +; VCode: +; block0: +; vmhg %v24, %v24, %v25 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; vmh %v24, %v24, %v25, 3 +; br %r14 + diff --git a/cranelift/filetests/filetests/isa/s390x/vec-bitwise-arch15.clif b/cranelift/filetests/filetests/isa/s390x/vec-bitwise-arch15.clif new file mode 100644 index 000000000000..1c91bb392755 --- /dev/null +++ b/cranelift/filetests/filetests/isa/s390x/vec-bitwise-arch15.clif @@ -0,0 +1,946 @@ +test compile precise-output +target s390x arch15 + +function %band_band_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = band.i64x2 v0, v1 + v4 = band.i64x2 v3, v2 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r0, %r1, 0xf88(%r10) +; br %r14 + +function %band_band_rev_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = band.i64x2 v0, v1 + v4 = band.i64x2 v2, v3 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v26, %v24, %v25, 1 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x8a +; .byte 0x80, 0x01 +; .byte 0x9f, 0x88 +; br %r14 + +function %band_band_nota_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bnot v0 + v4 = band.i64x2 v3, v1 + v5 = band.i64x2 v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 16 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r1, %r0, 0xf88(%r10) +; br %r14 + +function %band_band_notb_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bnot v1 + v4 = band.i64x2 v0, v3 + v5 = band.i64x2 v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v25, %v24, %v26, 4 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x89 +; .byte 0x80, 0x04 +; mc 0x7fe, 0x88 + +function %band_band_notc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bnot v2 + v4 = band.i64x2 v0, v1 + v5 = band.i64x2 v4, v3 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 2 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r0, %r2, 0xf88(%r10) +; br %r14 + +function %band_bnandab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = band.i64x2 v0, v1 + v4 = bnot v3 + v5 = band.i64x2 v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 84 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r5, %r4, 0xf88(%r10) +; br %r14 + +function %band_bnandbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = band.i64x2 v1, v2 + v4 = bnot v3 + v5 = band.i64x2 v0, v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 14 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r0, %r14, 0xf88(%r10) +; br %r14 + +function %band_borab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor.i64x2 v0, v1 + v4 = band.i64x2 v3, v2 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 21 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r1, %r5, 0xf88(%r10) +; br %r14 + +function %band_borbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor.i64x2 v1, v2 + v4 = band.i64x2 v0, v3 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 7 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r0, %r7, 0xf88(%r10) +; br %r14 + +function %band_bnorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor.i64x2 v0, v1 + v4 = bnot.i64x2 v3 + v5 = band.i64x2 v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 64 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r4, %r0, 0xf88(%r10) +; br %r14 + +function %band_bnorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor.i64x2 v1, v2 + v4 = bnot.i64x2 v3 + v5 = band.i64x2 v0, v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 8 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r0, %r8, 0xf88(%r10) +; br %r14 + +function %band_bxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor.i64x2 v0, v1 + v4 = band.i64x2 v3, v2 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 20 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r1, %r4, 0xf88(%r10) +; br %r14 + +function %band_bxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor.i64x2 v1, v2 + v4 = band.i64x2 v0, v3 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 6 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r0, %r6, 0xf88(%r10) +; br %r14 + +function %band_bnxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor.i64x2 v0, v1 + v4 = bnot.i64x2 v3 + v5 = band.i64x2 v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 65 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r4, %r1, 0xf88(%r10) +; br %r14 + +function %band_bnxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor.i64x2 v1, v2 + v4 = bnot.i64x2 v3 + v5 = band.i64x2 v0, v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 9 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r0, %r9, 0xf88(%r10) +; br %r14 + +function %bor_bor_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v0, v1 + v4 = bor v3, v2 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 127 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r7, %r15, 0xf88(%r10) +; br %r14 + +function %bor_bor_rev_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v0, v1 + v4 = bor v2, v3 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v26, %v24, %v25, 127 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x8a +; .byte 0x80, 0x7f +; .byte 0x9f, 0x88 +; br %r14 + +function %bor_bor_nota_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bnot v0 + v4 = bor v3, v1 + v5 = bor v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 247 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r15, %r7, 0xf88(%r10) +; br %r14 + +function %bor_bor_notb_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bnot v1 + v4 = bor v0, v3 + v5 = bor v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 223 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r13, %r15, 0xf88(%r10) +; br %r14 + +function %bor_bor_notc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bnot v2 + v4 = bor.i64x2 v0, v1 + v5 = bor.i64x2 v4, v3 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 191 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r11, %r15, 0xf88(%r10) +; br %r14 + +function %bor_bnandab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = band v0, v1 + v4 = bnot v3 + v5 = bor v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 253 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r15, %r13, 0xf88(%r10) +; br %r14 + +function %bor_bnandbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = band v1, v2 + v4 = bnot v3 + v5 = bor v0, v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 239 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r14, %r15, 0xf88(%r10) +; br %r14 + +function %bor_borab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v0, v1 + v4 = bor v3, v2 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 127 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r7, %r15, 0xf88(%r10) +; br %r14 + +function %bor_borbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v1, v2 + v4 = bor v0, v3 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 127 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r7, %r15, 0xf88(%r10) +; br %r14 + +function %bor_bnorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v0, v1 + v4 = bnot v3 + v5 = bor v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 213 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r13, %r5, 0xf88(%r10) +; br %r14 + +function %bor_bnorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v1, v2 + v4 = bnot v3 + v5 = bor v0, v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 143 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r8, %r15, 0xf88(%r10) +; br %r14 + +function %bor_bxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v0, v1 + v4 = bor v3, v2 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 125 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r7, %r13, 0xf88(%r10) +; br %r14 + +function %bor_bxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v1, v2 + v4 = bor v0, v3 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 111 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r6, %r15, 0xf88(%r10) +; br %r14 + +function %bor_bnxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v0, v1 + v4 = bnot v3 + v5 = bor v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 215 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r13, %r7, 0xf88(%r10) +; br %r14 + +function %bor_bnxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v1, v2 + v4 = bnot v3 + v5 = bor v0, v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 159 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r9, %r15, 0xf88(%r10) +; br %r14 + +function %bxor_bxor_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v0, v1 + v4 = bxor v3, v2 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 105 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r6, %r9, 0xf88(%r10) +; br %r14 + +function %bxor_bxor_rev_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v0, v1 + v4 = bxor v2, v3 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v26, %v24, %v25, 105 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x8a +; .byte 0x80, 0x69 +; .byte 0x9f, 0x88 +; br %r14 + +function %bxor_bnandab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = band v0, v1 + v4 = bnot v3 + v5 = bxor v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 169 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r10, %r9, 0xf88(%r10) +; br %r14 + +function %bxor_bnandbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = band v1, v2 + v4 = bnot v3 + v5 = bxor v0, v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 225 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r14, %r1, 0xf88(%r10) +; br %r14 + +function %bxor_borab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v0, v1 + v4 = bxor v3, v2 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 106 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r6, %r10, 0xf88(%r10) +; br %r14 + +function %bxor_borbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v1, v2 + v4 = bxor v0, v3 + return v4 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 120 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r7, %r8, 0xf88(%r10) +; br %r14 + +function %bxor_bnorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v0, v1 + v4 = bnot v3 + v5 = bxor v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 149 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r9, %r5, 0xf88(%r10) +; br %r14 + +function %bxor_bnorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v1, v2 + v4 = bnot v3 + v5 = bxor v0, v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 135 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r8, %r7, 0xf88(%r10) +; br %r14 + +function %bxor_bnxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v0, v1 + v4 = bnot v3 + v5 = bxor v4, v2 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 150 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r9, %r6, 0xf88(%r10) +; br %r14 + +function %bxor_bnxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v1, v2 + v4 = bnot v3 + v5 = bxor v0, v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 150 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r9, %r6, 0xf88(%r10) +; br %r14 + +function %bnxor_bxor_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v0, v1 + v4 = bxor v3, v2 + v5 = bnot v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 150 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r9, %r6, 0xf88(%r10) +; br %r14 + +function %bnxor_bxor_rev_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v0, v1 + v4 = bxor v2, v3 + v5 = bnot v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v26, %v24, %v25, 150 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x8a +; .byte 0x80, 0x96 +; .byte 0x9f, 0x88 +; br %r14 + +function %bnxor_bnandab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = band v0, v1 + v4 = bnot v3 + v5 = bxor v4, v2 + v6 = bnot v5 + return v6 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 86 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r5, %r6, 0xf88(%r10) +; br %r14 + +function %bnxor_bnandbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = band v1, v2 + v4 = bnot v3 + v5 = bxor v0, v4 + v6 = bnot v5 + return v6 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 30 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r1, %r14, 0xf88(%r10) +; br %r14 + +function %bnxor_borab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v0, v1 + v4 = bxor v3, v2 + v5 = bnot v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 149 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r9, %r5, 0xf88(%r10) +; br %r14 + +function %bnxor_borbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v1, v2 + v4 = bxor v0, v3 + v5 = bnot v4 + return v5 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 135 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r8, %r7, 0xf88(%r10) +; br %r14 + +function %bnxor_bnorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v0, v1 + v4 = bnot v3 + v5 = bxor v4, v2 + v6 = bnot v5 + return v6 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 106 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r6, %r10, 0xf88(%r10) +; br %r14 + +function %bnxor_bnorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bor v1, v2 + v4 = bnot v3 + v5 = bxor v0, v4 + v6 = bnot v5 + return v6 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 120 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r7, %r8, 0xf88(%r10) +; br %r14 + +function %bnxor_bnxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v0, v1 + v4 = bnot v3 + v5 = bxor v4, v2 + v6 = bnot v5 + return v6 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 105 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r6, %r9, 0xf88(%r10) +; br %r14 + +function %bnxor_bnxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2, v2: i64x2): + v3 = bxor v1, v2 + v4 = bnot v3 + v5 = bxor v0, v4 + v6 = bnot v5 + return v6 +} + +; VCode: +; block0: +; veval %v24, %v24, %v25, %v26, 105 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r6, %r9, 0xf88(%r10) +; br %r14 From da628431e4b007b4a303cffe29345f64ebee4536 Mon Sep 17 00:00:00 2001 From: Jimmy Brisson Date: Tue, 25 Nov 2025 17:23:34 -0600 Subject: [PATCH 2/3] s390x: Emit vector blend on z17 --- cranelift/codegen/src/isa/s390x/inst.isle | 14 +++++++++ cranelift/codegen/src/isa/s390x/inst/emit.rs | 31 +++++++++++++++++++ .../codegen/src/isa/s390x/inst/emit_tests.rs | 10 ++++++ cranelift/codegen/src/isa/s390x/inst/mod.rs | 10 +++++- cranelift/codegen/src/isa/s390x/lower.isle | 5 +++ cranelift/codegen/src/isa/s390x/mod.rs | 2 +- .../filetests/isa/s390x/icmp-i128-arch15.clif | 17 ++++++++++ 7 files changed, 87 insertions(+), 2 deletions(-) diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle index db64ed797141..388af8f7cbd1 100644 --- a/cranelift/codegen/src/isa/s390x/inst.isle +++ b/cranelift/codegen/src/isa/s390x/inst.isle @@ -600,6 +600,13 @@ (rm Reg) (ra Reg)) + ;; Vector merge instruction. + (VecBlend + (rd WritableReg) + (rn Reg) + (rm Reg) + (ra Reg)) + ;; Vector permute instruction. (VecPermute (rd WritableReg) @@ -2493,6 +2500,13 @@ (_ Unit (emit (MInst.VecPermute dst src1 src2 src3)))) dst)) +;; Helper for emitting `MInst.VecBlend` instructions. +(decl vec_blend (Type Reg Reg Reg) Reg) +(rule (vec_blend ty src1 src2 src3) + (let ((dst WritableReg (temp_writable_reg ty)) + (_ Unit (emit (MInst.VecBlend dst src1 src2 src3)))) + dst)) + ;; Helper for emitting `MInst.VecEvaluate` instructions. (decl vec_eval (Type u8 Reg Reg Reg) Reg) (rule (vec_eval ty op src1 src2 src3) diff --git a/cranelift/codegen/src/isa/s390x/inst/emit.rs b/cranelift/codegen/src/isa/s390x/inst/emit.rs index 944f63ed5842..2222faeff1b5 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs @@ -1176,6 +1176,33 @@ fn enc_vrr_c(opcode: u16, v1: Reg, v2: Reg, v3: Reg, m4: u8, m5: u8, m6: u8) -> enc } +/// VRRd-type instructions. +/// +/// 47 39 35 31 27 23 19 15 11 7 +/// opcode1 v1 v2 v3 m5 m6 - v4 rxb opcode2 +/// 40 36 32 28 24 20 16 12 8 0 +/// +fn enc_vrr_d(opcode: u16, v1: Reg, v2: Reg, v3: Reg, v4: Reg, m5: u8, m6: u8) -> [u8; 6] { + let opcode1 = ((opcode >> 8) & 0xff) as u8; + let opcode2 = (opcode & 0xff) as u8; + let rxb = rxb(Some(v1), Some(v2), Some(v3), Some(v4)); + let v1 = machreg_to_vr(v1) & 0x0f; + let v2 = machreg_to_vr(v2) & 0x0f; + let v3 = machreg_to_vr(v3) & 0x0f; + let v4 = machreg_to_vr(v4) & 0x0f; + let m5 = m5 & 0x0f; + let m6 = m6 & 0x0f; + + let mut enc: [u8; 6] = [0; 6]; + enc[0] = opcode1; + enc[1] = v1 << 4 | v2; + enc[2] = v3 << 4 | m5; + enc[3] = m6 << 4; + enc[4] = v4 << 4 | rxb; + enc[5] = opcode2; + enc +} + /// VRRe-type instructions. /// /// 47 39 35 31 27 23 19 15 11 7 @@ -2910,6 +2937,10 @@ impl Inst { let opcode = 0xe78c; // VPERM put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, 0, 0)); } + &Inst::VecBlend { rd, rn, rm, ra } => { + let opcode = 0xe789; // VBLEND + put(sink, &enc_vrr_d(opcode, rd.to_reg(), rn, rm, ra, 0, 0)); + } &Inst::VecEvaluate { imm, rd, diff --git a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs index d69ccbd91c2a..701b50feef03 100644 --- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs +++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs @@ -10593,6 +10593,16 @@ fn test_s390x_binemit() { "E7C450026788", "veval %v12, %v20, %v21, %v22, 2", )); + insns.push(( + Inst::VecBlend { + rd: writable_vr(12), + rn: vr(20), + rm: vr(21), + ra: vr(22), + }, + "E7C450006789", + "vblend %v12, %v20, %v21, %v22", + )); insns.push(( Inst::VecInt128SCmpHi { tmp: writable_vr(20), diff --git a/cranelift/codegen/src/isa/s390x/inst/mod.rs b/cranelift/codegen/src/isa/s390x/inst/mod.rs index 6247e966cde6..421cfde58fef 100644 --- a/cranelift/codegen/src/isa/s390x/inst/mod.rs +++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs @@ -319,7 +319,7 @@ impl Inst { | Inst::VecLoadLaneRevUndef { .. } | Inst::VecStoreLaneRev { .. } => InstructionSet::VXRS_EXT2, - Inst::VecEvaluate { .. } => InstructionSet::VXRS_EXT3, + Inst::VecBlend { .. } | Inst::VecEvaluate { .. } => InstructionSet::VXRS_EXT3, Inst::DummyUse { .. } => InstructionSet::Base, @@ -741,6 +741,7 @@ fn s390x_get_operands(inst: &mut Inst, collector: &mut DenyReuseVisitor { collector.reg_def(rd); @@ -2714,6 +2715,13 @@ impl Inst { let ra = pretty_print_reg(ra); format!("vsel {rd}, {rn}, {rm}, {ra}") } + &Inst::VecBlend { rd, rn, rm, ra } => { + let rd = pretty_print_reg(rd.to_reg()); + let rn = pretty_print_reg(rn); + let rm = pretty_print_reg(rm); + let ra = pretty_print_reg(ra); + format!("vblend {rd}, {rn}, {rm}, {ra}") + } &Inst::VecPermute { rd, rn, rm, ra } => { let rd = pretty_print_reg(rd.to_reg()); let rn = pretty_print_reg(rn); diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index 3bf616b6229d..03e3a8dceb56 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -2596,6 +2596,11 @@ (rule (lower (shuffle x y (shuffle_mask (imm8x16 24 25 26 27 28 29 30 31 16 17 18 19 20 21 22 23) 65535))) (vec_permute_dw_imm $I8X16 y 1 y 0)) +;;;; Rules for `blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule 1 (lower (has_type (and (ty_vec128 ty) (vxrs_ext3_enabled)) (x86_blendv p x y))) + (vec_blend ty p x y)) + ;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cranelift/codegen/src/isa/s390x/mod.rs b/cranelift/codegen/src/isa/s390x/mod.rs index 6bc40484153d..5af31576855f 100644 --- a/cranelift/codegen/src/isa/s390x/mod.rs +++ b/cranelift/codegen/src/isa/s390x/mod.rs @@ -186,7 +186,7 @@ impl TargetIsa for S390xBackend { } fn has_x86_blendv_lowering(&self, _: Type) -> bool { - false + self.isa_flags.has_vxrs_ext3() } fn has_x86_pshufb_lowering(&self) -> bool { diff --git a/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif index 93f3de14e972..2c04cdf8ce7f 100644 --- a/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif +++ b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif @@ -241,3 +241,20 @@ block0(v0: i128, v1: i128): ; lochihe %r2, 1 ; br %r14 +function %f4(i8x16, i8x16, i8x16) -> i8x16 { +block0(v0: i8x16, v1: i8x16, v2: i8x16): + v3 = x86_blendv v0, v1, v2 + return v3 +} + +; VCode: +; block0: +; vblend %v24, %v24, %v25, %v26 +; br %r14 +; +; Disassembled: +; block0: ; offset 0x0 +; .byte 0xe7, 0x88 +; stm %r0, %r0, 0xf89(%r10) +; br %r14 + From 898cc0e123ae8eac0a9a4f492add46b8d26b9454 Mon Sep 17 00:00:00 2001 From: Jimmy Brisson Date: Fri, 2 Jan 2026 10:11:18 -0600 Subject: [PATCH 3/3] Rename x86_blendv to blendv Now that s390x implements blendv as well, we should refer to the instruction without the x86 prefix. --- cranelift/codegen/meta/src/shared/instructions.rs | 2 +- cranelift/codegen/src/isa/aarch64/mod.rs | 2 +- cranelift/codegen/src/isa/mod.rs | 4 ++-- cranelift/codegen/src/isa/pulley_shared/mod.rs | 2 +- cranelift/codegen/src/isa/riscv64/mod.rs | 2 +- cranelift/codegen/src/isa/s390x/lower.isle | 2 +- cranelift/codegen/src/isa/s390x/mod.rs | 2 +- cranelift/codegen/src/isa/x64/lower.isle | 8 ++++---- cranelift/codegen/src/isa/x64/mod.rs | 2 +- .../filetests/filetests/isa/s390x/icmp-i128-arch15.clif | 2 +- cranelift/fuzzgen/src/function_generator.rs | 2 +- cranelift/interpreter/src/step.rs | 2 +- crates/cranelift/src/func_environ.rs | 4 ++-- crates/cranelift/src/translate/code_translator.rs | 4 ++-- 14 files changed, 20 insertions(+), 20 deletions(-) diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 9604c6904bc0..a27ca30aab4a 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -1667,7 +1667,7 @@ pub(crate) fn define( ig.push( Inst::new( - "x86_blendv", + "blendv", r#" A bitselect-lookalike instruction except with the semantics of `blendv`-related instructions on x86. diff --git a/cranelift/codegen/src/isa/aarch64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs index 2f80ad9fa29b..e90a8aecfa57 100644 --- a/cranelift/codegen/src/isa/aarch64/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/mod.rs @@ -227,7 +227,7 @@ impl TargetIsa for AArch64Backend { true } - fn has_x86_blendv_lowering(&self, _: Type) -> bool { + fn has_blendv_lowering(&self, _: Type) -> bool { false } diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs index 2bc2033983ef..a33e69dee809 100644 --- a/cranelift/codegen/src/isa/mod.rs +++ b/cranelift/codegen/src/isa/mod.rs @@ -391,9 +391,9 @@ pub trait TargetIsa: fmt::Display + Send + Sync { /// Returns whether this ISA has instructions for `ceil`, `floor`, etc. fn has_round(&self) -> bool; - /// Returns whether the CLIF `x86_blendv` instruction is implemented for + /// Returns whether the CLIF `blendv` instruction is implemented for /// this ISA for the specified type. - fn has_x86_blendv_lowering(&self, ty: Type) -> bool; + fn has_blendv_lowering(&self, ty: Type) -> bool; /// Returns whether the CLIF `x86_pshufb` instruction is implemented for /// this ISA. diff --git a/cranelift/codegen/src/isa/pulley_shared/mod.rs b/cranelift/codegen/src/isa/pulley_shared/mod.rs index 0b781e467a08..66d0051e9e62 100644 --- a/cranelift/codegen/src/isa/pulley_shared/mod.rs +++ b/cranelift/codegen/src/isa/pulley_shared/mod.rs @@ -232,7 +232,7 @@ where true } - fn has_x86_blendv_lowering(&self, _ty: ir::Type) -> bool { + fn has_blendv_lowering(&self, _ty: ir::Type) -> bool { false } diff --git a/cranelift/codegen/src/isa/riscv64/mod.rs b/cranelift/codegen/src/isa/riscv64/mod.rs index ca0a1a13e2e9..f41700825df2 100644 --- a/cranelift/codegen/src/isa/riscv64/mod.rs +++ b/cranelift/codegen/src/isa/riscv64/mod.rs @@ -205,7 +205,7 @@ impl TargetIsa for Riscv64Backend { true } - fn has_x86_blendv_lowering(&self, _: Type) -> bool { + fn has_blendv_lowering(&self, _: Type) -> bool { false } diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle index 03e3a8dceb56..f92bdd01199f 100644 --- a/cranelift/codegen/src/isa/s390x/lower.isle +++ b/cranelift/codegen/src/isa/s390x/lower.isle @@ -2598,7 +2598,7 @@ ;;;; Rules for `blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(rule 1 (lower (has_type (and (ty_vec128 ty) (vxrs_ext3_enabled)) (x86_blendv p x y))) +(rule 1 (lower (has_type (and (ty_vec128 ty) (vxrs_ext3_enabled)) (blendv p x y))) (vec_blend ty p x y)) diff --git a/cranelift/codegen/src/isa/s390x/mod.rs b/cranelift/codegen/src/isa/s390x/mod.rs index 5af31576855f..1f2c2461ee02 100644 --- a/cranelift/codegen/src/isa/s390x/mod.rs +++ b/cranelift/codegen/src/isa/s390x/mod.rs @@ -185,7 +185,7 @@ impl TargetIsa for S390xBackend { true } - fn has_x86_blendv_lowering(&self, _: Type) -> bool { + fn has_blendv_lowering(&self, _: Type) -> bool { self.isa_flags.has_vxrs_ext3() } diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle index e8922537bdd2..ca96b7830fa1 100644 --- a/cranelift/codegen/src/isa/x64/lower.isle +++ b/cranelift/codegen/src/isa/x64/lower.isle @@ -1558,20 +1558,20 @@ (b Xmm (sse_and ty c_neg f))) (sse_or ty a b))) -;;;; Rules for `x86_blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; Rules for `blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (rule (lower (has_type $I8X16 - (x86_blendv condition if_true if_false))) + (blendv condition if_true if_false))) (if-let true (has_sse41)) (x64_pblendvb if_false if_true condition)) (rule (lower (has_type $I32X4 - (x86_blendv condition if_true if_false))) + (blendv condition if_true if_false))) (if-let true (has_sse41)) (x64_blendvps if_false if_true condition)) (rule (lower (has_type $I64X2 - (x86_blendv condition if_true if_false))) + (blendv condition if_true if_false))) (if-let true (has_sse41)) (x64_blendvpd if_false if_true condition)) diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs index 2559a97b6863..b47b33178431 100644 --- a/cranelift/codegen/src/isa/x64/mod.rs +++ b/cranelift/codegen/src/isa/x64/mod.rs @@ -179,7 +179,7 @@ impl TargetIsa for X64Backend { self.x64_flags.has_sse41() } - fn has_x86_blendv_lowering(&self, ty: Type) -> bool { + fn has_blendv_lowering(&self, ty: Type) -> bool { // The `blendvpd`, `blendvps`, and `pblendvb` instructions are all only // available from SSE 4.1 and onwards. Otherwise the i16x8 type has no // equivalent instruction which only looks at the top bit for a select diff --git a/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif index 2c04cdf8ce7f..4a97ff856868 100644 --- a/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif +++ b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif @@ -243,7 +243,7 @@ block0(v0: i128, v1: i128): function %f4(i8x16, i8x16, i8x16) -> i8x16 { block0(v0: i8x16, v1: i8x16, v2: i8x16): - v3 = x86_blendv v0, v1, v2 + v3 = blendv v0, v1, v2 return v3 } diff --git a/cranelift/fuzzgen/src/function_generator.rs b/cranelift/fuzzgen/src/function_generator.rs index d53db22a8f52..4758af801b1d 100644 --- a/cranelift/fuzzgen/src/function_generator.rs +++ b/cranelift/fuzzgen/src/function_generator.rs @@ -912,7 +912,7 @@ static OPCODE_SIGNATURES: LazyLock> = LazyLock::new(|| { (Opcode::GetFramePointer), (Opcode::GetStackPointer), (Opcode::GetReturnAddress), - (Opcode::X86Blendv), + (Opcode::Blendv), (Opcode::IcmpImm), (Opcode::X86Pmulhrsw), (Opcode::IaddImm), diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index 8a7fad2323d9..3476305c9f83 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -1311,7 +1311,7 @@ where Opcode::GetStackPointer => unimplemented!("GetStackPointer"), Opcode::GetReturnAddress => unimplemented!("GetReturnAddress"), Opcode::X86Pshufb => unimplemented!("X86Pshufb"), - Opcode::X86Blendv => unimplemented!("X86Blendv"), + Opcode::Blendv => unimplemented!("Blendv"), Opcode::X86Pmulhrsw => unimplemented!("X86Pmulhrsw"), Opcode::X86Pmaddubsw => unimplemented!("X86Pmaddubsw"), Opcode::X86Cvtt2dq => unimplemented!("X86Cvtt2dq"), diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index eb9581f9e3de..028e7bd40489 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -3994,8 +3994,8 @@ impl FuncEnvironment<'_> { .returns() } - pub fn use_x86_blendv_for_relaxed_laneselect(&self, ty: Type) -> bool { - self.isa.has_x86_blendv_lowering(ty) + pub fn use_blendv_for_relaxed_laneselect(&self, ty: Type) -> bool { + self.isa.has_blendv_lowering(ty) } pub fn use_x86_pmulhrsw_for_relaxed_q15mul(&self) -> bool { diff --git a/crates/cranelift/src/translate/code_translator.rs b/crates/cranelift/src/translate/code_translator.rs index e2dc4b539c8c..b7565bd24e12 100644 --- a/crates/cranelift/src/translate/code_translator.rs +++ b/crates/cranelift/src/translate/code_translator.rs @@ -2481,13 +2481,13 @@ pub fn translate_operator( // op. environ.stacks.push1( if environ.relaxed_simd_deterministic() - || !environ.use_x86_blendv_for_relaxed_laneselect(ty) + || !environ.use_blendv_for_relaxed_laneselect(ty) { // Deterministic semantics are a `bitselect` along the lines // of the wasm `v128.bitselect` instruction. builder.ins().bitselect(c, a, b) } else { - builder.ins().x86_blendv(c, a, b) + builder.ins().blendv(c, a, b) }, ); }