From 5b7866b8fe7f748c38f0fc815c4eee09e522ff68 Mon Sep 17 00:00:00 2001
From: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Date: Tue, 25 Nov 2025 17:23:10 -0600
Subject: [PATCH 1/3] s390x: Emit instructions from MIE4 & VXRS_EXT3 on z17

This emits & tests a bunch of instructions:
  * from Miscellaneous-Instruction-Extensions Facility 4:
    * CLZ, 64bit
    * CTZ, 64bit
  * from Vector-Enhancements Facility 3:
    * 32x4, 64x2 & 128x1 variants of the following:
      * Divide
      * Remainder
    * 64x2 & 128x1 multiply variants
    * 128x1 vaiants of:
      * Compare
      * CLZ
      * CTZ
      * Max
      * Min
      * Average
      * Negation
      * Evaluate

Co-authored-by: Jimmy Brisson <jbrisson@linux.ibm.com>
---
 cranelift/codegen/src/isa/s390x/inst.isle     | 180 +++-
 cranelift/codegen/src/isa/s390x/inst/emit.rs  | 116 ++-
 .../codegen/src/isa/s390x/inst/emit_tests.rs  | 461 ++++++++-
 cranelift/codegen/src/isa/s390x/inst/mod.rs   | 138 ++-
 cranelift/codegen/src/isa/s390x/lower.isle    | 430 +++++++-
 cranelift/codegen/src/isa/s390x/lower/isle.rs |  36 +
 .../isa/s390x/arithmetic-arch15.clif          | 333 ++++++
 .../filetests/isa/s390x/bitops-arch15.clif    | 326 ++++++
 .../filetests/isa/s390x/icmp-i128-arch15.clif | 243 +++++
 .../isa/s390x/vec-arithmetic-arch15.clif      |  51 +
 .../isa/s390x/vec-bitwise-arch15.clif         | 946 ++++++++++++++++++
 11 files changed, 3178 insertions(+), 82 deletions(-)
 create mode 100644 cranelift/filetests/filetests/isa/s390x/arithmetic-arch15.clif
 create mode 100644 cranelift/filetests/filetests/isa/s390x/bitops-arch15.clif
 create mode 100644 cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif
 create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-arithmetic-arch15.clif
 create mode 100644 cranelift/filetests/filetests/isa/s390x/vec-bitwise-arch15.clif

diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle
index 7ac2e36f839e..db64ed797141 100644
--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
@@ -607,6 +607,14 @@
       (rm Reg)
       (ra Reg))
 
+    ;; Vector evaluate instruction.
+    (VecEvaluate
+      (imm u8)
+      (rd WritableReg)
+      (rn Reg)
+      (rm Reg)
+      (ra Reg))
+
     ;; Vector permute doubleword immediate instruction.
     (VecPermuteDWImm
       (rd WritableReg)
@@ -645,6 +653,13 @@
       (rn Reg)
       (rm Reg))
 
+    ;; Vector integer element comparison with two registers sources,
+    ;; setting the condition code.
+    (VecIntEltCmp
+      (op VecIntEltCmpOp)
+      (rn Reg)
+      (rm Reg))
+
     ;; Synthetic instruction to compare signed 128-bit values.
     ;; Sets CC 1 if rn > rm, sets a different CC otherwise.
     (VecInt128SCmpHi
@@ -1117,6 +1132,8 @@
     (PopcntReg)
     (BSwap32)
     (BSwap64)
+    (Clz64)
+    (Ctz64)
 ))
 
 ;; A shift operation.
@@ -1170,53 +1187,82 @@
     (Sub32x4)
     (Sub64x2)
     (Sub128)
-    ;; Multiplication (64-bit not supported)
+    ;; Multiplication
     (Mul8x16)
     (Mul16x8)
     (Mul32x4)
+    (Mul64x2)
+    (Mul128)
     (UMulHi8x16)
     (UMulHi16x8)
     (UMulHi32x4)
+    (UMulHi64x2)
+    (UMulHi128)
     (SMulHi8x16)
     (SMulHi16x8)
     (SMulHi32x4)
+    (SMulHi64x2)
+    (SMulHi128)
     (UMulEven8x16)
     (UMulEven16x8)
     (UMulEven32x4)
+    (UMulEven64x2)
     (SMulEven8x16)
     (SMulEven16x8)
     (SMulEven32x4)
+    (SMulEven64x2)
     (UMulOdd8x16)
     (UMulOdd16x8)
     (UMulOdd32x4)
+    (UMulOdd64x2)
     (SMulOdd8x16)
     (SMulOdd16x8)
     (SMulOdd32x4)
+    (SMulOdd64x2)
+    ;; Division and remainder
+    (UDiv32x4)
+    (UDiv64x2)
+    (UDiv128)
+    (URem32x4)
+    (URem64x2)
+    (URem128)
+    (SDiv32x4)
+    (SDiv64x2)
+    (SDiv128)
+    (SRem32x4)
+    (SRem64x2)
+    (SRem128)
     ;; Minimum, maximum, and average
     (UMax8x16)
     (UMax16x8)
     (UMax32x4)
     (UMax64x2)
+    (UMax128)
     (SMax8x16)
     (SMax16x8)
     (SMax32x4)
     (SMax64x2)
+    (SMax128)
     (UMin8x16)
     (UMin16x8)
     (UMin32x4)
     (UMin64x2)
+    (UMin128)
     (SMin8x16)
     (SMin16x8)
     (SMin32x4)
     (SMin64x2)
+    (SMin128)
     (UAvg8x16)
     (UAvg16x8)
     (UAvg32x4)
     (UAvg64x2)
+    (UAvg128)
     (SAvg8x16)
     (SAvg16x8)
     (SAvg32x4)
     (SAvg64x2)
+    (SAvg128)
     ;; Bitwise operations
     (And128)
     (Orr128)
@@ -1266,10 +1312,12 @@
     (Abs16x8)
     (Abs32x4)
     (Abs64x2)
+    (Abs128)
     (Neg8x16)
     (Neg16x8)
     (Neg32x4)
     (Neg64x2)
+    (Neg128)
     ;; Population count
     (Popcnt8x16)
     (Popcnt16x8)
@@ -1280,23 +1328,29 @@
     (Clz16x8)
     (Clz32x4)
     (Clz64x2)
+    (Clz128)
     (Ctz8x16)
     (Ctz16x8)
     (Ctz32x4)
     (Ctz64x2)
+    (Ctz128)
     ;; Unpack
     (UnpackULow8x16)
     (UnpackULow16x8)
     (UnpackULow32x4)
+    (UnpackULow64x2)
     (UnpackUHigh8x16)
     (UnpackUHigh16x8)
     (UnpackUHigh32x4)
+    (UnpackUHigh64x2)
     (UnpackSLow8x16)
     (UnpackSLow16x8)
     (UnpackSLow32x4)
+    (UnpackSLow64x2)
     (UnpackSHigh8x16)
     (UnpackSHigh16x8)
     (UnpackSHigh32x4)
+    (UnpackSHigh64x2)
 ))
 
 ;; A vector shift operation.
@@ -1327,16 +1381,26 @@
     (CmpEq16x8)
     (CmpEq32x4)
     (CmpEq64x2)
+    (CmpEq128)
     (SCmpHi8x16)
     (SCmpHi16x8)
     (SCmpHi32x4)
     (SCmpHi64x2)
+    (SCmpHi128)
     (UCmpHi8x16)
     (UCmpHi16x8)
     (UCmpHi32x4)
     (UCmpHi64x2)
+    (UCmpHi128)
 ))
 
+;; An integer vector element comparion operation.
+(type VecIntEltCmpOp
+  (enum
+    (SCmp128)
+    (UCmp128)
+));
+
 ;; A floatint-point vector comparison operation.
 (type VecFloatCmpOp
   (enum
@@ -1493,11 +1557,19 @@
 (extern extractor mie3_enabled mie3_enabled)
 (decl mie3_disabled () Type)
 (extern extractor mie3_disabled mie3_disabled)
+(decl mie4_enabled () Type)
+(extern extractor mie4_enabled mie4_enabled)
+(decl mie4_disabled () Type)
+(extern extractor mie4_disabled mie4_disabled)
 
 (decl vxrs_ext2_enabled () Type)
 (extern extractor vxrs_ext2_enabled vxrs_ext2_enabled)
 (decl vxrs_ext2_disabled () Type)
 (extern extractor vxrs_ext2_disabled vxrs_ext2_disabled)
+(decl vxrs_ext3_enabled () Type)
+(extern extractor vxrs_ext3_enabled vxrs_ext3_enabled)
+(decl vxrs_ext3_disabled () Type)
+(extern extractor vxrs_ext3_disabled vxrs_ext3_disabled)
 
 ;; Helpers for SIMD lane number operations ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -2421,6 +2493,13 @@
             (_ Unit (emit (MInst.VecPermute dst src1 src2 src3))))
         dst))
 
+;; Helper for emitting `MInst.VecEvaluate` instructions.
+(decl vec_eval (Type u8 Reg Reg Reg) Reg)
+(rule (vec_eval ty op src1 src2 src3)
+      (let ((dst WritableReg (temp_writable_reg ty))
+            (_ Unit (emit (MInst.VecEvaluate op dst src1 src2 src3))))
+        dst))
+
 ;; Helper for emitting `MInst.VecPermuteDWImm` instructions.
 (decl vec_permute_dw_imm (Type Reg u8 Reg u8) Reg)
 (rule (vec_permute_dw_imm ty src1 idx1 src2 idx2)
@@ -2454,6 +2533,11 @@
       (let ((tmp WritableReg (temp_writable_reg ty)))
         (ProducesFlags.ProducesFlagsSideEffect (MInst.VecFloatCmpS op tmp src1 src2))))
 
+;; Helper for emitting `MInst.VecIntEltCmp` instructions.
+(decl vec_int_elt_cmp (VecIntEltCmpOp Reg Reg) ProducesFlags)
+(rule (vec_int_elt_cmp op src1 src2)
+      (ProducesFlags.ProducesFlagsSideEffect (MInst.VecIntEltCmp op src1 src2)))
+
 ;; Helper for emitting `MInst.VecInt128SCmpHi` instructions.
 (decl vec_int128_scmphi (Reg Reg) ProducesBool)
 (rule (vec_int128_scmphi src1 src2)
@@ -3618,28 +3702,37 @@
 
 ;; Helpers for generating `clz` and `ctz` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;
 
-;; Count leading zeroes.  For a zero input, return the specified value.
-(decl clz_reg (i16 Reg) Reg)
+;; Count leading zeroes via FLOGR.  For a zero input, return the specified value.
+(decl clz_flogr_reg (i16 Reg) Reg)
 
 ;; The flogr instruction returns 64 for zero input by default.
-(rule (clz_reg 64 x)
+(rule (clz_flogr_reg 64 x)
       (let ((dst WritableRegPair (temp_writable_regpair $I64))
             (_ Unit (emit (MInst.Flogr dst x))))
         (regpair_hi dst)))
 
 ;; If another zero return value was requested, we need to override the flogr result.
-(rule -1 (clz_reg zeroval x)
+(rule -1 (clz_flogr_reg zeroval x)
       (let ((tmp WritableRegPair (temp_writable_regpair $I64)))
         (with_flags_reg
           (ProducesFlags.ProducesFlagsSideEffect (MInst.Flogr tmp x))
           (cmov_imm $I64 (intcc_as_cond (IntCC.Equal)) zeroval (regpair_hi tmp)))))
 
+;; Count leading zeros (z17 instruction).
+(decl clz_reg (Reg) Reg)
+(rule (clz_reg x) (unary_rr $I64 (UnaryOp.Clz64) x))
+
+;; Count trailing zeros (z17 instruction).
+(decl ctz_reg (Reg) Reg)
+(rule (ctz_reg x) (unary_rr $I64 (UnaryOp.Ctz64) x))
+
 ;; Vector count leading zeros.
 (decl vecop_clz (Type) VecUnaryOp)
 (rule (vecop_clz $I8X16) (VecUnaryOp.Clz8x16))
 (rule (vecop_clz $I16X8) (VecUnaryOp.Clz16x8))
 (rule (vecop_clz $I32X4) (VecUnaryOp.Clz32x4))
 (rule (vecop_clz $I64X2) (VecUnaryOp.Clz64x2))
+(rule (vecop_clz $I128) (VecUnaryOp.Clz128))
 
 (decl vec_clz (Type Reg) Reg)
 (rule (vec_clz ty x) (vec_rr ty (vecop_clz ty) x))
@@ -3650,6 +3743,7 @@
 (rule (vecop_ctz $I16X8) (VecUnaryOp.Ctz16x8))
 (rule (vecop_ctz $I32X4) (VecUnaryOp.Ctz32x4))
 (rule (vecop_ctz $I64X2) (VecUnaryOp.Ctz64x2))
+(rule (vecop_ctz $I128) (VecUnaryOp.Ctz128))
 
 (decl vec_ctz (Type Reg) Reg)
 (rule (vec_ctz ty x) (vec_rr ty (vecop_ctz ty) x))
@@ -3900,7 +3994,8 @@
 (rule (vecop_mul $I8X16) (VecBinaryOp.Mul8x16))
 (rule (vecop_mul $I16X8) (VecBinaryOp.Mul16x8))
 (rule (vecop_mul $I32X4) (VecBinaryOp.Mul32x4))
-;; No support for $I64X2 multiplication.
+(rule (vecop_mul $I64X2) (VecBinaryOp.Mul64x2))
+(rule (vecop_mul $I128) (VecBinaryOp.Mul128))
 
 (decl vec_mul (Type Reg Reg) Reg)
 (rule (vec_mul ty x y) (vec_rrr ty (vecop_mul ty) x y))
@@ -3909,7 +4004,8 @@
 (rule (vecop_umulhi $I8X16) (VecBinaryOp.UMulHi8x16))
 (rule (vecop_umulhi $I16X8) (VecBinaryOp.UMulHi16x8))
 (rule (vecop_umulhi $I32X4) (VecBinaryOp.UMulHi32x4))
-;; No support for $I64X2 multiplication.
+(rule (vecop_umulhi $I64X2) (VecBinaryOp.UMulHi64x2))
+(rule (vecop_umulhi $I128) (VecBinaryOp.UMulHi128))
 
 (decl vec_umulhi (Type Reg Reg) Reg)
 (rule (vec_umulhi ty x y) (vec_rrr ty (vecop_umulhi ty) x y))
@@ -3918,7 +4014,8 @@
 (rule (vecop_smulhi $I8X16) (VecBinaryOp.SMulHi8x16))
 (rule (vecop_smulhi $I16X8) (VecBinaryOp.SMulHi16x8))
 (rule (vecop_smulhi $I32X4) (VecBinaryOp.SMulHi32x4))
-;; No support for $I64X2 multiplication.
+(rule (vecop_smulhi $I64X2) (VecBinaryOp.SMulHi64x2))
+(rule (vecop_smulhi $I128) (VecBinaryOp.SMulHi128))
 
 (decl vec_smulhi (Type Reg Reg) Reg)
 (rule (vec_smulhi ty x y) (vec_rrr ty (vecop_smulhi ty) x y))
@@ -3927,7 +4024,7 @@
 (rule (vecop_umul_even $I8X16) (VecBinaryOp.UMulEven8x16))
 (rule (vecop_umul_even $I16X8) (VecBinaryOp.UMulEven16x8))
 (rule (vecop_umul_even $I32X4) (VecBinaryOp.UMulEven32x4))
-;; No support for $I64X2 multiplication.
+(rule (vecop_umul_even $I64X2) (VecBinaryOp.UMulEven64x2))
 
 (decl vec_umul_even (Type Reg Reg) Reg)
 (rule (vec_umul_even ty x y) (vec_rrr ty (vecop_umul_even ty) x y))
@@ -3936,7 +4033,7 @@
 (rule (vecop_smul_even $I8X16) (VecBinaryOp.SMulEven8x16))
 (rule (vecop_smul_even $I16X8) (VecBinaryOp.SMulEven16x8))
 (rule (vecop_smul_even $I32X4) (VecBinaryOp.SMulEven32x4))
-;; No support for $I64X2 multiplication.
+(rule (vecop_smul_even $I64X2) (VecBinaryOp.SMulEven64x2))
 
 (decl vec_smul_even (Type Reg Reg) Reg)
 (rule (vec_smul_even ty x y) (vec_rrr ty (vecop_smul_even ty) x y))
@@ -3945,7 +4042,7 @@
 (rule (vecop_umul_odd $I8X16) (VecBinaryOp.UMulOdd8x16))
 (rule (vecop_umul_odd $I16X8) (VecBinaryOp.UMulOdd16x8))
 (rule (vecop_umul_odd $I32X4) (VecBinaryOp.UMulOdd32x4))
-;; No support for $I64X2 multiplication.
+(rule (vecop_umul_odd $I64X2) (VecBinaryOp.UMulOdd64x2))
 
 (decl vec_umul_odd (Type Reg Reg) Reg)
 (rule (vec_umul_odd ty x y) (vec_rrr ty (vecop_umul_odd ty) x y))
@@ -3954,12 +4051,47 @@
 (rule (vecop_smul_odd $I8X16) (VecBinaryOp.SMulOdd8x16))
 (rule (vecop_smul_odd $I16X8) (VecBinaryOp.SMulOdd16x8))
 (rule (vecop_smul_odd $I32X4) (VecBinaryOp.SMulOdd32x4))
-;; No support for $I64X2 multiplication.
+(rule (vecop_smul_odd $I64X2) (VecBinaryOp.SMulOdd64x2))
 
 (decl vec_smul_odd (Type Reg Reg) Reg)
 (rule (vec_smul_odd ty x y) (vec_rrr ty (vecop_smul_odd ty) x y))
 
 
+;; Helpers for generating vector divide instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(decl vecop_sdiv (Type) VecBinaryOp)
+(rule (vecop_sdiv $I32X4) (VecBinaryOp.SDiv32x4))
+(rule (vecop_sdiv $I64X2) (VecBinaryOp.SDiv64x2))
+(rule (vecop_sdiv $I128) (VecBinaryOp.SDiv128))
+
+(decl vec_sdiv (Type Reg Reg) Reg)
+(rule (vec_sdiv ty x y) (vec_rrr ty (vecop_sdiv ty) x y))
+
+(decl vecop_udiv (Type) VecBinaryOp)
+(rule (vecop_udiv $I32X4) (VecBinaryOp.UDiv32x4))
+(rule (vecop_udiv $I64X2) (VecBinaryOp.UDiv64x2))
+(rule (vecop_udiv $I128) (VecBinaryOp.UDiv128))
+
+(decl vec_udiv (Type Reg Reg) Reg)
+(rule (vec_udiv ty x y) (vec_rrr ty (vecop_udiv ty) x y))
+
+(decl vecop_srem (Type) VecBinaryOp)
+(rule (vecop_srem $I32X4) (VecBinaryOp.SRem32x4))
+(rule (vecop_srem $I64X2) (VecBinaryOp.SRem64x2))
+(rule (vecop_srem $I128) (VecBinaryOp.SRem128))
+
+(decl vec_srem (Type Reg Reg) Reg)
+(rule (vec_srem ty x y) (vec_rrr ty (vecop_srem ty) x y))
+
+(decl vecop_urem (Type) VecBinaryOp)
+(rule (vecop_urem $I32X4) (VecBinaryOp.URem32x4))
+(rule (vecop_urem $I64X2) (VecBinaryOp.URem64x2))
+(rule (vecop_urem $I128) (VecBinaryOp.URem128))
+
+(decl vec_urem (Type Reg Reg) Reg)
+(rule (vec_urem ty x y) (vec_rrr ty (vecop_urem ty) x y))
+
+
 ;; Helpers for generating `udivmod` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (decl udivmod (Type RegPair Reg) RegPair)
@@ -3981,6 +4113,7 @@
 (rule (vecop_umax $I16X8) (VecBinaryOp.UMax16x8))
 (rule (vecop_umax $I32X4) (VecBinaryOp.UMax32x4))
 (rule (vecop_umax $I64X2) (VecBinaryOp.UMax64x2))
+(rule (vecop_umax $I128) (VecBinaryOp.UMax128))
 
 (decl vec_umax (Type Reg Reg) Reg)
 (rule (vec_umax ty x y) (vec_rrr ty (vecop_umax ty) x y))
@@ -3993,6 +4126,7 @@
 (rule (vecop_smax $I16X8) (VecBinaryOp.SMax16x8))
 (rule (vecop_smax $I32X4) (VecBinaryOp.SMax32x4))
 (rule (vecop_smax $I64X2) (VecBinaryOp.SMax64x2))
+(rule (vecop_smax $I128) (VecBinaryOp.SMax128))
 
 (decl vec_smax (Type Reg Reg) Reg)
 (rule (vec_smax ty x y) (vec_rrr ty (vecop_smax ty) x y))
@@ -4005,6 +4139,7 @@
 (rule (vecop_umin $I16X8) (VecBinaryOp.UMin16x8))
 (rule (vecop_umin $I32X4) (VecBinaryOp.UMin32x4))
 (rule (vecop_umin $I64X2) (VecBinaryOp.UMin64x2))
+(rule (vecop_umin $I128) (VecBinaryOp.UMin128))
 
 (decl vec_umin (Type Reg Reg) Reg)
 (rule (vec_umin ty x y) (vec_rrr ty (vecop_umin ty) x y))
@@ -4017,6 +4152,7 @@
 (rule (vecop_smin $I16X8) (VecBinaryOp.SMin16x8))
 (rule (vecop_smin $I32X4) (VecBinaryOp.SMin32x4))
 (rule (vecop_smin $I64X2) (VecBinaryOp.SMin64x2))
+(rule (vecop_smin $I128) (VecBinaryOp.SMin128))
 
 (decl vec_smin (Type Reg Reg) Reg)
 (rule (vec_smin ty x y) (vec_rrr ty (vecop_smin ty) x y))
@@ -4213,6 +4349,7 @@
 (rule (vecop_abs $I16X8) (VecUnaryOp.Abs16x8))
 (rule (vecop_abs $I32X4) (VecUnaryOp.Abs32x4))
 (rule (vecop_abs $I64X2) (VecUnaryOp.Abs64x2))
+(rule (vecop_abs $I128) (VecUnaryOp.Abs128))
 
 (decl vec_abs (Type Reg) Reg)
 (rule (vec_abs ty x) (vec_rr ty (vecop_abs ty) x))
@@ -4240,6 +4377,7 @@
 (rule (vecop_neg $I16X8) (VecUnaryOp.Neg16x8))
 (rule (vecop_neg $I32X4) (VecUnaryOp.Neg32x4))
 (rule (vecop_neg $I64X2) (VecUnaryOp.Neg64x2))
+(rule (vecop_neg $I128) (VecUnaryOp.Neg128))
 
 (decl vec_neg (Type Reg) Reg)
 (rule (vec_neg ty x) (vec_rr ty (vecop_neg ty) x))
@@ -4871,9 +5009,9 @@
 (rule (vecop_int_cmpeq (multi_lane 64 2)) (VecIntCmpOp.CmpEq64x2))
 
 (decl vec_cmpeq (Type Reg Reg) Reg)
-(rule (vec_cmpeq (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmpeq ty) x y))
+(rule (vec_cmpeq (vr128_ty ty) x y) (vec_int_cmp ty (vecop_int_cmpeq ty) x y))
 (decl vec_cmpeqs (Type Reg Reg) ProducesFlags)
-(rule (vec_cmpeqs (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmpeq ty) x y))
+(rule (vec_cmpeqs (vr128_ty ty) x y) (vec_int_cmps ty (vecop_int_cmpeq ty) x y))
 
 (decl vecop_int_cmph (Type) VecIntCmpOp)
 (rule (vecop_int_cmph (multi_lane 8 16)) (VecIntCmpOp.SCmpHi8x16))
@@ -4882,9 +5020,9 @@
 (rule (vecop_int_cmph (multi_lane 64 2)) (VecIntCmpOp.SCmpHi64x2))
 
 (decl vec_cmph (Type Reg Reg) Reg)
-(rule (vec_cmph (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmph ty) x y))
+(rule (vec_cmph (vr128_ty ty) x y) (vec_int_cmp ty (vecop_int_cmph ty) x y))
 (decl vec_cmphs (Type Reg Reg) ProducesFlags)
-(rule (vec_cmphs (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmph ty) x y))
+(rule (vec_cmphs (vr128_ty ty) x y) (vec_int_cmps ty (vecop_int_cmph ty) x y))
 
 (decl vecop_int_cmphl (Type) VecIntCmpOp)
 (rule (vecop_int_cmphl (multi_lane 8 16)) (VecIntCmpOp.UCmpHi8x16))
@@ -4893,9 +5031,15 @@
 (rule (vecop_int_cmphl (multi_lane 64 2)) (VecIntCmpOp.UCmpHi64x2))
 
 (decl vec_cmphl (Type Reg Reg) Reg)
-(rule (vec_cmphl (ty_vec128 ty) x y) (vec_int_cmp ty (vecop_int_cmphl ty) x y))
+(rule (vec_cmphl (vr128_ty ty) x y) (vec_int_cmp ty (vecop_int_cmphl ty) x y))
 (decl vec_cmphls (Type Reg Reg) ProducesFlags)
-(rule (vec_cmphls (ty_vec128 ty) x y) (vec_int_cmps ty (vecop_int_cmphl ty) x y))
+(rule (vec_cmphls (vr128_ty ty) x y) (vec_int_cmps ty (vecop_int_cmphl ty) x y))
+
+(decl vec_elt_icmps (Reg Reg) ProducesFlags)
+(rule (vec_elt_icmps x y) (vec_int_elt_cmp (VecIntEltCmpOp.SCmp128) x y))
+
+(decl vec_elt_icmpu (Reg Reg) ProducesFlags)
+(rule (vec_elt_icmpu x y) (vec_int_elt_cmp (VecIntEltCmpOp.UCmp128) x y))
 
 
 ;; Helpers for generating `fcmp` instructions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/cranelift/codegen/src/isa/s390x/inst/emit.rs b/cranelift/codegen/src/isa/s390x/inst/emit.rs
index 5baef0fede8a..944f63ed5842 100644
--- a/cranelift/codegen/src/isa/s390x/inst/emit.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs
@@ -1072,6 +1072,31 @@ fn enc_vri_c(opcode: u16, v1: Reg, i2: u16, v3: Reg, m4: u8) -> [u8; 6] {
     enc
 }
 
+/// VRIk-type instructions.
+///
+///   47      39 35 31 27 23 15 11  7
+///   opcode1 v1 v2 v3 -  i5 v4 rxb opcode2
+///        40 36 32 28 24 16 12   8       0
+///
+fn enc_vri_k(opcode: u16, i5: u8, v1: Reg, v2: Reg, v3: Reg, v4: Reg) -> [u8; 6] {
+    let opcode1 = ((opcode >> 8) & 0xff) as u8;
+    let opcode2 = (opcode & 0xff) as u8;
+    let rxb = rxb(Some(v1), Some(v2), Some(v3), Some(v4));
+    let v1 = machreg_to_vr(v1) & 0x0f;
+    let v2 = machreg_to_vr(v2) & 0x0f;
+    let v3 = machreg_to_vr(v3) & 0x0f;
+    let v4 = machreg_to_vr(v4) & 0x0f;
+
+    let mut enc: [u8; 6] = [0; 6];
+    enc[0] = opcode1;
+    enc[1] = v1 << 4 | v2;
+    enc[2] = v3 << 4;
+    enc[3] = i5;
+    enc[4] = v4 << 4 | rxb;
+    enc[5] = opcode2;
+    enc
+}
+
 /// VRRa-type instructions.
 ///
 ///   47      39 35 31 23 19 15 11  7
@@ -1437,8 +1462,12 @@ impl Inst {
                 InstructionSet::Base => true,
                 // Miscellaneous-Instruction-Extensions Facility 3 (z15)
                 InstructionSet::MIE3 => emit_info.isa_flags.has_mie3(),
+                // Miscellaneous-Instruction-Extensions Facility 4 (z17)
+                InstructionSet::MIE4 => emit_info.isa_flags.has_mie4(),
                 // Vector-Enhancements Facility 2 (z15)
                 InstructionSet::VXRS_EXT2 => emit_info.isa_flags.has_vxrs_ext2(),
+                // Vector-Enhancements Facility 3 (z17)
+                InstructionSet::VXRS_EXT3 => emit_info.isa_flags.has_vxrs_ext3(),
             }
         };
         let isa_requirements = self.available_in_isa();
@@ -1884,6 +1913,14 @@ impl Inst {
                         let opcode = 0xb90f; // LRVRG
                         put(sink, &enc_rre(opcode, rd.to_reg(), rn));
                     }
+                    UnaryOp::Clz64 => {
+                        let opcode = 0xb968; // CLZG
+                        put(sink, &enc_rre(opcode, rd.to_reg(), rn));
+                    }
+                    UnaryOp::Ctz64 => {
+                        let opcode = 0xb969; // CTZG
+                        put(sink, &enc_rre(opcode, rd.to_reg(), rn));
+                    }
                 }
             }
 
@@ -2663,48 +2700,76 @@ impl Inst {
                     VecBinaryOp::Mul8x16 => (0xe7a2, 0),       // VMLB
                     VecBinaryOp::Mul16x8 => (0xe7a2, 1),       // VMLHW
                     VecBinaryOp::Mul32x4 => (0xe7a2, 2),       // VMLF
+                    VecBinaryOp::Mul64x2 => (0xe7a2, 3),       // VMLG
+                    VecBinaryOp::Mul128 => (0xe7a2, 4),        // VMLQ
                     VecBinaryOp::UMulHi8x16 => (0xe7a1, 0),    // VMLHB
                     VecBinaryOp::UMulHi16x8 => (0xe7a1, 1),    // VMLHH
                     VecBinaryOp::UMulHi32x4 => (0xe7a1, 2),    // VMLHF
+                    VecBinaryOp::UMulHi64x2 => (0xe7a1, 3),    // VMLHG
+                    VecBinaryOp::UMulHi128 => (0xe7a1, 4),     // VMLHQ
                     VecBinaryOp::SMulHi8x16 => (0xe7a3, 0),    // VMHB
                     VecBinaryOp::SMulHi16x8 => (0xe7a3, 1),    // VMHH
                     VecBinaryOp::SMulHi32x4 => (0xe7a3, 2),    // VMHF
+                    VecBinaryOp::SMulHi64x2 => (0xe7a3, 3),    // VMHG
+                    VecBinaryOp::SMulHi128 => (0xe7a3, 4),     // VMHQ
                     VecBinaryOp::UMulEven8x16 => (0xe7a4, 0),  // VMLEB
                     VecBinaryOp::UMulEven16x8 => (0xe7a4, 1),  // VMLEH
                     VecBinaryOp::UMulEven32x4 => (0xe7a4, 2),  // VMLEF
+                    VecBinaryOp::UMulEven64x2 => (0xe7a4, 3),  // VMLEG
                     VecBinaryOp::SMulEven8x16 => (0xe7a6, 0),  // VMEB
                     VecBinaryOp::SMulEven16x8 => (0xe7a6, 1),  // VMEH
                     VecBinaryOp::SMulEven32x4 => (0xe7a6, 2),  // VMEF
+                    VecBinaryOp::SMulEven64x2 => (0xe7a6, 3),  // VMEG
                     VecBinaryOp::UMulOdd8x16 => (0xe7a5, 0),   // VMLOB
                     VecBinaryOp::UMulOdd16x8 => (0xe7a5, 1),   // VMLOH
                     VecBinaryOp::UMulOdd32x4 => (0xe7a5, 2),   // VMLOF
+                    VecBinaryOp::UMulOdd64x2 => (0xe7a5, 3),   // VMLOG
                     VecBinaryOp::SMulOdd8x16 => (0xe7a7, 0),   // VMOB
                     VecBinaryOp::SMulOdd16x8 => (0xe7a7, 1),   // VMOH
                     VecBinaryOp::SMulOdd32x4 => (0xe7a7, 2),   // VMOF
+                    VecBinaryOp::SMulOdd64x2 => (0xe7a7, 3),   // VMOG
+                    VecBinaryOp::UDiv32x4 => (0xe7b0, 2),      // VDLF
+                    VecBinaryOp::UDiv64x2 => (0xe7b0, 3),      // VDLG
+                    VecBinaryOp::UDiv128 => (0xe7b0, 4),       // VDLQ
+                    VecBinaryOp::SDiv32x4 => (0xe7b2, 2),      // VDF
+                    VecBinaryOp::SDiv64x2 => (0xe7b2, 3),      // VDG
+                    VecBinaryOp::SDiv128 => (0xe7b2, 4),       // VDQ
+                    VecBinaryOp::URem32x4 => (0xe7b1, 2),      // VRLF
+                    VecBinaryOp::URem64x2 => (0xe7b1, 3),      // VRLG
+                    VecBinaryOp::URem128 => (0xe7b1, 4),       // VRLQ
+                    VecBinaryOp::SRem32x4 => (0xe7b3, 2),      // VRF
+                    VecBinaryOp::SRem64x2 => (0xe7b3, 3),      // VRG
+                    VecBinaryOp::SRem128 => (0xe7b3, 4),       // VRQ
                     VecBinaryOp::UMax8x16 => (0xe7fd, 0),      // VMXLB
                     VecBinaryOp::UMax16x8 => (0xe7fd, 1),      // VMXLH
                     VecBinaryOp::UMax32x4 => (0xe7fd, 2),      // VMXLF
                     VecBinaryOp::UMax64x2 => (0xe7fd, 3),      // VMXLG
+                    VecBinaryOp::UMax128 => (0xe7fd, 4),       // VMXLQ
                     VecBinaryOp::SMax8x16 => (0xe7ff, 0),      // VMXB
                     VecBinaryOp::SMax16x8 => (0xe7ff, 1),      // VMXH
                     VecBinaryOp::SMax32x4 => (0xe7ff, 2),      // VMXF
                     VecBinaryOp::SMax64x2 => (0xe7ff, 3),      // VMXG
+                    VecBinaryOp::SMax128 => (0xe7ff, 4),       // VMXQ
                     VecBinaryOp::UMin8x16 => (0xe7fc, 0),      // VMNLB
                     VecBinaryOp::UMin16x8 => (0xe7fc, 1),      // VMNLH
                     VecBinaryOp::UMin32x4 => (0xe7fc, 2),      // VMNLF
                     VecBinaryOp::UMin64x2 => (0xe7fc, 3),      // VMNLG
+                    VecBinaryOp::UMin128 => (0xe7fc, 4),       // VMNLQ
                     VecBinaryOp::SMin8x16 => (0xe7fe, 0),      // VMNB
                     VecBinaryOp::SMin16x8 => (0xe7fe, 1),      // VMNH
                     VecBinaryOp::SMin32x4 => (0xe7fe, 2),      // VMNF
                     VecBinaryOp::SMin64x2 => (0xe7fe, 3),      // VMNG
+                    VecBinaryOp::SMin128 => (0xe7fe, 4),       // VMNQ
                     VecBinaryOp::UAvg8x16 => (0xe7f0, 0),      // VAVGLB
                     VecBinaryOp::UAvg16x8 => (0xe7f0, 1),      // VAVGLH
                     VecBinaryOp::UAvg32x4 => (0xe7f0, 2),      // VAVGLF
                     VecBinaryOp::UAvg64x2 => (0xe7f0, 3),      // VAVGLG
+                    VecBinaryOp::UAvg128 => (0xe7f0, 4),       // VAVGLQ
                     VecBinaryOp::SAvg8x16 => (0xe7f2, 0),      // VAVGB
                     VecBinaryOp::SAvg16x8 => (0xe7f2, 1),      // VAVGH
                     VecBinaryOp::SAvg32x4 => (0xe7f2, 2),      // VAVGF
                     VecBinaryOp::SAvg64x2 => (0xe7f2, 3),      // VAVGG
+                    VecBinaryOp::SAvg128 => (0xe7f2, 4),       // VAVGQ
                     VecBinaryOp::And128 => (0xe768, 0),        // VN
                     VecBinaryOp::Orr128 => (0xe76a, 0),        // VO
                     VecBinaryOp::Xor128 => (0xe76d, 0),        // VX
@@ -2739,7 +2804,27 @@ impl Inst {
                     VecBinaryOp::MergeHigh64x2 => (0xe761, 3), // VMRHG
                 };
 
-                put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, 0));
+                let enc = &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, 0);
+                let may_trap = match op {
+                    VecBinaryOp::UDiv32x4
+                    | VecBinaryOp::UDiv64x2
+                    | VecBinaryOp::UDiv128
+                    | VecBinaryOp::SDiv32x4
+                    | VecBinaryOp::SDiv64x2
+                    | VecBinaryOp::SDiv128
+                    | VecBinaryOp::URem32x4
+                    | VecBinaryOp::URem64x2
+                    | VecBinaryOp::URem128
+                    | VecBinaryOp::SRem32x4
+                    | VecBinaryOp::SRem64x2
+                    | VecBinaryOp::SRem128 => true,
+                    _ => false,
+                };
+                if may_trap {
+                    put_with_trap(sink, enc, TrapCode::INTEGER_DIVISION_BY_ZERO);
+                } else {
+                    put(sink, enc);
+                }
             }
             &Inst::VecRR { op, rd, rn } => {
                 let (opcode, m3) = match op {
@@ -2747,10 +2832,12 @@ impl Inst {
                     VecUnaryOp::Abs16x8 => (0xe7df, 1),         // VLPH
                     VecUnaryOp::Abs32x4 => (0xe7df, 2),         // VLPF
                     VecUnaryOp::Abs64x2 => (0xe7df, 3),         // VLPG
+                    VecUnaryOp::Abs128 => (0xe7df, 4),          // VLPQ
                     VecUnaryOp::Neg8x16 => (0xe7de, 0),         // VLCB
                     VecUnaryOp::Neg16x8 => (0xe7de, 1),         // VLCH
                     VecUnaryOp::Neg32x4 => (0xe7de, 2),         // VLCF
                     VecUnaryOp::Neg64x2 => (0xe7de, 3),         // VLCG
+                    VecUnaryOp::Neg128 => (0xe7de, 4),          // VLCQ
                     VecUnaryOp::Popcnt8x16 => (0xe750, 0),      // VPOPCTB
                     VecUnaryOp::Popcnt16x8 => (0xe750, 1),      // VPOPCTH
                     VecUnaryOp::Popcnt32x4 => (0xe750, 2),      // VPOPCTF
@@ -2759,22 +2846,28 @@ impl Inst {
                     VecUnaryOp::Clz16x8 => (0xe753, 1),         // VCLZH
                     VecUnaryOp::Clz32x4 => (0xe753, 2),         // VCLZF
                     VecUnaryOp::Clz64x2 => (0xe753, 3),         // VCLZG
+                    VecUnaryOp::Clz128 => (0xe753, 4),          // VCLZQ
                     VecUnaryOp::Ctz8x16 => (0xe752, 0),         // VCTZB
                     VecUnaryOp::Ctz16x8 => (0xe752, 1),         // VCTZH
                     VecUnaryOp::Ctz32x4 => (0xe752, 2),         // VCTZF
                     VecUnaryOp::Ctz64x2 => (0xe752, 3),         // VCTZG
+                    VecUnaryOp::Ctz128 => (0xe752, 4),          // VCTZQ
                     VecUnaryOp::UnpackULow8x16 => (0xe7d4, 0),  // VUPLLB
                     VecUnaryOp::UnpackULow16x8 => (0xe7d4, 1),  // VUPLLH
                     VecUnaryOp::UnpackULow32x4 => (0xe7d4, 2),  // VUPLLF
+                    VecUnaryOp::UnpackULow64x2 => (0xe7d4, 3),  // VUPLLG
                     VecUnaryOp::UnpackUHigh8x16 => (0xe7d5, 0), // VUPLHB
                     VecUnaryOp::UnpackUHigh16x8 => (0xe7d5, 1), // VUPLHH
                     VecUnaryOp::UnpackUHigh32x4 => (0xe7d5, 2), // VUPLHF
+                    VecUnaryOp::UnpackUHigh64x2 => (0xe7d5, 3), // VUPLHG
                     VecUnaryOp::UnpackSLow8x16 => (0xe7d6, 0),  // VUPLB
                     VecUnaryOp::UnpackSLow16x8 => (0xe7d6, 1),  // VUPLH
                     VecUnaryOp::UnpackSLow32x4 => (0xe7d6, 2),  // VUPLF
+                    VecUnaryOp::UnpackSLow64x2 => (0xe7d6, 3),  // VUPLG
                     VecUnaryOp::UnpackSHigh8x16 => (0xe7d7, 0), // VUPHB
                     VecUnaryOp::UnpackSHigh16x8 => (0xe7d7, 1), // VUPHH
                     VecUnaryOp::UnpackSHigh32x4 => (0xe7d7, 2), // VUPHF
+                    VecUnaryOp::UnpackSHigh64x2 => (0xe7d7, 3), // VUPHG
                 };
 
                 put(sink, &enc_vrr_a(opcode, rd.to_reg(), rn, m3, 0, 0));
@@ -2817,6 +2910,16 @@ impl Inst {
                 let opcode = 0xe78c; // VPERM
                 put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, 0, 0));
             }
+            &Inst::VecEvaluate {
+                imm,
+                rd,
+                rn,
+                rm,
+                ra,
+            } => {
+                let opcode = 0xe788; //VEVAL
+                put(sink, &enc_vri_k(opcode, imm, rd.to_reg(), rn, rm, ra));
+            }
             &Inst::VecPermuteDWImm {
                 rd,
                 rn,
@@ -2835,14 +2938,17 @@ impl Inst {
                     VecIntCmpOp::CmpEq16x8 => (0xe7f8, 1),  // VCEQH
                     VecIntCmpOp::CmpEq32x4 => (0xe7f8, 2),  // VCEQF
                     VecIntCmpOp::CmpEq64x2 => (0xe7f8, 3),  // VCEQG
+                    VecIntCmpOp::CmpEq128 => (0xe7f8, 4),   // VCEQQ
                     VecIntCmpOp::SCmpHi8x16 => (0xe7fb, 0), // VCHB
                     VecIntCmpOp::SCmpHi16x8 => (0xe7fb, 1), // VCHH
                     VecIntCmpOp::SCmpHi32x4 => (0xe7fb, 2), // VCHG
                     VecIntCmpOp::SCmpHi64x2 => (0xe7fb, 3), // VCHG
+                    VecIntCmpOp::SCmpHi128 => (0xe7fb, 4),  // VCHQ
                     VecIntCmpOp::UCmpHi8x16 => (0xe7f9, 0), // VCHLB
                     VecIntCmpOp::UCmpHi16x8 => (0xe7f9, 1), // VCHLH
                     VecIntCmpOp::UCmpHi32x4 => (0xe7f9, 2), // VCHLG
                     VecIntCmpOp::UCmpHi64x2 => (0xe7f9, 3), // VCHLG
+                    VecIntCmpOp::UCmpHi128 => (0xe7f9, 4),  // VCHLQ
                 };
                 let m5 = match self {
                     &Inst::VecIntCmp { .. } => 0,
@@ -2869,6 +2975,14 @@ impl Inst {
 
                 put(sink, &enc_vrr_c(opcode, rd.to_reg(), rn, rm, m4, 0, m6));
             }
+            &Inst::VecIntEltCmp { op, rn, rm } => {
+                let (opcode, m3) = match op {
+                    VecIntEltCmpOp::SCmp128 => (0xe7db, 4), // VECQ
+                    VecIntEltCmpOp::UCmp128 => (0xe7d9, 4), // VECLQ
+                };
+
+                put(sink, &enc_vrr_a(opcode, rn, rm, m3, 0, 0));
+            }
             &Inst::VecInt128SCmpHi { tmp, rn, rm } | &Inst::VecInt128UCmpHi { tmp, rn, rm } => {
                 // Synthetic instruction to compare 128-bit values.
                 // Sets CC 1 if rn > rm, sets a different CC otherwise.
diff --git a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
index aa6f9e6b19c9..d69ccbd91c2a 100644
--- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
@@ -1643,6 +1643,24 @@ fn test_s390x_binemit() {
         "B90F001A",
         "lrvgr %r1, %r10",
     ));
+    insns.push((
+        Inst::UnaryRR {
+            op: UnaryOp::Clz64,
+            rd: writable_gpr(1),
+            rn: gpr(10),
+        },
+        "B968001A",
+        "clzg %r1, %r10",
+    ));
+    insns.push((
+        Inst::UnaryRR {
+            op: UnaryOp::Ctz64,
+            rd: writable_gpr(1),
+            rn: gpr(10),
+        },
+        "B969001A",
+        "ctzg %r1, %r10",
+    ));
 
     insns.push((
         Inst::CmpRR {
@@ -8522,6 +8540,26 @@ fn test_s390x_binemit() {
         "E748C00028A2",
         "vmlf %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::Mul64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038A2",
+        "vmlg %v20, %v8, %v12",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::Mul128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048A2",
+        "vmlq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::UMulHi8x16,
@@ -8552,6 +8590,26 @@ fn test_s390x_binemit() {
         "E748C00028A1",
         "vmlhf %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::UMulHi64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038A1",
+        "vmlhg %v20, %v8, %v12",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::UMulHi128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048A1",
+        "vmlhq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::SMulHi8x16,
@@ -8582,6 +8640,26 @@ fn test_s390x_binemit() {
         "E748C00028A3",
         "vmhf %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SMulHi64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038A3",
+        "vmhg %v20, %v8, %v12",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SMulHi128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048A3",
+        "vmhq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::UMulEven8x16,
@@ -8612,6 +8690,16 @@ fn test_s390x_binemit() {
         "E748C00028A4",
         "vmlef %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::UMulEven64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038A4",
+        "vmleg %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::SMulEven8x16,
@@ -8642,6 +8730,16 @@ fn test_s390x_binemit() {
         "E748C00028A6",
         "vmef %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SMulEven64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038A6",
+        "vmeg %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::UMulOdd8x16,
@@ -8672,6 +8770,16 @@ fn test_s390x_binemit() {
         "E748C00028A5",
         "vmlof %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::UMulOdd64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038A5",
+        "vmlog %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::SMulOdd8x16,
@@ -8702,6 +8810,136 @@ fn test_s390x_binemit() {
         "E748C00028A7",
         "vmof %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SMulOdd64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038A7",
+        "vmog %v20, %v8, %v12",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::UDiv32x4,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00028B0",
+        "vdlf %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::UDiv64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038B0",
+        "vdlg %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::UDiv128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048B0",
+        "vdlq %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SDiv32x4,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00028B2",
+        "vdf %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SDiv64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038B2",
+        "vdg %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SDiv128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048B2",
+        "vdq %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::URem32x4,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00028B1",
+        "vrlf %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::URem64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038B1",
+        "vrlg %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::URem128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048B1",
+        "vrlq %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SRem32x4,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00028B3",
+        "vrf %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SRem64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00038B3",
+        "vrg %v20, %v8, %v12, 0",
+    ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SRem128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048B3",
+        "vrq %v20, %v8, %v12, 0",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::UMax8x16,
@@ -8742,6 +8980,16 @@ fn test_s390x_binemit() {
         "E748C00038FD",
         "vmxlg %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::UMax128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048FD",
+        "vmxlq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::SMax8x16,
@@ -8782,6 +9030,16 @@ fn test_s390x_binemit() {
         "E748C00038FF",
         "vmxg %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SMax128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048FF",
+        "vmxq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::UMin8x16,
@@ -8822,6 +9080,16 @@ fn test_s390x_binemit() {
         "E748C00038FC",
         "vmnlg %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::UMin128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048FC",
+        "vmnlq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::SMin8x16,
@@ -8862,6 +9130,16 @@ fn test_s390x_binemit() {
         "E748C00038FE",
         "vmng %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SMin128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048FE",
+        "vmnq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::UAvg8x16,
@@ -8902,6 +9180,16 @@ fn test_s390x_binemit() {
         "E748C00038F0",
         "vavglg %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::UAvg128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048F0",
+        "vavglq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::SAvg8x16,
@@ -8942,6 +9230,16 @@ fn test_s390x_binemit() {
         "E748C00038F2",
         "vavgg %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecRRR {
+            op: VecBinaryOp::SAvg128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048F2",
+        "vavgq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecRRR {
             op: VecBinaryOp::And128,
@@ -9299,6 +9597,15 @@ fn test_s390x_binemit() {
         "E748000038DF",
         "vlpg %v20, %v8",
     ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Abs128,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E748000048DF",
+        "vlpq %v20, %v8",
+    ));
     insns.push((
         Inst::VecRR {
             op: VecUnaryOp::Neg8x16,
@@ -9335,6 +9642,15 @@ fn test_s390x_binemit() {
         "E748000038DE",
         "vlcg %v20, %v8",
     ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Neg128,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E748000048DE",
+        "vlcq %v20, %v8",
+    ));
     insns.push((
         Inst::VecRR {
             op: VecUnaryOp::Popcnt8x16,
@@ -9407,6 +9723,15 @@ fn test_s390x_binemit() {
         "E74800003853",
         "vclzg %v20, %v8",
     ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Clz128,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E74800004853",
+        "vclzq %v20, %v8",
+    ));
     insns.push((
         Inst::VecRR {
             op: VecUnaryOp::Ctz8x16,
@@ -9443,6 +9768,15 @@ fn test_s390x_binemit() {
         "E74800003852",
         "vctzg %v20, %v8",
     ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::Ctz128,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E74800004852",
+        "vctzq %v20, %v8",
+    ));
     insns.push((
         Inst::VecRR {
             op: VecUnaryOp::UnpackULow8x16,
@@ -9470,6 +9804,15 @@ fn test_s390x_binemit() {
         "E748000028D4",
         "vupllf %v20, %v8",
     ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::UnpackULow64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E748000038D4",
+        "vupllg %v20, %v8",
+    ));
     insns.push((
         Inst::VecRR {
             op: VecUnaryOp::UnpackUHigh8x16,
@@ -9497,6 +9840,15 @@ fn test_s390x_binemit() {
         "E748000028D5",
         "vuplhf %v20, %v8",
     ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::UnpackUHigh64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E748000038D5",
+        "vuplhg %v20, %v8",
+    ));
     insns.push((
         Inst::VecRR {
             op: VecUnaryOp::UnpackSLow8x16,
@@ -9524,6 +9876,15 @@ fn test_s390x_binemit() {
         "E748000028D6",
         "vuplf %v20, %v8",
     ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::UnpackSLow64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E748000038D6",
+        "vuplg %v20, %v8",
+    ));
     insns.push((
         Inst::VecRR {
             op: VecUnaryOp::UnpackSHigh8x16,
@@ -9551,6 +9912,15 @@ fn test_s390x_binemit() {
         "E748000028D7",
         "vuphf %v20, %v8",
     ));
+    insns.push((
+        Inst::VecRR {
+            op: VecUnaryOp::UnpackSHigh64x2,
+            rd: writable_vr(20),
+            rn: vr(8),
+        },
+        "E748000038D7",
+        "vuphg %v20, %v8",
+    ));
 
     insns.push((
         Inst::VecShiftRR {
@@ -9934,6 +10304,16 @@ fn test_s390x_binemit() {
         "E748C00038F8",
         "vceqg %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecIntCmp {
+            op: VecIntCmpOp::CmpEq128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048F8",
+        "vceqq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecIntCmp {
             op: VecIntCmpOp::SCmpHi8x16,
@@ -9974,6 +10354,16 @@ fn test_s390x_binemit() {
         "E748C00038FB",
         "vchg %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecIntCmp {
+            op: VecIntCmpOp::SCmpHi128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048FB",
+        "vchq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecIntCmp {
             op: VecIntCmpOp::UCmpHi8x16,
@@ -10014,6 +10404,16 @@ fn test_s390x_binemit() {
         "E748C00038F9",
         "vchlg %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecIntCmp {
+            op: VecIntCmpOp::UCmpHi128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C00048F9",
+        "vchlq %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecIntCmpS {
             op: VecIntCmpOp::CmpEq8x16,
@@ -10054,6 +10454,16 @@ fn test_s390x_binemit() {
         "E748C01038F8",
         "vceqgs %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecIntCmpS {
+            op: VecIntCmpOp::CmpEq128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C01048F8",
+        "vceqqs %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecIntCmpS {
             op: VecIntCmpOp::SCmpHi8x16,
@@ -10094,6 +10504,16 @@ fn test_s390x_binemit() {
         "E748C01038FB",
         "vchgs %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecIntCmpS {
+            op: VecIntCmpOp::SCmpHi128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C01048FB",
+        "vchqs %v20, %v8, %v12",
+    ));
     insns.push((
         Inst::VecIntCmpS {
             op: VecIntCmpOp::UCmpHi8x16,
@@ -10134,6 +10554,45 @@ fn test_s390x_binemit() {
         "E748C01038F9",
         "vchlgs %v20, %v8, %v12",
     ));
+    insns.push((
+        Inst::VecIntCmpS {
+            op: VecIntCmpOp::UCmpHi128,
+            rd: writable_vr(20),
+            rn: vr(8),
+            rm: vr(12),
+        },
+        "E748C01048F9",
+        "vchlqs %v20, %v8, %v12",
+    ));
+    insns.push((
+        Inst::VecIntEltCmp {
+            op: VecIntEltCmpOp::SCmp128,
+            rn: vr(20),
+            rm: vr(12),
+        },
+        "E74C000048DB",
+        "vecq %v20, %v12",
+    ));
+    insns.push((
+        Inst::VecIntEltCmp {
+            op: VecIntEltCmpOp::UCmp128,
+            rn: vr(20),
+            rm: vr(12),
+        },
+        "E74C000048D9",
+        "veclq %v20, %v12",
+    ));
+    insns.push((
+        Inst::VecEvaluate {
+            imm: 0x02,
+            rd: writable_vr(12),
+            rn: vr(20),
+            rm: vr(21),
+            ra: vr(22),
+        },
+        "E7C450026788",
+        "veval %v12, %v20, %v21, %v22, 2",
+    ));
     insns.push((
         Inst::VecInt128SCmpHi {
             tmp: writable_vr(20),
@@ -13465,7 +13924,7 @@ fn test_s390x_binemit() {
 
     use crate::settings::Configurable;
     let mut isa_flag_builder = s390x_settings::builder();
-    isa_flag_builder.enable("arch13").unwrap();
+    isa_flag_builder.enable("arch15").unwrap();
     let isa_flags = s390x_settings::Flags::new(&flags, &isa_flag_builder);
     let ctrl_plane = &mut Default::default();
     let constants = Default::default();
diff --git a/cranelift/codegen/src/isa/s390x/inst/mod.rs b/cranelift/codegen/src/isa/s390x/inst/mod.rs
index 762f6c58d27e..6247e966cde6 100644
--- a/cranelift/codegen/src/isa/s390x/inst/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs
@@ -30,7 +30,7 @@ mod emit_tests;
 pub use crate::isa::s390x::lower::isle::generated_code::{
     ALUOp, CmpOp, FPUOp1, FPUOp2, FPUOp3, FpuConv128Op, FpuRoundMode, FpuRoundOp, LaneOrder,
     MInst as Inst, RxSBGOp, ShiftOp, SymbolReloc, UnaryOp, VecBinaryOp, VecFloatCmpOp, VecIntCmpOp,
-    VecShiftOp, VecUnaryOp,
+    VecIntEltCmpOp, VecShiftOp, VecUnaryOp,
 };
 
 /// The destination of a call instruction.
@@ -89,8 +89,12 @@ pub(crate) enum InstructionSet {
     Base,
     /// Miscellaneous-Instruction-Extensions Facility 3 (z15)
     MIE3,
+    /// Miscellaneous-Instruction-Extensions Facility 4 (z17)
+    MIE4,
     /// Vector-Enhancements Facility 2 (z15)
     VXRS_EXT2,
+    /// Vector-Enhancements Facility 3 (z17)
+    VXRS_EXT3,
 }
 
 impl Inst {
@@ -188,14 +192,10 @@ impl Inst {
             | Inst::FpuCmp32 { .. }
             | Inst::FpuCmp64 { .. }
             | Inst::FpuCmp128 { .. }
-            | Inst::VecRRR { .. }
-            | Inst::VecRR { .. }
             | Inst::VecShiftRR { .. }
             | Inst::VecSelect { .. }
             | Inst::VecPermute { .. }
             | Inst::VecPermuteDWImm { .. }
-            | Inst::VecIntCmp { .. }
-            | Inst::VecIntCmpS { .. }
             | Inst::VecFloatCmp { .. }
             | Inst::VecFloatCmpS { .. }
             | Inst::VecInt128SCmpHi { .. }
@@ -251,6 +251,7 @@ impl Inst {
             },
             Inst::UnaryRR { op, .. } => match op {
                 UnaryOp::PopcntReg => InstructionSet::MIE3,
+                UnaryOp::Clz64 | UnaryOp::Ctz64 => InstructionSet::MIE4,
                 _ => InstructionSet::Base,
             },
             Inst::FpuRound { op, .. } => match op {
@@ -260,6 +261,43 @@ impl Inst {
                 FpuRoundOp::ToUInt32x4 | FpuRoundOp::FromUInt32x4 => InstructionSet::VXRS_EXT2,
                 _ => InstructionSet::Base,
             },
+            Inst::VecRRR { op, .. } => match op {
+                VecBinaryOp::Mul64x2 | VecBinaryOp::Mul128 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::UMulHi64x2 | VecBinaryOp::UMulHi128 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::SMulHi64x2 | VecBinaryOp::SMulHi128 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::UMulEven64x2 | VecBinaryOp::SMulEven64x2 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::UMulOdd64x2 | VecBinaryOp::SMulOdd64x2 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::UDiv32x4 | VecBinaryOp::SDiv32x4 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::UDiv64x2 | VecBinaryOp::SDiv64x2 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::UDiv128 | VecBinaryOp::SDiv128 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::URem32x4 | VecBinaryOp::SRem32x4 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::URem64x2 | VecBinaryOp::SRem64x2 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::URem128 | VecBinaryOp::SRem128 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::UMax128 | VecBinaryOp::SMax128 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::UMin128 | VecBinaryOp::SMin128 => InstructionSet::VXRS_EXT3,
+                VecBinaryOp::UAvg128 | VecBinaryOp::SAvg128 => InstructionSet::VXRS_EXT3,
+                _ => InstructionSet::Base,
+            },
+            &Inst::VecRR { op, .. } => match op {
+                VecUnaryOp::Abs128 | VecUnaryOp::Neg128 => InstructionSet::VXRS_EXT3,
+                VecUnaryOp::Clz128 | VecUnaryOp::Ctz128 => InstructionSet::VXRS_EXT3,
+                VecUnaryOp::UnpackULow64x2 => InstructionSet::VXRS_EXT3,
+                VecUnaryOp::UnpackUHigh64x2 => InstructionSet::VXRS_EXT3,
+                VecUnaryOp::UnpackSLow64x2 => InstructionSet::VXRS_EXT3,
+                VecUnaryOp::UnpackSHigh64x2 => InstructionSet::VXRS_EXT3,
+                _ => InstructionSet::Base,
+            },
+            &Inst::VecIntCmp { op, .. } | &Inst::VecIntCmpS { op, .. } => match op {
+                VecIntCmpOp::CmpEq128 => InstructionSet::VXRS_EXT3,
+                VecIntCmpOp::SCmpHi128 => InstructionSet::VXRS_EXT3,
+                VecIntCmpOp::UCmpHi128 => InstructionSet::VXRS_EXT3,
+                _ => InstructionSet::Base,
+            },
+            &Inst::VecIntEltCmp { op, .. } => match op {
+                VecIntEltCmpOp::SCmp128 => InstructionSet::VXRS_EXT3,
+                VecIntEltCmpOp::UCmp128 => InstructionSet::VXRS_EXT3,
+                // We do not use any of the pre-z17 variants of these instructions.
+            },
 
             // These are all part of VXRS_EXT2
             Inst::VecLoadRev { .. }
@@ -281,6 +319,8 @@ impl Inst {
             | Inst::VecLoadLaneRevUndef { .. }
             | Inst::VecStoreLaneRev { .. } => InstructionSet::VXRS_EXT2,
 
+            Inst::VecEvaluate { .. } => InstructionSet::VXRS_EXT3,
+
             Inst::DummyUse { .. } => InstructionSet::Base,
 
             Inst::LabelAddress { .. } => InstructionSet::Base,
@@ -700,13 +740,9 @@ fn s390x_get_operands(inst: &mut Inst, collector: &mut DenyReuseVisitor<impl Ope
             collector.reg_use(rn);
             collector.reg_use(shift_reg);
         }
-        Inst::VecSelect { rd, rn, rm, ra, .. } => {
-            collector.reg_def(rd);
-            collector.reg_use(rn);
-            collector.reg_use(rm);
-            collector.reg_use(ra);
-        }
-        Inst::VecPermute { rd, rn, rm, ra, .. } => {
+        Inst::VecSelect { rd, rn, rm, ra, .. }
+        | Inst::VecPermute { rd, rn, rm, ra, .. }
+        | Inst::VecEvaluate { rd, rn, rm, ra, .. } => {
             collector.reg_def(rd);
             collector.reg_use(rn);
             collector.reg_use(rm);
@@ -727,6 +763,10 @@ fn s390x_get_operands(inst: &mut Inst, collector: &mut DenyReuseVisitor<impl Ope
             collector.reg_use(rn);
             collector.reg_use(rm);
         }
+        Inst::VecIntEltCmp { rn, rm, .. } => {
+            collector.reg_use(rn);
+            collector.reg_use(rm);
+        }
         Inst::VecInt128SCmpHi { tmp, rn, rm, .. } | Inst::VecInt128UCmpHi { tmp, rn, rm, .. } => {
             collector.reg_def(tmp);
             collector.reg_use(rn);
@@ -1627,6 +1667,8 @@ impl Inst {
                     UnaryOp::PopcntReg => ("popcnt", ", 8"),
                     UnaryOp::BSwap32 => ("lrvr", ""),
                     UnaryOp::BSwap64 => ("lrvgr", ""),
+                    UnaryOp::Clz64 => ("clzg", ""),
+                    UnaryOp::Ctz64 => ("ctzg", ""),
                 };
                 let rd = pretty_print_reg(rd.to_reg());
                 let rn = pretty_print_reg(rn);
@@ -2453,6 +2495,15 @@ impl Inst {
             }
 
             &Inst::VecRRR { op, rd, rn, rm } => {
+                let m5 = match op {
+                    VecBinaryOp::UDiv32x4 | VecBinaryOp::SDiv32x4 => ", 0",
+                    VecBinaryOp::UDiv64x2 | VecBinaryOp::SDiv64x2 => ", 0",
+                    VecBinaryOp::UDiv128 | VecBinaryOp::SDiv128 => ", 0",
+                    VecBinaryOp::URem32x4 | VecBinaryOp::SRem32x4 => ", 0",
+                    VecBinaryOp::URem64x2 | VecBinaryOp::SRem64x2 => ", 0",
+                    VecBinaryOp::URem128 | VecBinaryOp::SRem128 => ", 0",
+                    _ => "",
+                };
                 let op = match op {
                     VecBinaryOp::Add8x16 => "vab",
                     VecBinaryOp::Add16x8 => "vah",
@@ -2467,48 +2518,76 @@ impl Inst {
                     VecBinaryOp::Mul8x16 => "vmlb",
                     VecBinaryOp::Mul16x8 => "vmlhw",
                     VecBinaryOp::Mul32x4 => "vmlf",
+                    VecBinaryOp::Mul64x2 => "vmlg",
+                    VecBinaryOp::Mul128 => "vmlq",
                     VecBinaryOp::UMulHi8x16 => "vmlhb",
                     VecBinaryOp::UMulHi16x8 => "vmlhh",
                     VecBinaryOp::UMulHi32x4 => "vmlhf",
+                    VecBinaryOp::UMulHi64x2 => "vmlhg",
+                    VecBinaryOp::UMulHi128 => "vmlhq",
                     VecBinaryOp::SMulHi8x16 => "vmhb",
                     VecBinaryOp::SMulHi16x8 => "vmhh",
                     VecBinaryOp::SMulHi32x4 => "vmhf",
+                    VecBinaryOp::SMulHi64x2 => "vmhg",
+                    VecBinaryOp::SMulHi128 => "vmhq",
                     VecBinaryOp::UMulEven8x16 => "vmleb",
                     VecBinaryOp::UMulEven16x8 => "vmleh",
                     VecBinaryOp::UMulEven32x4 => "vmlef",
+                    VecBinaryOp::UMulEven64x2 => "vmleg",
                     VecBinaryOp::SMulEven8x16 => "vmeb",
                     VecBinaryOp::SMulEven16x8 => "vmeh",
                     VecBinaryOp::SMulEven32x4 => "vmef",
+                    VecBinaryOp::SMulEven64x2 => "vmeg",
                     VecBinaryOp::UMulOdd8x16 => "vmlob",
                     VecBinaryOp::UMulOdd16x8 => "vmloh",
                     VecBinaryOp::UMulOdd32x4 => "vmlof",
+                    VecBinaryOp::UMulOdd64x2 => "vmlog",
                     VecBinaryOp::SMulOdd8x16 => "vmob",
                     VecBinaryOp::SMulOdd16x8 => "vmoh",
                     VecBinaryOp::SMulOdd32x4 => "vmof",
+                    VecBinaryOp::SMulOdd64x2 => "vmog",
+                    VecBinaryOp::SDiv32x4 => "vdf",
+                    VecBinaryOp::SDiv64x2 => "vdg",
+                    VecBinaryOp::SDiv128 => "vdq",
+                    VecBinaryOp::UDiv32x4 => "vdlf",
+                    VecBinaryOp::UDiv64x2 => "vdlg",
+                    VecBinaryOp::UDiv128 => "vdlq",
+                    VecBinaryOp::SRem32x4 => "vrf",
+                    VecBinaryOp::SRem64x2 => "vrg",
+                    VecBinaryOp::SRem128 => "vrq",
+                    VecBinaryOp::URem32x4 => "vrlf",
+                    VecBinaryOp::URem64x2 => "vrlg",
+                    VecBinaryOp::URem128 => "vrlq",
                     VecBinaryOp::UMax8x16 => "vmxlb",
                     VecBinaryOp::UMax16x8 => "vmxlh",
                     VecBinaryOp::UMax32x4 => "vmxlf",
                     VecBinaryOp::UMax64x2 => "vmxlg",
+                    VecBinaryOp::UMax128 => "vmxlq",
                     VecBinaryOp::SMax8x16 => "vmxb",
                     VecBinaryOp::SMax16x8 => "vmxh",
                     VecBinaryOp::SMax32x4 => "vmxf",
                     VecBinaryOp::SMax64x2 => "vmxg",
+                    VecBinaryOp::SMax128 => "vmxq",
                     VecBinaryOp::UMin8x16 => "vmnlb",
                     VecBinaryOp::UMin16x8 => "vmnlh",
                     VecBinaryOp::UMin32x4 => "vmnlf",
                     VecBinaryOp::UMin64x2 => "vmnlg",
+                    VecBinaryOp::UMin128 => "vmnlq",
                     VecBinaryOp::SMin8x16 => "vmnb",
                     VecBinaryOp::SMin16x8 => "vmnh",
                     VecBinaryOp::SMin32x4 => "vmnf",
                     VecBinaryOp::SMin64x2 => "vmng",
+                    VecBinaryOp::SMin128 => "vmnq",
                     VecBinaryOp::UAvg8x16 => "vavglb",
                     VecBinaryOp::UAvg16x8 => "vavglh",
                     VecBinaryOp::UAvg32x4 => "vavglf",
                     VecBinaryOp::UAvg64x2 => "vavglg",
+                    VecBinaryOp::UAvg128 => "vavglq",
                     VecBinaryOp::SAvg8x16 => "vavgb",
                     VecBinaryOp::SAvg16x8 => "vavgh",
                     VecBinaryOp::SAvg32x4 => "vavgf",
                     VecBinaryOp::SAvg64x2 => "vavgg",
+                    VecBinaryOp::SAvg128 => "vavgq",
                     VecBinaryOp::And128 => "vn",
                     VecBinaryOp::Orr128 => "vo",
                     VecBinaryOp::Xor128 => "vx",
@@ -2545,7 +2624,7 @@ impl Inst {
                 let rd = pretty_print_reg(rd.to_reg());
                 let rn = pretty_print_reg(rn);
                 let rm = pretty_print_reg(rm);
-                format!("{op} {rd}, {rn}, {rm}")
+                format!("{op} {rd}, {rn}, {rm}{m5}")
             }
             &Inst::VecRR { op, rd, rn } => {
                 let op = match op {
@@ -2553,10 +2632,12 @@ impl Inst {
                     VecUnaryOp::Abs16x8 => "vlph",
                     VecUnaryOp::Abs32x4 => "vlpf",
                     VecUnaryOp::Abs64x2 => "vlpg",
+                    VecUnaryOp::Abs128 => "vlpq",
                     VecUnaryOp::Neg8x16 => "vlcb",
                     VecUnaryOp::Neg16x8 => "vlch",
                     VecUnaryOp::Neg32x4 => "vlcf",
                     VecUnaryOp::Neg64x2 => "vlcg",
+                    VecUnaryOp::Neg128 => "vlcq",
                     VecUnaryOp::Popcnt8x16 => "vpopctb",
                     VecUnaryOp::Popcnt16x8 => "vpopcth",
                     VecUnaryOp::Popcnt32x4 => "vpopctf",
@@ -2565,22 +2646,28 @@ impl Inst {
                     VecUnaryOp::Clz16x8 => "vclzh",
                     VecUnaryOp::Clz32x4 => "vclzf",
                     VecUnaryOp::Clz64x2 => "vclzg",
+                    VecUnaryOp::Clz128 => "vclzq",
                     VecUnaryOp::Ctz8x16 => "vctzb",
                     VecUnaryOp::Ctz16x8 => "vctzh",
                     VecUnaryOp::Ctz32x4 => "vctzf",
                     VecUnaryOp::Ctz64x2 => "vctzg",
+                    VecUnaryOp::Ctz128 => "vctzq",
                     VecUnaryOp::UnpackULow8x16 => "vupllb",
                     VecUnaryOp::UnpackULow16x8 => "vupllh",
                     VecUnaryOp::UnpackULow32x4 => "vupllf",
+                    VecUnaryOp::UnpackULow64x2 => "vupllg",
                     VecUnaryOp::UnpackUHigh8x16 => "vuplhb",
                     VecUnaryOp::UnpackUHigh16x8 => "vuplhh",
                     VecUnaryOp::UnpackUHigh32x4 => "vuplhf",
+                    VecUnaryOp::UnpackUHigh64x2 => "vuplhg",
                     VecUnaryOp::UnpackSLow8x16 => "vuplb",
                     VecUnaryOp::UnpackSLow16x8 => "vuplh",
                     VecUnaryOp::UnpackSLow32x4 => "vuplf",
+                    VecUnaryOp::UnpackSLow64x2 => "vuplg",
                     VecUnaryOp::UnpackSHigh8x16 => "vuphb",
                     VecUnaryOp::UnpackSHigh16x8 => "vuphh",
                     VecUnaryOp::UnpackSHigh32x4 => "vuphf",
+                    VecUnaryOp::UnpackSHigh64x2 => "vuphg",
                 };
                 let rd = pretty_print_reg(rd.to_reg());
                 let rn = pretty_print_reg(rn);
@@ -2634,6 +2721,19 @@ impl Inst {
                 let ra = pretty_print_reg(ra);
                 format!("vperm {rd}, {rn}, {rm}, {ra}")
             }
+            &Inst::VecEvaluate {
+                imm,
+                rd,
+                rn,
+                rm,
+                ra,
+            } => {
+                let rd = pretty_print_reg(rd.to_reg());
+                let rn = pretty_print_reg(rn);
+                let rm = pretty_print_reg(rm);
+                let ra = pretty_print_reg(ra);
+                format!("veval {rd}, {rn}, {rm}, {ra}, {imm}")
+            }
             &Inst::VecPermuteDWImm {
                 rd,
                 rn,
@@ -2653,14 +2753,17 @@ impl Inst {
                     VecIntCmpOp::CmpEq16x8 => "vceqh",
                     VecIntCmpOp::CmpEq32x4 => "vceqf",
                     VecIntCmpOp::CmpEq64x2 => "vceqg",
+                    VecIntCmpOp::CmpEq128 => "vceqq",
                     VecIntCmpOp::SCmpHi8x16 => "vchb",
                     VecIntCmpOp::SCmpHi16x8 => "vchh",
                     VecIntCmpOp::SCmpHi32x4 => "vchf",
                     VecIntCmpOp::SCmpHi64x2 => "vchg",
+                    VecIntCmpOp::SCmpHi128 => "vchq",
                     VecIntCmpOp::UCmpHi8x16 => "vchlb",
                     VecIntCmpOp::UCmpHi16x8 => "vchlh",
                     VecIntCmpOp::UCmpHi32x4 => "vchlf",
                     VecIntCmpOp::UCmpHi64x2 => "vchlg",
+                    VecIntCmpOp::UCmpHi128 => "vchlq",
                 };
                 let s = match self {
                     &Inst::VecIntCmp { .. } => "",
@@ -2691,6 +2794,15 @@ impl Inst {
                 let rm = pretty_print_reg(rm);
                 format!("{op}{s} {rd}, {rn}, {rm}")
             }
+            &Inst::VecIntEltCmp { op, rn, rm } => {
+                let op = match op {
+                    VecIntEltCmpOp::SCmp128 => "vecq",
+                    VecIntEltCmpOp::UCmp128 => "veclq",
+                };
+                let rn = pretty_print_reg(rn);
+                let rm = pretty_print_reg(rm);
+                format!("{op} {rn}, {rm}")
+            }
             &Inst::VecInt128SCmpHi { tmp, rn, rm } | &Inst::VecInt128UCmpHi { tmp, rn, rm } => {
                 let op = match self {
                     &Inst::VecInt128SCmpHi { .. } => "vecg",
diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle
index 875469fae073..3bf616b6229d 100644
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -221,8 +221,12 @@
 (rule 1 (lower (has_type (ty_vec128 ty) (iabs x)))
       (vec_abs ty x))
 
-;; Absolute value of a 128-bit integer.
-(rule 0 (lower (has_type $I128 (iabs x)))
+;; Absolute value of a 128-bit integer on z17.
+(rule 4 (lower (has_type (and (vxrs_ext3_enabled) $I128) (iabs x)))
+      (vec_abs $I128 x))
+
+;; Absolute value of a 128-bit integer pre-z17.
+(rule 0 (lower (has_type (and (vxrs_ext3_disabled) $I128) (iabs x)))
       (let ((zero Reg (vec_imm $I128 0))
             (pos Reg x)
             (neg Reg (vec_sub $I128 zero pos))
@@ -245,8 +249,12 @@
 (rule 1 (lower (has_type (ty_vec128 ty) (ineg x)))
       (vec_neg ty x))
 
-;; Negate a 128-bit integer.
-(rule 0 (lower (has_type $I128 (ineg x)))
+;; Negate a 128-bit integer on z17.
+(rule 4 (lower (has_type (and (vxrs_ext3_enabled) $I128) (ineg x)))
+      (vec_neg $I128 x))
+
+;; Negate a 128-bit integer pre-z17.
+(rule 0 (lower (has_type (and (vxrs_ext3_disabled) $I128) (ineg x)))
       (vec_sub $I128 (vec_imm $I128 0) x))
 
 
@@ -260,13 +268,17 @@
                                      (intcc_as_cond (IntCC.UnsignedLessThan)))))
         (select_bool_reg ty cond y_ext x_ext)))
 
-;; Unsigned maximum of two 128-bit integers - expand to icmp + select.
-(rule 1 (lower (has_type $I128 (umax x y)))
+;; Unsigned maximum of two 128-bit integers pre-z17 - expand to icmp + select.
+(rule 1 (lower (has_type (and (vxrs_ext3_disabled) $I128) (umax x y)))
       (let ((x_reg Reg (put_in_reg x))
             (y_reg Reg (put_in_reg y))
             (cond ProducesBool (vec_int128_ucmphi y_reg x_reg)))
         (select_bool_reg $I128 cond y_reg x_reg)))
 
+;; Unsigned maximum of two 128-bit integers on z17.
+(rule 3 (lower (has_type (and (vxrs_ext3_enabled) $I128) (umax x y)))
+      (vec_umax $I128 x y))
+
 ;; Unsigned maximum of two vector registers.
 (rule 0 (lower (has_type (ty_vec128 ty) (umax x y)))
       (vec_umax ty x y))
@@ -282,13 +294,17 @@
                                      (intcc_as_cond (IntCC.UnsignedGreaterThan)))))
         (select_bool_reg ty cond y_ext x_ext)))
 
-;; Unsigned maximum of two 128-bit integers - expand to icmp + select.
-(rule 1 (lower (has_type $I128 (umin x y)))
+;; Unsigned maximum of two 128-bit integers pre-z17 - expand to icmp + select.
+(rule 1 (lower (has_type (and (vxrs_ext3_disabled) $I128) (umin x y)))
       (let ((x_reg Reg (put_in_reg x))
             (y_reg Reg (put_in_reg y))
             (cond ProducesBool (vec_int128_ucmphi x_reg y_reg)))
         (select_bool_reg $I128 cond y_reg x_reg)))
 
+;; Unsigned minimum of two 128-bit integers on z17.
+(rule 3 (lower (has_type (and (vxrs_ext3_enabled) $I128) (umin x y)))
+      (vec_umin $I128 x y))
+
 ;; Unsigned minimum of two vector registers.
 (rule 0 (lower (has_type (ty_vec128 ty) (umin x y)))
       (vec_umin ty x y))
@@ -304,13 +320,17 @@
                                      (intcc_as_cond (IntCC.SignedLessThan)))))
         (select_bool_reg ty cond y_ext x_ext)))
 
-;; Signed maximum of two 128-bit integers - expand to icmp + select.
-(rule 1 (lower (has_type $I128 (smax x y)))
+;; Signed maximum of two 128-bit integers pre-z17 - expand to icmp + select.
+(rule 1 (lower (has_type (and (vxrs_ext3_disabled) $I128) (smax x y)))
       (let ((x_reg Reg (put_in_reg x))
             (y_reg Reg (put_in_reg y))
             (cond ProducesBool (vec_int128_scmphi y_reg x_reg)))
         (select_bool_reg $I128 cond y_reg x_reg)))
 
+;; Signed maximum of two 128-bit integers on z17.
+(rule 3 (lower (has_type (and (vxrs_ext3_enabled) $I128) (smax x y)))
+      (vec_smax $I128 x y))
+
 ;; Signed maximum of two vector registers.
 (rule (lower (has_type (ty_vec128 ty) (smax x y)))
       (vec_smax ty x y))
@@ -326,13 +346,17 @@
                                      (intcc_as_cond (IntCC.SignedGreaterThan)))))
         (select_bool_reg ty cond y_ext x_ext)))
 
-;; Signed maximum of two 128-bit integers - expand to icmp + select.
-(rule 1 (lower (has_type $I128 (smin x y)))
+;; Signed maximum of two 128-bit integers pre-z17 - expand to icmp + select.
+(rule 1 (lower (has_type (and (vxrs_ext3_disabled) $I128) (smin x y)))
       (let ((x_reg Reg (put_in_reg x))
             (y_reg Reg (put_in_reg y))
             (cond ProducesBool (vec_int128_scmphi x_reg y_reg)))
         (select_bool_reg $I128 cond y_reg x_reg)))
 
+;; Signed minimum of two 128-bit integers on z17.
+(rule 3 (lower (has_type (and (vxrs_ext3_enabled) $I128) (smin x y)))
+      (vec_smin $I128 x y))
+
 ;; Signed minimum of two vector registers.
 (rule (lower (has_type (ty_vec128 ty) (smin x y)))
       (vec_smin ty x y))
@@ -399,16 +423,22 @@
 (rule (vec_mul_impl $I16X8 x y) (vec_mul $I16X8 x y))
 (rule (vec_mul_impl $I32X4 x y) (vec_mul $I32X4 x y))
 
-;; Multiply two vector registers - doubleword.  Has to be scalarized.
-(rule (vec_mul_impl $I64X2 x y)
+;; Multiply two vector registers - doubleword on z17.
+(rule 1 (vec_mul_impl (and (vxrs_ext3_enabled) $I64X2) x y) (vec_mul $I64X2 x y))
+
+;; Multiply two vector registers - doubleword pre-z17.  Has to be scalarized.
+(rule (vec_mul_impl (and (vxrs_ext3_disabled) $I64X2) x y)
       (mov_to_vec128 $I64X2
                      (mul_reg $I64 (vec_extract_lane $I64X2 x 0 (zero_reg))
                                    (vec_extract_lane $I64X2 y 0 (zero_reg)))
                      (mul_reg $I64 (vec_extract_lane $I64X2 x 1 (zero_reg))
                                    (vec_extract_lane $I64X2 y 1 (zero_reg)))))
 
-;; Multiply two vector registers - quadword.
-(rule (vec_mul_impl $I128 x y)
+;; Multiply two vector registers - quadword on z17.
+(rule 1 (vec_mul_impl (and (vxrs_ext3_enabled) $I128) x y) (vec_mul $I128 x y))
+
+;; Multiply two vector registers - quadword pre-z17.
+(rule (vec_mul_impl (and (vxrs_ext3_disabled) $I128) x y)
       (let ((x_hi Reg (vec_extract_lane $I64X2 x 0 (zero_reg)))
             (x_lo Reg (vec_extract_lane $I64X2 x 1 (zero_reg)))
             (y_hi Reg (vec_extract_lane $I64X2 y 0 (zero_reg)))
@@ -457,9 +487,12 @@
 (rule (lower (has_type $I16X8 (umulhi x y))) (vec_umulhi $I16X8 x y))
 (rule (lower (has_type $I32X4 (umulhi x y))) (vec_umulhi $I32X4 x y))
 
-;; Multiply high part unsigned, vector types with 64-bit elements.
+;; Multiply high part unsigned, vector types with 64-bit elements on z17.
+(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I64X2) (umulhi x y))) (vec_umulhi $I64X2 x y))
+
+;; Multiply high part unsigned, vector types with 64-bit elements pre-z17.
 ;; Has to be scalarized.
-(rule (lower (has_type $I64X2 (umulhi x y)))
+(rule (lower (has_type (and (vxrs_ext3_disabled) $I64X2) (umulhi x y)))
       (let ((pair_0 RegPair (umul_wide (vec_extract_lane $I64X2 x 0 (zero_reg))
                                        (vec_extract_lane $I64X2 y 0 (zero_reg))))
             (res_0 Reg (regpair_hi pair_0))
@@ -495,9 +528,12 @@
 (rule (lower (has_type $I16X8 (smulhi x y))) (vec_smulhi $I16X8 x y))
 (rule (lower (has_type $I32X4 (smulhi x y))) (vec_smulhi $I32X4 x y))
 
-;; Multiply high part unsigned, vector types with 64-bit elements.
+;; Multiply high part signed, vector types with 64-bit elements on z17.
+(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I64X2) (smulhi x y))) (vec_smulhi $I64X2 x y))
+
+;; Multiply high part signed, vector types with 64-bit elements pre-z17.
 ;; Has to be scalarized.
-(rule (lower (has_type $I64X2 (smulhi x y)))
+(rule (lower (has_type (and (vxrs_ext3_disabled) $I64X2) (smulhi x y)))
       (let ((pair_0 RegPair (smul_wide (vec_extract_lane $I64X2 x 0 (zero_reg))
                                        (vec_extract_lane $I64X2 y 0 (zero_reg))))
             (res_0 Reg (copy_reg $I64 (regpair_hi pair_0)))
@@ -575,6 +611,14 @@
             (pair RegPair (udivmod ext_ty ext_x ext_y)))
         (regpair_hi pair)))
 
+;; Implement `udiv` for 128-bit integers on z17 (only).
+(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I128) (udiv x y)))
+        (vec_udiv $I128 x y))
+
+;; Implement `urem` for 128-bit integers on z17 (only).
+(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I128) (urem x y)))
+        (vec_urem $I128 x y))
+
 
 ;;;; Rules for `sdiv` and `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -660,6 +704,22 @@
         (icmps_simm16_and_trap ext_ty reg -1
                                (intcc_as_cond (IntCC.Equal))
                                (trap_code_integer_overflow))))
+(rule 1 (maybe_trap_if_sdiv_overflow true $I128 $I128 x y)
+      (let (
+            ;; We need to trap when y == INT_MIN && x == -1
+            ;; y == INT_MIN is implemented as y == -y, as -INT_MIN == INT_MIN.
+            ;; This checks that y == -y, by using Not-Xor for bitwise
+            ;; equality, producing all 0b1's (-1u128) when y == -y.
+            ;; Then it uses band to include the x == -1 check as well.
+            ;; using (band x (bnot (bxor y neg_divison))) vaiant of vec eval
+            (neg_divisor Reg (vec_neg $I128 y))
+            (reg Reg (vec_eval $I128 0b00001001 x y neg_divisor))
+            ;; finally, we check that the combination of x & y == -y is -1
+            (flags ProducesFlags (vec_elt_icmps reg (vec_imm $I128 -1))))
+        (trap_if flags
+           (intcc_as_cond (IntCC.Equal))
+           (trap_code_integer_overflow))))
+
 (decl int_max (Type) u64)
 (rule (int_max $I8) 0x7f)
 (rule (int_max $I16) 0x7fff)
@@ -687,6 +747,18 @@
       (with_flags_reg (icmps_simm16 $I64 y -1)
                       (cmov_imm $I64 (intcc_as_cond (IntCC.Equal)) 0 x)))
 
+;; Implement `sdiv` for 128-bit integers on z17 (only).
+(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I128) (sdiv x y)))
+        (let ((OFcheck bool (div_overflow_check_needed y))
+              (_ Reg (maybe_trap_if_sdiv_overflow OFcheck $I128 $I128 x y)))
+        (vec_sdiv $I128 x y)))
+
+;; Implement `srem` for 128-bit integers on z17 (only).
+(rule 1 (lower (has_type (and (vxrs_ext3_enabled) $I128) (srem x y)))
+        (let ((OFcheck bool (div_overflow_check_needed y))
+              (_ Reg (maybe_trap_if_sdiv_overflow OFcheck $I128 $I128 x y)))
+        (vec_srem $I128 x y)))
+
 
 ;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1041,6 +1113,11 @@
 (rule 11 (lower (has_type (ty_scalar_float _) (band x y)))
       (vec_and $F64X2 x y))
 
+(rule 12 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (band x y) z)))
+      (vec_eval ty 0b00000001 x y z))
+(rule 13 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (band y z))))
+      (vec_eval ty 0b00000001 x y z))
+
 ;; Specialized lowerings for `(band x (bnot y))` which is additionally produced
 ;; by Cranelift's `band_not` instruction that is legalized into the simpler
 ;; forms early on.
@@ -1057,6 +1134,56 @@
 (rule 10 (lower (has_type (vr128_ty ty) (band (bnot y) x)))
       (vec_and_not ty x y))
 
+;; And-not three vector registers.
+(rule 14 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (band (bnot x) y) z)))
+      (vec_eval ty 0b00010000 x y z))
+(rule 15 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bnot x) (band y z))))
+      (vec_eval ty 0b00010000 x y z))
+(rule 16 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (band x y) (bnot z))))
+      (vec_eval ty 0b00000010 x y z))
+(rule 17 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (band y (bnot z)))))
+      (vec_eval ty 0b00000010 x y z))
+(rule 18 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (band x (bnot y)) z)))
+      (vec_eval ty 0b00000100 y x z))
+(rule 19 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (band (bnot y) z))))
+      (vec_eval ty 0b00000100 z x y))
+
+;; Not-and three vector registers
+(rule 20 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (band (band x y) z))))
+      (vec_eval ty 0b11111110 x y z))
+(rule 21 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (band x (band y z)))))
+      (vec_eval ty 0b11111110 x y z))
+
+;; And-Nand three vector registers
+(rule 20 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bnot (band x y)) z)))
+      (vec_eval ty 0b01010100 x y z))
+(rule 21 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (bnot (band y z)))))
+      (vec_eval ty 0b00001110 x y z))
+
+;; And-Or 3 vector registers
+(rule 22 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (bor y z))))
+      (vec_eval ty 0b00000111 x y z))
+(rule 23 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bor x y) z)))
+      (vec_eval ty 0b00010101 x y z))
+
+;; And-Nor 3 vector registers
+(rule 24 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (bnot (bor y z)))))
+      (vec_eval ty 0b00001000 x y z))
+(rule 25 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bnot (bor x y)) z)))
+      (vec_eval ty 0b01000000 x y z))
+
+;; And-Xor 3 vector registers
+(rule 26 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (bxor y z))))
+      (vec_eval ty 0b00000110 x y z))
+(rule 27 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bxor x y) z)))
+      (vec_eval ty 0b00010100 x y z))
+
+;; And-Nxor 3 vector registers
+(rule 28 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band x (bnot (bxor y z)))))
+      (vec_eval ty 0b00001001 x y z))
+(rule 29 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (band (bnot (bxor x y)) z)))
+      (vec_eval ty 0b01000001 x y z))
+
 ;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; Or two registers.
@@ -1087,6 +1214,12 @@
 (rule 11 (lower (has_type (ty_scalar_float _) (bor x y)))
       (vec_or $F64X2 x y))
 
+;; Or three vector registers.
+(rule 12 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bor x y) z)))
+      (vec_eval ty 0b01111111 x y z))
+(rule 13 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bor y z))))
+      (vec_eval ty 0b01111111 x y z))
+
 ;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced
 ;; by Cranelift's `bor_not` instruction that is legalized into the simpler
 ;; forms early on.
@@ -1103,6 +1236,79 @@
 (rule 10 (lower (has_type (vr128_ty ty) (bor (bnot y) x)))
       (vec_or_not ty x y))
 
+;; 3-input bor with a single not
+(rule 14 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bor (bnot x) y) z)))
+      (vec_eval ty 0b11110111 x y z))
+(rule 15 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bnot x) (bor y z))))
+      (vec_eval ty 0b11110111 x y z))
+(rule 16 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bor x (bnot y)) z)))
+      (vec_eval ty 0b11011111 x y z))
+(rule 17 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bor (bnot y) z))))
+      (vec_eval ty 0b11011111 x y z))
+(rule 18 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bor x y) (bnot z))))
+      (vec_eval ty 0b10111111 x y z))
+(rule 19 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bor y (bnot z)))))
+      (vec_eval ty 0b10111111 x y z))
+
+;; 3-input bnor
+(rule 20 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor (bor x y) z))))
+      (vec_eval ty 0b10000000 x y z))
+(rule 21 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor x (bor y z)))))
+      (vec_eval ty 0b10000000 x y z))
+
+;; Or-Nor
+(rule 20 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bnot (bor x y)) z)))
+      (vec_eval ty 0b11010101 x y z))
+(rule 21 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bnot (bor y z)))))
+      (vec_eval ty 0b10001111 x y z))
+
+;; Or-And
+(rule 22 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (band x y) z)))
+      (vec_eval ty 0b01010111 x y z))
+(rule 23 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (band y z))))
+      (vec_eval ty 0b00011111 x y z))
+
+;; Or-Nand
+(rule 24 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bnot (band x y)) z)))
+      (vec_eval ty 0b11111101 x y z))
+(rule 25 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bnot (band y z)))))
+      (vec_eval ty 0b11101111 x y z))
+
+;; Or-Xor
+(rule 26 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bxor x y) z)))
+      (vec_eval ty 0b01111101 x y z))
+(rule 27 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bxor y z))))
+      (vec_eval ty 0b01101111 x y z))
+
+;; Or-Nxor
+(rule 28 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor (bnot (bxor x y)) z)))
+      (vec_eval ty 0b11010111 x y z))
+(rule 29 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bor x (bnot (bxor y z)))))
+      (vec_eval ty 0b10011111 x y z))
+
+;; Nor-And
+(rule 30 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor (band x y) z))))
+      (vec_eval ty 0b10101000 x y z))
+(rule 31 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor x (band y z)))))
+      (vec_eval ty 0b11100000 x y z))
+
+;; Nor-Nand
+(rule 32 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor (bnot (band x y)) z))))
+      (vec_eval ty 0b00000010 x y z))
+(rule 33 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor x (bnot (band y z))))))
+      (vec_eval ty 0b00010000 x y z))
+
+;; Nor-Xor
+(rule 34 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor (bxor x y) z))))
+      (vec_eval ty 0b10000010 x y z))
+(rule 35 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor x (bxor y z)))))
+      (vec_eval ty 0b10010000 x y z))
+
+;; Nor-Nxor
+(rule 36 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor (bnot (bxor x y)) z))))
+      (vec_eval ty 0b00101000 x y z))
+(rule 37 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bor x (bnot (bxor y z))))))
+      (vec_eval ty 0b01100000 x y z))
 
 ;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1146,6 +1352,77 @@
 (rule 8 (lower (has_type (vr128_ty ty) (bxor (bnot y) x)))
       (vec_not_xor ty x y))
 
+;; 3-input Xor
+(rule 10 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (bxor x y) z)))
+      (vec_eval ty 0b01101001 x y z))
+(rule 11 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (bxor y z))))
+      (vec_eval ty 0b01101001 x y z))
+
+;; Xor-And
+(rule 12 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (band x y) z)))
+      (vec_eval ty 0b01010110 x y z))
+(rule 13 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (band y z))))
+      (vec_eval ty 0b00011110 x y z))
+
+;; Xor-Nand
+(rule 14 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (bnot (band x y)) z)))
+      (vec_eval ty 0b10101001 x y z))
+(rule 15 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (bnot (band y z)))))
+      (vec_eval ty 0b11100001 x y z))
+
+;; Xor-Or
+(rule 16 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (bor x y) z)))
+      (vec_eval ty 0b01101010 x y z))
+(rule 17 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (bor y z))))
+      (vec_eval ty 0b01111000 x y z))
+
+;; Xor-Nor
+(rule 18 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (bnot (bor x y)) z)))
+      (vec_eval ty 0b10010101 x y z))
+(rule 19 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (bnot (bor y z)))))
+      (vec_eval ty 0b10000111 x y z))
+
+;; Xor-Nxor
+(rule 18 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor (bnot (bxor x y)) z)))
+      (vec_eval ty 0b10010110 x y z))
+(rule 19 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bxor x (bnot (bxor y z)))))
+      (vec_eval ty 0b10010110 x y z))
+
+;; 3-input Nxor
+(rule 20 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (bxor x y) z))))
+      (vec_eval ty 0b10010110 x y z))
+(rule 21 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (bxor y z)))))
+      (vec_eval ty 0b10010110 x y z))
+
+;; Nxor-And
+(rule 22 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (band x y) z))))
+      (vec_eval ty 0b10101001 x y z))
+(rule 23 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (band y z)))))
+      (vec_eval ty 0b11100001 x y z))
+
+;; Nxor-Nand
+(rule 24 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (bnot (band x y)) z))))
+      (vec_eval ty 0b01010110 x y z))
+(rule 25 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (bnot (band y z))))))
+      (vec_eval ty 0b00011110 x y z))
+
+;; Nxor-Or
+(rule 26 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (bor x y) z))))
+      (vec_eval ty 0b10010101 x y z))
+(rule 27 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (bor y z)))))
+      (vec_eval ty 0b10000111 x y z))
+
+;; Nxor-Nor
+(rule 28 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (bnot (bor x y)) z))))
+      (vec_eval ty 0b01101010 x y z))
+(rule 29 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (bnot (bor y z))))))
+      (vec_eval ty 0b01111000 x y z))
+
+;; Xor-Nxor
+(rule 18 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor (bnot (bxor x y)) z))))
+      (vec_eval ty 0b01101001 x y z))
+(rule 19 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bnot (bxor x (bnot (bxor y z))))))
+      (vec_eval ty 0b01101001 x y z))
 
 ;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1167,6 +1444,14 @@
 (rule (lower (has_type (vr128_ty ty) (bitselect x y z)))
       (vec_select ty y z x))
 
+;; Bitselect-not vector registers.
+(rule 5 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bitselect x (bnot y) z)))
+      (vec_eval ty 0b01011100 x y z))
+(rule 6 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bitselect x y (bnot z))))
+      (vec_eval ty 0b10100011 x y z))
+(rule 7 (lower (has_type (and (vxrs_ext3_enabled) (vr128_ty ty)) (bitselect x (bnot y) (bnot z))))
+      (vec_eval ty 0b10101100 x y z))
+
 ;; Special-case some float-selection instructions for min/max
 (rule 3 (lower (has_type (ty_vec128 ty) (bitselect (bitcast _ (fcmp (FloatCC.LessThan) x y)) x y)))
         (fmin_pseudo_reg ty y x))
@@ -1232,17 +1517,17 @@
 (rule (clz_offset $I32 x) (add_simm16 $I32 x -32))
 (rule (clz_offset $I64 x) x)
 
-;; Count leading zeros, via FLOGR on an input zero-extended to 64 bits,
+;; Count leading zeros pre-z17, via FLOGR on an input zero-extended to 64 bits,
 ;; with the result compensated for the extra bits.
-(rule 1 (lower (has_type (fits_in_64 ty) (clz x)))
+(rule 1 (lower (has_type (and (mie4_disabled) (fits_in_64 ty)) (clz x)))
       (let ((ext_reg Reg (put_in_reg_zext64 x))
             ;; Ask for a value of 64 in the all-zero 64-bit input case.
             ;; After compensation this will match the expected semantics.
-            (clz Reg (clz_reg 64 ext_reg)))
+            (clz Reg (clz_flogr_reg 64 ext_reg)))
         (clz_offset ty clz)))
 
-;; Count leading zeros, 128-bit full vector.
-(rule (lower (has_type $I128 (clz x)))
+;; Count leading zeros, 128-bit full vector pre-z17.
+(rule (lower (has_type (and (vxrs_ext3_disabled) $I128) (clz x)))
       (let ((clz_vec Reg (vec_clz $I64X2 x))
             (zero Reg (vec_imm $I64X2 0))
             (clz_hi Reg (vec_permute_dw_imm $I64X2 zero 0 clz_vec 0))
@@ -1251,6 +1536,16 @@
             (mask Reg (vec_cmpeq $I64X2 clz_hi (vec_imm_splat $I64X2 64))))
         (vec_select $I128 clz_sum clz_hi mask)))
 
+;; Count leading zeros on z17, via CLZG on an input zero-extended to 64 bits,
+;; with the result compensated for the extra bits.
+(rule 3 (lower (has_type (and (mie4_enabled) (fits_in_64 ty)) (clz x)))
+      (let ((ext_reg Reg (put_in_reg_zext64 x))
+            (clz Reg (clz_reg ext_reg)))
+        (clz_offset ty clz)))
+
+;; Count leading zeros, 128-bit full vector on z17.
+(rule 2 (lower (has_type (and (vxrs_ext3_enabled) $I128) (clz x)))
+        (vec_clz $I128 x))
 
 ;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1263,20 +1558,20 @@
 (rule (cls_offset $I32 x) (add_simm16 $I32 x -33))
 (rule (cls_offset $I64 x) (add_simm16 $I64 x -1))
 
-;; Count leading sign-bit copies.  We don't have any instruction for that,
+;; Count leading sign-bit copies pre-z17.  We don't have any instruction for that,
 ;; so we instead count the leading zeros after inverting the input if negative,
 ;; i.e. computing
 ;;        cls(x) == clz(x ^ (x >> 63)) - 1
 ;; where x is the sign-extended input.
-(rule 1 (lower (has_type (fits_in_64 ty) (cls x)))
+(rule 1 (lower (has_type (and (mie4_disabled) (fits_in_64 ty)) (cls x)))
       (let ((ext_reg Reg (put_in_reg_sext64 x))
             (signbit_copies Reg (ashr_imm $I64 ext_reg 63))
             (inv_reg Reg (xor_reg $I64 ext_reg signbit_copies))
-            (clz Reg (clz_reg 64 inv_reg)))
+            (clz Reg (clz_flogr_reg 64 inv_reg)))
         (cls_offset ty clz)))
 
-;; Count leading sign-bit copies, 128-bit full vector.
-(rule (lower (has_type $I128 (cls x)))
+;; Count leading sign-bit copies, 128-bit full vector pre-z17.
+(rule (lower (has_type (and (vxrs_ext3_disabled) $I128) (cls x)))
       (let ((x_reg Reg x)
             (ones Reg (vec_imm_splat $I8X16 255))
             (signbit_copies Reg (vec_ashr_by_bit (vec_ashr_by_byte x_reg ones) ones))
@@ -1289,6 +1584,22 @@
             (mask Reg (vec_cmpeq $I64X2 clz_hi (vec_imm_splat $I64X2 64))))
         (vec_add $I128 (vec_select $I128 clz_sum clz_hi mask) ones)))
 
+;; Count leading sign-bit copies on z17, similar to above.
+(rule 3 (lower (has_type (and (mie4_enabled) (fits_in_64 ty)) (cls x)))
+        (let ((ext_reg Reg (put_in_reg_sext64 x))
+              (signbit_copies Reg (ashr_imm $I64 ext_reg 63))
+              (inv_reg Reg (xor_reg $I64 ext_reg signbit_copies))
+              (clz Reg (clz_reg inv_reg)))
+          (cls_offset ty clz)))
+
+;; Count leading sign-bit copies, 128-bit full vector on z17.
+(rule 2 (lower (has_type (and (vxrs_ext3_enabled) $I128) (cls x)))
+        (let ((x_reg Reg x)
+              (ones Reg (vec_imm_splat $I8X16 255))
+              (signbit_copies Reg (vec_ashr_by_bit (vec_ashr_by_byte x_reg ones) ones))
+              (inv_reg Reg (vec_xor $I128 x_reg signbit_copies))
+              (clz_vec Reg (vec_clz $I128 inv_reg)))
+          (vec_add $I128 clz_vec ones)))
 
 ;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -1305,10 +1616,10 @@
 ;; never zero by setting a "guard bit" in the position corresponding to
 ;; the input type size.  This way the 64-bit algorithm above will handle
 ;; that case correctly automatically.
-(rule 2 (lower (has_type (gpr32_ty ty) (ctz x)))
+(rule 2 (lower (has_type (and (mie4_disabled) (gpr32_ty ty)) (ctz x)))
       (let ((rx Reg (or_uimm16shifted $I64 x (ctz_guardbit ty)))
             (lastbit Reg (and_reg $I64 rx (neg_reg $I64 rx)))
-            (clz Reg (clz_reg 64 lastbit)))
+            (clz Reg (clz_flogr_reg 64 lastbit)))
         (sub_reg ty (imm ty 63) clz)))
 
 (decl ctz_guardbit (Type) UImm16Shifted)
@@ -1320,14 +1631,14 @@
 ;; via its condition code.  We check for that and replace the instruction
 ;; result with the value -1 via a conditional move, which will then lead to
 ;; the correct result after the final subtraction from 63.
-(rule 1 (lower (has_type (gpr64_ty _ty) (ctz x)))
+(rule 1 (lower (has_type (and (mie4_disabled) (gpr64_ty _ty)) (ctz x)))
       (let ((rx Reg x)
             (lastbit Reg (and_reg $I64 rx (neg_reg $I64 rx)))
-            (clz Reg (clz_reg -1 lastbit)))
+            (clz Reg (clz_flogr_reg -1 lastbit)))
         (sub_reg $I64 (imm $I64 63) clz)))
 
-;; Count trailing zeros, 128-bit full vector.
-(rule 0 (lower (has_type $I128 (ctz x)))
+;; Count trailing zeros, 128-bit full vector pre-z17.
+(rule 0 (lower (has_type (and (vxrs_ext3_disabled) $I128) (ctz x)))
       (let ((ctz_vec Reg (vec_ctz $I64X2 x))
             (zero Reg (vec_imm $I64X2 0))
             (ctz_hi Reg (vec_permute_dw_imm $I64X2 zero 0 ctz_vec 0))
@@ -1336,6 +1647,19 @@
             (mask Reg (vec_cmpeq $I64X2 ctz_lo (vec_imm_splat $I64X2 64))))
         (vec_select $I128 ctz_sum ctz_lo mask)))
 
+;; Count leading zeros on z17, via CTZG on types smaller than 64-bit,
+;; using the same guard bit mechanism as above.
+(rule 5 (lower (has_type (and (mie4_enabled) (gpr32_ty ty)) (ctz x)))
+        (ctz_reg (or_uimm16shifted $I64 x (ctz_guardbit ty))))
+
+;; Count leading zeros on z17, via CTZG directly on 64-bit types.
+(rule 4 (lower (has_type (and (mie4_enabled) (gpr64_ty _ty)) (ctz x)))
+        (ctz_reg x))
+
+;; Count trailing zeros, 128-bit full vector on z17.
+(rule 3 (lower (has_type (and (vxrs_ext3_enabled) $I128) (ctz x)))
+        (vec_ctz $I128 x))
+
 
 ;;;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
@@ -3477,35 +3801,43 @@
       (icmpu_mem_zext32 ty x (sink_uload32 y)))
 
 
-;; Compare 128-bit integers for equality.
+;; Compare (signed) 128-bit integers on z17.
+(rule 2 (icmp_val _ int_cc @ (signed) x @ (value_type (and (vxrs_ext3_enabled) (vr128_ty _))) y)
+        (bool (vec_elt_icmps x y) (intcc_as_cond int_cc)))
+
+;; Compare (unsigned) 128-bit integers on z17.
+(rule 1 (icmp_val _ int_cc @ (unsigned) x @ (value_type (and (vxrs_ext3_enabled) (vr128_ty _))) y)
+        (bool (vec_elt_icmpu x y) (intcc_as_cond int_cc)))
+
+;; Compare 128-bit integers for equality pre-z17.
 ;; Implemented via element-wise comparison using the all-element true CC flag.
-(rule (icmp_val _ (IntCC.Equal) x @ (value_type (vr128_ty _)) y)
+(rule (icmp_val _ (IntCC.Equal) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty _))) y)
       (bool (vec_cmpeqs $I64X2 x y)
             (floatcc_as_cond (FloatCC.Equal))))
-(rule (icmp_val _ (IntCC.NotEqual) x @ (value_type (vr128_ty _)) y)
+(rule (icmp_val _ (IntCC.NotEqual) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty _))) y)
       (bool (vec_cmpeqs $I64X2 x y)
             (floatcc_as_cond (FloatCC.NotEqual))))
 
-;; Compare (signed) 128-bit integers for relational inequality.
+;; Compare (signed) 128-bit integers for relational inequality pre-z17.
 ;; Implemented via synthetic instruction using VECG and VCHLGS.
-(rule (icmp_val _ (IntCC.SignedGreaterThan) x @ (value_type (vr128_ty ty)) y)
+(rule (icmp_val _ (IntCC.SignedGreaterThan) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y)
       (vec_int128_scmphi x y))
-(rule (icmp_val _ (IntCC.SignedLessThan) x @ (value_type (vr128_ty ty)) y)
+(rule (icmp_val _ (IntCC.SignedLessThan) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y)
       (vec_int128_scmphi y x))
-(rule (icmp_val _ (IntCC.SignedGreaterThanOrEqual) x @ (value_type (vr128_ty ty)) y)
+(rule (icmp_val _ (IntCC.SignedGreaterThanOrEqual) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y)
       (invert_bool (vec_int128_scmphi y x)))
-(rule (icmp_val _ (IntCC.SignedLessThanOrEqual) x @ (value_type (vr128_ty ty)) y)
+(rule (icmp_val _ (IntCC.SignedLessThanOrEqual) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y)
       (invert_bool (vec_int128_scmphi x y)))
 
-;; Compare (unsigned) 128-bit integers for relational inequality.
+;; Compare (unsigned) 128-bit integers for relational inequality pre-z17.
 ;; Implemented via synthetic instruction using VECLG and VCHLGS.
-(rule (icmp_val _ (IntCC.UnsignedGreaterThan) x @ (value_type (vr128_ty ty)) y)
+(rule (icmp_val _ (IntCC.UnsignedGreaterThan) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y)
       (vec_int128_ucmphi x y))
-(rule (icmp_val _ (IntCC.UnsignedLessThan) x @ (value_type (vr128_ty ty)) y)
+(rule (icmp_val _ (IntCC.UnsignedLessThan) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y)
       (vec_int128_ucmphi y x))
-(rule (icmp_val _ (IntCC.UnsignedGreaterThanOrEqual) x @ (value_type (vr128_ty ty)) y)
+(rule (icmp_val _ (IntCC.UnsignedGreaterThanOrEqual) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y)
       (invert_bool (vec_int128_ucmphi y x)))
-(rule (icmp_val _ (IntCC.UnsignedLessThanOrEqual) x @ (value_type (vr128_ty ty)) y)
+(rule (icmp_val _ (IntCC.UnsignedLessThanOrEqual) x @ (value_type (and (vxrs_ext3_disabled) (vr128_ty ty))) y)
       (invert_bool (vec_int128_ucmphi x y)))
 
 
diff --git a/cranelift/codegen/src/isa/s390x/lower/isle.rs b/cranelift/codegen/src/isa/s390x/lower/isle.rs
index 561fabd9561e..1e53aef7d664 100644
--- a/cranelift/codegen/src/isa/s390x/lower/isle.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs
@@ -229,6 +229,24 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, S390xBackend> {
         }
     }
 
+    #[inline]
+    fn mie4_enabled(&mut self, _: Type) -> Option<()> {
+        if self.backend.isa_flags.has_mie4() {
+            Some(())
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    fn mie4_disabled(&mut self, _: Type) -> Option<()> {
+        if !self.backend.isa_flags.has_mie4() {
+            Some(())
+        } else {
+            None
+        }
+    }
+
     #[inline]
     fn vxrs_ext2_enabled(&mut self, _: Type) -> Option<()> {
         if self.backend.isa_flags.has_vxrs_ext2() {
@@ -247,6 +265,24 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, S390xBackend> {
         }
     }
 
+    #[inline]
+    fn vxrs_ext3_enabled(&mut self, _: Type) -> Option<()> {
+        if self.backend.isa_flags.has_vxrs_ext3() {
+            Some(())
+        } else {
+            None
+        }
+    }
+
+    #[inline]
+    fn vxrs_ext3_disabled(&mut self, _: Type) -> Option<()> {
+        if !self.backend.isa_flags.has_vxrs_ext3() {
+            Some(())
+        } else {
+            None
+        }
+    }
+
     #[inline]
     fn writable_gpr(&mut self, regno: u8) -> WritableReg {
         writable_gpr(regno)
diff --git a/cranelift/filetests/filetests/isa/s390x/arithmetic-arch15.clif b/cranelift/filetests/filetests/isa/s390x/arithmetic-arch15.clif
new file mode 100644
index 000000000000..55499a1a19d3
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/s390x/arithmetic-arch15.clif
@@ -0,0 +1,333 @@
+test compile precise-output
+set enable_multi_ret_implicit_sret
+target s390x arch15
+
+function %imul_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+  v2 = imul.i128 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vmlq %v6, %v1, %v3
+;   vst %v6, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vml %v6, %v1, %v3, 4
+;   vst %v6, 0(%r2)
+;   br %r14
+
+function %mul_uextend_i64(i64, i64) -> i128 {
+block0(v0: i64, v1: i64):
+    v2 = uextend.i128 v0
+    v3 = uextend.i128 v1
+    v4 = imul v2, v3
+    return v4
+}
+
+; VCode:
+; block0:
+;   lgr %r5, %r2
+;   mlgr %r2, %r4
+;   vlvgp %v7, %r2, %r3
+;   lgr %r2, %r5
+;   vst %v7, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   lgr %r5, %r2
+;   mlgr %r2, %r4
+;   vlvgp %v7, %r2, %r3
+;   lgr %r2, %r5
+;   vst %v7, 0(%r2)
+;   br %r14
+
+function %mul_sextend_i64(i64, i64) -> i128 {
+block0(v0: i64, v1: i64):
+    v2 = sextend.i128 v0
+    v3 = sextend.i128 v1
+    v4 = imul v2, v3
+    return v4
+}
+
+; VCode:
+; block0:
+;   lgr %r5, %r2
+;   mgrk %r2, %r3, %r4
+;   vlvgp %v7, %r2, %r3
+;   lgr %r2, %r5
+;   vst %v7, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   lgr %r5, %r2
+;   mgrk %r2, %r3, %r4
+;   vlvgp %v7, %r2, %r3
+;   lgr %r2, %r5
+;   vst %v7, 0(%r2)
+;   br %r14
+
+function %sdiv_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+  v2 = sdiv.i128 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vlcq %v6, %v3
+;   veval %v16, %v1, %v3, %v6, 9
+;   vrepib %v18, 255
+;   vecq %v16, %v18
+;   jge .+2 # trap=int_ovf
+;   vdq %v22, %v1, %v3, 0
+;   vst %v22, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vlc %v6, %v3, 4
+;   .byte 0xe7, 0x01
+;   lper %f0, %f9
+;   ld %f8, 0x720(%r8, %r14)
+;   .byte 0x00, 0xff
+;   .byte 0x08, 0x45
+;   vec %v16, %v18, 4
+;   jge 0x26 ; trap: int_ovf
+;   .byte 0xe7, 0x61
+;   lper %f0, %f0
+;   lh %r11, 0x760(%r2, %r14) ; trap: int_divz
+;   lpdr %f0, %f0
+;   .byte 0x08, 0x0e
+;   br %r14
+
+function %udiv_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+  v2 = udiv.i128 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vdlq %v6, %v1, %v3, 0
+;   vst %v6, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   .byte 0xe7, 0x61
+;   lper %f0, %f0
+;   sth %r11, 0x760(%r14) ; trap: int_divz
+;   lpdr %f0, %f0
+;   .byte 0x00, 0x0e
+;   br %r14
+
+function %srem_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+  v2 = srem.i128 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vlcq %v6, %v3
+;   veval %v16, %v1, %v3, %v6, 9
+;   vrepib %v18, 255
+;   vecq %v16, %v18
+;   jge .+2 # trap=int_ovf
+;   vrq %v22, %v1, %v3, 0
+;   vst %v22, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vlc %v6, %v3, 4
+;   .byte 0xe7, 0x01
+;   lper %f0, %f9
+;   ld %f8, 0x720(%r8, %r14)
+;   .byte 0x00, 0xff
+;   .byte 0x08, 0x45
+;   vec %v16, %v18, 4
+;   jge 0x26 ; trap: int_ovf
+;   .byte 0xe7, 0x61
+;   lper %f0, %f0
+;   lh %r11, 0x760(%r3, %r14) ; trap: int_divz
+;   lpdr %f0, %f0
+;   .byte 0x08, 0x0e
+;   br %r14
+
+function %urem_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+  v2 = urem.i128 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vrlq %v6, %v1, %v3, 0
+;   vst %v6, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   .byte 0xe7, 0x61
+;   lper %f0, %f0
+;   sth %r11, 0x760(%r1, %r14) ; trap: int_divz
+;   lpdr %f0, %f0
+;   .byte 0x00, 0x0e
+;   br %r14
+
+function %umax_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+  v2 = umax.i128 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vmxlq %v6, %v1, %v3
+;   vst %v6, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vmxl %v6, %v1, %v3, 4
+;   vst %v6, 0(%r2)
+;   br %r14
+
+function %umin_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+  v2 = umin.i128 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vmnlq %v6, %v1, %v3
+;   vst %v6, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vmnl %v6, %v1, %v3, 4
+;   vst %v6, 0(%r2)
+;   br %r14
+
+function %smax_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+  v2 = smax.i128 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vmxq %v6, %v1, %v3
+;   vst %v6, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vmx %v6, %v1, %v3, 4
+;   vst %v6, 0(%r2)
+;   br %r14
+
+function %smin_i128(i128, i128) -> i128 {
+block0(v0: i128, v1: i128):
+  v2 = smin.i128 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vmnq %v6, %v1, %v3
+;   vst %v6, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vl %v3, 0(%r4)
+;   vmn %v6, %v1, %v3, 4
+;   vst %v6, 0(%r2)
+;   br %r14
+
+
+function %iabs_i128(i128) -> i128 {
+block0(v0: i128):
+  v1 = iabs.i128 v0
+  return v1
+}   
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vlpq %v4, %v1
+;   vst %v4, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vlp %v4, %v1, 4
+;   vst %v4, 0(%r2)
+;   br %r14
+
+function %ineg_i128(i128) -> i128 {
+block0(v0: i128):
+  v1 = ineg.i128 v0
+  return v1
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vlcq %v4, %v1
+;   vst %v4, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vlc %v4, %v1, 4
+;   vst %v4, 0(%r2)
+;   br %r14
+
diff --git a/cranelift/filetests/filetests/isa/s390x/bitops-arch15.clif b/cranelift/filetests/filetests/isa/s390x/bitops-arch15.clif
new file mode 100644
index 000000000000..8f9b3b30ea89
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/s390x/bitops-arch15.clif
@@ -0,0 +1,326 @@
+test compile precise-output
+set enable_multi_ret_implicit_sret
+target s390x arch15
+
+function %clz_i128(i128) -> i128 {
+block0(v0: i128):
+    v1 = clz v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vclzq %v4, %v1
+;   vst %v4, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vclz %v4, %v1, 4
+;   vst %v4, 0(%r2)
+;   br %r14
+
+function %clz_i64(i64) -> i64 {
+block0(v0: i64):
+    v1 = clz v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   clzg %r2, %r2
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xb9, 0x68
+;   .byte 0x00, 0x22
+;   br %r14
+
+function %clz_i32(i32) -> i32 {
+block0(v0: i32):
+    v1 = clz v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   llgfr %r4, %r2
+;   clzg %r2, %r4
+;   ahi %r2, -32
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   llgfr %r4, %r2
+;   .byte 0xb9, 0x68
+;   .byte 0x00, 0x24
+;   ahi %r2, -0x20
+;   br %r14
+
+function %clz_i16(i16) -> i16 {
+block0(v0: i16):
+    v1 = clz v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   llghr %r4, %r2
+;   clzg %r2, %r4
+;   ahi %r2, -48
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   llghr %r4, %r2
+;   .byte 0xb9, 0x68
+;   .byte 0x00, 0x24
+;   ahi %r2, -0x30
+;   br %r14
+
+function %clz_i8(i8) -> i8 {
+block0(v0: i8):
+    v1 = clz v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   llgcr %r4, %r2
+;   clzg %r2, %r4
+;   ahi %r2, -56
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   llgcr %r4, %r2
+;   .byte 0xb9, 0x68
+;   .byte 0x00, 0x24
+;   ahi %r2, -0x38
+;   br %r14
+
+function %cls_i128(i128) -> i128 {
+block0(v0: i128):
+    v1 = cls v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vrepib %v4, 255
+;   vsrab %v6, %v1, %v4
+;   vsra %v16, %v6, %v4
+;   vx %v18, %v1, %v16
+;   vclzq %v20, %v18
+;   vaq %v22, %v20, %v4
+;   vst %v22, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vrepib %v4, 0xff
+;   vsrab %v6, %v1, %v4
+;   vsra %v16, %v6, %v4
+;   vx %v18, %v1, %v16
+;   vclz %v20, %v18, 4
+;   vaq %v22, %v20, %v4
+;   vst %v22, 0(%r2)
+;   br %r14
+
+function %cls_i64(i64) -> i64 {
+block0(v0: i64):
+    v1 = cls v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   srag %r4, %r2, 63
+;   xgr %r2, %r4
+;   clzg %r4, %r2
+;   aghik %r2, %r4, -1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   srag %r4, %r2, 0x3f
+;   xgr %r2, %r4
+;   .byte 0xb9, 0x68
+;   .byte 0x00, 0x42
+;   aghik %r2, %r4, -1
+;   br %r14
+
+function %cls_i32(i32) -> i32 {
+block0(v0: i32):
+    v1 = cls v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   lgfr %r4, %r2
+;   srag %r2, %r4, 63
+;   xgr %r4, %r2
+;   clzg %r2, %r4
+;   ahi %r2, -33
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   lgfr %r4, %r2
+;   srag %r2, %r4, 0x3f
+;   xgr %r4, %r2
+;   .byte 0xb9, 0x68
+;   .byte 0x00, 0x24
+;   ahi %r2, -0x21
+;   br %r14
+
+function %cls_i16(i16) -> i16 {
+block0(v0: i16):
+    v1 = cls v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   lghr %r4, %r2
+;   srag %r2, %r4, 63
+;   xgr %r4, %r2
+;   clzg %r2, %r4
+;   ahi %r2, -49
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   lghr %r4, %r2
+;   srag %r2, %r4, 0x3f
+;   xgr %r4, %r2
+;   .byte 0xb9, 0x68
+;   .byte 0x00, 0x24
+;   ahi %r2, -0x31
+;   br %r14
+
+function %cls_i8(i8) -> i8 {
+block0(v0: i8):
+    v1 = cls v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   lgbr %r4, %r2
+;   srag %r2, %r4, 63
+;   xgr %r4, %r2
+;   clzg %r2, %r4
+;   ahi %r2, -57
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   lgbr %r4, %r2
+;   srag %r2, %r4, 0x3f
+;   xgr %r4, %r2
+;   .byte 0xb9, 0x68
+;   .byte 0x00, 0x24
+;   ahi %r2, -0x39
+;   br %r14
+
+function %ctz_i128(i128) -> i128 {
+block0(v0: i128):
+    v1 = ctz v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r3)
+;   vctzq %v4, %v1
+;   vst %v4, 0(%r2)
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r3)
+;   vctz %v4, %v1, 4
+;   vst %v4, 0(%r2)
+;   br %r14
+
+function %ctz_i64(i64) -> i64 {
+block0(v0: i64):
+    v1 = ctz v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   ctzg %r2, %r2
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xb9, 0x69
+;   .byte 0x00, 0x22
+;   br %r14
+
+function %ctz_i32(i32) -> i32 {
+block0(v0: i32):
+    v1 = ctz v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   oihl %r2, 1
+;   ctzg %r2, %r2
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   oihl %r2, 1
+;   .byte 0xb9, 0x69
+;   .byte 0x00, 0x22
+;   br %r14
+
+function %ctz_i16(i16) -> i16 {
+block0(v0: i16):
+    v1 = ctz v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   oilh %r2, 1
+;   ctzg %r2, %r2
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   oilh %r2, 1
+;   .byte 0xb9, 0x69
+;   .byte 0x00, 0x22
+;   br %r14
+
+function %ctz_i8(i8) -> i8 {
+block0(v0: i8):
+    v1 = ctz v0
+    return v1
+}
+
+; VCode:
+; block0:
+;   oill %r2, 256
+;   ctzg %r2, %r2
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   oill %r2, 0x100
+;   .byte 0xb9, 0x69
+;   .byte 0x00, 0x22
+;   br %r14
+
diff --git a/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif
new file mode 100644
index 000000000000..93f3de14e972
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif
@@ -0,0 +1,243 @@
+test compile precise-output
+target s390x arch15
+
+function %icmp_eq_i128(i128, i128) -> i8 {
+block0(v0: i128, v1: i128):
+  v2 = icmp.i128 eq v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   veclq %v1, %v3
+;   lhi %r2, 0
+;   lochie %r2, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vecl %v1, %v3, 4
+;   lhi %r2, 0
+;   lochie %r2, 1
+;   br %r14
+
+function %icmp_ne_i128(i128, i128) -> i8 {
+block0(v0: i128, v1: i128):
+  v2 = icmp.i128 ne v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   veclq %v1, %v3
+;   lhi %r2, 0
+;   lochilh %r2, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vecl %v1, %v3, 4
+;   lhi %r2, 0
+;   lochilh %r2, 1
+;   br %r14
+
+function %icmp_slt_i128(i128, i128) -> i8 {
+block0(v0: i128, v1: i128):
+  v2 = icmp.i128 slt v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vecq %v1, %v3
+;   lhi %r2, 0
+;   lochil %r2, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vec %v1, %v3, 4
+;   lhi %r2, 0
+;   lochil %r2, 1
+;   br %r14
+
+function %icmp_sgt_i128(i128, i128) -> i8 {
+block0(v0: i128, v1: i128):
+  v2 = icmp.i128 sgt v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vecq %v1, %v3
+;   lhi %r2, 0
+;   lochih %r2, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vec %v1, %v3, 4
+;   lhi %r2, 0
+;   lochih %r2, 1
+;   br %r14
+
+function %icmp_sle_i128(i128, i128) -> i8 {
+block0(v0: i128, v1: i128):
+  v2 = icmp.i128 sle v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vecq %v1, %v3
+;   lhi %r2, 0
+;   lochile %r2, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vec %v1, %v3, 4
+;   lhi %r2, 0
+;   lochile %r2, 1
+;   br %r14
+
+function %icmp_sge_i128(i128, i128) -> i8 {
+block0(v0: i128, v1: i128):
+  v2 = icmp.i128 sge v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vecq %v1, %v3
+;   lhi %r2, 0
+;   lochihe %r2, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vec %v1, %v3, 4
+;   lhi %r2, 0
+;   lochihe %r2, 1
+;   br %r14
+
+function %icmp_ult_i128(i128, i128) -> i8 {
+block0(v0: i128, v1: i128):
+  v2 = icmp.i128 ult v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   veclq %v1, %v3
+;   lhi %r2, 0
+;   lochil %r2, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vecl %v1, %v3, 4
+;   lhi %r2, 0
+;   lochil %r2, 1
+;   br %r14
+
+function %icmp_ugt_i128(i128, i128) -> i8 {
+block0(v0: i128, v1: i128):
+  v2 = icmp.i128 ugt v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   veclq %v1, %v3
+;   lhi %r2, 0
+;   lochih %r2, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vecl %v1, %v3, 4
+;   lhi %r2, 0
+;   lochih %r2, 1
+;   br %r14
+
+function %icmp_ule_i128(i128, i128) -> i8 {
+block0(v0: i128, v1: i128):
+  v2 = icmp.i128 ule v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   veclq %v1, %v3
+;   lhi %r2, 0
+;   lochile %r2, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vecl %v1, %v3, 4
+;   lhi %r2, 0
+;   lochile %r2, 1
+;   br %r14
+
+function %icmp_uge_i128(i128, i128) -> i8 {
+block0(v0: i128, v1: i128):
+  v2 = icmp.i128 uge v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   veclq %v1, %v3
+;   lhi %r2, 0
+;   lochihe %r2, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vl %v1, 0(%r2)
+;   vl %v3, 0(%r3)
+;   vecl %v1, %v3, 4
+;   lhi %r2, 0
+;   lochihe %r2, 1
+;   br %r14
+
diff --git a/cranelift/filetests/filetests/isa/s390x/vec-arithmetic-arch15.clif b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic-arch15.clif
new file mode 100644
index 000000000000..381ec6c32d94
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/s390x/vec-arithmetic-arch15.clif
@@ -0,0 +1,51 @@
+test compile precise-output
+target s390x arch15
+
+function %imul_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = imul.i64x2 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vmlg %v24, %v24, %v25
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vml %v24, %v24, %v25, 3
+;   br %r14
+
+function %umulhi_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = umulhi.i64x2 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vmlhg %v24, %v24, %v25
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vmlh %v24, %v24, %v25, 3
+;   br %r14
+
+function %smulhi_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  v2 = smulhi.i64x2 v0, v1
+  return v2
+}
+
+; VCode:
+; block0:
+;   vmhg %v24, %v24, %v25
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   vmh %v24, %v24, %v25, 3
+;   br %r14
+
diff --git a/cranelift/filetests/filetests/isa/s390x/vec-bitwise-arch15.clif b/cranelift/filetests/filetests/isa/s390x/vec-bitwise-arch15.clif
new file mode 100644
index 000000000000..1c91bb392755
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/s390x/vec-bitwise-arch15.clif
@@ -0,0 +1,946 @@
+test compile precise-output
+target s390x arch15
+
+function %band_band_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = band.i64x2 v0, v1
+  v4 = band.i64x2 v3, v2
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r0, %r1, 0xf88(%r10)
+;   br %r14
+
+function %band_band_rev_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = band.i64x2 v0, v1
+  v4 = band.i64x2 v2, v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v26, %v24, %v25, 1
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x8a
+;   .byte 0x80, 0x01
+;   .byte 0x9f, 0x88
+;   br %r14
+
+function %band_band_nota_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bnot v0
+  v4 = band.i64x2 v3, v1
+  v5 = band.i64x2 v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 16
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r1, %r0, 0xf88(%r10)
+;   br %r14
+
+function %band_band_notb_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bnot v1
+  v4 = band.i64x2 v0, v3
+  v5 = band.i64x2 v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v25, %v24, %v26, 4
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x89
+;   .byte 0x80, 0x04
+;   mc 0x7fe, 0x88
+
+function %band_band_notc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bnot v2
+  v4 = band.i64x2 v0, v1
+  v5 = band.i64x2 v4, v3
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 2
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r0, %r2, 0xf88(%r10)
+;   br %r14
+
+function %band_bnandab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = band.i64x2 v0, v1
+  v4 = bnot v3
+  v5 = band.i64x2 v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 84
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r5, %r4, 0xf88(%r10)
+;   br %r14
+
+function %band_bnandbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = band.i64x2 v1, v2
+  v4 = bnot v3
+  v5 = band.i64x2 v0, v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 14
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r0, %r14, 0xf88(%r10)
+;   br %r14
+
+function %band_borab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor.i64x2 v0, v1
+  v4 = band.i64x2 v3, v2
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 21
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r1, %r5, 0xf88(%r10)
+;   br %r14
+
+function %band_borbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor.i64x2 v1, v2
+  v4 = band.i64x2 v0, v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 7
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r0, %r7, 0xf88(%r10)
+;   br %r14
+
+function %band_bnorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor.i64x2 v0, v1
+  v4 = bnot.i64x2 v3
+  v5 = band.i64x2 v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 64
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r4, %r0, 0xf88(%r10)
+;   br %r14
+
+function %band_bnorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor.i64x2 v1, v2
+  v4 = bnot.i64x2 v3
+  v5 = band.i64x2 v0, v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 8
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r0, %r8, 0xf88(%r10)
+;   br %r14
+
+function %band_bxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor.i64x2 v0, v1
+  v4 = band.i64x2 v3, v2
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 20
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r1, %r4, 0xf88(%r10)
+;   br %r14
+
+function %band_bxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor.i64x2 v1, v2
+  v4 = band.i64x2 v0, v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 6
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r0, %r6, 0xf88(%r10)
+;   br %r14
+
+function %band_bnxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor.i64x2 v0, v1
+  v4 = bnot.i64x2 v3
+  v5 = band.i64x2 v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 65
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r4, %r1, 0xf88(%r10)
+;   br %r14
+
+function %band_bnxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor.i64x2 v1, v2
+  v4 = bnot.i64x2 v3
+  v5 = band.i64x2 v0, v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 9
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r0, %r9, 0xf88(%r10)
+;   br %r14
+
+function %bor_bor_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v0, v1
+  v4 = bor v3, v2
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 127
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r7, %r15, 0xf88(%r10)
+;   br %r14
+
+function %bor_bor_rev_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v0, v1
+  v4 = bor v2, v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v26, %v24, %v25, 127
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x8a
+;   .byte 0x80, 0x7f
+;   .byte 0x9f, 0x88
+;   br %r14
+
+function %bor_bor_nota_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bnot v0
+  v4 = bor v3, v1
+  v5 = bor v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 247
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r15, %r7, 0xf88(%r10)
+;   br %r14
+
+function %bor_bor_notb_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bnot v1
+  v4 = bor v0, v3
+  v5 = bor v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 223
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r13, %r15, 0xf88(%r10)
+;   br %r14
+
+function %bor_bor_notc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bnot v2
+  v4 = bor.i64x2 v0, v1
+  v5 = bor.i64x2 v4, v3
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 191
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r11, %r15, 0xf88(%r10)
+;   br %r14
+
+function %bor_bnandab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = band v0, v1
+  v4 = bnot v3
+  v5 = bor v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 253
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r15, %r13, 0xf88(%r10)
+;   br %r14
+
+function %bor_bnandbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = band v1, v2
+  v4 = bnot v3
+  v5 = bor v0, v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 239
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r14, %r15, 0xf88(%r10)
+;   br %r14
+
+function %bor_borab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v0, v1
+  v4 = bor v3, v2
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 127
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r7, %r15, 0xf88(%r10)
+;   br %r14
+
+function %bor_borbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v1, v2
+  v4 = bor v0, v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 127
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r7, %r15, 0xf88(%r10)
+;   br %r14
+
+function %bor_bnorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v0, v1
+  v4 = bnot v3
+  v5 = bor v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 213
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r13, %r5, 0xf88(%r10)
+;   br %r14
+
+function %bor_bnorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v1, v2
+  v4 = bnot v3
+  v5 = bor v0, v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 143
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r8, %r15, 0xf88(%r10)
+;   br %r14
+
+function %bor_bxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v0, v1
+  v4 = bor v3, v2
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 125
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r7, %r13, 0xf88(%r10)
+;   br %r14
+
+function %bor_bxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v1, v2
+  v4 = bor v0, v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 111
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r6, %r15, 0xf88(%r10)
+;   br %r14
+
+function %bor_bnxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v0, v1
+  v4 = bnot v3
+  v5 = bor v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 215
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r13, %r7, 0xf88(%r10)
+;   br %r14
+
+function %bor_bnxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v1, v2
+  v4 = bnot v3
+  v5 = bor v0, v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 159
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r9, %r15, 0xf88(%r10)
+;   br %r14
+
+function %bxor_bxor_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v0, v1
+  v4 = bxor v3, v2
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 105
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r6, %r9, 0xf88(%r10)
+;   br %r14
+
+function %bxor_bxor_rev_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v0, v1
+  v4 = bxor v2, v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v26, %v24, %v25, 105
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x8a
+;   .byte 0x80, 0x69
+;   .byte 0x9f, 0x88
+;   br %r14
+
+function %bxor_bnandab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = band v0, v1
+  v4 = bnot v3
+  v5 = bxor v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 169
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r10, %r9, 0xf88(%r10)
+;   br %r14
+
+function %bxor_bnandbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = band v1, v2
+  v4 = bnot v3
+  v5 = bxor v0, v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 225
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r14, %r1, 0xf88(%r10)
+;   br %r14
+
+function %bxor_borab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v0, v1
+  v4 = bxor v3, v2
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 106
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r6, %r10, 0xf88(%r10)
+;   br %r14
+
+function %bxor_borbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v1, v2
+  v4 = bxor v0, v3
+  return v4
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 120
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r7, %r8, 0xf88(%r10)
+;   br %r14
+
+function %bxor_bnorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v0, v1
+  v4 = bnot v3
+  v5 = bxor v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 149
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r9, %r5, 0xf88(%r10)
+;   br %r14
+
+function %bxor_bnorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v1, v2
+  v4 = bnot v3
+  v5 = bxor v0, v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 135
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r8, %r7, 0xf88(%r10)
+;   br %r14
+
+function %bxor_bnxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v0, v1
+  v4 = bnot v3
+  v5 = bxor v4, v2
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 150
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r9, %r6, 0xf88(%r10)
+;   br %r14
+
+function %bxor_bnxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v1, v2
+  v4 = bnot v3
+  v5 = bxor v0, v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 150
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r9, %r6, 0xf88(%r10)
+;   br %r14
+
+function %bnxor_bxor_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v0, v1
+  v4 = bxor v3, v2
+  v5 = bnot v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 150
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r9, %r6, 0xf88(%r10)
+;   br %r14
+
+function %bnxor_bxor_rev_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v0, v1
+  v4 = bxor v2, v3
+  v5 = bnot v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v26, %v24, %v25, 150
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x8a
+;   .byte 0x80, 0x96
+;   .byte 0x9f, 0x88
+;   br %r14
+
+function %bnxor_bnandab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = band v0, v1
+  v4 = bnot v3
+  v5 = bxor v4, v2
+  v6 = bnot v5
+  return v6
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 86
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r5, %r6, 0xf88(%r10)
+;   br %r14
+
+function %bnxor_bnandbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = band v1, v2
+  v4 = bnot v3
+  v5 = bxor v0, v4
+  v6 = bnot v5
+  return v6
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 30
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r1, %r14, 0xf88(%r10)
+;   br %r14
+
+function %bnxor_borab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v0, v1
+  v4 = bxor v3, v2
+  v5 = bnot v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 149
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r9, %r5, 0xf88(%r10)
+;   br %r14
+
+function %bnxor_borbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v1, v2
+  v4 = bxor v0, v3
+  v5 = bnot v4
+  return v5
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 135
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r8, %r7, 0xf88(%r10)
+;   br %r14
+
+function %bnxor_bnorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v0, v1
+  v4 = bnot v3
+  v5 = bxor v4, v2
+  v6 = bnot v5
+  return v6
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 106
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r6, %r10, 0xf88(%r10)
+;   br %r14
+
+function %bnxor_bnorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bor v1, v2
+  v4 = bnot v3
+  v5 = bxor v0, v4
+  v6 = bnot v5
+  return v6
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 120
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r7, %r8, 0xf88(%r10)
+;   br %r14
+
+function %bnxor_bnxorab_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v0, v1
+  v4 = bnot v3
+  v5 = bxor v4, v2
+  v6 = bnot v5
+  return v6
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 105
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r6, %r9, 0xf88(%r10)
+;   br %r14
+
+function %bnxor_bnxorbc_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i64x2):
+  v3 = bxor v1, v2
+  v4 = bnot v3
+  v5 = bxor v0, v4
+  v6 = bnot v5
+  return v6
+}
+
+; VCode:
+; block0:
+;   veval %v24, %v24, %v25, %v26, 105
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r6, %r9, 0xf88(%r10)
+;   br %r14

From da628431e4b007b4a303cffe29345f64ebee4536 Mon Sep 17 00:00:00 2001
From: Jimmy Brisson <jbrisson@linux.ibm.com>
Date: Tue, 25 Nov 2025 17:23:34 -0600
Subject: [PATCH 2/3] s390x: Emit vector blend on z17

---
 cranelift/codegen/src/isa/s390x/inst.isle     | 14 +++++++++
 cranelift/codegen/src/isa/s390x/inst/emit.rs  | 31 +++++++++++++++++++
 .../codegen/src/isa/s390x/inst/emit_tests.rs  | 10 ++++++
 cranelift/codegen/src/isa/s390x/inst/mod.rs   | 10 +++++-
 cranelift/codegen/src/isa/s390x/lower.isle    |  5 +++
 cranelift/codegen/src/isa/s390x/mod.rs        |  2 +-
 .../filetests/isa/s390x/icmp-i128-arch15.clif | 17 ++++++++++
 7 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/cranelift/codegen/src/isa/s390x/inst.isle b/cranelift/codegen/src/isa/s390x/inst.isle
index db64ed797141..388af8f7cbd1 100644
--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
@@ -600,6 +600,13 @@
       (rm Reg)
       (ra Reg))
 
+    ;; Vector merge instruction.
+    (VecBlend
+      (rd WritableReg)
+      (rn Reg)
+      (rm Reg)
+      (ra Reg))
+
     ;; Vector permute instruction.
     (VecPermute
       (rd WritableReg)
@@ -2493,6 +2500,13 @@
             (_ Unit (emit (MInst.VecPermute dst src1 src2 src3))))
         dst))
 
+;; Helper for emitting `MInst.VecBlend` instructions.
+(decl vec_blend (Type Reg Reg Reg) Reg)
+(rule (vec_blend ty src1 src2 src3)
+      (let ((dst WritableReg (temp_writable_reg ty))
+            (_ Unit (emit (MInst.VecBlend dst src1 src2 src3))))
+        dst))
+
 ;; Helper for emitting `MInst.VecEvaluate` instructions.
 (decl vec_eval (Type u8 Reg Reg Reg) Reg)
 (rule (vec_eval ty op src1 src2 src3)
diff --git a/cranelift/codegen/src/isa/s390x/inst/emit.rs b/cranelift/codegen/src/isa/s390x/inst/emit.rs
index 944f63ed5842..2222faeff1b5 100644
--- a/cranelift/codegen/src/isa/s390x/inst/emit.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/emit.rs
@@ -1176,6 +1176,33 @@ fn enc_vrr_c(opcode: u16, v1: Reg, v2: Reg, v3: Reg, m4: u8, m5: u8, m6: u8) ->
     enc
 }
 
+/// VRRd-type instructions.
+///
+///   47      39 35 31 27 23 19 15 11  7
+///   opcode1 v1 v2 v3 m5 m6 -  v4 rxb opcode2
+///        40 36 32 28 24 20 16 12   8       0
+///
+fn enc_vrr_d(opcode: u16, v1: Reg, v2: Reg, v3: Reg, v4: Reg, m5: u8, m6: u8) -> [u8; 6] {
+    let opcode1 = ((opcode >> 8) & 0xff) as u8;
+    let opcode2 = (opcode & 0xff) as u8;
+    let rxb = rxb(Some(v1), Some(v2), Some(v3), Some(v4));
+    let v1 = machreg_to_vr(v1) & 0x0f;
+    let v2 = machreg_to_vr(v2) & 0x0f;
+    let v3 = machreg_to_vr(v3) & 0x0f;
+    let v4 = machreg_to_vr(v4) & 0x0f;
+    let m5 = m5 & 0x0f;
+    let m6 = m6 & 0x0f;
+
+    let mut enc: [u8; 6] = [0; 6];
+    enc[0] = opcode1;
+    enc[1] = v1 << 4 | v2;
+    enc[2] = v3 << 4 | m5;
+    enc[3] = m6 << 4;
+    enc[4] = v4 << 4 | rxb;
+    enc[5] = opcode2;
+    enc
+}
+
 /// VRRe-type instructions.
 ///
 ///   47      39 35 31 27 23 19 15 11  7
@@ -2910,6 +2937,10 @@ impl Inst {
                 let opcode = 0xe78c; // VPERM
                 put(sink, &enc_vrr_e(opcode, rd.to_reg(), rn, rm, ra, 0, 0));
             }
+            &Inst::VecBlend { rd, rn, rm, ra } => {
+                let opcode = 0xe789; // VBLEND
+                put(sink, &enc_vrr_d(opcode, rd.to_reg(), rn, rm, ra, 0, 0));
+            }
             &Inst::VecEvaluate {
                 imm,
                 rd,
diff --git a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
index d69ccbd91c2a..701b50feef03 100644
--- a/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/emit_tests.rs
@@ -10593,6 +10593,16 @@ fn test_s390x_binemit() {
         "E7C450026788",
         "veval %v12, %v20, %v21, %v22, 2",
     ));
+    insns.push((
+        Inst::VecBlend {
+            rd: writable_vr(12),
+            rn: vr(20),
+            rm: vr(21),
+            ra: vr(22),
+        },
+        "E7C450006789",
+        "vblend %v12, %v20, %v21, %v22",
+    ));
     insns.push((
         Inst::VecInt128SCmpHi {
             tmp: writable_vr(20),
diff --git a/cranelift/codegen/src/isa/s390x/inst/mod.rs b/cranelift/codegen/src/isa/s390x/inst/mod.rs
index 6247e966cde6..421cfde58fef 100644
--- a/cranelift/codegen/src/isa/s390x/inst/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/inst/mod.rs
@@ -319,7 +319,7 @@ impl Inst {
             | Inst::VecLoadLaneRevUndef { .. }
             | Inst::VecStoreLaneRev { .. } => InstructionSet::VXRS_EXT2,
 
-            Inst::VecEvaluate { .. } => InstructionSet::VXRS_EXT3,
+            Inst::VecBlend { .. } | Inst::VecEvaluate { .. } => InstructionSet::VXRS_EXT3,
 
             Inst::DummyUse { .. } => InstructionSet::Base,
 
@@ -741,6 +741,7 @@ fn s390x_get_operands(inst: &mut Inst, collector: &mut DenyReuseVisitor<impl Ope
             collector.reg_use(shift_reg);
         }
         Inst::VecSelect { rd, rn, rm, ra, .. }
+        | Inst::VecBlend { rd, rn, rm, ra, .. }
         | Inst::VecPermute { rd, rn, rm, ra, .. }
         | Inst::VecEvaluate { rd, rn, rm, ra, .. } => {
             collector.reg_def(rd);
@@ -2714,6 +2715,13 @@ impl Inst {
                 let ra = pretty_print_reg(ra);
                 format!("vsel {rd}, {rn}, {rm}, {ra}")
             }
+            &Inst::VecBlend { rd, rn, rm, ra } => {
+                let rd = pretty_print_reg(rd.to_reg());
+                let rn = pretty_print_reg(rn);
+                let rm = pretty_print_reg(rm);
+                let ra = pretty_print_reg(ra);
+                format!("vblend {rd}, {rn}, {rm}, {ra}")
+            }
             &Inst::VecPermute { rd, rn, rm, ra } => {
                 let rd = pretty_print_reg(rd.to_reg());
                 let rn = pretty_print_reg(rn);
diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle
index 3bf616b6229d..03e3a8dceb56 100644
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -2596,6 +2596,11 @@
 (rule (lower (shuffle x y (shuffle_mask (imm8x16 24 25 26 27 28 29 30 31 16 17 18 19 20 21 22 23) 65535)))
       (vec_permute_dw_imm $I8X16 y 1 y 0))
 
+;;;; Rules for `blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 1 (lower (has_type (and (ty_vec128 ty) (vxrs_ext3_enabled)) (x86_blendv p x y)))
+  (vec_blend ty p x y))
+
 
 ;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
diff --git a/cranelift/codegen/src/isa/s390x/mod.rs b/cranelift/codegen/src/isa/s390x/mod.rs
index 6bc40484153d..5af31576855f 100644
--- a/cranelift/codegen/src/isa/s390x/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/mod.rs
@@ -186,7 +186,7 @@ impl TargetIsa for S390xBackend {
     }
 
     fn has_x86_blendv_lowering(&self, _: Type) -> bool {
-        false
+        self.isa_flags.has_vxrs_ext3()
     }
 
     fn has_x86_pshufb_lowering(&self) -> bool {
diff --git a/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif
index 93f3de14e972..2c04cdf8ce7f 100644
--- a/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif
+++ b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif
@@ -241,3 +241,20 @@ block0(v0: i128, v1: i128):
 ;   lochihe %r2, 1
 ;   br %r14
 
+function %f4(i8x16, i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16, v2: i8x16):
+   v3 = x86_blendv v0, v1, v2
+   return v3
+}
+
+; VCode:
+; block0:
+;   vblend %v24, %v24, %v25, %v26
+;   br %r14
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   .byte 0xe7, 0x88
+;   stm %r0, %r0, 0xf89(%r10)
+;   br %r14
+

From 898cc0e123ae8eac0a9a4f492add46b8d26b9454 Mon Sep 17 00:00:00 2001
From: Jimmy Brisson <jbrisson@linux.ibm.com>
Date: Fri, 2 Jan 2026 10:11:18 -0600
Subject: [PATCH 3/3] Rename x86_blendv to blendv

Now that s390x implements blendv as well, we should refer to
the instruction without the x86 prefix.
---
 cranelift/codegen/meta/src/shared/instructions.rs         | 2 +-
 cranelift/codegen/src/isa/aarch64/mod.rs                  | 2 +-
 cranelift/codegen/src/isa/mod.rs                          | 4 ++--
 cranelift/codegen/src/isa/pulley_shared/mod.rs            | 2 +-
 cranelift/codegen/src/isa/riscv64/mod.rs                  | 2 +-
 cranelift/codegen/src/isa/s390x/lower.isle                | 2 +-
 cranelift/codegen/src/isa/s390x/mod.rs                    | 2 +-
 cranelift/codegen/src/isa/x64/lower.isle                  | 8 ++++----
 cranelift/codegen/src/isa/x64/mod.rs                      | 2 +-
 .../filetests/filetests/isa/s390x/icmp-i128-arch15.clif   | 2 +-
 cranelift/fuzzgen/src/function_generator.rs               | 2 +-
 cranelift/interpreter/src/step.rs                         | 2 +-
 crates/cranelift/src/func_environ.rs                      | 4 ++--
 crates/cranelift/src/translate/code_translator.rs         | 4 ++--
 14 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index 9604c6904bc0..a27ca30aab4a 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -1667,7 +1667,7 @@ pub(crate) fn define(
 
     ig.push(
         Inst::new(
-            "x86_blendv",
+            "blendv",
             r#"
         A bitselect-lookalike instruction except with the semantics of
         `blendv`-related instructions on x86.
diff --git a/cranelift/codegen/src/isa/aarch64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs
index 2f80ad9fa29b..e90a8aecfa57 100644
--- a/cranelift/codegen/src/isa/aarch64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -227,7 +227,7 @@ impl TargetIsa for AArch64Backend {
         true
     }
 
-    fn has_x86_blendv_lowering(&self, _: Type) -> bool {
+    fn has_blendv_lowering(&self, _: Type) -> bool {
         false
     }
 
diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs
index 2bc2033983ef..a33e69dee809 100644
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -391,9 +391,9 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
     /// Returns whether this ISA has instructions for `ceil`, `floor`, etc.
     fn has_round(&self) -> bool;
 
-    /// Returns whether the CLIF `x86_blendv` instruction is implemented for
+    /// Returns whether the CLIF `blendv` instruction is implemented for
     /// this ISA for the specified type.
-    fn has_x86_blendv_lowering(&self, ty: Type) -> bool;
+    fn has_blendv_lowering(&self, ty: Type) -> bool;
 
     /// Returns whether the CLIF `x86_pshufb` instruction is implemented for
     /// this ISA.
diff --git a/cranelift/codegen/src/isa/pulley_shared/mod.rs b/cranelift/codegen/src/isa/pulley_shared/mod.rs
index 0b781e467a08..66d0051e9e62 100644
--- a/cranelift/codegen/src/isa/pulley_shared/mod.rs
+++ b/cranelift/codegen/src/isa/pulley_shared/mod.rs
@@ -232,7 +232,7 @@ where
         true
     }
 
-    fn has_x86_blendv_lowering(&self, _ty: ir::Type) -> bool {
+    fn has_blendv_lowering(&self, _ty: ir::Type) -> bool {
         false
     }
 
diff --git a/cranelift/codegen/src/isa/riscv64/mod.rs b/cranelift/codegen/src/isa/riscv64/mod.rs
index ca0a1a13e2e9..f41700825df2 100644
--- a/cranelift/codegen/src/isa/riscv64/mod.rs
+++ b/cranelift/codegen/src/isa/riscv64/mod.rs
@@ -205,7 +205,7 @@ impl TargetIsa for Riscv64Backend {
         true
     }
 
-    fn has_x86_blendv_lowering(&self, _: Type) -> bool {
+    fn has_blendv_lowering(&self, _: Type) -> bool {
         false
     }
 
diff --git a/cranelift/codegen/src/isa/s390x/lower.isle b/cranelift/codegen/src/isa/s390x/lower.isle
index 03e3a8dceb56..f92bdd01199f 100644
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -2598,7 +2598,7 @@
 
 ;;;; Rules for `blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(rule 1 (lower (has_type (and (ty_vec128 ty) (vxrs_ext3_enabled)) (x86_blendv p x y)))
+(rule 1 (lower (has_type (and (ty_vec128 ty) (vxrs_ext3_enabled)) (blendv p x y)))
   (vec_blend ty p x y))
 
 
diff --git a/cranelift/codegen/src/isa/s390x/mod.rs b/cranelift/codegen/src/isa/s390x/mod.rs
index 5af31576855f..1f2c2461ee02 100644
--- a/cranelift/codegen/src/isa/s390x/mod.rs
+++ b/cranelift/codegen/src/isa/s390x/mod.rs
@@ -185,7 +185,7 @@ impl TargetIsa for S390xBackend {
         true
     }
 
-    fn has_x86_blendv_lowering(&self, _: Type) -> bool {
+    fn has_blendv_lowering(&self, _: Type) -> bool {
         self.isa_flags.has_vxrs_ext3()
     }
 
diff --git a/cranelift/codegen/src/isa/x64/lower.isle b/cranelift/codegen/src/isa/x64/lower.isle
index e8922537bdd2..ca96b7830fa1 100644
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1558,20 +1558,20 @@
             (b Xmm (sse_and ty c_neg f)))
         (sse_or ty a b)))
 
-;;;; Rules for `x86_blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Rules for `blendv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type $I8X16
-                       (x86_blendv condition if_true if_false)))
+                       (blendv condition if_true if_false)))
       (if-let true (has_sse41))
       (x64_pblendvb if_false if_true condition))
 
 (rule (lower (has_type $I32X4
-                       (x86_blendv condition if_true if_false)))
+                       (blendv condition if_true if_false)))
       (if-let true (has_sse41))
       (x64_blendvps if_false if_true condition))
 
 (rule (lower (has_type $I64X2
-                       (x86_blendv condition if_true if_false)))
+                       (blendv condition if_true if_false)))
       (if-let true (has_sse41))
       (x64_blendvpd if_false if_true condition))
 
diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs
index 2559a97b6863..b47b33178431 100644
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -179,7 +179,7 @@ impl TargetIsa for X64Backend {
         self.x64_flags.has_sse41()
     }
 
-    fn has_x86_blendv_lowering(&self, ty: Type) -> bool {
+    fn has_blendv_lowering(&self, ty: Type) -> bool {
         // The `blendvpd`, `blendvps`, and `pblendvb` instructions are all only
         // available from SSE 4.1 and onwards. Otherwise the i16x8 type has no
         // equivalent instruction which only looks at the top bit for a select
diff --git a/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif
index 2c04cdf8ce7f..4a97ff856868 100644
--- a/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif
+++ b/cranelift/filetests/filetests/isa/s390x/icmp-i128-arch15.clif
@@ -243,7 +243,7 @@ block0(v0: i128, v1: i128):
 
 function %f4(i8x16, i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16, v2: i8x16):
-   v3 = x86_blendv v0, v1, v2
+   v3 = blendv v0, v1, v2
    return v3
 }
 
diff --git a/cranelift/fuzzgen/src/function_generator.rs b/cranelift/fuzzgen/src/function_generator.rs
index d53db22a8f52..4758af801b1d 100644
--- a/cranelift/fuzzgen/src/function_generator.rs
+++ b/cranelift/fuzzgen/src/function_generator.rs
@@ -912,7 +912,7 @@ static OPCODE_SIGNATURES: LazyLock<Vec<OpcodeSignature>> = LazyLock::new(|| {
                 (Opcode::GetFramePointer),
                 (Opcode::GetStackPointer),
                 (Opcode::GetReturnAddress),
-                (Opcode::X86Blendv),
+                (Opcode::Blendv),
                 (Opcode::IcmpImm),
                 (Opcode::X86Pmulhrsw),
                 (Opcode::IaddImm),
diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs
index 8a7fad2323d9..3476305c9f83 100644
--- a/cranelift/interpreter/src/step.rs
+++ b/cranelift/interpreter/src/step.rs
@@ -1311,7 +1311,7 @@ where
         Opcode::GetStackPointer => unimplemented!("GetStackPointer"),
         Opcode::GetReturnAddress => unimplemented!("GetReturnAddress"),
         Opcode::X86Pshufb => unimplemented!("X86Pshufb"),
-        Opcode::X86Blendv => unimplemented!("X86Blendv"),
+        Opcode::Blendv => unimplemented!("Blendv"),
         Opcode::X86Pmulhrsw => unimplemented!("X86Pmulhrsw"),
         Opcode::X86Pmaddubsw => unimplemented!("X86Pmaddubsw"),
         Opcode::X86Cvtt2dq => unimplemented!("X86Cvtt2dq"),
diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs
index eb9581f9e3de..028e7bd40489 100644
--- a/crates/cranelift/src/func_environ.rs
+++ b/crates/cranelift/src/func_environ.rs
@@ -3994,8 +3994,8 @@ impl FuncEnvironment<'_> {
             .returns()
     }
 
-    pub fn use_x86_blendv_for_relaxed_laneselect(&self, ty: Type) -> bool {
-        self.isa.has_x86_blendv_lowering(ty)
+    pub fn use_blendv_for_relaxed_laneselect(&self, ty: Type) -> bool {
+        self.isa.has_blendv_lowering(ty)
     }
 
     pub fn use_x86_pmulhrsw_for_relaxed_q15mul(&self) -> bool {
diff --git a/crates/cranelift/src/translate/code_translator.rs b/crates/cranelift/src/translate/code_translator.rs
index e2dc4b539c8c..b7565bd24e12 100644
--- a/crates/cranelift/src/translate/code_translator.rs
+++ b/crates/cranelift/src/translate/code_translator.rs
@@ -2481,13 +2481,13 @@ pub fn translate_operator(
             // op.
             environ.stacks.push1(
                 if environ.relaxed_simd_deterministic()
-                    || !environ.use_x86_blendv_for_relaxed_laneselect(ty)
+                    || !environ.use_blendv_for_relaxed_laneselect(ty)
                 {
                     // Deterministic semantics are a `bitselect` along the lines
                     // of the wasm `v128.bitselect` instruction.
                     builder.ins().bitselect(c, a, b)
                 } else {
-                    builder.ins().x86_blendv(c, a, b)
+                    builder.ins().blendv(c, a, b)
                 },
             );
         }