From d8c97ca6310be6a97cc2d193ce2ea58e2ed67189 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 31 Jul 2025 12:01:20 +0000 Subject: [PATCH 1/3] Add tests --- .../AArch64/fixed-vector-interleave.ll | 82 ++++- .../CodeGen/AArch64/sve-vector-interleave.ll | 170 +++++++++- .../RISCV/rvv/vector-interleave-fixed.ll | 226 +++++++++++++ .../CodeGen/RISCV/rvv/vector-interleave.ll | 300 ++++++++++++++++++ 4 files changed, 776 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll index a9618fdc2dec3..8ccc9f97e5133 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll @@ -131,6 +131,85 @@ define <4 x i64> @interleave2_v4i64(<2 x i64> %vec0, <2 x i64> %vec1) { ret <4 x i64> %retval } +define <4 x i16> @interleave2_same_const_splat_v4i16() { +; CHECK-SD-LABEL: interleave2_same_const_splat_v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: movi v0.4h, #3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: interleave2_same_const_splat_v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #3 // =0x3 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-GI-NEXT: ret + %retval = call <4 x i16> @llvm.vector.interleave2.v4i16(<2 x i16> splat(i16 3), <2 x i16> splat(i16 3)) + ret <4 x i16> %retval +} + +define <4 x i16> @interleave2_diff_const_splat_v4i16() { +; CHECK-SD-LABEL: interleave2_diff_const_splat_v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: adrp x8, .LCPI11_0 +; CHECK-SD-NEXT: ldr d0, [x8, :lo12:.LCPI11_0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: interleave2_diff_const_splat_v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #3 // =0x3 +; CHECK-GI-NEXT: mov w9, #4 // =0x4 +; CHECK-GI-NEXT: fmov s0, w8 +; CHECK-GI-NEXT: fmov s1, w9 +; CHECK-GI-NEXT: mov v0.h[1], w8 +; CHECK-GI-NEXT: mov v1.h[1], w9 +; CHECK-GI-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret + %retval = call <4 x i16> @llvm.vector.interleave2.v4i16(<2 x i16> splat(i16 3), <2 x i16> splat(i16 4)) + ret <4 x i16> %retval +} + +define <4 x i16> @interleave2_same_nonconst_splat_v4i16(i16 %a) { +; CHECK-SD-LABEL: interleave2_same_nonconst_splat_v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: dup v0.4h, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: interleave2_same_nonconst_splat_v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: dup v0.4h, w0 +; CHECK-GI-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-GI-NEXT: ret + %ins = insertelement <2 x i16> poison, i16 %a, i32 0 + %splat = shufflevector <2 x i16> %ins, <2 x i16> poison, <2 x i32> + %retval = call <4 x i16> @llvm.vector.interleave2.v4i16(<2 x i16> %splat, <2 x i16> %splat) + ret <4 x i16> %retval +} + +define <4 x i16> @interleave2_diff_nonconst_splat_v4i16(i16 %a, i16 %b) { +; CHECK-SD-LABEL: interleave2_diff_nonconst_splat_v4i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: mov v0.h[1], w0 +; CHECK-SD-NEXT: mov v0.h[2], w1 +; CHECK-SD-NEXT: mov v0.h[3], w1 +; CHECK-SD-NEXT: rev32 v1.4h, v0.4h +; CHECK-SD-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: interleave2_diff_nonconst_splat_v4i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: dup v0.4h, w0 +; CHECK-GI-NEXT: dup v1.4h, w1 +; CHECK-GI-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-GI-NEXT: ret + %ins1 = insertelement <2 x i16> poison, i16 %a, i32 0 + %splat1 = shufflevector <2 x i16> %ins1, <2 x i16> poison, <2 x i32> + %ins2 = insertelement <2 x i16> poison, i16 %b, i32 0 + %splat2 = shufflevector <2 x i16> %ins2, <2 x i16> poison, <2 x i32> + %retval = call <4 x i16> @llvm.vector.interleave2.v4i16(<2 x i16> %splat1, <2 x i16> %splat2) + ret <4 x i16> %retval +} ; Float declarations declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>) @@ -145,4 +224,5 @@ declare <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8>, <16 x i8>) declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) - +declare <4 x i16> @llvm.vector.interleave2.v4i16(<2 x i16>, <2 x i16>) +declare <8 x i16> @llvm.vector.interleave4.v8i16(<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>) diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll index 52cb2d9ebe343..b954863560899 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll @@ -267,7 +267,7 @@ define @interleave4_nxv8i16( %vec0, @llvm.vector.interleave4.nxv8i16( %vec0, %vec1, %vec2, %vec3) + %retval = call @llvm.vector.interleave4.nxv32i16( %vec0, %vec1, %vec2, %vec3) ret %retval } @@ -540,6 +540,172 @@ define @interleave2_nxv2i32( %vec0, %retval } +define @interleave2_same_const_splat_nxv4i16() { +; SVE-LABEL: interleave2_same_const_splat_nxv4i16: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #3 // =0x3 +; SVE-NEXT: zip2 z1.d, z0.d, z0.d +; SVE-NEXT: zip1 z0.d, z0.d, z0.d +; SVE-NEXT: uzp1 z0.s, z0.s, z1.s +; SVE-NEXT: ret +; +; SME2-LABEL: interleave2_same_const_splat_nxv4i16: +; SME2: // %bb.0: +; SME2-NEXT: mov z0.d, #3 // =0x3 +; SME2-NEXT: zip { z0.d, z1.d }, z0.d, z0.d +; SME2-NEXT: uzp1 z0.s, z0.s, z1.s +; SME2-NEXT: ret + %retval = call @llvm.vector.interleave2.nxv4i16( splat(i16 3), splat(i16 3)) + ret %retval +} + +define @interleave2_diff_const_splat_nxv4i16() { +; SVE-LABEL: interleave2_diff_const_splat_nxv4i16: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #4 // =0x4 +; SVE-NEXT: mov z1.d, #3 // =0x3 +; SVE-NEXT: zip2 z2.d, z1.d, z0.d +; SVE-NEXT: zip1 z0.d, z1.d, z0.d +; SVE-NEXT: uzp1 z0.s, z0.s, z2.s +; SVE-NEXT: ret +; +; SME2-LABEL: interleave2_diff_const_splat_nxv4i16: +; SME2: // %bb.0: +; SME2-NEXT: mov z0.d, #4 // =0x4 +; SME2-NEXT: mov z1.d, #3 // =0x3 +; SME2-NEXT: zip { z0.d, z1.d }, z1.d, z0.d +; SME2-NEXT: uzp1 z0.s, z0.s, z1.s +; SME2-NEXT: ret + %retval = call @llvm.vector.interleave2.v4i16( splat(i16 3), splat(i16 4)) + ret %retval +} + +define @interleave2_same_nonconst_splat_nxv4i16(i16 %a) { +; SVE-LABEL: interleave2_same_nonconst_splat_nxv4i16: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $w0 killed $w0 def $x0 +; SVE-NEXT: mov z0.d, x0 +; SVE-NEXT: zip2 z1.d, z0.d, z0.d +; SVE-NEXT: zip1 z0.d, z0.d, z0.d +; SVE-NEXT: uzp1 z0.s, z0.s, z1.s +; SVE-NEXT: ret +; +; SME2-LABEL: interleave2_same_nonconst_splat_nxv4i16: +; SME2: // %bb.0: +; SME2-NEXT: // kill: def $w0 killed $w0 def $x0 +; SME2-NEXT: mov z0.d, x0 +; SME2-NEXT: zip { z0.d, z1.d }, z0.d, z0.d +; SME2-NEXT: uzp1 z0.s, z0.s, z1.s +; SME2-NEXT: ret + %ins = insertelement poison, i16 %a, i32 0 + %splat = shufflevector %ins, poison, zeroinitializer + %retval = call @llvm.vector.interleave2.nxv4i16( %splat, %splat) + ret %retval +} + +define @interleave2_diff_nonconst_splat_nxv4i16(i16 %a, i16 %b) { +; SVE-LABEL: interleave2_diff_nonconst_splat_nxv4i16: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $w1 killed $w1 def $x1 +; SVE-NEXT: // kill: def $w0 killed $w0 def $x0 +; SVE-NEXT: mov z0.d, x0 +; SVE-NEXT: mov z1.d, x1 +; SVE-NEXT: zip2 z2.d, z0.d, z1.d +; SVE-NEXT: zip1 z0.d, z0.d, z1.d +; SVE-NEXT: uzp1 z0.s, z0.s, z2.s +; SVE-NEXT: ret +; +; SME2-LABEL: interleave2_diff_nonconst_splat_nxv4i16: +; SME2: // %bb.0: +; SME2-NEXT: // kill: def $w1 killed $w1 def $x1 +; SME2-NEXT: // kill: def $w0 killed $w0 def $x0 +; SME2-NEXT: mov z0.d, x0 +; SME2-NEXT: mov z1.d, x1 +; SME2-NEXT: zip { z0.d, z1.d }, z0.d, z1.d +; SME2-NEXT: uzp1 z0.s, z0.s, z1.s +; SME2-NEXT: ret + %ins1 = insertelement poison, i16 %a, i32 0 + %splat1 = shufflevector %ins1, poison, zeroinitializer + %ins2 = insertelement poison, i16 %b, i32 0 + %splat2 = shufflevector %ins2, poison, zeroinitializer + %retval = call @llvm.vector.interleave2.nxv4i16( %splat1, %splat2) + ret %retval +} + +define @interleave4_same_const_splat_nxv8i16() { +; SVE-LABEL: interleave4_same_const_splat_nxv8i16: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #3 // =0x3 +; SVE-NEXT: zip1 z1.d, z0.d, z0.d +; SVE-NEXT: zip1 z2.d, z1.d, z1.d +; SVE-NEXT: zip2 z1.d, z1.d, z1.d +; SVE-NEXT: uzp1 z2.s, z2.s, z0.s +; SVE-NEXT: uzp1 z2.h, z2.h, z0.h +; SVE-NEXT: uunpklo z2.s, z2.h +; SVE-NEXT: uunpklo z2.d, z2.s +; SVE-NEXT: uzp1 z1.s, z2.s, z1.s +; SVE-NEXT: uzp1 z2.h, z1.h, z0.h +; SVE-NEXT: zip2 z0.d, z0.d, z0.d +; SVE-NEXT: uunpkhi z2.s, z2.h +; SVE-NEXT: zip1 z3.d, z0.d, z0.d +; SVE-NEXT: zip2 z0.d, z0.d, z0.d +; SVE-NEXT: uunpkhi z2.d, z2.s +; SVE-NEXT: uzp1 z2.s, z3.s, z2.s +; SVE-NEXT: uzp1 z2.h, z1.h, z2.h +; SVE-NEXT: uunpkhi z2.s, z2.h +; SVE-NEXT: uunpklo z2.d, z2.s +; SVE-NEXT: uzp1 z0.s, z2.s, z0.s +; SVE-NEXT: uzp1 z0.h, z1.h, z0.h +; SVE-NEXT: ret +; +; SME-ALL-LABEL: interleave4_same_const_splat_nxv8i16: +; SME-ALL: // %bb.0: +; SME-ALL-NEXT: mov z0.d, #3 // =0x3 +; SME-ALL-NEXT: zip { z0.d, z1.d }, z0.d, z0.d +; SME-ALL-NEXT: zip { z2.d, z3.d }, z0.d, z0.d +; SME-ALL-NEXT: uzp1 z4.s, z2.s, z0.s +; SME-ALL-NEXT: uzp1 z4.h, z4.h, z0.h +; SME-ALL-NEXT: uunpklo z4.s, z4.h +; SME-ALL-NEXT: uunpklo z4.d, z4.s +; SME-ALL-NEXT: uzp1 z2.s, z4.s, z3.s +; SME-ALL-NEXT: uzp1 z3.h, z2.h, z0.h +; SME-ALL-NEXT: zip { z0.d, z1.d }, z1.d, z1.d +; SME-ALL-NEXT: uunpkhi z3.s, z3.h +; SME-ALL-NEXT: uunpkhi z3.d, z3.s +; SME-ALL-NEXT: uzp1 z3.s, z0.s, z3.s +; SME-ALL-NEXT: uzp1 z3.h, z2.h, z3.h +; SME-ALL-NEXT: uunpkhi z3.s, z3.h +; SME-ALL-NEXT: uunpklo z3.d, z3.s +; SME-ALL-NEXT: uzp1 z0.s, z3.s, z1.s +; SME-ALL-NEXT: uzp1 z0.h, z2.h, z0.h +; SME-ALL-NEXT: ret +; +; SME2-256-LABEL: interleave4_same_const_splat_nxv8i16: +; SME2-256: // %bb.0: +; SME2-256-NEXT: mov z0.d, #3 // =0x3 +; SME2-256-NEXT: mov z1.d, z0.d +; SME2-256-NEXT: mov z2.d, z0.d +; SME2-256-NEXT: mov z3.d, z0.d +; SME2-256-NEXT: zip { z0.d - z3.d }, { z0.d - z3.d } +; SME2-256-NEXT: uzp1 z4.s, z0.s, z0.s +; SME2-256-NEXT: uzp1 z4.h, z4.h, z0.h +; SME2-256-NEXT: uunpklo z4.s, z4.h +; SME2-256-NEXT: uunpklo z4.d, z4.s +; SME2-256-NEXT: uzp1 z4.s, z4.s, z1.s +; SME2-256-NEXT: uzp1 z5.h, z4.h, z0.h +; SME2-256-NEXT: uunpkhi z5.s, z5.h +; SME2-256-NEXT: uunpkhi z5.d, z5.s +; SME2-256-NEXT: uzp1 z5.s, z2.s, z5.s +; SME2-256-NEXT: uzp1 z5.h, z4.h, z5.h +; SME2-256-NEXT: uunpkhi z5.s, z5.h +; SME2-256-NEXT: uunpklo z5.d, z5.s +; SME2-256-NEXT: uzp1 z0.s, z5.s, z3.s +; SME2-256-NEXT: uzp1 z0.h, z4.h, z0.h +; SME2-256-NEXT: ret + %retval = call @llvm.vector.interleave4.nxv8i16( splat(i16 3), splat(i16 3), splat(i16 3), splat(i16 3)) + ret %retval +} + ; Float declarations declare @llvm.vector.interleave2.nxv4f16(, ) declare @llvm.vector.interleave2.nxv8f16(, ) @@ -567,3 +733,5 @@ declare @llvm.vector.interleave2.nxv8i64(, declare @llvm.vector.interleave2.nxv16i8(, ) declare @llvm.vector.interleave2.nxv8i16(, ) declare @llvm.vector.interleave2.nxv4i32(, ) +declare @llvm.vector.interleave2.nxv4i16(, ) +declare @llvm.vector.interleave4.nxv8i16(, , , ) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll index 3dc83d50ee3f3..f3ba7fe33fa48 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -1636,3 +1636,229 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, %res = call <8 x half> @llvm.vector.interleave8.v8f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h) ret <8 x half> %res } + +define <8 x i16> @interleave4_const_splat_v8i16(<2 x i16> %a) { +; CHECK-LABEL: interleave4_const_splat_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v8, 3 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vsetvli a3, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg4e16.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vle16.v v9, (a3) +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vle16.v v11, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v9, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: interleave4_const_splat_v8i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: .cfi_def_cfa_offset 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVBB-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVBB-NEXT: vmv.v.i v8, 3 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vmv1r.v v9, v8 +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: vmv1r.v v10, v8 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vmv1r.v v11, v8 +; ZVBB-NEXT: vsetvli a3, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg4e16.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vle16.v v9, (a3) +; ZVBB-NEXT: vle16.v v10, (a2) +; ZVBB-NEXT: vle16.v v11, (a1) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v9, v11, 2 +; ZVBB-NEXT: vslideup.vi v8, v10, 2 +; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v9, 4 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: .cfi_def_cfa sp, 16 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: interleave4_const_splat_v8i16: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -16 +; ZIP-NEXT: .cfi_def_cfa_offset 16 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: sub sp, sp, a0 +; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZIP-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZIP-NEXT: vmv.v.i v8, 3 +; ZIP-NEXT: addi a0, sp, 16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vmv1r.v v9, v8 +; ZIP-NEXT: srli a1, a1, 2 +; ZIP-NEXT: vmv1r.v v10, v8 +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vmv1r.v v11, v8 +; ZIP-NEXT: vsetvli a3, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg4e16.v v8, (a0) +; ZIP-NEXT: add a3, a2, a1 +; ZIP-NEXT: add a1, a3, a1 +; ZIP-NEXT: vle16.v v9, (a3) +; ZIP-NEXT: vle16.v v10, (a2) +; ZIP-NEXT: vle16.v v11, (a1) +; ZIP-NEXT: vle16.v v8, (a0) +; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZIP-NEXT: vslideup.vi v9, v11, 2 +; ZIP-NEXT: vslideup.vi v8, v10, 2 +; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vi v8, v9, 4 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: add sp, sp, a0 +; ZIP-NEXT: .cfi_def_cfa sp, 16 +; ZIP-NEXT: addi sp, sp, 16 +; ZIP-NEXT: .cfi_def_cfa_offset 0 +; ZIP-NEXT: ret + %retval = call <8 x i16> @llvm.vector.interleave4.v8i16(<2 x i16> splat(i16 3), <2 x i16> splat(i16 3), <2 x i16> splat(i16 3), <2 x i16> splat(i16 3)) + ret <8 x i16> %retval +} + +define <8 x i16> @interleave4_same_nonconst_splat_v8i16(i16 %a) { +; CHECK-LABEL: interleave4_same_nonconst_splat_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: add a2, a0, a1 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vsetvli a3, zero, e16, mf4, ta, ma +; CHECK-NEXT: vsseg4e16.v v8, (a0) +; CHECK-NEXT: add a3, a2, a1 +; CHECK-NEXT: add a1, a3, a1 +; CHECK-NEXT: vle16.v v9, (a3) +; CHECK-NEXT: vle16.v v10, (a2) +; CHECK-NEXT: vle16.v v11, (a1) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vslideup.vi v9, v11, 2 +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vi v8, v9, 4 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: interleave4_same_nonconst_splat_v8i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: .cfi_def_cfa_offset 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: sub sp, sp, a1 +; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZVBB-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVBB-NEXT: vmv.v.x v8, a0 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vmv1r.v v9, v8 +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: vmv1r.v v10, v8 +; ZVBB-NEXT: add a2, a0, a1 +; ZVBB-NEXT: vmv1r.v v11, v8 +; ZVBB-NEXT: vsetvli a3, zero, e16, mf4, ta, ma +; ZVBB-NEXT: vsseg4e16.v v8, (a0) +; ZVBB-NEXT: add a3, a2, a1 +; ZVBB-NEXT: add a1, a3, a1 +; ZVBB-NEXT: vle16.v v9, (a3) +; ZVBB-NEXT: vle16.v v10, (a2) +; ZVBB-NEXT: vle16.v v11, (a1) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB-NEXT: vslideup.vi v9, v11, 2 +; ZVBB-NEXT: vslideup.vi v8, v10, 2 +; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vi v8, v9, 4 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: .cfi_def_cfa sp, 16 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: interleave4_same_nonconst_splat_v8i16: +; ZIP: # %bb.0: +; ZIP-NEXT: addi sp, sp, -16 +; ZIP-NEXT: .cfi_def_cfa_offset 16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: sub sp, sp, a1 +; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb +; ZIP-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZIP-NEXT: vmv.v.x v8, a0 +; ZIP-NEXT: addi a0, sp, 16 +; ZIP-NEXT: csrr a1, vlenb +; ZIP-NEXT: vmv1r.v v9, v8 +; ZIP-NEXT: srli a1, a1, 2 +; ZIP-NEXT: vmv1r.v v10, v8 +; ZIP-NEXT: add a2, a0, a1 +; ZIP-NEXT: vmv1r.v v11, v8 +; ZIP-NEXT: vsetvli a3, zero, e16, mf4, ta, ma +; ZIP-NEXT: vsseg4e16.v v8, (a0) +; ZIP-NEXT: add a3, a2, a1 +; ZIP-NEXT: add a1, a3, a1 +; ZIP-NEXT: vle16.v v9, (a3) +; ZIP-NEXT: vle16.v v10, (a2) +; ZIP-NEXT: vle16.v v11, (a1) +; ZIP-NEXT: vle16.v v8, (a0) +; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZIP-NEXT: vslideup.vi v9, v11, 2 +; ZIP-NEXT: vslideup.vi v8, v10, 2 +; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vi v8, v9, 4 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: add sp, sp, a0 +; ZIP-NEXT: .cfi_def_cfa sp, 16 +; ZIP-NEXT: addi sp, sp, 16 +; ZIP-NEXT: .cfi_def_cfa_offset 0 +; ZIP-NEXT: ret + %ins = insertelement <2 x i16> poison, i16 %a, i32 0 + %splat = shufflevector <2 x i16> %ins, <2 x i16> poison, <2 x i32> zeroinitializer + %retval = call <8 x i16> @llvm.vector.interleave4.v8i16(<2 x i16> %splat, <2 x i16> %splat, <2 x i16> %splat, <2 x i16> %splat) + ret <8 x i16> %retval +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll index 01cc5c58b24ce..7a977ff9b4e3a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -14947,3 +14947,303 @@ define @vector_interleave_nxv16f64_nxv2f64( @llvm.vector.interleave8.nxv16f64( %v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7) ret %res } + +define @interleave2_same_const_splat_nxv4i16() { +; V-LABEL: interleave2_same_const_splat_nxv4i16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; V-NEXT: vmv.v.i v9, 3 +; V-NEXT: li a0, 3 +; V-NEXT: vmv.v.i v10, -1 +; V-NEXT: vwaddu.vx v8, v9, a0 +; V-NEXT: vwmaccu.vx v8, a0, v10 +; V-NEXT: csrr a0, vlenb +; V-NEXT: srli a0, a0, 2 +; V-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; V-NEXT: vslidedown.vx v9, v8, a0 +; V-NEXT: vslideup.vx v8, v9, a0 +; V-NEXT: ret +; +; ZVBB-LABEL: interleave2_same_const_splat_nxv4i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vmv.v.i v8, 3 +; ZVBB-NEXT: li a0, 3 +; ZVBB-NEXT: vwsll.vi v9, v8, 16 +; ZVBB-NEXT: vwaddu.wx v8, v9, a0 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: srli a0, a0, 2 +; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVBB-NEXT: vslidedown.vx v9, v8, a0 +; ZVBB-NEXT: vslideup.vx v8, v9, a0 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: interleave2_same_const_splat_nxv4i16: +; ZIP: # %bb.0: +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZIP-NEXT: vmv.v.i v9, 3 +; ZIP-NEXT: srli a0, a0, 2 +; ZIP-NEXT: ri.vzip2b.vv v10, v9, v9 +; ZIP-NEXT: ri.vzip2a.vv v8, v9, v9 +; ZIP-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vx v8, v10, a0 +; ZIP-NEXT: ret + %retval = call @llvm.vector.interleave2.nxv4i16( splat(i16 3), splat(i16 3)) + ret %retval +} + +define @interleave2_diff_const_splat_nxv4i16() { +; SVE-LABEL: interleave2_diff_const_splat_nxv4i16: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #4 // =0x4 +; SVE-NEXT: mov z1.d, #3 // =0x3 +; SVE-NEXT: zip2 z2.d, z1.d, z0.d +; SVE-NEXT: zip1 z0.d, z1.d, z0.d +; SVE-NEXT: uzp1 z0.s, z0.s, z2.s +; SVE-NEXT: ret +; +; SME2-LABEL: interleave2_diff_const_splat_nxv4i16: +; SME2: // %bb.0: +; SME2-NEXT: mov z0.d, #4 // =0x4 +; SME2-NEXT: mov z1.d, #3 // =0x3 +; SME2-NEXT: zip { z0.d, z1.d }, z1.d, z0.d +; SME2-NEXT: uzp1 z0.s, z0.s, z1.s +; SME2-NEXT: ret +; V-LABEL: interleave2_diff_const_splat_nxv4i16: +; V: # %bb.0: +; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; V-NEXT: vmv.v.i v9, 3 +; V-NEXT: li a0, 4 +; V-NEXT: vmv.v.i v10, -1 +; V-NEXT: vwaddu.vx v8, v9, a0 +; V-NEXT: vwmaccu.vx v8, a0, v10 +; V-NEXT: csrr a0, vlenb +; V-NEXT: srli a0, a0, 2 +; V-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; V-NEXT: vslidedown.vx v9, v8, a0 +; V-NEXT: vslideup.vx v8, v9, a0 +; V-NEXT: ret +; +; ZVBB-LABEL: interleave2_diff_const_splat_nxv4i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vmv.v.i v8, 4 +; ZVBB-NEXT: li a0, 3 +; ZVBB-NEXT: vwsll.vi v9, v8, 16 +; ZVBB-NEXT: vwaddu.wx v8, v9, a0 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: srli a0, a0, 2 +; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVBB-NEXT: vslidedown.vx v9, v8, a0 +; ZVBB-NEXT: vslideup.vx v8, v9, a0 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: interleave2_diff_const_splat_nxv4i16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZIP-NEXT: vmv.v.i v9, 4 +; ZIP-NEXT: vmv.v.i v10, 3 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: ri.vzip2b.vv v11, v10, v9 +; ZIP-NEXT: ri.vzip2a.vv v8, v10, v9 +; ZIP-NEXT: srli a0, a0, 2 +; ZIP-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vx v8, v11, a0 +; ZIP-NEXT: ret + %retval = call @llvm.vector.interleave2.v4i16( splat(i16 3), splat(i16 4)) + ret %retval +} + +define @interleave2_same_nonconst_splat_nxv4i16(i16 %a) { +; V-LABEL: interleave2_same_nonconst_splat_nxv4i16: +; V: # %bb.0: +; V-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; V-NEXT: vmv.v.x v9, a0 +; V-NEXT: vmv.v.i v10, -1 +; V-NEXT: vwaddu.vx v8, v9, a0 +; V-NEXT: vwmaccu.vx v8, a0, v10 +; V-NEXT: csrr a0, vlenb +; V-NEXT: srli a0, a0, 2 +; V-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; V-NEXT: vslidedown.vx v9, v8, a0 +; V-NEXT: vslideup.vx v8, v9, a0 +; V-NEXT: ret +; +; ZVBB-LABEL: interleave2_same_nonconst_splat_nxv4i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vmv.v.x v8, a0 +; ZVBB-NEXT: vwsll.vi v9, v8, 16 +; ZVBB-NEXT: vwaddu.wx v8, v9, a0 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: srli a0, a0, 2 +; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZVBB-NEXT: vslidedown.vx v9, v8, a0 +; ZVBB-NEXT: vslideup.vx v8, v9, a0 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: interleave2_same_nonconst_splat_nxv4i16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; ZIP-NEXT: vmv.v.x v9, a0 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: srli a0, a0, 2 +; ZIP-NEXT: ri.vzip2b.vv v10, v9, v9 +; ZIP-NEXT: ri.vzip2a.vv v8, v9, v9 +; ZIP-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vx v8, v10, a0 +; ZIP-NEXT: ret + %ins = insertelement poison, i16 %a, i32 0 + %splat = shufflevector %ins, poison, zeroinitializer + %retval = call @llvm.vector.interleave2.nxv4i16( %splat, %splat) + ret %retval +} + +define @interleave2_diff_nonconst_splat_nxv4i16(i16 %a, i16 %b) { +; SVE-LABEL: interleave2_diff_nonconst_splat_nxv4i16: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $w1 killed $w1 def $x1 +; SVE-NEXT: // kill: def $w0 killed $w0 def $x0 +; SVE-NEXT: mov z0.d, x0 +; SVE-NEXT: mov z1.d, x1 +; SVE-NEXT: zip2 z2.d, z0.d, z1.d +; SVE-NEXT: zip1 z0.d, z0.d, z1.d +; SVE-NEXT: uzp1 z0.s, z0.s, z2.s +; SVE-NEXT: ret +; +; SME2-LABEL: interleave2_diff_nonconst_splat_nxv4i16: +; SME2: // %bb.0: +; SME2-NEXT: // kill: def $w1 killed $w1 def $x1 +; SME2-NEXT: // kill: def $w0 killed $w0 def $x0 +; SME2-NEXT: mov z0.d, x0 +; SME2-NEXT: mov z1.d, x1 +; SME2-NEXT: zip { z0.d, z1.d }, z0.d, z1.d +; SME2-NEXT: uzp1 z0.s, z0.s, z1.s +; SME2-NEXT: ret +; V-LABEL: interleave2_diff_nonconst_splat_nxv4i16: +; V: # %bb.0: +; V-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; V-NEXT: vmv.v.x v9, a0 +; V-NEXT: vmv.v.i v10, -1 +; V-NEXT: csrr a0, vlenb +; V-NEXT: vwaddu.vx v8, v9, a1 +; V-NEXT: vwmaccu.vx v8, a1, v10 +; V-NEXT: srli a0, a0, 2 +; V-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; V-NEXT: vslidedown.vx v9, v8, a0 +; V-NEXT: vslideup.vx v8, v9, a0 +; V-NEXT: ret +; +; ZVBB-LABEL: interleave2_diff_nonconst_splat_nxv4i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vmv.v.x v8, a1 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vwsll.vi v9, v8, 16 +; ZVBB-NEXT: vwaddu.wx v8, v9, a0 +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-NEXT: vslidedown.vx v9, v8, a1 +; ZVBB-NEXT: vslideup.vx v8, v9, a1 +; ZVBB-NEXT: ret +; +; ZIP-LABEL: interleave2_diff_nonconst_splat_nxv4i16: +; ZIP: # %bb.0: +; ZIP-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; ZIP-NEXT: vmv.v.x v9, a0 +; ZIP-NEXT: vmv.v.x v10, a1 +; ZIP-NEXT: csrr a0, vlenb +; ZIP-NEXT: ri.vzip2b.vv v11, v9, v10 +; ZIP-NEXT: ri.vzip2a.vv v8, v9, v10 +; ZIP-NEXT: srli a0, a0, 2 +; ZIP-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; ZIP-NEXT: vslideup.vx v8, v11, a0 +; ZIP-NEXT: ret + %ins1 = insertelement poison, i16 %a, i32 0 + %splat1 = shufflevector %ins1, poison, zeroinitializer + %ins2 = insertelement poison, i16 %b, i32 0 + %splat2 = shufflevector %ins2, poison, zeroinitializer + %retval = call @llvm.vector.interleave2.nxv4i16( %splat1, %splat2) + ret %retval +} + +define @interleave4_same_const_splat_nxv8i16() { +; CHECK-LABEL: interleave4_same_const_splat_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 3 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: srli a2, a1, 1 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: add a3, a0, a2 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vsseg4e16.v v8, (a0) +; CHECK-NEXT: add a4, a3, a2 +; CHECK-NEXT: add a2, a4, a2 +; CHECK-NEXT: vle16.v v9, (a4) +; CHECK-NEXT: vle16.v v8, (a2) +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v10, (a3) +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vslideup.vx v8, v10, a1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +; +; ZVBB-LABEL: interleave4_same_const_splat_nxv8i16: +; ZVBB: # %bb.0: +; ZVBB-NEXT: addi sp, sp, -16 +; ZVBB-NEXT: .cfi_def_cfa_offset 16 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: sub sp, sp, a0 +; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb +; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vmv.v.i v8, 3 +; ZVBB-NEXT: addi a0, sp, 16 +; ZVBB-NEXT: csrr a1, vlenb +; ZVBB-NEXT: vmv1r.v v9, v8 +; ZVBB-NEXT: srli a2, a1, 1 +; ZVBB-NEXT: vmv1r.v v10, v8 +; ZVBB-NEXT: add a3, a0, a2 +; ZVBB-NEXT: vmv1r.v v11, v8 +; ZVBB-NEXT: vsseg4e16.v v8, (a0) +; ZVBB-NEXT: add a4, a3, a2 +; ZVBB-NEXT: add a2, a4, a2 +; ZVBB-NEXT: vle16.v v9, (a4) +; ZVBB-NEXT: vle16.v v8, (a2) +; ZVBB-NEXT: srli a1, a1, 2 +; ZVBB-NEXT: vsetvli a2, zero, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v9, v8, a1 +; ZVBB-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vle16.v v10, (a3) +; ZVBB-NEXT: vle16.v v8, (a0) +; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; ZVBB-NEXT: vslideup.vx v8, v10, a1 +; ZVBB-NEXT: csrr a0, vlenb +; ZVBB-NEXT: slli a0, a0, 1 +; ZVBB-NEXT: add sp, sp, a0 +; ZVBB-NEXT: .cfi_def_cfa sp, 16 +; ZVBB-NEXT: addi sp, sp, 16 +; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: ret + %retval = call @llvm.vector.interleave4.nxv8i16( splat(i16 3), splat(i16 3), splat(i16 3), splat(i16 3)) + ret %retval +} From cb095081f3bd89e9dcc12bb9e8a658eecd11f23f Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 31 Jul 2025 12:02:06 +0000 Subject: [PATCH 2/3] [DAGCombiner] Add combine for vector interleave of splats This patch adds two DAG combines: 1. vector_interleave(splat, splat, ...) -> {splat,splat,...} 2. concat_vectors(splat, splat, ...) -> wide_splat where all the input splats are identical. Both of these together enable us to fold concat_vectors(vector_interleave(splat, splat, ...)) into a wide splat. Post-legalisation we must only do the concat_vector combine if the wider type and splat operation is legal. For fixed-width vectors the DAG combine only occurs for interleave factors of 3 or more, however it's not currently safe to test this for AArch64 since there isn't any lowering support for fixed-width interleaves. I've only added fixed-width tests for RISCV. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 49 +++++ ...rleaving-reductions-predicated-scalable.ll | 55 +++-- ...plex-deinterleaving-reductions-scalable.ll | 52 +++-- .../AArch64/fixed-vector-interleave.ll | 15 -- .../CodeGen/AArch64/sve-vector-interleave.ll | 141 ++----------- .../RISCV/rvv/vector-interleave-fixed.ll | 192 +----------------- .../CodeGen/RISCV/rvv/vector-interleave.ll | 184 ++--------------- 7 files changed, 132 insertions(+), 556 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 251682a5abbb0..dcd8f98c267cd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -331,6 +331,11 @@ namespace { return CombineTo(N, To, 2, AddTo); } + SDValue CombineTo(SDNode *N, SmallVectorImpl *To, + bool AddTo = true) { + return CombineTo(N, To->data(), To->size(), AddTo); + } + void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); private: @@ -541,6 +546,7 @@ namespace { SDValue visitEXTRACT_VECTOR_ELT(SDNode *N); SDValue visitBUILD_VECTOR(SDNode *N); SDValue visitCONCAT_VECTORS(SDNode *N); + SDValue visitVECTOR_INTERLEAVE(SDNode *N); SDValue visitEXTRACT_SUBVECTOR(SDNode *N); SDValue visitVECTOR_SHUFFLE(SDNode *N); SDValue visitSCALAR_TO_VECTOR(SDNode *N); @@ -2021,6 +2027,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N); case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N); case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N); + case ISD::VECTOR_INTERLEAVE: return visitVECTOR_INTERLEAVE(N); case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N); case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N); case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N); @@ -25274,6 +25281,28 @@ static SDValue combineConcatVectorOfShuffleAndItsOperands( return DAG.getVectorShuffle(VT, dl, ShufOps[0], ShufOps[1], Mask); } +static SDValue combineConcatVectorOfSplats(SDNode *N, SelectionDAG &DAG, + const TargetLowering &TLI, + bool LegalTypes, + bool LegalOperations) { + EVT VT = N->getValueType(0); + + // Post-legalization we can only create wider SPLAT_VECTOR operations if both + // the type and operation is legal. The Hexagon target has custom + // legalization for SPLAT_VECTOR that splits the operation into two parts and + // concatenates them. Therefore, custom lowering must also be rejected in + // order to avoid an infinite loop. + if ((LegalTypes && !TLI.isTypeLegal(VT)) || + (LegalOperations && !TLI.isOperationLegal(ISD::SPLAT_VECTOR, VT))) + return SDValue(); + + SDValue Op0 = N->getOperand(0); + if (!llvm::all_equal(N->op_values()) || Op0.getOpcode() != ISD::SPLAT_VECTOR) + return SDValue(); + + return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, Op0.getOperand(0)); +} + SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // If we only have one input vector, we don't need to do any concatenation. if (N->getNumOperands() == 1) @@ -25397,6 +25426,10 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return DAG.getBuildVector(VT, SDLoc(N), Opnds); } + if (SDValue V = + combineConcatVectorOfSplats(N, DAG, TLI, LegalTypes, LegalOperations)) + return V; + // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR. // FIXME: Add support for concat_vectors(bitcast(vec0),bitcast(vec1),...). if (SDValue V = combineConcatVectorOfScalars(N, DAG)) @@ -25465,6 +25498,22 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { return SDValue(); } +SDValue DAGCombiner::visitVECTOR_INTERLEAVE(SDNode *N) { + // Check to see if all operands are identical. + if (!llvm::all_equal(N->op_values())) + return SDValue(); + + // Check to see if the identical operand is a splat. + SDValue Splat = DAG.getSplatValue(N->getOperand(0)); + if (!Splat) + return SDValue(); + + // Simply replace all results with the first operand. + SmallVector Ops; + Ops.append(N->op_values().begin(), N->op_values().end()); + return CombineTo(N, &Ops); +} + // Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find // if the subvector can be sourced for free. static SDValue getSubVectorSrc(SDValue V, unsigned Index, EVT SubVT) { diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll index 880bd2904154c..d67aa08125f74 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll @@ -14,20 +14,19 @@ target triple = "aarch64" define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #100 // =0x64 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: rdvl x10, #2 -; CHECK-NEXT: mov x11, x9 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: zip2 z0.d, z1.d, z1.d -; CHECK-NEXT: zip1 z1.d, z1.d, z1.d +; CHECK-NEXT: mov x11, x9 ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: zip2 p2.d, p1.d, p1.d -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z1.d ; CHECK-NEXT: zip1 p1.d, p1.d, p1.d ; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl] ; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl] @@ -39,14 +38,14 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: mov z1.d, p1/m, z6.d +; CHECK-NEXT: mov z1.d, p2/m, z7.d +; CHECK-NEXT: mov z0.d, p1/m, z6.d ; CHECK-NEXT: whilelo p1.d, x11, x8 ; CHECK-NEXT: add x11, x11, x9 ; CHECK-NEXT: b.mi .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block -; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d -; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d +; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -111,21 +110,20 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %cond) { ; CHECK-LABEL: complex_mul_predicated_v2f64: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: mov w11, #100 // =0x64 ; CHECK-NEXT: neg x10, x9 +; CHECK-NEXT: mov w11, #100 // =0x64 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x10, x11 ; CHECK-NEXT: rdvl x11, #2 -; CHECK-NEXT: zip2 z0.d, z1.d, z1.d -; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x8, lsl #2] -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z1.d ; CHECK-NEXT: add x8, x8, x9 ; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0 ; CHECK-NEXT: cmp x10, x8 @@ -141,12 +139,12 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr % ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: mov z1.d, p1/m, z6.d +; CHECK-NEXT: mov z1.d, p2/m, z7.d +; CHECK-NEXT: mov z0.d, p1/m, z6.d ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %exit.block -; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d -; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d +; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -213,21 +211,20 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, ptr %cond) { ; CHECK-LABEL: complex_mul_predicated_x2_v2f64: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: mov w8, #100 // =0x64 -; CHECK-NEXT: cntd x9 ; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: cntd x9 ; CHECK-NEXT: rdvl x10, #2 -; CHECK-NEXT: cnth x11 ; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cnth x11 ; CHECK-NEXT: mov x12, x9 -; CHECK-NEXT: zip2 z0.d, z1.d, z1.d -; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2] -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z1.d ; CHECK-NEXT: add x2, x2, x11 ; CHECK-NEXT: and z2.d, z2.d, #0xffffffff ; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 @@ -243,14 +240,14 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: mov z1.d, p1/m, z6.d +; CHECK-NEXT: mov z1.d, p2/m, z7.d +; CHECK-NEXT: mov z0.d, p1/m, z6.d ; CHECK-NEXT: whilelo p1.d, x12, x8 ; CHECK-NEXT: add x12, x12, x9 ; CHECK-NEXT: b.mi .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block -; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d -; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d +; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index 29be231920305..0646ca4948e1d 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -14,15 +14,14 @@ target triple = "aarch64" define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: cntd x8 -; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: neg x9, x8 +; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and x9, x9, x10 ; CHECK-NEXT: rdvl x10, #2 -; CHECK-NEXT: zip2 z0.d, z1.d, z1.d -; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr z2, [x0, #1, mul vl] @@ -32,14 +31,14 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldr z5, [x1] ; CHECK-NEXT: add x1, x1, x10 ; CHECK-NEXT: add x0, x0, x10 -; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0 +; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90 +; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block -; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d -; CHECK-NEXT: uzp2 z1.d, z1.d, z0.d +; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z0.d, z1.d ; CHECK-NEXT: faddv d0, p0, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -183,17 +182,16 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64_unrolled: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: cntw x8 -; CHECK-NEXT: mov w10, #1000 // =0x3e8 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: neg x9, x8 +; CHECK-NEXT: mov w10, #1000 // =0x3e8 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and x9, x9, x10 ; CHECK-NEXT: rdvl x10, #4 -; CHECK-NEXT: zip2 z0.d, z1.d, z1.d -; CHECK-NEXT: zip1 z1.d, z1.d, z1.d -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr z4, [x0, #1, mul vl] @@ -207,20 +205,20 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-NEXT: ldr z18, [x1, #3, mul vl] ; CHECK-NEXT: ldr z19, [x1, #2, mul vl] ; CHECK-NEXT: add x1, x1, x10 -; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0 +; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0 ; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #0 ; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #90 -; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90 +; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90 ; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #90 ; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #90 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d -; CHECK-NEXT: uzp1 z5.d, z1.d, z0.d +; CHECK-NEXT: uzp1 z5.d, z0.d, z1.d ; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d -; CHECK-NEXT: uzp2 z0.d, z1.d, z0.d +; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d ; CHECK-NEXT: fadd z1.d, z4.d, z5.d ; CHECK-NEXT: fadd z2.d, z2.d, z0.d ; CHECK-NEXT: faddv d0, p0, z1.d @@ -310,15 +308,15 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia ; CHECK-LABEL: reduction_mix: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: mov w11, #100 // =0x64 +; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: neg x10, x9 +; CHECK-NEXT: mov w11, #100 // =0x64 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x10, x11 ; CHECK-NEXT: rdvl x11, #2 -; CHECK-NEXT: zip2 z0.d, z2.d, z2.d -; CHECK-NEXT: zip1 z1.d, z2.d, z2.d ; CHECK-NEXT: .LBB3_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr z3, [x0] @@ -327,13 +325,13 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia ; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2] ; CHECK-NEXT: add x8, x8, x9 ; CHECK-NEXT: cmp x10, x8 -; CHECK-NEXT: fadd z0.d, z4.d, z0.d -; CHECK-NEXT: fadd z1.d, z3.d, z1.d +; CHECK-NEXT: fadd z1.d, z4.d, z1.d +; CHECK-NEXT: fadd z0.d, z3.d, z0.d ; CHECK-NEXT: add z2.d, z5.d, z2.d ; CHECK-NEXT: b.ne .LBB3_1 ; CHECK-NEXT: // %bb.2: // %middle.block -; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d -; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d +; CHECK-NEXT: uzp2 z3.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z1.d, z0.d, z1.d ; CHECK-NEXT: uaddv d2, p0, z2.d ; CHECK-NEXT: faddv d0, p0, z3.d ; CHECK-NEXT: faddv d1, p0, z1.d diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll index 8ccc9f97e5133..05ecc9e7b49d4 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll @@ -211,18 +211,3 @@ define <4 x i16> @interleave2_diff_nonconst_splat_v4i16(i16 %a, i16 %b) { ret <4 x i16> %retval } -; Float declarations -declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>) -declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>) -declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>) -declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>) -declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>) -declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>) - -; Integer declarations -declare <32 x i8> @llvm.vector.interleave2.v32i8(<16 x i8>, <16 x i8>) -declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) -declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) -declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) -declare <4 x i16> @llvm.vector.interleave2.v4i16(<2 x i16>, <2 x i16>) -declare <8 x i16> @llvm.vector.interleave4.v8i16(<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>) diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll index b954863560899..c7fb2db53d2a3 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll @@ -541,20 +541,10 @@ define @interleave2_nxv2i32( %vec0, @interleave2_same_const_splat_nxv4i16() { -; SVE-LABEL: interleave2_same_const_splat_nxv4i16: -; SVE: // %bb.0: -; SVE-NEXT: mov z0.d, #3 // =0x3 -; SVE-NEXT: zip2 z1.d, z0.d, z0.d -; SVE-NEXT: zip1 z0.d, z0.d, z0.d -; SVE-NEXT: uzp1 z0.s, z0.s, z1.s -; SVE-NEXT: ret -; -; SME2-LABEL: interleave2_same_const_splat_nxv4i16: -; SME2: // %bb.0: -; SME2-NEXT: mov z0.d, #3 // =0x3 -; SME2-NEXT: zip { z0.d, z1.d }, z0.d, z0.d -; SME2-NEXT: uzp1 z0.s, z0.s, z1.s -; SME2-NEXT: ret +; CHECK-LABEL: interleave2_same_const_splat_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, #3 // =0x3 +; CHECK-NEXT: ret %retval = call @llvm.vector.interleave2.nxv4i16( splat(i16 3), splat(i16 3)) ret %retval } @@ -581,22 +571,10 @@ define @interleave2_diff_const_splat_nxv4i16() { } define @interleave2_same_nonconst_splat_nxv4i16(i16 %a) { -; SVE-LABEL: interleave2_same_nonconst_splat_nxv4i16: -; SVE: // %bb.0: -; SVE-NEXT: // kill: def $w0 killed $w0 def $x0 -; SVE-NEXT: mov z0.d, x0 -; SVE-NEXT: zip2 z1.d, z0.d, z0.d -; SVE-NEXT: zip1 z0.d, z0.d, z0.d -; SVE-NEXT: uzp1 z0.s, z0.s, z1.s -; SVE-NEXT: ret -; -; SME2-LABEL: interleave2_same_nonconst_splat_nxv4i16: -; SME2: // %bb.0: -; SME2-NEXT: // kill: def $w0 killed $w0 def $x0 -; SME2-NEXT: mov z0.d, x0 -; SME2-NEXT: zip { z0.d, z1.d }, z0.d, z0.d -; SME2-NEXT: uzp1 z0.s, z0.s, z1.s -; SME2-NEXT: ret +; CHECK-LABEL: interleave2_same_nonconst_splat_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: ret %ins = insertelement poison, i16 %a, i32 0 %splat = shufflevector %ins, poison, zeroinitializer %retval = call @llvm.vector.interleave2.nxv4i16( %splat, %splat) @@ -633,105 +611,10 @@ define @interleave2_diff_nonconst_splat_nxv4i16(i16 %a, i16 % } define @interleave4_same_const_splat_nxv8i16() { -; SVE-LABEL: interleave4_same_const_splat_nxv8i16: -; SVE: // %bb.0: -; SVE-NEXT: mov z0.d, #3 // =0x3 -; SVE-NEXT: zip1 z1.d, z0.d, z0.d -; SVE-NEXT: zip1 z2.d, z1.d, z1.d -; SVE-NEXT: zip2 z1.d, z1.d, z1.d -; SVE-NEXT: uzp1 z2.s, z2.s, z0.s -; SVE-NEXT: uzp1 z2.h, z2.h, z0.h -; SVE-NEXT: uunpklo z2.s, z2.h -; SVE-NEXT: uunpklo z2.d, z2.s -; SVE-NEXT: uzp1 z1.s, z2.s, z1.s -; SVE-NEXT: uzp1 z2.h, z1.h, z0.h -; SVE-NEXT: zip2 z0.d, z0.d, z0.d -; SVE-NEXT: uunpkhi z2.s, z2.h -; SVE-NEXT: zip1 z3.d, z0.d, z0.d -; SVE-NEXT: zip2 z0.d, z0.d, z0.d -; SVE-NEXT: uunpkhi z2.d, z2.s -; SVE-NEXT: uzp1 z2.s, z3.s, z2.s -; SVE-NEXT: uzp1 z2.h, z1.h, z2.h -; SVE-NEXT: uunpkhi z2.s, z2.h -; SVE-NEXT: uunpklo z2.d, z2.s -; SVE-NEXT: uzp1 z0.s, z2.s, z0.s -; SVE-NEXT: uzp1 z0.h, z1.h, z0.h -; SVE-NEXT: ret -; -; SME-ALL-LABEL: interleave4_same_const_splat_nxv8i16: -; SME-ALL: // %bb.0: -; SME-ALL-NEXT: mov z0.d, #3 // =0x3 -; SME-ALL-NEXT: zip { z0.d, z1.d }, z0.d, z0.d -; SME-ALL-NEXT: zip { z2.d, z3.d }, z0.d, z0.d -; SME-ALL-NEXT: uzp1 z4.s, z2.s, z0.s -; SME-ALL-NEXT: uzp1 z4.h, z4.h, z0.h -; SME-ALL-NEXT: uunpklo z4.s, z4.h -; SME-ALL-NEXT: uunpklo z4.d, z4.s -; SME-ALL-NEXT: uzp1 z2.s, z4.s, z3.s -; SME-ALL-NEXT: uzp1 z3.h, z2.h, z0.h -; SME-ALL-NEXT: zip { z0.d, z1.d }, z1.d, z1.d -; SME-ALL-NEXT: uunpkhi z3.s, z3.h -; SME-ALL-NEXT: uunpkhi z3.d, z3.s -; SME-ALL-NEXT: uzp1 z3.s, z0.s, z3.s -; SME-ALL-NEXT: uzp1 z3.h, z2.h, z3.h -; SME-ALL-NEXT: uunpkhi z3.s, z3.h -; SME-ALL-NEXT: uunpklo z3.d, z3.s -; SME-ALL-NEXT: uzp1 z0.s, z3.s, z1.s -; SME-ALL-NEXT: uzp1 z0.h, z2.h, z0.h -; SME-ALL-NEXT: ret -; -; SME2-256-LABEL: interleave4_same_const_splat_nxv8i16: -; SME2-256: // %bb.0: -; SME2-256-NEXT: mov z0.d, #3 // =0x3 -; SME2-256-NEXT: mov z1.d, z0.d -; SME2-256-NEXT: mov z2.d, z0.d -; SME2-256-NEXT: mov z3.d, z0.d -; SME2-256-NEXT: zip { z0.d - z3.d }, { z0.d - z3.d } -; SME2-256-NEXT: uzp1 z4.s, z0.s, z0.s -; SME2-256-NEXT: uzp1 z4.h, z4.h, z0.h -; SME2-256-NEXT: uunpklo z4.s, z4.h -; SME2-256-NEXT: uunpklo z4.d, z4.s -; SME2-256-NEXT: uzp1 z4.s, z4.s, z1.s -; SME2-256-NEXT: uzp1 z5.h, z4.h, z0.h -; SME2-256-NEXT: uunpkhi z5.s, z5.h -; SME2-256-NEXT: uunpkhi z5.d, z5.s -; SME2-256-NEXT: uzp1 z5.s, z2.s, z5.s -; SME2-256-NEXT: uzp1 z5.h, z4.h, z5.h -; SME2-256-NEXT: uunpkhi z5.s, z5.h -; SME2-256-NEXT: uunpklo z5.d, z5.s -; SME2-256-NEXT: uzp1 z0.s, z5.s, z3.s -; SME2-256-NEXT: uzp1 z0.h, z4.h, z0.h -; SME2-256-NEXT: ret +; CHECK-LABEL: interleave4_same_const_splat_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, #3 // =0x3 +; CHECK-NEXT: ret %retval = call @llvm.vector.interleave4.nxv8i16( splat(i16 3), splat(i16 3), splat(i16 3), splat(i16 3)) ret %retval } - -; Float declarations -declare @llvm.vector.interleave2.nxv4f16(, ) -declare @llvm.vector.interleave2.nxv8f16(, ) -declare @llvm.vector.interleave2.nxv16f16(, ) -declare @llvm.vector.interleave2.nxv4f32(, ) -declare @llvm.vector.interleave2.nxv8f32(, ) -declare @llvm.vector.interleave2.nxv4f64(, ) - -; Integer declarations -declare @llvm.vector.interleave2.nxv32i8(, ) -declare @llvm.vector.interleave2.nxv16i16(, ) -declare @llvm.vector.interleave2.nxv8i32(, ) -declare @llvm.vector.interleave2.nxv4i64(, ) - -; Predicated -declare @llvm.vector.interleave2.nxv32i1(, ) -declare @llvm.vector.interleave2.nxv16i1(, ) -declare @llvm.vector.interleave2.nxv8i1(, ) -declare @llvm.vector.interleave2.nxv4i1(, ) - -; Illegal type size -declare @llvm.vector.interleave2.nxv16i32(, ) -declare @llvm.vector.interleave2.nxv8i64(, ) - -declare @llvm.vector.interleave2.nxv16i8(, ) -declare @llvm.vector.interleave2.nxv8i16(, ) -declare @llvm.vector.interleave2.nxv4i32(, ) -declare @llvm.vector.interleave2.nxv4i16(, ) -declare @llvm.vector.interleave4.nxv8i16(, , , ) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll index f3ba7fe33fa48..38d38f78c6054 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -1640,110 +1640,20 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, define <8 x i16> @interleave4_const_splat_v8i16(<2 x i16> %a) { ; CHECK-LABEL: interleave4_const_splat_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.i v8, 3 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vmv1r.v v9, v8 -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: vmv1r.v v10, v8 -; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: vmv1r.v v11, v8 -; CHECK-NEXT: vsetvli a3, zero, e16, mf4, ta, ma -; CHECK-NEXT: vsseg4e16.v v8, (a0) -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vle16.v v9, (a3) -; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v9, v11, 2 -; CHECK-NEXT: vslideup.vi v8, v10, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 4 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vmv.v.i v8, 3 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: interleave4_const_splat_v8i16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVBB-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVBB-NEXT: vmv.v.i v8, 3 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: vmv1r.v v9, v8 -; ZVBB-NEXT: srli a1, a1, 2 -; ZVBB-NEXT: vmv1r.v v10, v8 -; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: vmv1r.v v11, v8 -; ZVBB-NEXT: vsetvli a3, zero, e16, mf4, ta, ma -; ZVBB-NEXT: vsseg4e16.v v8, (a0) -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: add a1, a3, a1 -; ZVBB-NEXT: vle16.v v9, (a3) -; ZVBB-NEXT: vle16.v v10, (a2) -; ZVBB-NEXT: vle16.v v11, (a1) -; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVBB-NEXT: vslideup.vi v9, v11, 2 -; ZVBB-NEXT: vslideup.vi v8, v10, 2 ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v9, 4 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: vmv.v.i v8, 3 ; ZVBB-NEXT: ret ; ; ZIP-LABEL: interleave4_const_splat_v8i16: ; ZIP: # %bb.0: -; ZIP-NEXT: addi sp, sp, -16 -; ZIP-NEXT: .cfi_def_cfa_offset 16 -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: sub sp, sp, a0 -; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZIP-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZIP-NEXT: vmv.v.i v8, 3 -; ZIP-NEXT: addi a0, sp, 16 -; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: vmv1r.v v9, v8 -; ZIP-NEXT: srli a1, a1, 2 -; ZIP-NEXT: vmv1r.v v10, v8 -; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: vmv1r.v v11, v8 -; ZIP-NEXT: vsetvli a3, zero, e16, mf4, ta, ma -; ZIP-NEXT: vsseg4e16.v v8, (a0) -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: add a1, a3, a1 -; ZIP-NEXT: vle16.v v9, (a3) -; ZIP-NEXT: vle16.v v10, (a2) -; ZIP-NEXT: vle16.v v11, (a1) -; ZIP-NEXT: vle16.v v8, (a0) -; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZIP-NEXT: vslideup.vi v9, v11, 2 -; ZIP-NEXT: vslideup.vi v8, v10, 2 ; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vi v8, v9, 4 -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: add sp, sp, a0 -; ZIP-NEXT: .cfi_def_cfa sp, 16 -; ZIP-NEXT: addi sp, sp, 16 -; ZIP-NEXT: .cfi_def_cfa_offset 0 +; ZIP-NEXT: vmv.v.i v8, 3 ; ZIP-NEXT: ret %retval = call <8 x i16> @llvm.vector.interleave4.v8i16(<2 x i16> splat(i16 3), <2 x i16> splat(i16 3), <2 x i16> splat(i16 3), <2 x i16> splat(i16 3)) ret <8 x i16> %retval @@ -1752,110 +1662,20 @@ define <8 x i16> @interleave4_const_splat_v8i16(<2 x i16> %a) { define <8 x i16> @interleave4_same_nonconst_splat_v8i16(i16 %a) { ; CHECK-LABEL: interleave4_same_nonconst_splat_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v8, a0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vmv1r.v v9, v8 -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: vmv1r.v v10, v8 -; CHECK-NEXT: add a2, a0, a1 -; CHECK-NEXT: vmv1r.v v11, v8 -; CHECK-NEXT: vsetvli a3, zero, e16, mf4, ta, ma -; CHECK-NEXT: vsseg4e16.v v8, (a0) -; CHECK-NEXT: add a3, a2, a1 -; CHECK-NEXT: add a1, a3, a1 -; CHECK-NEXT: vle16.v v9, (a3) -; CHECK-NEXT: vle16.v v10, (a2) -; CHECK-NEXT: vle16.v v11, (a1) -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vslideup.vi v9, v11, 2 -; CHECK-NEXT: vslideup.vi v8, v10, 2 ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vi v8, v9, 4 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: interleave4_same_nonconst_splat_v8i16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: sub sp, sp, a1 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZVBB-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZVBB-NEXT: vmv.v.x v8, a0 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: vmv1r.v v9, v8 -; ZVBB-NEXT: srli a1, a1, 2 -; ZVBB-NEXT: vmv1r.v v10, v8 -; ZVBB-NEXT: add a2, a0, a1 -; ZVBB-NEXT: vmv1r.v v11, v8 -; ZVBB-NEXT: vsetvli a3, zero, e16, mf4, ta, ma -; ZVBB-NEXT: vsseg4e16.v v8, (a0) -; ZVBB-NEXT: add a3, a2, a1 -; ZVBB-NEXT: add a1, a3, a1 -; ZVBB-NEXT: vle16.v v9, (a3) -; ZVBB-NEXT: vle16.v v10, (a2) -; ZVBB-NEXT: vle16.v v11, (a1) -; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVBB-NEXT: vslideup.vi v9, v11, 2 -; ZVBB-NEXT: vslideup.vi v8, v10, 2 ; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vi v8, v9, 4 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 +; ZVBB-NEXT: vmv.v.x v8, a0 ; ZVBB-NEXT: ret ; ; ZIP-LABEL: interleave4_same_nonconst_splat_v8i16: ; ZIP: # %bb.0: -; ZIP-NEXT: addi sp, sp, -16 -; ZIP-NEXT: .cfi_def_cfa_offset 16 -; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: sub sp, sp, a1 -; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb -; ZIP-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; ZIP-NEXT: vmv.v.x v8, a0 -; ZIP-NEXT: addi a0, sp, 16 -; ZIP-NEXT: csrr a1, vlenb -; ZIP-NEXT: vmv1r.v v9, v8 -; ZIP-NEXT: srli a1, a1, 2 -; ZIP-NEXT: vmv1r.v v10, v8 -; ZIP-NEXT: add a2, a0, a1 -; ZIP-NEXT: vmv1r.v v11, v8 -; ZIP-NEXT: vsetvli a3, zero, e16, mf4, ta, ma -; ZIP-NEXT: vsseg4e16.v v8, (a0) -; ZIP-NEXT: add a3, a2, a1 -; ZIP-NEXT: add a1, a3, a1 -; ZIP-NEXT: vle16.v v9, (a3) -; ZIP-NEXT: vle16.v v10, (a2) -; ZIP-NEXT: vle16.v v11, (a1) -; ZIP-NEXT: vle16.v v8, (a0) -; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZIP-NEXT: vslideup.vi v9, v11, 2 -; ZIP-NEXT: vslideup.vi v8, v10, 2 ; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vi v8, v9, 4 -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: add sp, sp, a0 -; ZIP-NEXT: .cfi_def_cfa sp, 16 -; ZIP-NEXT: addi sp, sp, 16 -; ZIP-NEXT: .cfi_def_cfa_offset 0 +; ZIP-NEXT: vmv.v.x v8, a0 ; ZIP-NEXT: ret %ins = insertelement <2 x i16> poison, i16 %a, i32 0 %splat = shufflevector <2 x i16> %ins, <2 x i16> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll index 7a977ff9b4e3a..ee38257f09cd5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -14949,67 +14949,22 @@ define @vector_interleave_nxv16f64_nxv2f64( @interleave2_same_const_splat_nxv4i16() { -; V-LABEL: interleave2_same_const_splat_nxv4i16: -; V: # %bb.0: -; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma -; V-NEXT: vmv.v.i v9, 3 -; V-NEXT: li a0, 3 -; V-NEXT: vmv.v.i v10, -1 -; V-NEXT: vwaddu.vx v8, v9, a0 -; V-NEXT: vwmaccu.vx v8, a0, v10 -; V-NEXT: csrr a0, vlenb -; V-NEXT: srli a0, a0, 2 -; V-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; V-NEXT: vslidedown.vx v9, v8, a0 -; V-NEXT: vslideup.vx v8, v9, a0 -; V-NEXT: ret +; CHECK-LABEL: interleave2_same_const_splat_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 3 +; CHECK-NEXT: ret ; ; ZVBB-LABEL: interleave2_same_const_splat_nxv4i16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVBB-NEXT: vmv.v.i v8, 3 -; ZVBB-NEXT: li a0, 3 -; ZVBB-NEXT: vwsll.vi v9, v8, 16 -; ZVBB-NEXT: vwaddu.wx v8, v9, a0 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: srli a0, a0, 2 -; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVBB-NEXT: vslidedown.vx v9, v8, a0 -; ZVBB-NEXT: vslideup.vx v8, v9, a0 ; ZVBB-NEXT: ret -; -; ZIP-LABEL: interleave2_same_const_splat_nxv4i16: -; ZIP: # %bb.0: -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZIP-NEXT: vmv.v.i v9, 3 -; ZIP-NEXT: srli a0, a0, 2 -; ZIP-NEXT: ri.vzip2b.vv v10, v9, v9 -; ZIP-NEXT: ri.vzip2a.vv v8, v9, v9 -; ZIP-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vx v8, v10, a0 -; ZIP-NEXT: ret %retval = call @llvm.vector.interleave2.nxv4i16( splat(i16 3), splat(i16 3)) ret %retval } define @interleave2_diff_const_splat_nxv4i16() { -; SVE-LABEL: interleave2_diff_const_splat_nxv4i16: -; SVE: // %bb.0: -; SVE-NEXT: mov z0.d, #4 // =0x4 -; SVE-NEXT: mov z1.d, #3 // =0x3 -; SVE-NEXT: zip2 z2.d, z1.d, z0.d -; SVE-NEXT: zip1 z0.d, z1.d, z0.d -; SVE-NEXT: uzp1 z0.s, z0.s, z2.s -; SVE-NEXT: ret -; -; SME2-LABEL: interleave2_diff_const_splat_nxv4i16: -; SME2: // %bb.0: -; SME2-NEXT: mov z0.d, #4 // =0x4 -; SME2-NEXT: mov z1.d, #3 // =0x3 -; SME2-NEXT: zip { z0.d, z1.d }, z1.d, z0.d -; SME2-NEXT: uzp1 z0.s, z0.s, z1.s -; SME2-NEXT: ret ; V-LABEL: interleave2_diff_const_splat_nxv4i16: ; V: # %bb.0: ; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma @@ -15056,44 +15011,17 @@ define @interleave2_diff_const_splat_nxv4i16() { } define @interleave2_same_nonconst_splat_nxv4i16(i16 %a) { -; V-LABEL: interleave2_same_nonconst_splat_nxv4i16: -; V: # %bb.0: -; V-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; V-NEXT: vmv.v.x v9, a0 -; V-NEXT: vmv.v.i v10, -1 -; V-NEXT: vwaddu.vx v8, v9, a0 -; V-NEXT: vwmaccu.vx v8, a0, v10 -; V-NEXT: csrr a0, vlenb -; V-NEXT: srli a0, a0, 2 -; V-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; V-NEXT: vslidedown.vx v9, v8, a0 -; V-NEXT: vslideup.vx v8, v9, a0 -; V-NEXT: ret +; CHECK-LABEL: interleave2_same_nonconst_splat_nxv4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: ret ; ; ZVBB-LABEL: interleave2_same_nonconst_splat_nxv4i16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZVBB-NEXT: vmv.v.x v8, a0 -; ZVBB-NEXT: vwsll.vi v9, v8, 16 -; ZVBB-NEXT: vwaddu.wx v8, v9, a0 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: srli a0, a0, 2 ; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZVBB-NEXT: vslidedown.vx v9, v8, a0 -; ZVBB-NEXT: vslideup.vx v8, v9, a0 +; ZVBB-NEXT: vmv.v.x v8, a0 ; ZVBB-NEXT: ret -; -; ZIP-LABEL: interleave2_same_nonconst_splat_nxv4i16: -; ZIP: # %bb.0: -; ZIP-NEXT: vsetvli a1, zero, e16, mf2, ta, ma -; ZIP-NEXT: vmv.v.x v9, a0 -; ZIP-NEXT: csrr a0, vlenb -; ZIP-NEXT: srli a0, a0, 2 -; ZIP-NEXT: ri.vzip2b.vv v10, v9, v9 -; ZIP-NEXT: ri.vzip2a.vv v8, v9, v9 -; ZIP-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; ZIP-NEXT: vslideup.vx v8, v10, a0 -; ZIP-NEXT: ret %ins = insertelement poison, i16 %a, i32 0 %splat = shufflevector %ins, poison, zeroinitializer %retval = call @llvm.vector.interleave2.nxv4i16( %splat, %splat) @@ -15101,26 +15029,6 @@ define @interleave2_same_nonconst_splat_nxv4i16(i16 %a) { } define @interleave2_diff_nonconst_splat_nxv4i16(i16 %a, i16 %b) { -; SVE-LABEL: interleave2_diff_nonconst_splat_nxv4i16: -; SVE: // %bb.0: -; SVE-NEXT: // kill: def $w1 killed $w1 def $x1 -; SVE-NEXT: // kill: def $w0 killed $w0 def $x0 -; SVE-NEXT: mov z0.d, x0 -; SVE-NEXT: mov z1.d, x1 -; SVE-NEXT: zip2 z2.d, z0.d, z1.d -; SVE-NEXT: zip1 z0.d, z0.d, z1.d -; SVE-NEXT: uzp1 z0.s, z0.s, z2.s -; SVE-NEXT: ret -; -; SME2-LABEL: interleave2_diff_nonconst_splat_nxv4i16: -; SME2: // %bb.0: -; SME2-NEXT: // kill: def $w1 killed $w1 def $x1 -; SME2-NEXT: // kill: def $w0 killed $w0 def $x0 -; SME2-NEXT: mov z0.d, x0 -; SME2-NEXT: mov z1.d, x1 -; SME2-NEXT: zip { z0.d, z1.d }, z0.d, z1.d -; SME2-NEXT: uzp1 z0.s, z0.s, z1.s -; SME2-NEXT: ret ; V-LABEL: interleave2_diff_nonconst_splat_nxv4i16: ; V: # %bb.0: ; V-NEXT: vsetvli a2, zero, e16, mf2, ta, ma @@ -15171,78 +15079,14 @@ define @interleave2_diff_nonconst_splat_nxv4i16(i16 %a, i16 % define @interleave4_same_const_splat_nxv8i16() { ; CHECK-LABEL: interleave4_same_const_splat_nxv8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; CHECK-NEXT: vmv.v.i v8, 3 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vmv1r.v v9, v8 -; CHECK-NEXT: srli a2, a1, 1 -; CHECK-NEXT: vmv1r.v v10, v8 -; CHECK-NEXT: add a3, a0, a2 -; CHECK-NEXT: vmv1r.v v11, v8 -; CHECK-NEXT: vsseg4e16.v v8, (a0) -; CHECK-NEXT: add a4, a3, a2 -; CHECK-NEXT: add a2, a4, a2 -; CHECK-NEXT: vle16.v v9, (a4) -; CHECK-NEXT: vle16.v v8, (a2) -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v9, v8, a1 -; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v10, (a3) -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; CHECK-NEXT: vslideup.vx v8, v10, a1 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: ret ; ; ZVBB-LABEL: interleave4_same_const_splat_nxv8i16: ; ZVBB: # %bb.0: -; ZVBB-NEXT: addi sp, sp, -16 -; ZVBB-NEXT: .cfi_def_cfa_offset 16 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 1 -; ZVBB-NEXT: sub sp, sp, a0 -; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb -; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVBB-NEXT: vmv.v.i v8, 3 -; ZVBB-NEXT: addi a0, sp, 16 -; ZVBB-NEXT: csrr a1, vlenb -; ZVBB-NEXT: vmv1r.v v9, v8 -; ZVBB-NEXT: srli a2, a1, 1 -; ZVBB-NEXT: vmv1r.v v10, v8 -; ZVBB-NEXT: add a3, a0, a2 -; ZVBB-NEXT: vmv1r.v v11, v8 -; ZVBB-NEXT: vsseg4e16.v v8, (a0) -; ZVBB-NEXT: add a4, a3, a2 -; ZVBB-NEXT: add a2, a4, a2 -; ZVBB-NEXT: vle16.v v9, (a4) -; ZVBB-NEXT: vle16.v v8, (a2) -; ZVBB-NEXT: srli a1, a1, 2 -; ZVBB-NEXT: vsetvli a2, zero, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v9, v8, a1 -; ZVBB-NEXT: vsetvli a2, zero, e16, mf2, ta, ma -; ZVBB-NEXT: vle16.v v10, (a3) -; ZVBB-NEXT: vle16.v v8, (a0) -; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma -; ZVBB-NEXT: vslideup.vx v8, v10, a1 -; ZVBB-NEXT: csrr a0, vlenb -; ZVBB-NEXT: slli a0, a0, 1 -; ZVBB-NEXT: add sp, sp, a0 -; ZVBB-NEXT: .cfi_def_cfa sp, 16 -; ZVBB-NEXT: addi sp, sp, 16 -; ZVBB-NEXT: .cfi_def_cfa_offset 0 ; ZVBB-NEXT: ret %retval = call @llvm.vector.interleave4.nxv8i16( splat(i16 3), splat(i16 3), splat(i16 3), splat(i16 3)) ret %retval From 777e72eb680379b3051ba78cc5fae6f6337878e7 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Thu, 31 Jul 2025 13:45:13 +0000 Subject: [PATCH 3/3] Address review comments --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index dcd8f98c267cd..af77277a17bde 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -25504,11 +25504,10 @@ SDValue DAGCombiner::visitVECTOR_INTERLEAVE(SDNode *N) { return SDValue(); // Check to see if the identical operand is a splat. - SDValue Splat = DAG.getSplatValue(N->getOperand(0)); - if (!Splat) + if (!DAG.isSplatValue(N->getOperand(0))) return SDValue(); - // Simply replace all results with the first operand. + // interleave splat(X), splat(X).... --> splat(X), splat(X).... SmallVector Ops; Ops.append(N->op_values().begin(), N->op_values().end()); return CombineTo(N, &Ops);