From ebcb4929004ae3f08b2ca3d5d246f29aa73600e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?= Date: Thu, 31 Jul 2025 13:07:28 +0000 Subject: [PATCH] [AArch64][ISel] Subvector extracts can use undef for second EXT input This will later allow us to use the SVE2 constructive variant of EXT without requiring a MOV. That is because that variant of EXT requires two consecutive Z registers as input. As a consequence, extracting a subvector from e.g. z2 into z0 would require: z3 = MOV z2 z0 = EXT_ZZI_B { z2, z3 }, idx With this change, the z3 part of the { z2, z3 } tuple will be marked as undef, allowing the MOV to be simplified. We just need to add patterns for EXT_ZZI_B now, currently only the the destructive EXT_ZZI variant is selected. --- .../Target/AArch64/AArch64ISelLowering.cpp | 4 +- .../AArch64/get-active-lane-mask-extract.ll | 8 +- .../AArch64/sve-fixed-length-int-div.ll | 52 +-- .../AArch64/sve-fixed-length-int-rem.ll | 44 +-- .../sve-fixed-length-masked-scatter.ll | 12 +- .../AArch64/sve-fixed-length-shuffles.ll | 2 +- ...aming-mode-fixed-length-fp-extend-trunc.ll | 2 +- ...e-streaming-mode-fixed-length-fp-to-int.ll | 20 +- ...sve-streaming-mode-fixed-length-int-div.ll | 120 +++--- ...streaming-mode-fixed-length-int-extends.ll | 208 +++++------ ...sve-streaming-mode-fixed-length-int-rem.ll | 88 ++--- ...e-streaming-mode-fixed-length-int-to-fp.ll | 116 +++--- ...-streaming-mode-fixed-length-reductions.ll | 40 +- .../sve2-fixed-length-extract-subvector.ll | 341 ++++++++++++++++++ 14 files changed, 700 insertions(+), 357 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7c9fc67bb0119..16f02293c78f3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15544,7 +15544,9 @@ SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, assert(InVT.isScalableVector() && "Unexpected vector type!"); // Move requested subvector to the start of the vector and try again. - SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx); + // There's no need for a second input to vector_splice, so use undef there. + SDValue Splice = + DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, DAG.getUNDEF(InVT), Idx); return convertFromScalableVector(DAG, VT, Splice); } diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll index 5e01612e3881a..50975d16c7e9e 100644 --- a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll +++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll @@ -180,7 +180,7 @@ define void @test_fixed_extract(i64 %i, i64 %n) #0 { ; CHECK-SVE-NEXT: mov z1.s, p0/z, #1 // =0x1 ; CHECK-SVE-NEXT: fmov s0, w8 ; CHECK-SVE-NEXT: mov v0.s[1], v1.s[1] -; CHECK-SVE-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-SVE-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-SVE-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SVE-NEXT: // kill: def $d1 killed $d1 killed $z1 ; CHECK-SVE-NEXT: b use @@ -192,7 +192,7 @@ define void @test_fixed_extract(i64 %i, i64 %n) #0 { ; CHECK-SVE2p1-NEXT: mov z1.s, p0/z, #1 // =0x1 ; CHECK-SVE2p1-NEXT: fmov s0, w8 ; CHECK-SVE2p1-NEXT: mov v0.s[1], v1.s[1] -; CHECK-SVE2p1-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-SVE2p1-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-SVE2p1-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SVE2p1-NEXT: // kill: def $d1 killed $d1 killed $z1 ; CHECK-SVE2p1-NEXT: b use @@ -204,10 +204,10 @@ define void @test_fixed_extract(i64 %i, i64 %n) #0 { ; CHECK-SME2-NEXT: mov z1.s, p0/z, #1 // =0x1 ; CHECK-SME2-NEXT: fmov s2, w8 ; CHECK-SME2-NEXT: mov z0.s, z1.s[1] -; CHECK-SME2-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-SME2-NEXT: // kill: def $d1 killed $d1 killed $z1 ; CHECK-SME2-NEXT: zip1 z0.s, z2.s, z0.s +; CHECK-SME2-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-SME2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; CHECK-SME2-NEXT: // kill: def $d1 killed $d1 killed $z1 ; CHECK-SME2-NEXT: b use %r = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 %i, i64 %n) %v0 = call <2 x i1> @llvm.vector.extract.v2i1.nxv4i1.i64( %r, i64 0) diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll index 0ddf434eff930..4d29b1724a8dd 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -125,7 +125,7 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h -; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -210,7 +210,7 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #128 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h @@ -239,24 +239,24 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: sunpklo z2.h, z1.b ; CHECK-NEXT: sunpklo z3.h, z0.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #128 ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z4.s, z2.h ; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 -; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s ; CHECK-NEXT: sunpklo z5.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 -; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s @@ -398,7 +398,7 @@ define void @sdiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h -; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h @@ -476,7 +476,7 @@ define void @sdiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #128 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h @@ -858,7 +858,7 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h -; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -930,12 +930,12 @@ define void @udiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] ; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] ; CHECK-NEXT: uunpklo z2.s, z0.h -; CHECK-NEXT: uunpklo z3.s, z1.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: uunpklo z3.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #128 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: udivr z0.s, p1/m, z0.s, z1.s ; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h @@ -959,24 +959,24 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: uunpklo z2.h, z1.b ; CHECK-NEXT: uunpklo z3.h, z0.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #128 ; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z4.s, z2.h ; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 -; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s ; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 -; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #128 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s @@ -1118,7 +1118,7 @@ define void @udiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h -; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -1187,7 +1187,7 @@ define void @udiv_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #128 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #128 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll index 2d78945399176..9ce174b228dd5 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -129,8 +129,8 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sunpklo z4.s, z2.h ; VBITS_GE_256-NEXT: sunpklo z5.s, z3.h -; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 -; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h ; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h ; VBITS_GE_256-NEXT: sdivr z4.s, p0/m, z4.s, z5.s @@ -222,8 +222,8 @@ define void @srem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: sunpklo z3.h, z0.b ; CHECK-NEXT: sunpklo z4.s, z2.h ; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s @@ -254,8 +254,8 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: sunpklo z3.h, z0.b ; CHECK-NEXT: sunpklo z4.s, z2.h ; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z4.s, p1/m, z4.s, z5.s @@ -263,15 +263,15 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128 ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z7.s, z5.h -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #128 +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128 ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: sunpklo z3.h, z3.b ; CHECK-NEXT: sunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z6.s, p1/m, z6.s, z7.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h @@ -425,7 +425,7 @@ define void @srem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; VBITS_GE_256-NEXT: mov z3.d, z1.d ; VBITS_GE_256-NEXT: sunpklo z4.s, z4.h -; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z0.b, #16 ; VBITS_GE_256-NEXT: sunpklo z3.s, z3.h ; VBITS_GE_256-NEXT: sdivr z3.s, p1/m, z3.s, z4.s ; VBITS_GE_256-NEXT: ptrue p1.h, vl8 @@ -512,7 +512,7 @@ define void @srem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: sdivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: sunpklo z4.s, z4.h -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z4.s ; CHECK-NEXT: ptrue p1.h, vl64 @@ -947,8 +947,8 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z4.s, z2.h ; VBITS_GE_256-NEXT: uunpklo z5.s, z3.h -; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16 -; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16 +; VBITS_GE_256-NEXT: ext z2.b, z2.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z2.s, z2.h ; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h ; VBITS_GE_256-NEXT: udivr z4.s, p0/m, z4.s, z5.s @@ -1040,8 +1040,8 @@ define void @urem_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uunpklo z3.h, z0.b ; CHECK-NEXT: uunpklo z4.s, z2.h ; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s @@ -1072,8 +1072,8 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uunpklo z3.h, z0.b ; CHECK-NEXT: uunpklo z4.s, z2.h ; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #128 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z4.s, p1/m, z4.s, z5.s @@ -1081,15 +1081,15 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128 ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z7.s, z5.h -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #128 +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #128 ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z6.s, p1/m, z6.s, z7.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h @@ -1243,7 +1243,7 @@ define void @urem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; VBITS_GE_256-NEXT: mov z3.d, z1.d ; VBITS_GE_256-NEXT: uunpklo z4.s, z4.h -; VBITS_GE_256-NEXT: ext z3.b, z3.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z3.b, z3.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h ; VBITS_GE_256-NEXT: udivr z3.s, p1/m, z3.s, z4.s ; VBITS_GE_256-NEXT: ptrue p1.h, vl8 @@ -1330,7 +1330,7 @@ define void @urem_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: udivr z2.s, p1/m, z2.s, z3.s ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: uunpklo z4.s, z4.h -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #128 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #128 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z4.s ; CHECK-NEXT: ptrue p1.h, vl64 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index ed03f9b322432..1cbe97a54e900 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -338,14 +338,14 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: punpklo p0.h, p0.b ; VBITS_GE_256-NEXT: and p0.b, p0/z, p0.b, p1.b -; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret @@ -715,14 +715,14 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: punpklo p0.h, p0.b ; VBITS_GE_256-NEXT: and p0.b, p0/z, p0.b, p1.b -; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 +; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s +; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index c48ee3939bd2e..2560f45a153b4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -13,7 +13,7 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) v ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #16 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 ; CHECK-NEXT: st2 { v0.4s, v1.4s }, [x0] ; CHECK-NEXT: ret %splat = shufflevector <2 x i32> %b, <2 x i32> poison, <8 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll index 56149e99b15f8..e3d0a72c74b87 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll @@ -127,7 +127,7 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) { ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index 94d756a36ab92..86de930cecca9 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -323,11 +323,11 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: fcvtzu z2.s, p0/m, z2.h ; CHECK-NEXT: fcvtzu z3.s, p0/m, z3.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.h ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -606,7 +606,7 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK-NEXT: mov z4.h, z1.h[3] ; CHECK-NEXT: mov z7.h, z1.h[2] ; CHECK-NEXT: mov z17.h, z0.h[1] -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.h ; CHECK-NEXT: fcvtzu z5.d, p0/m, z5.h ; CHECK-NEXT: fcvtzu z6.d, p0/m, z6.h @@ -1109,11 +1109,11 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: fcvtzu z2.d, p0/m, z2.s ; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.s +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.s ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -2018,11 +2018,11 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: fcvtzs z2.s, p0/m, z2.h ; CHECK-NEXT: fcvtzs z3.s, p0/m, z3.h +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.h ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -2302,7 +2302,7 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; CHECK-NEXT: mov z4.h, z1.h[3] ; CHECK-NEXT: mov z7.h, z1.h[2] ; CHECK-NEXT: mov z17.h, z0.h[1] -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.h ; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.h ; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.h @@ -2805,11 +2805,11 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.s ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.s +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.s ; CHECK-NEXT: stp q2, q0, [x1, #32] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index 1fdcd4f826870..ae7c676172867 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -63,7 +63,7 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h @@ -129,24 +129,24 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: sunpklo z2.h, z1.b ; CHECK-NEXT: sunpklo z3.h, z0.b ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z4.s, z2.h ; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.h, z1.b -; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: sunpklo z5.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h @@ -246,45 +246,45 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q2, [x0, #16] ; CHECK-NEXT: sunpklo z1.h, z3.b ; CHECK-NEXT: sunpklo z4.h, z2.b -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: sunpklo z7.h, z6.b -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: sunpklo z3.h, z3.b ; CHECK-NEXT: sunpklo z0.s, z1.h ; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: sunpklo z17.s, z7.h -; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 -; CHECK-NEXT: sunpklo z6.h, z6.b +; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z5.s +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z4.s, z4.h +; CHECK-NEXT: sunpklo z3.h, z3.b ; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z5.s +; CHECK-NEXT: sunpklo z6.h, z6.b ; CHECK-NEXT: sdivr z1.s, p0/m, z1.s, z4.s ; CHECK-NEXT: sunpklo z4.h, z2.b ; CHECK-NEXT: sunpklo z2.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s ; CHECK-NEXT: ldr q5, [x0] ; CHECK-NEXT: sunpklo z16.h, z5.b -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z18.s, z16.h -; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 ; CHECK-NEXT: sunpklo z16.s, z16.h ; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: sunpklo z18.s, z5.h -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 ; CHECK-NEXT: sunpklo z5.s, z5.h ; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s ; CHECK-NEXT: sunpklo z16.s, z6.h -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 ; CHECK-NEXT: sunpklo z6.s, z6.h ; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h ; CHECK-NEXT: sdivr z16.s, p0/m, z16.s, z18.s @@ -539,7 +539,7 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h @@ -604,17 +604,17 @@ define void @sdiv_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h ; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ldr q3, [x0] ; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s ; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uzp1 z4.h, z5.h, z5.h @@ -977,7 +977,7 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h @@ -1043,24 +1043,24 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: uunpklo z2.h, z1.b ; CHECK-NEXT: uunpklo z3.h, z0.b ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z4.s, z2.h ; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.h, z1.b -; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: uunpklo z5.s, z0.h -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z5.h, z2.h, z2.h @@ -1160,45 +1160,45 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q2, [x0, #16] ; CHECK-NEXT: uunpklo z1.h, z3.b ; CHECK-NEXT: uunpklo z4.h, z2.b -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: uunpklo z7.h, z6.b -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z0.s, z1.h ; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 ; CHECK-NEXT: uunpklo z17.s, z7.h -; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 -; CHECK-NEXT: uunpklo z6.h, z6.b +; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z5.s +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z4.s, z4.h +; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z5.s +; CHECK-NEXT: uunpklo z6.h, z6.b ; CHECK-NEXT: udivr z1.s, p0/m, z1.s, z4.s ; CHECK-NEXT: uunpklo z4.h, z2.b ; CHECK-NEXT: uunpklo z2.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s ; CHECK-NEXT: ldr q5, [x0] ; CHECK-NEXT: uunpklo z16.h, z5.b -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z18.s, z16.h -; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 ; CHECK-NEXT: uunpklo z16.s, z16.h ; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: uunpklo z18.s, z5.h -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 ; CHECK-NEXT: uunpklo z5.s, z5.h ; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s ; CHECK-NEXT: uunpklo z16.s, z6.h -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 ; CHECK-NEXT: uunpklo z6.s, z6.h ; CHECK-NEXT: uzp1 z20.h, z17.h, z17.h ; CHECK-NEXT: udivr z16.s, p0/m, z16.s, z18.s @@ -1453,7 +1453,7 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h @@ -1518,17 +1518,17 @@ define void @udiv_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ldr q3, [x0] ; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s ; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uzp1 z4.h, z5.h, z5.h diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll index 75911e5ff1569..b022c19363ed6 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll @@ -179,7 +179,7 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) { ; CHECK-NEXT: sunpklo z2.h, z0.b ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -409,10 +409,10 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: stp q3, q0, [x0, #32] @@ -468,23 +468,23 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-NEXT: sunpklo z2.h, z0.b ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z4.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z1.h, z1.b +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z6.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z7.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: stp q4, q2, [x1, #64] ; CHECK-NEXT: stp q5, q3, [x1] +; CHECK-NEXT: stp q4, q2, [x1, #64] ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] ; CHECK-NEXT: ret @@ -703,7 +703,7 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.d, z1.s @@ -748,22 +748,22 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z4.d, z2.s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z6.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: sunpklo z5.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: sunpklo z6.d, z1.s +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z7.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: stp q4, q2, [x0] ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: stp q6, q1, [x0, #32] @@ -824,56 +824,56 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) { ; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: mov z2.d, z0.d ; CHECK-NEXT: sunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z4.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: sunpklo z2.h, z2.b +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z5.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sunpklo z16.d, z4.s -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z6.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: sunpklo z20.d, z1.s ; CHECK-NEXT: sunpklo z4.d, z4.s -; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z19.d, z3.s ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z7.s, z7.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 ; CHECK-NEXT: sunpklo z18.d, z6.s -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: sunpklo z20.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z17.b, z17.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 +; CHECK-NEXT: sunpklo z19.d, z3.s +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: stp q16, q4, [x1, #128] -; CHECK-NEXT: sunpklo z3.d, z3.s ; CHECK-NEXT: sunpklo z16.d, z0.s +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z17.d, z17.s ; CHECK-NEXT: mov z4.d, z7.d -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: ext z4.b, z4.b, z7.b, #8 -; CHECK-NEXT: stp q19, q3, [x1, #160] +; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: sunpklo z7.d, z7.s ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: stp q5, q17, [x1] ; CHECK-NEXT: sunpklo z5.d, z6.s ; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: stp q20, q1, [x1, #192] -; CHECK-NEXT: sunpklo z7.d, z7.s -; CHECK-NEXT: sunpklo z1.d, z4.s -; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 +; CHECK-NEXT: stp q19, q3, [x1, #160] ; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: stp q16, q0, [x1, #32] +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 +; CHECK-NEXT: stp q20, q1, [x1, #192] ; CHECK-NEXT: stp q18, q5, [x1, #64] +; CHECK-NEXT: sunpklo z1.d, z4.s ; CHECK-NEXT: sunpklo z3.d, z6.s ; CHECK-NEXT: stp q7, q1, [x1, #224] ; CHECK-NEXT: stp q2, q3, [x1, #96] @@ -1099,7 +1099,7 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) { ; CHECK-NEXT: sunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -1223,10 +1223,10 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: stp q3, q0, [x0, #32] @@ -1270,23 +1270,23 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-NEXT: sunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z4.d, z2.s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: sunpklo z5.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: sunpklo z1.s, z1.h +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z6.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z2.d, z2.s ; CHECK-NEXT: sunpklo z7.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: sunpklo z2.d, z2.s ; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: stp q4, q2, [x1, #64] ; CHECK-NEXT: stp q5, q3, [x1] +; CHECK-NEXT: stp q4, q2, [x1, #64] ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] ; CHECK-NEXT: ret @@ -1412,7 +1412,7 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) { ; CHECK-NEXT: sunpklo z2.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -1532,7 +1532,7 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) { ; CHECK-NEXT: uunpklo z2.h, z0.b ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -1762,10 +1762,10 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: stp q3, q0, [x0, #32] @@ -1821,23 +1821,23 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) { ; CHECK-NEXT: uunpklo z2.h, z0.b ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b -; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z4.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z1.h, z1.b +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z6.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z7.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h -; CHECK-NEXT: stp q4, q2, [x1, #64] ; CHECK-NEXT: stp q5, q3, [x1] +; CHECK-NEXT: stp q4, q2, [x1, #64] ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] ; CHECK-NEXT: ret @@ -2054,7 +2054,7 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.d, z1.s @@ -2103,22 +2103,22 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z2.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z4.d, z2.s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z6.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 ; CHECK-NEXT: uunpklo z5.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: uunpklo z2.d, z2.s -; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: uunpklo z6.d, z1.s +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z7.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: stp q4, q2, [x0] ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: stp q6, q1, [x0, #32] @@ -2187,56 +2187,56 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) { ; CHECK-NEXT: add z1.b, z1.b, z1.b ; CHECK-NEXT: mov z2.d, z0.d ; CHECK-NEXT: uunpklo z3.h, z1.b -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z4.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: uunpklo z2.h, z2.b +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z5.s, z0.h +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uunpklo z16.d, z4.s -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z6.s, z2.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: mov z17.d, z5.d -; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uunpklo z5.d, z5.s +; CHECK-NEXT: uunpklo z20.d, z1.s ; CHECK-NEXT: uunpklo z4.d, z4.s -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z19.d, z3.s ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z7.s, z7.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 ; CHECK-NEXT: uunpklo z18.d, z6.s -; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8 -; CHECK-NEXT: uunpklo z20.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z17.b, z17.b, z0.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 +; CHECK-NEXT: uunpklo z19.d, z3.s +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: stp q16, q4, [x1, #128] -; CHECK-NEXT: uunpklo z3.d, z3.s ; CHECK-NEXT: uunpklo z16.d, z0.s +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z17.d, z17.s ; CHECK-NEXT: mov z4.d, z7.d -; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: ext z4.b, z4.b, z7.b, #8 -; CHECK-NEXT: stp q19, q3, [x1, #160] +; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: uunpklo z7.d, z7.s ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: stp q5, q17, [x1] ; CHECK-NEXT: uunpklo z5.d, z6.s ; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: stp q20, q1, [x1, #192] -; CHECK-NEXT: uunpklo z7.d, z7.s -; CHECK-NEXT: uunpklo z1.d, z4.s -; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 +; CHECK-NEXT: stp q19, q3, [x1, #160] ; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: stp q16, q0, [x1, #32] +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 +; CHECK-NEXT: stp q20, q1, [x1, #192] ; CHECK-NEXT: stp q18, q5, [x1, #64] +; CHECK-NEXT: uunpklo z1.d, z4.s ; CHECK-NEXT: uunpklo z3.d, z6.s ; CHECK-NEXT: stp q7, q1, [x1, #224] ; CHECK-NEXT: stp q2, q3, [x1, #96] @@ -2485,7 +2485,7 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) { ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -2611,10 +2611,10 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: stp q3, q0, [x0, #32] @@ -2662,23 +2662,23 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) { ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z4.d, z2.s -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 ; CHECK-NEXT: uunpklo z5.d, z3.s -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z6.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: uunpklo z7.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: uunpklo z2.d, z2.s ; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: stp q4, q2, [x1, #64] ; CHECK-NEXT: stp q5, q3, [x1] +; CHECK-NEXT: stp q4, q2, [x1, #64] ; CHECK-NEXT: stp q6, q0, [x1, #96] ; CHECK-NEXT: stp q7, q1, [x1, #32] ; CHECK-NEXT: ret @@ -2816,7 +2816,7 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) { ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: stp q2, q0, [x1, #32] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 9497ec88e57b4..bffef1352e44f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -68,8 +68,8 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z4.s, z2.h ; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s @@ -146,8 +146,8 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z4.s, z2.h ; CHECK-NEXT: sunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z2.s, z2.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s @@ -155,14 +155,14 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 ; CHECK-NEXT: sunpklo z5.h, z5.b ; CHECK-NEXT: sunpklo z7.s, z5.h -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: sunpklo z5.s, z5.h -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.h, z3.b ; CHECK-NEXT: sunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s @@ -283,8 +283,8 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z4.h, z0.b ; CHECK-NEXT: sunpklo z2.s, z3.h ; CHECK-NEXT: sunpklo z5.s, z4.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z5.s @@ -292,14 +292,14 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sdiv z5.s, p0/m, z5.s, z3.s ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: sunpklo z7.h, z3.b ; CHECK-NEXT: sunpklo z16.h, z4.b ; CHECK-NEXT: sunpklo z3.s, z7.h ; CHECK-NEXT: sunpklo z4.s, z16.h -; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 -; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 +; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 ; CHECK-NEXT: sunpklo z7.s, z7.h ; CHECK-NEXT: movprfx z6, z4 ; CHECK-NEXT: sdiv z6.s, p0/m, z6.s, z3.s @@ -311,23 +311,23 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s ; CHECK-NEXT: sunpklo z19.s, z17.h ; CHECK-NEXT: sunpklo z20.s, z18.h -; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 -; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 +; CHECK-NEXT: ext z17.b, z17.b, z0.b, #8 +; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 ; CHECK-NEXT: sunpklo z17.s, z17.h ; CHECK-NEXT: sunpklo z18.s, z18.h ; CHECK-NEXT: sdivr z19.s, p0/m, z19.s, z20.s ; CHECK-NEXT: mov z20.d, z3.d -; CHECK-NEXT: ext z20.b, z20.b, z3.b, #8 +; CHECK-NEXT: ext z20.b, z20.b, z0.b, #8 ; CHECK-NEXT: sunpklo z20.h, z20.b ; CHECK-NEXT: sunpklo z22.s, z20.h -; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 +; CHECK-NEXT: ext z20.b, z20.b, z0.b, #8 ; CHECK-NEXT: sdivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: mov z18.d, z4.d ; CHECK-NEXT: sunpklo z20.s, z20.h -; CHECK-NEXT: ext z18.b, z18.b, z4.b, #8 +; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 ; CHECK-NEXT: sunpklo z18.h, z18.b ; CHECK-NEXT: sunpklo z21.s, z18.h -; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 +; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 ; CHECK-NEXT: sunpklo z18.s, z18.h ; CHECK-NEXT: sdivr z21.s, p0/m, z21.s, z22.s ; CHECK-NEXT: uzp1 z22.h, z2.h, z2.h @@ -595,7 +595,7 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z4.s, z4.h ; CHECK-NEXT: sunpklo z3.s, z3.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s @@ -675,15 +675,15 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z6.s, z3.h ; CHECK-NEXT: mov z7.d, z3.d ; CHECK-NEXT: sunpklo z16.s, z16.h -; CHECK-NEXT: ext z7.b, z7.b, z3.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 ; CHECK-NEXT: sunpklo z7.s, z7.h ; CHECK-NEXT: sdivr z5.s, p0/m, z5.s, z6.s ; CHECK-NEXT: mov z6.d, z4.d -; CHECK-NEXT: ext z6.b, z6.b, z4.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 ; CHECK-NEXT: sunpklo z6.s, z6.h ; CHECK-NEXT: sdivr z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 ; CHECK-NEXT: sunpklo z7.s, z7.h ; CHECK-NEXT: sdivr z7.s, p0/m, z7.s, z16.s ; CHECK-NEXT: uzp1 z16.h, z5.h, z5.h @@ -1118,8 +1118,8 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z4.s, z2.h ; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s @@ -1196,8 +1196,8 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z4.s, z2.h ; CHECK-NEXT: uunpklo z5.s, z3.h -; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z2.s, z2.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s @@ -1205,14 +1205,14 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 ; CHECK-NEXT: uunpklo z5.h, z5.b ; CHECK-NEXT: uunpklo z7.s, z5.h -; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8 +; CHECK-NEXT: ext z5.b, z5.b, z0.b, #8 ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: uunpklo z5.s, z5.h -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.h, z3.b ; CHECK-NEXT: uunpklo z6.s, z3.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s @@ -1333,8 +1333,8 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z4.h, z0.b ; CHECK-NEXT: uunpklo z2.s, z3.h ; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z5.s @@ -1342,14 +1342,14 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: udiv z5.s, p0/m, z5.s, z3.s ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 ; CHECK-NEXT: uunpklo z7.h, z3.b ; CHECK-NEXT: uunpklo z16.h, z4.b ; CHECK-NEXT: uunpklo z3.s, z7.h ; CHECK-NEXT: uunpklo z4.s, z16.h -; CHECK-NEXT: ext z7.b, z7.b, z7.b, #8 -; CHECK-NEXT: ext z16.b, z16.b, z16.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 +; CHECK-NEXT: ext z16.b, z16.b, z0.b, #8 ; CHECK-NEXT: uunpklo z7.s, z7.h ; CHECK-NEXT: movprfx z6, z4 ; CHECK-NEXT: udiv z6.s, p0/m, z6.s, z3.s @@ -1361,23 +1361,23 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s ; CHECK-NEXT: uunpklo z19.s, z17.h ; CHECK-NEXT: uunpklo z20.s, z18.h -; CHECK-NEXT: ext z17.b, z17.b, z17.b, #8 -; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 +; CHECK-NEXT: ext z17.b, z17.b, z0.b, #8 +; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 ; CHECK-NEXT: uunpklo z17.s, z17.h ; CHECK-NEXT: uunpklo z18.s, z18.h ; CHECK-NEXT: udivr z19.s, p0/m, z19.s, z20.s ; CHECK-NEXT: mov z20.d, z3.d -; CHECK-NEXT: ext z20.b, z20.b, z3.b, #8 +; CHECK-NEXT: ext z20.b, z20.b, z0.b, #8 ; CHECK-NEXT: uunpklo z20.h, z20.b ; CHECK-NEXT: uunpklo z22.s, z20.h -; CHECK-NEXT: ext z20.b, z20.b, z20.b, #8 +; CHECK-NEXT: ext z20.b, z20.b, z0.b, #8 ; CHECK-NEXT: udivr z17.s, p0/m, z17.s, z18.s ; CHECK-NEXT: mov z18.d, z4.d ; CHECK-NEXT: uunpklo z20.s, z20.h -; CHECK-NEXT: ext z18.b, z18.b, z4.b, #8 +; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 ; CHECK-NEXT: uunpklo z18.h, z18.b ; CHECK-NEXT: uunpklo z21.s, z18.h -; CHECK-NEXT: ext z18.b, z18.b, z18.b, #8 +; CHECK-NEXT: ext z18.b, z18.b, z0.b, #8 ; CHECK-NEXT: uunpklo z18.s, z18.h ; CHECK-NEXT: udivr z21.s, p0/m, z21.s, z22.s ; CHECK-NEXT: uzp1 z22.h, z2.h, z2.h @@ -1645,7 +1645,7 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z1.b, #8 +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z4.s, z4.h ; CHECK-NEXT: uunpklo z3.s, z3.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s @@ -1725,15 +1725,15 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z6.s, z3.h ; CHECK-NEXT: mov z7.d, z3.d ; CHECK-NEXT: uunpklo z16.s, z16.h -; CHECK-NEXT: ext z7.b, z7.b, z3.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 ; CHECK-NEXT: uunpklo z7.s, z7.h ; CHECK-NEXT: udivr z5.s, p0/m, z5.s, z6.s ; CHECK-NEXT: mov z6.d, z4.d -; CHECK-NEXT: ext z6.b, z6.b, z4.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 ; CHECK-NEXT: uunpklo z6.s, z6.h ; CHECK-NEXT: udivr z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: ext z7.b, z7.b, z1.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 ; CHECK-NEXT: uunpklo z7.s, z7.h ; CHECK-NEXT: udivr z7.s, p0/m, z7.s, z16.s ; CHECK-NEXT: uzp1 z16.h, z5.h, z5.h diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 7df362826d052..8b0bb7a7d4e15 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -309,11 +309,11 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: ucvtf z2.s, p0/m, z2.s ; CHECK-NEXT: ucvtf z3.s, p0/m, z3.s +; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -485,14 +485,14 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d ; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d +; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q2, q1, [x1] ; CHECK-NEXT: stp q3, q0, [x1, #32] @@ -546,39 +546,39 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) { ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z5.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 -; CHECK-NEXT: uunpklo z1.d, z1.s -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: mov z7.d, z3.d -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z4.d, z4.s -; CHECK-NEXT: ucvtf z5.d, p0/m, z5.d ; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d -; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z3.b, #8 +; CHECK-NEXT: uunpklo z3.s, z3.h +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: mov z6.d, z2.d ; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: ucvtf z5.d, p0/m, z5.d +; CHECK-NEXT: uunpklo z4.d, z4.s +; CHECK-NEXT: mov z7.d, z3.d +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.d, z3.s +; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 +; CHECK-NEXT: uunpklo z6.d, z6.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ucvtf z4.d, p0/m, z4.d -; CHECK-NEXT: uunpklo z6.d, z6.s +; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d ; CHECK-NEXT: uunpklo z7.d, z7.s -; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d ; CHECK-NEXT: stp q5, q0, [x1, #64] -; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d ; CHECK-NEXT: stp q1, q4, [x1] ; CHECK-NEXT: movprfx z1, z6 ; CHECK-NEXT: ucvtf z1.d, p0/m, z6.d ; CHECK-NEXT: movprfx z0, z7 ; CHECK-NEXT: ucvtf z0.d, p0/m, z7.d -; CHECK-NEXT: stp q3, q0, [x1, #32] ; CHECK-NEXT: stp q2, q1, [x1, #96] +; CHECK-NEXT: stp q3, q0, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64: @@ -1038,11 +1038,11 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) { ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: uunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: ucvtf z2.d, p0/m, z2.d ; CHECK-NEXT: ucvtf z3.d, p0/m, z3.d +; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -1722,11 +1722,11 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z2.s, z0.h ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.s, z1.h -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: scvtf z2.s, p0/m, z2.s ; CHECK-NEXT: scvtf z3.s, p0/m, z3.s +; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: scvtf z1.s, p0/m, z1.s ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -1876,14 +1876,14 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) { ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z2.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: scvtf z2.d, p0/m, z2.d +; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: scvtf z1.d, p0/m, z1.d ; CHECK-NEXT: scvtf z3.d, p0/m, z3.d +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: stp q2, q1, [x1] ; CHECK-NEXT: stp q3, q0, [x1, #32] @@ -1937,39 +1937,39 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) { ; CHECK-NEXT: mov z3.d, z1.d ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: sunpklo z2.s, z2.h -; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z5.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: mov z7.d, z3.d -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z4.d, z4.s -; CHECK-NEXT: scvtf z5.d, p0/m, z5.d ; CHECK-NEXT: scvtf z1.d, p0/m, z1.d -; CHECK-NEXT: ext z6.b, z6.b, z2.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z3.b, #8 +; CHECK-NEXT: sunpklo z3.s, z3.h +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: mov z6.d, z2.d ; CHECK-NEXT: sunpklo z2.d, z2.s +; CHECK-NEXT: scvtf z5.d, p0/m, z5.d +; CHECK-NEXT: sunpklo z4.d, z4.s +; CHECK-NEXT: mov z7.d, z3.d +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.d, z3.s +; CHECK-NEXT: scvtf z2.d, p0/m, z2.d +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 +; CHECK-NEXT: sunpklo z6.d, z6.s ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: scvtf z4.d, p0/m, z4.d -; CHECK-NEXT: sunpklo z6.d, z6.s +; CHECK-NEXT: scvtf z3.d, p0/m, z3.d ; CHECK-NEXT: sunpklo z7.d, z7.s -; CHECK-NEXT: scvtf z2.d, p0/m, z2.d ; CHECK-NEXT: stp q5, q0, [x1, #64] -; CHECK-NEXT: scvtf z3.d, p0/m, z3.d ; CHECK-NEXT: stp q1, q4, [x1] ; CHECK-NEXT: movprfx z1, z6 ; CHECK-NEXT: scvtf z1.d, p0/m, z6.d ; CHECK-NEXT: movprfx z0, z7 ; CHECK-NEXT: scvtf z0.d, p0/m, z7.d -; CHECK-NEXT: stp q3, q0, [x1, #32] ; CHECK-NEXT: stp q2, q1, [x1, #96] +; CHECK-NEXT: stp q3, q0, [x1, #32] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64: @@ -2334,11 +2334,11 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) { ; CHECK-NEXT: sunpklo z2.d, z0.s ; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 ; CHECK-NEXT: sunpklo z3.d, z1.s -; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8 +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #8 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: scvtf z2.d, p0/m, z2.d ; CHECK-NEXT: scvtf z3.d, p0/m, z3.d +; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: scvtf z1.d, p0/m, z1.d ; CHECK-NEXT: stp q2, q0, [x1, #32] @@ -2390,35 +2390,35 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q5, q3, [x0] ; CHECK-NEXT: mov z2.d, z0.d ; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z7.d, z5.d -; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 -; CHECK-NEXT: ext z4.b, z4.b, z1.b, #8 -; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: ext z6.b, z6.b, z3.b, #8 -; CHECK-NEXT: ext z7.b, z7.b, z5.b, #8 ; CHECK-NEXT: sunpklo z3.d, z3.s ; CHECK-NEXT: sunpklo z5.d, z5.s +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #8 +; CHECK-NEXT: sunpklo z0.d, z0.s +; CHECK-NEXT: scvtf z1.d, p0/m, z1.d +; CHECK-NEXT: scvtf z3.d, p0/m, z3.d ; CHECK-NEXT: sunpklo z2.d, z2.s -; CHECK-NEXT: sunpklo z4.d, z4.s +; CHECK-NEXT: ext z4.b, z4.b, z0.b, #8 +; CHECK-NEXT: ext z6.b, z6.b, z0.b, #8 +; CHECK-NEXT: ext z7.b, z7.b, z0.b, #8 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d +; CHECK-NEXT: sunpklo z4.d, z4.s ; CHECK-NEXT: sunpklo z6.d, z6.s -; CHECK-NEXT: sunpklo z7.d, z7.s -; CHECK-NEXT: scvtf z1.d, p0/m, z1.d -; CHECK-NEXT: scvtf z3.d, p0/m, z3.d ; CHECK-NEXT: scvtf z2.d, p0/m, z2.d +; CHECK-NEXT: sunpklo z7.d, z7.s ; CHECK-NEXT: scvtf z4.d, p0/m, z4.d -; CHECK-NEXT: stp q1, q4, [x1, #64] -; CHECK-NEXT: movprfx z1, z5 -; CHECK-NEXT: scvtf z1.d, p0/m, z5.d ; CHECK-NEXT: stp q0, q2, [x1, #96] ; CHECK-NEXT: movprfx z0, z6 ; CHECK-NEXT: scvtf z0.d, p0/m, z6.d ; CHECK-NEXT: movprfx z2, z7 ; CHECK-NEXT: scvtf z2.d, p0/m, z7.d -; CHECK-NEXT: stp q1, q2, [x1] ; CHECK-NEXT: stp q3, q0, [x1, #32] +; CHECK-NEXT: stp q1, q4, [x1, #64] +; CHECK-NEXT: movprfx z1, z5 +; CHECK-NEXT: scvtf z1.d, p0/m, z5.d +; CHECK-NEXT: stp q1, q2, [x1] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64: diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll index 688537704a6f7..93d6da19c0c33 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll @@ -37,32 +37,32 @@ define i32 @reduce_uaddv_v16i8(<32 x i8> %a) { ; STREAMING-SVE-LABEL: reduce_uaddv_v16i8: ; STREAMING-SVE: // %bb.0: ; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 -; STREAMING-SVE-NEXT: uunpklo z2.h, z1.b ; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; STREAMING-SVE-NEXT: uunpklo z2.h, z1.b ; STREAMING-SVE-NEXT: uunpklo z3.h, z0.b ; STREAMING-SVE-NEXT: ptrue p0.s, vl4 -; STREAMING-SVE-NEXT: ext z1.b, z1.b, z1.b, #8 +; STREAMING-SVE-NEXT: ext z1.b, z1.b, z0.b, #8 ; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 ; STREAMING-SVE-NEXT: uunpklo z1.h, z1.b ; STREAMING-SVE-NEXT: uunpklo z0.h, z0.b ; STREAMING-SVE-NEXT: uunpklo z4.s, z2.h -; STREAMING-SVE-NEXT: ext z2.b, z2.b, z2.b, #8 ; STREAMING-SVE-NEXT: uunpklo z6.s, z3.h -; STREAMING-SVE-NEXT: ext z3.b, z3.b, z3.b, #8 ; STREAMING-SVE-NEXT: mov z5.d, z1.d +; STREAMING-SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; STREAMING-SVE-NEXT: ext z3.b, z3.b, z0.b, #8 ; STREAMING-SVE-NEXT: uunpklo z7.s, z0.h +; STREAMING-SVE-NEXT: uunpklo z1.s, z1.h +; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s +; STREAMING-SVE-NEXT: ext z5.b, z5.b, z0.b, #8 ; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 ; STREAMING-SVE-NEXT: uunpklo z2.s, z2.h ; STREAMING-SVE-NEXT: uunpklo z3.s, z3.h -; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s -; STREAMING-SVE-NEXT: ext z5.b, z5.b, z1.b, #8 -; STREAMING-SVE-NEXT: uunpklo z1.s, z1.h +; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s +; STREAMING-SVE-NEXT: uunpklo z5.s, z5.h ; STREAMING-SVE-NEXT: uunpklo z0.s, z0.h ; STREAMING-SVE-NEXT: add z2.s, z3.s, z2.s -; STREAMING-SVE-NEXT: uunpklo z5.s, z5.h -; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s -; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s ; STREAMING-SVE-NEXT: add z1.s, z4.s, z1.s +; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s ; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s ; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s ; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s @@ -104,32 +104,32 @@ define i32 @reduce_saddv_v16i8(<32 x i8> %a) { ; STREAMING-SVE-LABEL: reduce_saddv_v16i8: ; STREAMING-SVE: // %bb.0: ; STREAMING-SVE-NEXT: // kill: def $q1 killed $q1 def $z1 -; STREAMING-SVE-NEXT: sunpklo z2.h, z1.b ; STREAMING-SVE-NEXT: // kill: def $q0 killed $q0 def $z0 +; STREAMING-SVE-NEXT: sunpklo z2.h, z1.b ; STREAMING-SVE-NEXT: sunpklo z3.h, z0.b ; STREAMING-SVE-NEXT: ptrue p0.s, vl4 -; STREAMING-SVE-NEXT: ext z1.b, z1.b, z1.b, #8 +; STREAMING-SVE-NEXT: ext z1.b, z1.b, z0.b, #8 ; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 ; STREAMING-SVE-NEXT: sunpklo z1.h, z1.b ; STREAMING-SVE-NEXT: sunpklo z0.h, z0.b ; STREAMING-SVE-NEXT: sunpklo z4.s, z2.h -; STREAMING-SVE-NEXT: ext z2.b, z2.b, z2.b, #8 ; STREAMING-SVE-NEXT: sunpklo z6.s, z3.h -; STREAMING-SVE-NEXT: ext z3.b, z3.b, z3.b, #8 ; STREAMING-SVE-NEXT: mov z5.d, z1.d +; STREAMING-SVE-NEXT: ext z2.b, z2.b, z0.b, #8 +; STREAMING-SVE-NEXT: ext z3.b, z3.b, z0.b, #8 ; STREAMING-SVE-NEXT: sunpklo z7.s, z0.h +; STREAMING-SVE-NEXT: sunpklo z1.s, z1.h +; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s +; STREAMING-SVE-NEXT: ext z5.b, z5.b, z0.b, #8 ; STREAMING-SVE-NEXT: ext z0.b, z0.b, z0.b, #8 ; STREAMING-SVE-NEXT: sunpklo z2.s, z2.h ; STREAMING-SVE-NEXT: sunpklo z3.s, z3.h -; STREAMING-SVE-NEXT: add z4.s, z6.s, z4.s -; STREAMING-SVE-NEXT: ext z5.b, z5.b, z1.b, #8 -; STREAMING-SVE-NEXT: sunpklo z1.s, z1.h +; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s +; STREAMING-SVE-NEXT: sunpklo z5.s, z5.h ; STREAMING-SVE-NEXT: sunpklo z0.s, z0.h ; STREAMING-SVE-NEXT: add z2.s, z3.s, z2.s -; STREAMING-SVE-NEXT: sunpklo z5.s, z5.h -; STREAMING-SVE-NEXT: add z1.s, z7.s, z1.s -; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s ; STREAMING-SVE-NEXT: add z1.s, z4.s, z1.s +; STREAMING-SVE-NEXT: add z0.s, z0.s, z5.s ; STREAMING-SVE-NEXT: add z0.s, z2.s, z0.s ; STREAMING-SVE-NEXT: add z0.s, z1.s, z0.s ; STREAMING-SVE-NEXT: uaddv d0, p0, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll new file mode 100644 index 0000000000000..b96fad8239190 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-fixed-length-extract-subvector.ll @@ -0,0 +1,341 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64 -mattr=+sve2,+bf16 -verify-machineinstrs %s -o - | FileCheck %s + +; This is a similar test to sve-fixed-length-extract-subvector.ll, but this one +; uses SVE2 and extracts multiple subvectors at once to ensure that the ext +; instruction is used (instead of just using smaller ld/st instructions with an +; offset). + +; Test the patterns selecting EXT_ZZI and EXT_ZZI_B for fixed-length vectors +; when SVE2 is available. + +; +; Use NEON for 128-bit vectors +; + +define void @extract_v4i32_halves(ptr %in, ptr %out, ptr %out2) { +; CHECK-LABEL: extract_v4i32_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: str d1, [x1] +; CHECK-NEXT: str d0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <4 x i32>, ptr %in + %hi = shufflevector <4 x i32> %b, <4 x i32> poison, <2 x i32> + store <2 x i32> %hi, ptr %out + %lo = shufflevector <4 x i32> %b, <4 x i32> poison, <2 x i32> + store <2 x i32> %lo, ptr %out2 + ret void +} + +define void @extract_v4i32_half_unaligned(ptr %in, ptr %out) { +; CHECK-LABEL: extract_v4i32_half_unaligned: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: ret +entry: + %b = load <4 x i32>, ptr %in + %d = shufflevector <4 x i32> %b, <4 x i32> poison, <2 x i32> + store <2 x i32> %d, ptr %out + ret void +} + +; +; Use SVE for 256-bit vectors +; + +define void @extract_v4i64_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) { +; CHECK-LABEL: extract_v4i64_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <4 x i64>, ptr %in + %hi = shufflevector <4 x i64> %b, <4 x i64> poison, <2 x i32> + store <2 x i64> %hi, ptr %out + %lo = shufflevector <4 x i64> %b, <4 x i64> poison, <2 x i32> + store <2 x i64> %lo, ptr %out2 + ret void +} + +define void @extract_v4double_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) { +; CHECK-LABEL: extract_v4double_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <4 x double>, ptr %in + %hi = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + store <2 x double> %hi, ptr %out + %lo = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + store <2 x double> %lo, ptr %out2 + ret void +} + +define void @extract_v8i32_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) { +; CHECK-LABEL: extract_v8i32_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <8 x i32>, ptr %in + %hi = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> + store <4 x i32> %hi, ptr %out + %lo = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> + store <4 x i32> %lo, ptr %out2 + ret void +} + +; Note that both the vector.extract intrinsic and the shufflevector from +; the previous example get detected as a extract_subvector ISD node in +; SelectionDAG. We'll test both cases for v8i32 for the sake of completeness, +; but other types will just be tested using shufflevector. +define void @extract_v8i32_halves_intrinsic(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) { +; CHECK-LABEL: extract_v8i32_halves_intrinsic: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <8 x i32>, ptr %in + %hi = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %b, i64 4) + store <4 x i32> %hi, ptr %out + %lo = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %b, i64 0) + store <4 x i32> %lo, ptr %out2 + ret void +} + +define void @extract_v8float_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) { +; CHECK-LABEL: extract_v8float_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <8 x float>, ptr %in + %hi = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + store <4 x float> %hi, ptr %out + %lo = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + store <4 x float> %lo, ptr %out2 + ret void +} + +define void @extract_v8i32_half_unaligned(<8 x i32> %unused, ptr %in, ptr %out) vscale_range(2,2) { +; CHECK-LABEL: extract_v8i32_half_unaligned: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret +entry: + %b = load <8 x i32>, ptr %in + %d = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> + store <4 x i32> %d, ptr %out + ret void +} + +define void @extract_v8i32_quarters(ptr %in, ptr %out, ptr %out2, ptr %out3, ptr %out4) vscale_range(2,2) { +; CHECK-LABEL: extract_v8i32_quarters: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #24 +; CHECK-NEXT: str d1, [x1] +; CHECK-NEXT: str d2, [x2] +; CHECK-NEXT: str d0, [x3] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: str d0, [x4] +; CHECK-NEXT: ret +entry: + %b = load <8 x i32>, ptr %in + %hilo = shufflevector <8 x i32> %b, <8 x i32> poison, <2 x i32> + store <2 x i32> %hilo, ptr %out + %hihi = shufflevector <8 x i32> %b, <8 x i32> poison, <2 x i32> + store <2 x i32> %hihi, ptr %out2 + %lolo = shufflevector <8 x i32> %b, <8 x i32> poison, <2 x i32> + store <2 x i32> %lolo, ptr %out3 + %lohi = shufflevector <8 x i32> %b, <8 x i32> poison, <2 x i32> + store <2 x i32> %lohi, ptr %out4 + ret void +} + +define void @extract_v16i16_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) { +; CHECK-LABEL: extract_v16i16_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <16 x i16>, ptr %in + %hi = shufflevector <16 x i16> %b, <16 x i16> poison, <8 x i32> + store <8 x i16> %hi, ptr %out + %lo = shufflevector <16 x i16> %b, <16 x i16> poison, <8 x i32> + store <8 x i16> %lo, ptr %out2 + ret void +} + +define void @extract_v16bfloat_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) { +; CHECK-LABEL: extract_v16bfloat_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: str q1, [x2] +; CHECK-NEXT: ret +entry: + %b = load <16 x bfloat>, ptr %in + %hi = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <8 x i32> + store <8 x bfloat> %hi, ptr %out + %lo = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <8 x i32> + store <8 x bfloat> %lo, ptr %out2 + ret void +} + +define void @extract_v16half_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) { +; CHECK-LABEL: extract_v16half_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <16 x half>, ptr %in + %hi = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + store <8 x half> %hi, ptr %out + %lo = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + store <8 x half> %lo, ptr %out2 + ret void +} + +define void @extract_v32i8_halves(ptr %in, ptr %out, ptr %out2) vscale_range(2,2) { +; CHECK-LABEL: extract_v32i8_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <32 x i8>, ptr %in + %hi = shufflevector <32 x i8> %b, <32 x i8> poison, <16 x i32> + store <16 x i8> %hi, ptr %out + %lo = shufflevector <32 x i8> %b, <32 x i8> poison, <16 x i32> + store <16 x i8> %lo, ptr %out2 + ret void +} + +; +; Use SVE for 512-bit vectors +; + +define void @extract_v8i64_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4,4) { +; CHECK-LABEL: extract_v8i64_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1d { z1.d }, p0, [x1] +; CHECK-NEXT: st1d { z0.d }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <8 x i64>, ptr %in + %hi = shufflevector <8 x i64> %b, <8 x i64> poison, <4 x i32> + store <4 x i64> %hi, ptr %out + %lo = shufflevector <8 x i64> %b, <8 x i64> poison, <4 x i32> + store <4 x i64> %lo, ptr %out2 + ret void +} + +define void @extract_v16i32_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4,4) { +; CHECK-LABEL: extract_v16i32_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1w { z1.s }, p0, [x1] +; CHECK-NEXT: st1w { z0.s }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <16 x i32>, ptr %in + %hi = shufflevector <16 x i32> %b, <16 x i32> poison, <8 x i32> + store <8 x i32> %hi, ptr %out + %lo = shufflevector <16 x i32> %b, <16 x i32> poison, <8 x i32> + store <8 x i32> %lo, ptr %out2 + ret void +} + +define void @extract_v32i16_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4,4) { +; CHECK-LABEL: extract_v32i16_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: st1h { z0.h }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <32 x i16>, ptr %in + %hi = shufflevector <32 x i16> %b, <32 x i16> poison, <16 x i32> + store <16 x i16> %hi, ptr %out + %lo = shufflevector <32 x i16> %b, <32 x i16> poison, <16 x i32> + store <16 x i16> %lo, ptr %out2 + ret void +} + + +define void @extract_v64i8_halves(ptr %in, ptr %out, ptr %out2) vscale_range(4,4) { +; CHECK-LABEL: extract_v64i8_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1b { z1.b }, p0, [x1] +; CHECK-NEXT: st1b { z0.b }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <64 x i8>, ptr %in + %hi = shufflevector <64 x i8> %b, <64 x i8> poison, <32 x i32> + store <32 x i8> %hi, ptr %out + %lo = shufflevector <64 x i8> %b, <64 x i8> poison, <32 x i32> + store <32 x i8> %lo, ptr %out2 + ret void +} + +declare <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32>, i64) +declare <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32>, i64) +declare <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32>, i64)