From adb217d04d41392688ebfbd829cf6c079cddf4f5 Mon Sep 17 00:00:00 2001 From: Jonathan Cohen Date: Sun, 1 Jun 2025 11:10:48 +0300 Subject: [PATCH 1/3] Initial commit from #142941 --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 265 +++++++++++++ llvm/lib/Target/AArch64/AArch64InstrInfo.h | 4 + .../AArch64/aarch64-combine-gather-lanes.mir | 364 ++++++++++++++++++ .../complex-deinterleaving-uniform-cases.ll | 134 +++---- llvm/test/CodeGen/AArch64/concat-vector.ll | 5 +- .../AArch64/fp-maximumnum-minimumnum.ll | 50 +-- llvm/test/CodeGen/AArch64/fsh.ll | 113 +++--- llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 345 +++++++++-------- llvm/test/CodeGen/AArch64/nontemporal.ll | 48 +-- 9 files changed, 988 insertions(+), 340 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 59d4fd26f6f91..06bcaa1698aab 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,6 +20,7 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -35,6 +36,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -7349,6 +7351,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const { case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return true; } // end switch (Pattern) return false; @@ -7389,11 +7394,252 @@ static bool getMiscPatterns(MachineInstr &Root, return false; } +static bool getGatherPattern(MachineInstr &Root, + SmallVectorImpl &Patterns, + unsigned LoadLaneOpCode, unsigned NumLanes) { + const MachineFunction *MF = Root.getMF(); + + // Early exit if optimizing for size. + if (MF->getFunction().hasMinSize()) + return false; + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + // The root of the pattern must load into the last lane of the vector. + if (Root.getOperand(2).getImm() != NumLanes - 1) + return false; + + // Check that we have load into all lanes except lane 0. + // For each load we also want to check that: + // 1. It has a single non-debug use (since we will be replacing the virtual + // register) + // 2. That the addressing mode only uses a single offset register. + auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + auto Range = llvm::seq(1, NumLanes - 1); + SmallSet RemainingLanes(Range.begin(), Range.end()); + while (!RemainingLanes.empty() && CurrInstr && + CurrInstr->getOpcode() == LoadLaneOpCode && + MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && + CurrInstr->getNumOperands() == 4) { + RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + if (!RemainingLanes.empty()) + return false; + + // Match the SUBREG_TO_REG sequence. + if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) + return false; + + // Verify that the subreg to reg loads an integer into the first lane. + auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); + unsigned SingleLaneSizeInBits = 128 / NumLanes; + if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits) + return false; + + // Verify that it also has a single non debug use. + if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) + return false; + + switch (NumLanes) { + case 4: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); + break; + case 8: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16); + break; + case 16: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8); + break; + default: + llvm_unreachable("Got bad number of lanes for gather pattern."); + } + + return true; +} + +/// Search for patterns where we use LD1 instructions to load into +/// separate lanes of an 128 bit Neon register. We can increase Memory Level +/// Parallelism by loading into 2 Neon registers instead. +static bool getLoadPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + + // The pattern searches for loads into single lanes. + switch (Root.getOpcode()) { + case AArch64::LD1i32: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 4); + case AArch64::LD1i16: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 8); + case AArch64::LD1i8: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 16); + default: + return false; + } +} + +static void +generateGatherPattern(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + unsigned Pattern, unsigned NumLanes) { + + MachineFunction &MF = *Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + // Gather the initial load instructions to build the pattern + SmallVector LoadToLaneInstrs; + MachineInstr *CurrInstr = &Root; + for (unsigned i = 0; i < NumLanes - 1; ++i) { + LoadToLaneInstrs.push_back(CurrInstr); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + // Sort the load instructions according to the lane. + llvm::sort(LoadToLaneInstrs, + [](const MachineInstr *A, const MachineInstr *B) { + return A->getOperand(2).getImm() > B->getOperand(2).getImm(); + }); + + MachineInstr *SubregToReg = CurrInstr; + LoadToLaneInstrs.push_back( + MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg())); + auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs); + + const TargetRegisterClass *FPR128RegClass = + MRI.getRegClass(Root.getOperand(0).getReg()); + + auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr, + Register SrcRegister, unsigned Lane, + Register OffsetRegister) { + auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); + MachineInstrBuilder LoadIndexIntoRegister = + BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), + NewRegister) + .addReg(SrcRegister) + .addImm(Lane) + .addReg(OffsetRegister, getKillRegState(true)); + InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); + InsInstrs.push_back(LoadIndexIntoRegister); + return NewRegister; + }; + + // Helper to create load instruction based on opcode + auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, + Register OffsetReg) -> MachineInstrBuilder { + unsigned Opcode; + switch (NumLanes) { + case 4: + Opcode = AArch64::LDRSui; + break; + case 8: + Opcode = AArch64::LDRHui; + break; + case 16: + Opcode = AArch64::LDRBui; + break; + default: + llvm_unreachable( + "Got unsupported number of lanes in machine-combiner gather pattern"); + } + // Immediate offset load + return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) + .addReg(OffsetReg) + .addImm(0); // immediate offset + }; + + // Load the remaining lanes into register 0. + auto LanesToLoadToReg0 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, + LoadToLaneInstrsAscending.begin() + NumLanes / 2); + auto PrevReg = SubregToReg->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg0 = PrevReg; + + // First load into register 1. Perform a LDRSui to zero out the upper lanes in + // a single instruction. + auto Lane0Load = *LoadToLaneInstrsAscending.begin(); + auto OriginalSplitLoad = + *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); + auto DestRegForMiddleIndex = MRI.createVirtualRegister( + MRI.getRegClass(Lane0Load->getOperand(0).getReg())); + + MachineInstrBuilder MiddleIndexLoadInstr = + CreateLoadInstruction(NumLanes, DestRegForMiddleIndex, + OriginalSplitLoad->getOperand(3).getReg()); + + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); + InsInstrs.push_back(MiddleIndexLoadInstr); + DelInstrs.push_back(OriginalSplitLoad); + + // Subreg To Reg instruction for register 1. + auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); + unsigned SubregType; + switch (NumLanes) { + case 4: + SubregType = AArch64::ssub; + break; + case 8: + SubregType = AArch64::hsub; + break; + case 16: + SubregType = AArch64::bsub; + break; + default: + llvm_unreachable( + "Got invalid NumLanes for machine-combiner gather pattern"); + } + + auto SubRegToRegInstr = + BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()), + DestRegForSubregToReg) + .addImm(0) + .addReg(DestRegForMiddleIndex, getKillRegState(true)) + .addImm(SubregType); + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForSubregToReg, InsInstrs.size())); + InsInstrs.push_back(SubRegToRegInstr); + + // Load remaining lanes into register 1. + auto LanesToLoadToReg1 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, + LoadToLaneInstrsAscending.end()); + PrevReg = SubRegToRegInstr->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + if (Index == NumLanes / 2 - 2) { + break; + } + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg1 = PrevReg; + + // Create the final zip instruction to combine the results. + MachineInstrBuilder ZipInstr = + BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), + Root.getOperand(0).getReg()) + .addReg(LastLoadReg0) + .addReg(LastLoadReg1); + InsInstrs.push_back(ZipInstr); +} + CombinerObjective AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { switch (Pattern) { case AArch64MachineCombinerPattern::SUBADD_OP1: case AArch64MachineCombinerPattern::SUBADD_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return CombinerObjective::MustReduceDepth; default: return TargetInstrInfo::getCombinerObjective(Pattern); @@ -7423,6 +7669,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getMiscPatterns(Root, Patterns)) return true; + // Load patterns + if (getLoadPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -8678,6 +8928,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); break; } + case AArch64MachineCombinerPattern::GATHER_LANE_i32: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 4); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i16: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 8); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i8: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 16); + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 7c255da333e4b..02734866e7122 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned { FMULv8i16_indexed_OP2, FNMADD, + + GATHER_LANE_i32, + GATHER_LANE_i16, + GATHER_LANE_i8 }; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir new file mode 100644 index 0000000000000..09eb18b0e3574 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir @@ -0,0 +1,364 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: split_loads_to_fpr128 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: split_loads_to_fpr128 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %2 + %8:fpr128 = LD1i32 %7, 2, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_ui +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: split_loads_to_fpr128_ui + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSui %0, 0 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %1 + %8:fpr128 = LD1i32 %7, 2, killed %2 + %9:fpr128 = LD1i32 %8, 3, killed %3 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i16 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i16 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:fpr16 = LDRHroX %0, killed %1, 0, 1 + %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub + %11:fpr128 = LD1i16 %10, 1, killed %2 + %12:fpr128 = LD1i16 %11, 2, killed %3 + %13:fpr128 = LD1i16 %12, 3, killed %4 + %14:fpr128 = LD1i16 %13, 4, killed %5 + %15:fpr128 = LD1i16 %14, 5, killed %6 + %16:fpr128 = LD1i16 %15, 6, killed %7 + %17:fpr128 = LD1i16 %16, 7, killed %8 + $q0 = COPY %17 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i16_ui +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:fpr16 = LDRHui %0, 0 + %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub + %11:fpr128 = LD1i16 %10, 1, killed %1 + %12:fpr128 = LD1i16 %11, 2, killed %2 + %13:fpr128 = LD1i16 %12, 3, killed %3 + %14:fpr128 = LD1i16 %13, 4, killed %4 + %15:fpr128 = LD1i16 %14, 5, killed %5 + %16:fpr128 = LD1i16 %15, 6, killed %6 + %17:fpr128 = LD1i16 %16, 7, killed %7 + $q0 = COPY %17 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i8 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i8 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64common = COPY $x9 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr64common = COPY $x10 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr64common = COPY $x11 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr64common = COPY $x12 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr64common = COPY $x13 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gpr64common = COPY $x14 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr64common = COPY $x15 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr64common = COPY $x16 + ; CHECK-NEXT: [[LD_i8:%[0-9]+]]:fpr8 = LDRBroX [[COPY]], killed [[COPY1]], 0, 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i8]], %subreg.bsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i8 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i8 [[LD0_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i8 [[LD0_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: [[LD0_4:%[0-9]+]]:fpr128 = LD1i8 [[LD0_3]], 4, killed [[COPY5]] + ; CHECK-NEXT: [[LD0_5:%[0-9]+]]:fpr128 = LD1i8 [[LD0_4]], 5, killed [[COPY6]] + ; CHECK-NEXT: [[LD0_6:%[0-9]+]]:fpr128 = LD1i8 [[LD0_5]], 6, killed [[COPY7]] + ; CHECK-NEXT: [[LD0_7:%[0-9]+]]:fpr128 = LD1i8 [[LD0_6]], 7, killed [[COPY8]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr8 = LDRBui [[COPY9]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.bsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i8 [[SECOND_REG]], 1, killed [[COPY10]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i8 [[LD1_1]], 2, killed [[COPY11]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i8 [[LD1_2]], 3, killed [[COPY12]] + ; CHECK-NEXT: [[LD1_4:%[0-9]+]]:fpr128 = LD1i8 [[LD1_3]], 4, killed [[COPY13]] + ; CHECK-NEXT: [[LD1_5:%[0-9]+]]:fpr128 = LD1i8 [[LD1_4]], 5, killed [[COPY14]] + ; CHECK-NEXT: [[LD1_6:%[0-9]+]]:fpr128 = LD1i8 [[LD1_5]], 6, killed [[COPY15]] + ; CHECK-NEXT: [[LD1_7:%[0-9]+]]:fpr128 = LD1i8 [[LD1_6]], 7, killed [[COPY16]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_7]], [[LD1_7]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:gpr64common = COPY $x9 + %10:gpr64common = COPY $x10 + %11:gpr64common = COPY $x11 + %12:gpr64common = COPY $x12 + %13:gpr64common = COPY $x13 + %14:gpr64common = COPY $x14 + %15:gpr64common = COPY $x15 + %16:gpr64common = COPY $x16 + %17:fpr8 = LDRBroX %0, killed %1, 0, 0 + %18:fpr128 = SUBREG_TO_REG 0, killed %17, %subreg.bsub + %19:fpr128 = LD1i8 %18, 1, killed %2 + %20:fpr128 = LD1i8 %19, 2, killed %3 + %21:fpr128 = LD1i8 %20, 3, killed %4 + %22:fpr128 = LD1i8 %21, 4, killed %5 + %23:fpr128 = LD1i8 %22, 5, killed %6 + %24:fpr128 = LD1i8 %23, 6, killed %7 + %25:fpr128 = LD1i8 %24, 7, killed %8 + %26:fpr128 = LD1i8 %25, 8, killed %9 + %27:fpr128 = LD1i8 %26, 9, killed %10 + %28:fpr128 = LD1i8 %27, 10, killed %11 + %29:fpr128 = LD1i8 %28, 11, killed %12 + %30:fpr128 = LD1i8 %29, 12, killed %13 + %31:fpr128 = LD1i8 %30, 13, killed %14 + %32:fpr128 = LD1i8 %31, 14, killed %15 + %33:fpr128 = LD1i8 %32, 15, killed %16 + $q0 = COPY %33 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_missing_lanes +body: | + bb.0.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: negative_pattern_missing_lanes + ; CHECK: [[LD1:%.*]]:fpr128 = LDRQui $x1, 0 + ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]] + + %0:gpr64common = COPY $x0 + %1:fpr128 = LDRQui $x1, 0 + %2:fpr128 = LD1i32 %1, 3, %0 + $q0 = COPY %2 + RET_ReallyLR implicit $q0 + +--- +name: out_of_order_lanes +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: out_of_order_lanes + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY3]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 2, killed %2 + %8:fpr128 = LD1i32 %7, 1, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_no_subreg_to_reg +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3 + + ; CHECK-LABEL: name: negative_pattern_no_subreg_to_reg + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[INITIAL_VEC:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[INITIAL_VEC]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY2]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY3]] + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:fpr128 = LDRQui %0, 0 + %5:fpr128 = LD1i32 %4, 1, killed %1 + %6:fpr128 = LD1i32 %5, 2, killed %2 + %7:fpr128 = LD1i32 %6, 3, killed %3 + $q0 = COPY %7 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_multiple_users +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: negative_pattern_multiple_users + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: $q1 = COPY [[LD_LANE_2]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %2 + %8:fpr128 = LD1i32 %7, 2, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + $q1 = COPY %8 + RET_ReallyLR implicit $q0, implicit $q1 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll index 7686740aec302..13434fabefa78 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -203,89 +203,93 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-NEXT: ldr s17, [sp, #40] -; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: ldr s17, [sp, #32] +; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 ; CHECK-NEXT: add x9, sp, #48 +; CHECK-NEXT: add x10, sp, #64 ; CHECK-NEXT: mov v1.s[1], v3.s[0] -; CHECK-NEXT: ldr s3, [sp, #32] -; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: ld1 { v17.s }[1], [x10] -; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 -; CHECK-NEXT: ldr s16, [sp, #8] ; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 -; CHECK-NEXT: add x10, sp, #24 -; CHECK-NEXT: ld1 { v3.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #72 -; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 +; CHECK-NEXT: add x11, sp, #72 +; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ldr s18, [x10] +; CHECK-NEXT: add x9, sp, #80 +; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 +; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 +; CHECK-NEXT: ldr s16, [sp, #8] +; CHECK-NEXT: ldr s3, [sp, #96] +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #88 ; CHECK-NEXT: ldr s2, [sp] -; CHECK-NEXT: ld1 { v16.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #112 -; CHECK-NEXT: ldr s20, [sp, #136] ; CHECK-NEXT: mov v1.s[2], v5.s[0] -; CHECK-NEXT: ld1 { v17.s }[2], [x9] -; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ldr s5, [sp, #96] -; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: ldr s5, [sp, #40] ; CHECK-NEXT: mov v0.s[2], v4.s[0] -; CHECK-NEXT: add x9, sp, #88 -; CHECK-NEXT: ldr s4, [sp, #104] -; CHECK-NEXT: ldr s19, [sp, #192] ; CHECK-NEXT: ld1 { v5.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #80 -; CHECK-NEXT: ld1 { v17.s }[3], [x9] -; CHECK-NEXT: mov v1.s[3], v7.s[0] -; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: ld1 { v3.s }[3], [x10] -; CHECK-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-NEXT: ldr s7, [sp, #128] +; CHECK-NEXT: ldr s19, [x11] ; CHECK-NEXT: add x10, sp, #144 +; CHECK-NEXT: zip1 v4.2d, v17.2d, v18.2d +; CHECK-NEXT: add x11, sp, #160 +; CHECK-NEXT: ldr s18, [sp, #136] +; CHECK-NEXT: ld1 { v19.s }[1], [x9] ; CHECK-NEXT: mov v0.s[3], v6.s[0] -; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ldr s6, [sp, #128] +; CHECK-NEXT: mov v1.s[3], v7.s[0] +; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: ldr s7, [sp, #104] +; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ld1 { v6.s }[1], [x10] +; CHECK-NEXT: zip1 v5.2d, v5.2d, v19.2d +; CHECK-NEXT: add x10, sp, #120 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] -; CHECK-NEXT: ld1 { v2.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: fmul v6.4s, v17.4s, v1.4s -; CHECK-NEXT: fmul v18.4s, v4.4s, v16.4s -; CHECK-NEXT: fmul v16.4s, v5.4s, v16.4s -; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s -; CHECK-NEXT: add x10, sp, #208 -; CHECK-NEXT: ld1 { v7.s }[2], [x9] -; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ld1 { v19.s }[1], [x10] -; CHECK-NEXT: ld1 { v20.s }[1], [x9] +; CHECK-NEXT: ldr s17, [x11] ; CHECK-NEXT: add x9, sp, #176 -; CHECK-NEXT: add x10, sp, #184 -; CHECK-NEXT: fneg v6.4s, v6.4s -; CHECK-NEXT: fneg v18.4s, v18.4s -; CHECK-NEXT: fmla v16.4s, v2.4s, v4.4s -; CHECK-NEXT: fmla v1.4s, v0.4s, v17.4s -; CHECK-NEXT: ld1 { v7.s }[3], [x9] -; CHECK-NEXT: add x9, sp, #168 -; CHECK-NEXT: ld1 { v20.s }[2], [x9] -; CHECK-NEXT: ldr s4, [sp, #200] +; CHECK-NEXT: add x10, sp, #16 +; CHECK-NEXT: add x11, sp, #168 +; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ld1 { v2.s }[1], [x10] +; CHECK-NEXT: add x9, sp, #152 +; CHECK-NEXT: fmul v19.4s, v5.4s, v1.4s +; CHECK-NEXT: fmul v20.4s, v7.4s, v16.4s +; CHECK-NEXT: fmul v16.4s, v3.4s, v16.4s +; CHECK-NEXT: fmul v1.4s, v4.4s, v1.4s +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: ldr s21, [x11] +; CHECK-NEXT: zip1 v6.2d, v6.2d, v17.2d +; CHECK-NEXT: ldr s17, [sp, #192] +; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: add x10, sp, #208 +; CHECK-NEXT: ld1 { v21.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #216 -; CHECK-NEXT: fmla v6.4s, v0.4s, v3.4s -; CHECK-NEXT: fmla v18.4s, v2.4s, v5.4s -; CHECK-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-NEXT: fsub v0.4s, v7.4s, v1.4s -; CHECK-NEXT: fsub v1.4s, v19.4s, v16.4s -; CHECK-NEXT: ld1 { v20.s }[3], [x10] -; CHECK-NEXT: fadd v2.4s, v4.4s, v18.4s -; CHECK-NEXT: fadd v3.4s, v20.4s, v6.4s +; CHECK-NEXT: fneg v19.4s, v19.4s +; CHECK-NEXT: fneg v20.4s, v20.4s +; CHECK-NEXT: fmla v16.4s, v2.4s, v7.4s +; CHECK-NEXT: fmla v1.4s, v0.4s, v5.4s +; CHECK-NEXT: ld1 { v17.s }[1], [x10] +; CHECK-NEXT: ldr s5, [sp, #200] +; CHECK-NEXT: zip1 v7.2d, v18.2d, v21.2d +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: fmla v19.4s, v0.4s, v4.4s +; CHECK-NEXT: fmla v20.4s, v2.4s, v3.4s +; CHECK-NEXT: fsub v0.4s, v6.4s, v1.4s +; CHECK-NEXT: fsub v1.4s, v17.4s, v16.4s +; CHECK-NEXT: fadd v2.4s, v7.4s, v19.4s +; CHECK-NEXT: fadd v3.4s, v5.4s, v20.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 -; CHECK-NEXT: ext v5.16b, v3.16b, v2.16b, #12 -; CHECK-NEXT: trn2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: trn2 v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v4.16b, #12 -; CHECK-NEXT: ext v5.16b, v3.16b, v5.16b, #8 +; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 ; CHECK-NEXT: rev64 v4.4s, v4.4s -; CHECK-NEXT: trn2 v2.4s, v4.4s, v5.4s -; CHECK-NEXT: zip2 v4.4s, v0.4s, v3.4s -; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #8 -; CHECK-NEXT: mov v4.d[1], v2.d[0] +; CHECK-NEXT: trn2 v3.4s, v4.4s, v5.4s +; CHECK-NEXT: zip2 v4.4s, v0.4s, v2.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ext v1.16b, v3.16b, v1.16b, #8 +; CHECK-NEXT: mov v4.d[1], v3.d[0] ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: stp q4, q1, [x8, #16] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index acf15f1bd1178..e6f27b95d92c8 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -186,8 +186,9 @@ define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %p ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ld1 { v0.s }[1], [x1] -; CHECK-NEXT: ld1 { v0.s }[2], [x2] -; CHECK-NEXT: ld1 { v0.s }[3], [x3] +; CHECK-NEXT: ldr s1, [x2] +; CHECK-NEXT: ld1 { v1.s }[1], [x3] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %A = load <4 x i8>, ptr %ptrA %B = load <4 x i8>, ptr %ptrB diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll index c6b8e41f9bdfd..4906e2e15e51c 100644 --- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll +++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll @@ -1431,6 +1431,7 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: add x9, sp, #16 ; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 ; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: add x10, sp, #40 ; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 ; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 ; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 @@ -1439,30 +1440,30 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] ; FULLFP16-NEXT: add x9, sp, #24 ; FULLFP16-NEXT: mov v0.h[2], v2.h[0] -; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #32 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: mov v0.h[3], v3.h[0] ; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] -; FULLFP16-NEXT: add x9, sp, #40 -; FULLFP16-NEXT: ldr h3, [sp, #72] -; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: ldr h2, [x10] ; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: ldr h3, [sp, #72] +; FULLFP16-NEXT: ld1 { v2.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #56 ; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h ; FULLFP16-NEXT: mov v0.h[4], v4.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] -; FULLFP16-NEXT: add x9, sp, #56 -; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h -; FULLFP16-NEXT: mov v0.h[5], v5.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: ld1 { v2.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #64 -; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v2.h }[3], [x9] +; FULLFP16-NEXT: zip1 v1.2d, v1.2d, v2.2d +; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: mov v0.h[6], v6.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h ; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: str h2, [x8, #16] ; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v1.8h ; FULLFP16-NEXT: str q0, [x8] ; FULLFP16-NEXT: ret @@ -2012,6 +2013,7 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: add x9, sp, #16 ; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 ; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: add x10, sp, #40 ; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 ; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 ; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 @@ -2020,30 +2022,30 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] ; FULLFP16-NEXT: add x9, sp, #24 ; FULLFP16-NEXT: mov v0.h[2], v2.h[0] -; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #32 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: mov v0.h[3], v3.h[0] ; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] -; FULLFP16-NEXT: add x9, sp, #40 -; FULLFP16-NEXT: ldr h3, [sp, #72] -; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: ldr h2, [x10] ; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: ldr h3, [sp, #72] +; FULLFP16-NEXT: ld1 { v2.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #56 ; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h ; FULLFP16-NEXT: mov v0.h[4], v4.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] -; FULLFP16-NEXT: add x9, sp, #56 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h -; FULLFP16-NEXT: mov v0.h[5], v5.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: ld1 { v2.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #64 -; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v2.h }[3], [x9] +; FULLFP16-NEXT: zip1 v1.2d, v1.2d, v2.2d +; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: mov v0.h[6], v6.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h ; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: str h2, [x8, #16] ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v1.8h ; FULLFP16-NEXT: str q0, [x8] ; FULLFP16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 4c28c90824028..ae2ef2649102e 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -2509,87 +2509,88 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) { ; ; CHECK-GI-LABEL: fshl_v7i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr s3, [sp, #48] -; CHECK-GI-NEXT: ldr s20, [sp, #56] -; CHECK-GI-NEXT: add x9, sp, #56 +; CHECK-GI-NEXT: ldr s17, [sp, #48] +; CHECK-GI-NEXT: add x8, sp, #56 +; CHECK-GI-NEXT: add x9, sp, #64 ; CHECK-GI-NEXT: ldr s4, [sp, #48] -; CHECK-GI-NEXT: ldr s7, [sp, #80] -; CHECK-GI-NEXT: mov w12, #-1 // =0xffffffff -; CHECK-GI-NEXT: ldr s21, [sp, #88] -; CHECK-GI-NEXT: mov v3.s[1], v20.s[0] -; CHECK-GI-NEXT: fmov s20, w12 -; CHECK-GI-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-GI-NEXT: ldr s17, [sp] -; CHECK-GI-NEXT: add x13, sp, #64 -; CHECK-GI-NEXT: mov v7.s[1], v21.s[0] +; CHECK-GI-NEXT: ldr s21, [sp, #56] +; CHECK-GI-NEXT: mov w10, #-1 // =0xffffffff +; CHECK-GI-NEXT: ld1 { v17.s }[1], [x8] +; CHECK-GI-NEXT: ldr s20, [x9] +; CHECK-GI-NEXT: add x8, sp, #72 +; CHECK-GI-NEXT: mov v4.s[1], v21.s[0] ; CHECK-GI-NEXT: fmov s21, w7 +; CHECK-GI-NEXT: ldr s6, [sp] +; CHECK-GI-NEXT: ld1 { v20.s }[1], [x8] ; CHECK-GI-NEXT: ldr s19, [sp, #64] -; CHECK-GI-NEXT: mov w11, #31 // =0x1f -; CHECK-GI-NEXT: mov v20.s[1], w12 +; CHECK-GI-NEXT: ldr s7, [sp, #80] +; CHECK-GI-NEXT: ldr s22, [sp, #88] +; CHECK-GI-NEXT: mov w9, #31 // =0x1f +; CHECK-GI-NEXT: mov w11, #1 // =0x1 +; CHECK-GI-NEXT: mov v21.s[1], v6.s[0] +; CHECK-GI-NEXT: fmov s6, w9 ; CHECK-GI-NEXT: ldr s18, [sp, #96] -; CHECK-GI-NEXT: ld1 { v4.s }[2], [x13] -; CHECK-GI-NEXT: mov w13, #1 // =0x1 -; CHECK-GI-NEXT: mov v3.s[2], v19.s[0] -; CHECK-GI-NEXT: mov v21.s[1], v17.s[0] -; CHECK-GI-NEXT: fmov s17, w11 -; CHECK-GI-NEXT: fmov s19, w13 +; CHECK-GI-NEXT: zip1 v17.2d, v17.2d, v20.2d +; CHECK-GI-NEXT: fmov s20, w10 +; CHECK-GI-NEXT: mov v7.s[1], v22.s[0] +; CHECK-GI-NEXT: mov v4.s[2], v19.s[0] +; CHECK-GI-NEXT: fmov s19, w11 ; CHECK-GI-NEXT: fmov s23, w0 -; CHECK-GI-NEXT: fmov s24, w11 -; CHECK-GI-NEXT: ldr s6, [sp, #8] +; CHECK-GI-NEXT: mov v6.s[1], w9 +; CHECK-GI-NEXT: fmov s24, w9 +; CHECK-GI-NEXT: ldr s2, [sp, #8] +; CHECK-GI-NEXT: mov v20.s[1], w10 ; CHECK-GI-NEXT: ldr s0, [sp, #24] ; CHECK-GI-NEXT: ldr s5, [sp, #32] +; CHECK-GI-NEXT: mov v19.s[1], w11 ; CHECK-GI-NEXT: mov v7.s[2], v18.s[0] -; CHECK-GI-NEXT: mov v17.s[1], w11 -; CHECK-GI-NEXT: mov v19.s[1], w13 -; CHECK-GI-NEXT: mov v20.s[2], w12 ; CHECK-GI-NEXT: ldr s16, [sp, #72] ; CHECK-GI-NEXT: mov v23.s[1], w1 ; CHECK-GI-NEXT: ldr s18, [sp, #80] -; CHECK-GI-NEXT: mov v21.s[2], v6.s[0] -; CHECK-GI-NEXT: mov v24.s[1], w11 +; CHECK-GI-NEXT: mov v21.s[2], v2.s[0] +; CHECK-GI-NEXT: mov v24.s[1], w9 ; CHECK-GI-NEXT: mov v0.s[1], v5.s[0] -; CHECK-GI-NEXT: fmov s6, w4 -; CHECK-GI-NEXT: add x10, sp, #88 +; CHECK-GI-NEXT: fmov s5, w4 +; CHECK-GI-NEXT: mov v20.s[2], w10 +; CHECK-GI-NEXT: add x8, sp, #88 ; CHECK-GI-NEXT: movi v22.4s, #31 -; CHECK-GI-NEXT: mov v3.s[3], v16.s[0] -; CHECK-GI-NEXT: mov v17.s[2], w11 -; CHECK-GI-NEXT: mov v19.s[2], w13 -; CHECK-GI-NEXT: ldr s2, [sp, #16] -; CHECK-GI-NEXT: ldr s1, [sp, #40] -; CHECK-GI-NEXT: ld1 { v18.s }[1], [x10] -; CHECK-GI-NEXT: eor v5.16b, v7.16b, v20.16b +; CHECK-GI-NEXT: mov v4.s[3], v16.s[0] +; CHECK-GI-NEXT: mov v6.s[2], w9 +; CHECK-GI-NEXT: mov v19.s[2], w11 +; CHECK-GI-NEXT: ldr s1, [sp, #16] +; CHECK-GI-NEXT: ldr s3, [sp, #40] +; CHECK-GI-NEXT: ld1 { v18.s }[1], [x8] ; CHECK-GI-NEXT: mov v23.s[2], w2 -; CHECK-GI-NEXT: mov v6.s[1], w5 -; CHECK-GI-NEXT: add x8, sp, #72 -; CHECK-GI-NEXT: add x9, sp, #96 -; CHECK-GI-NEXT: mov v21.s[3], v2.s[0] -; CHECK-GI-NEXT: mov v24.s[2], w11 -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] -; CHECK-GI-NEXT: ld1 { v4.s }[3], [x8] -; CHECK-GI-NEXT: bic v2.16b, v22.16b, v3.16b -; CHECK-GI-NEXT: ld1 { v18.s }[2], [x9] -; CHECK-GI-NEXT: and v1.16b, v5.16b, v17.16b +; CHECK-GI-NEXT: mov v5.s[1], w5 +; CHECK-GI-NEXT: add x8, sp, #96 +; CHECK-GI-NEXT: eor v2.16b, v7.16b, v20.16b +; CHECK-GI-NEXT: mov v21.s[3], v1.s[0] +; CHECK-GI-NEXT: mov v24.s[2], w9 +; CHECK-GI-NEXT: mov v0.s[2], v3.s[0] +; CHECK-GI-NEXT: bic v1.16b, v22.16b, v4.16b +; CHECK-GI-NEXT: ld1 { v18.s }[2], [x8] ; CHECK-GI-NEXT: neg v3.4s, v19.4s +; CHECK-GI-NEXT: and v4.16b, v17.16b, v22.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v6.16b ; CHECK-GI-NEXT: mov v23.s[3], w3 -; CHECK-GI-NEXT: mov v6.s[2], w6 -; CHECK-GI-NEXT: and v4.16b, v4.16b, v22.16b -; CHECK-GI-NEXT: ushr v5.4s, v21.4s, #1 -; CHECK-GI-NEXT: neg v2.4s, v2.4s -; CHECK-GI-NEXT: and v7.16b, v18.16b, v24.16b +; CHECK-GI-NEXT: mov v5.s[2], w6 +; CHECK-GI-NEXT: ushr v6.4s, v21.4s, #1 ; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: and v7.16b, v18.16b, v24.16b ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: neg v2.4s, v2.4s ; CHECK-GI-NEXT: ushl v3.4s, v23.4s, v4.4s -; CHECK-GI-NEXT: ushl v2.4s, v5.4s, v2.4s -; CHECK-GI-NEXT: ushl v4.4s, v6.4s, v7.4s -; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: orr v1.16b, v3.16b, v2.16b +; CHECK-GI-NEXT: ushl v1.4s, v6.4s, v1.4s +; CHECK-GI-NEXT: ushl v4.4s, v5.4s, v7.4s +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: orr v1.16b, v3.16b, v1.16b ; CHECK-GI-NEXT: orr v0.16b, v4.16b, v0.16b ; CHECK-GI-NEXT: mov s2, v1.s[1] ; CHECK-GI-NEXT: mov s3, v1.s[2] ; CHECK-GI-NEXT: mov s4, v1.s[3] +; CHECK-GI-NEXT: fmov w0, s1 ; CHECK-GI-NEXT: mov s5, v0.s[1] ; CHECK-GI-NEXT: mov s6, v0.s[2] -; CHECK-GI-NEXT: fmov w0, s1 ; CHECK-GI-NEXT: fmov w4, s0 ; CHECK-GI-NEXT: fmov w1, s2 ; CHECK-GI-NEXT: fmov w2, s3 diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 4f0c4080aa0ce..9443004ea434b 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -6810,195 +6810,200 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: .cfi_offset w29, -16 -; CHECK-SD-NEXT: ldr b5, [sp, #208] +; CHECK-SD-NEXT: ldr b0, [sp, #208] ; CHECK-SD-NEXT: add x8, sp, #216 -; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: add x9, sp, #272 +; CHECK-SD-NEXT: ldr b2, [sp, #80] ; CHECK-SD-NEXT: ldr b4, [sp, #976] -; CHECK-SD-NEXT: add x9, sp, #984 -; CHECK-SD-NEXT: add x12, sp, #328 -; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #224 -; CHECK-SD-NEXT: movi v1.16b, #1 -; CHECK-SD-NEXT: mov v0.b[1], w1 -; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9] -; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 -; CHECK-SD-NEXT: add x11, sp, #992 ; CHECK-SD-NEXT: ldr b6, [sp, #720] -; CHECK-SD-NEXT: ldr b7, [sp, #80] -; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #224 +; CHECK-SD-NEXT: fmov s16, w0 +; CHECK-SD-NEXT: ldr b17, [sp, #848] +; CHECK-SD-NEXT: add x10, sp, #24 +; CHECK-SD-NEXT: movi v19.2d, #0000000000000000 +; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8] ; CHECK-SD-NEXT: add x8, sp, #232 -; CHECK-SD-NEXT: add x13, sp, #88 -; CHECK-SD-NEXT: ld1 { v4.b }[2], [x11] -; CHECK-SD-NEXT: ld1 { v7.b }[1], [x13] -; CHECK-SD-NEXT: add x13, sp, #856 -; CHECK-SD-NEXT: mov v0.b[2], w2 -; CHECK-SD-NEXT: add x14, sp, #1008 -; CHECK-SD-NEXT: add x15, sp, #872 -; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-SD-NEXT: mov v16.b[1], w1 +; CHECK-SD-NEXT: ld1 { v0.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #240 -; CHECK-SD-NEXT: add x16, sp, #888 -; CHECK-SD-NEXT: add x10, sp, #16 -; CHECK-SD-NEXT: add x9, sp, #24 -; CHECK-SD-NEXT: add x11, sp, #40 -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-SD-NEXT: mov v16.b[2], w2 +; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: add x8, sp, #248 -; CHECK-SD-NEXT: mov v0.b[3], w3 -; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-SD-NEXT: mov v16.b[3], w3 +; CHECK-SD-NEXT: ld1 { v0.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #256 -; CHECK-SD-NEXT: mov v0.b[4], w4 -; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-SD-NEXT: ld1 { v0.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #264 -; CHECK-SD-NEXT: mov v0.b[5], w5 -; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8] -; CHECK-SD-NEXT: add x8, sp, #272 -; CHECK-SD-NEXT: ld1 { v5.b }[8], [x8] +; CHECK-SD-NEXT: mov v16.b[4], w4 +; CHECK-SD-NEXT: ld1 { v0.b }[7], [x8] +; CHECK-SD-NEXT: ldr b1, [x9] ; CHECK-SD-NEXT: add x8, sp, #280 -; CHECK-SD-NEXT: mov v0.b[6], w6 -; CHECK-SD-NEXT: ld1 { v5.b }[9], [x8] +; CHECK-SD-NEXT: add x9, sp, #88 +; CHECK-SD-NEXT: mov v16.b[5], w5 +; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] ; CHECK-SD-NEXT: add x8, sp, #288 -; CHECK-SD-NEXT: mov v0.b[7], w7 -; CHECK-SD-NEXT: ld1 { v5.b }[10], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8] ; CHECK-SD-NEXT: add x8, sp, #296 -; CHECK-SD-NEXT: ld1 { v0.b }[8], [x10] -; CHECK-SD-NEXT: add x10, sp, #128 -; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8] +; CHECK-SD-NEXT: mov v16.b[6], w6 +; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #304 -; CHECK-SD-NEXT: ld1 { v0.b }[9], [x9] -; CHECK-SD-NEXT: add x9, sp, #136 -; CHECK-SD-NEXT: ld1 { v5.b }[12], [x8] +; CHECK-SD-NEXT: mov v16.b[7], w7 +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8] ; CHECK-SD-NEXT: add x8, sp, #312 -; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #320 -; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8] -; CHECK-SD-NEXT: add x8, sp, #32 -; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8] -; CHECK-SD-NEXT: add x8, sp, #144 -; CHECK-SD-NEXT: ld1 { v5.b }[15], [x12] -; CHECK-SD-NEXT: add x12, sp, #728 -; CHECK-SD-NEXT: ld1 { v6.b }[1], [x12] -; CHECK-SD-NEXT: add x12, sp, #1000 -; CHECK-SD-NEXT: ld1 { v0.b }[11], [x11] -; CHECK-SD-NEXT: ld1 { v4.b }[3], [x12] -; CHECK-SD-NEXT: add x12, sp, #736 -; CHECK-SD-NEXT: add x11, sp, #920 -; CHECK-SD-NEXT: sdot v3.4s, v5.16b, v1.16b -; CHECK-SD-NEXT: ldr b5, [sp, #848] -; CHECK-SD-NEXT: ld1 { v6.b }[2], [x12] -; CHECK-SD-NEXT: add x12, sp, #48 -; CHECK-SD-NEXT: ld1 { v5.b }[1], [x13] -; CHECK-SD-NEXT: add x13, sp, #744 -; CHECK-SD-NEXT: ld1 { v4.b }[4], [x14] -; CHECK-SD-NEXT: add x14, sp, #96 -; CHECK-SD-NEXT: ld1 { v0.b }[12], [x12] -; CHECK-SD-NEXT: ld1 { v6.b }[3], [x13] -; CHECK-SD-NEXT: add x13, sp, #864 -; CHECK-SD-NEXT: ld1 { v7.b }[2], [x14] -; CHECK-SD-NEXT: add x14, sp, #1016 -; CHECK-SD-NEXT: ld1 { v5.b }[2], [x13] -; CHECK-SD-NEXT: add x13, sp, #752 -; CHECK-SD-NEXT: ld1 { v4.b }[5], [x14] -; CHECK-SD-NEXT: add x14, sp, #104 -; CHECK-SD-NEXT: ld1 { v6.b }[4], [x13] -; CHECK-SD-NEXT: add x13, sp, #1024 -; CHECK-SD-NEXT: ld1 { v7.b }[3], [x14] -; CHECK-SD-NEXT: ld1 { v5.b }[3], [x15] -; CHECK-SD-NEXT: add x15, sp, #760 -; CHECK-SD-NEXT: add x14, sp, #112 -; CHECK-SD-NEXT: ld1 { v4.b }[6], [x13] -; CHECK-SD-NEXT: add x13, sp, #880 -; CHECK-SD-NEXT: ld1 { v6.b }[5], [x15] -; CHECK-SD-NEXT: add x15, sp, #1032 -; CHECK-SD-NEXT: ld1 { v7.b }[4], [x14] -; CHECK-SD-NEXT: ld1 { v5.b }[4], [x13] -; CHECK-SD-NEXT: add x14, sp, #768 -; CHECK-SD-NEXT: add x13, sp, #120 -; CHECK-SD-NEXT: ld1 { v4.b }[7], [x15] -; CHECK-SD-NEXT: add x15, sp, #1040 -; CHECK-SD-NEXT: ld1 { v6.b }[6], [x14] -; CHECK-SD-NEXT: ld1 { v7.b }[5], [x13] -; CHECK-SD-NEXT: add x13, sp, #776 -; CHECK-SD-NEXT: ld1 { v5.b }[5], [x16] -; CHECK-SD-NEXT: add x14, sp, #1048 -; CHECK-SD-NEXT: ld1 { v4.b }[8], [x15] -; CHECK-SD-NEXT: add x15, sp, #896 -; CHECK-SD-NEXT: ld1 { v6.b }[7], [x13] -; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-SD-NEXT: add x10, sp, #784 -; CHECK-SD-NEXT: ld1 { v5.b }[6], [x15] -; CHECK-SD-NEXT: add x13, sp, #1056 -; CHECK-SD-NEXT: ld1 { v4.b }[9], [x14] -; CHECK-SD-NEXT: add x14, sp, #904 -; CHECK-SD-NEXT: ld1 { v6.b }[8], [x10] -; CHECK-SD-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-SD-NEXT: add x9, sp, #792 -; CHECK-SD-NEXT: ld1 { v5.b }[7], [x14] -; CHECK-SD-NEXT: add x10, sp, #1064 -; CHECK-SD-NEXT: ld1 { v4.b }[10], [x13] -; CHECK-SD-NEXT: add x13, sp, #912 -; CHECK-SD-NEXT: ld1 { v6.b }[9], [x9] -; CHECK-SD-NEXT: ld1 { v7.b }[8], [x8] -; CHECK-SD-NEXT: add x9, sp, #800 -; CHECK-SD-NEXT: ld1 { v5.b }[8], [x13] +; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #328 +; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #96 +; CHECK-SD-NEXT: add x9, sp, #144 +; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #104 +; CHECK-SD-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: movi v1.16b, #1 +; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #112 +; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #120 +; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #128 +; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #136 +; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] +; CHECK-SD-NEXT: ldr b3, [x9] ; CHECK-SD-NEXT: add x8, sp, #152 -; CHECK-SD-NEXT: ld1 { v4.b }[11], [x10] -; CHECK-SD-NEXT: add x10, sp, #1072 -; CHECK-SD-NEXT: ld1 { v6.b }[10], [x9] -; CHECK-SD-NEXT: ld1 { v7.b }[9], [x8] -; CHECK-SD-NEXT: add x9, sp, #808 -; CHECK-SD-NEXT: ld1 { v5.b }[9], [x11] -; CHECK-SD-NEXT: add x8, sp, #56 -; CHECK-SD-NEXT: ld1 { v4.b }[12], [x10] -; CHECK-SD-NEXT: add x10, sp, #160 -; CHECK-SD-NEXT: ld1 { v0.b }[13], [x8] -; CHECK-SD-NEXT: ld1 { v6.b }[11], [x9] -; CHECK-SD-NEXT: add x9, sp, #928 -; CHECK-SD-NEXT: ld1 { v7.b }[10], [x10] -; CHECK-SD-NEXT: add x10, sp, #1080 -; CHECK-SD-NEXT: ld1 { v5.b }[10], [x9] +; CHECK-SD-NEXT: add x9, sp, #984 +; CHECK-SD-NEXT: ld1 { v3.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #160 +; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #168 +; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #176 +; CHECK-SD-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #184 +; CHECK-SD-NEXT: ld1 { v3.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #192 +; CHECK-SD-NEXT: ld1 { v3.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #200 +; CHECK-SD-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #992 +; CHECK-SD-NEXT: add x9, sp, #1040 +; CHECK-SD-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #1000 +; CHECK-SD-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #1008 +; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #1016 +; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #1024 +; CHECK-SD-NEXT: ld1 { v4.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #1032 +; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-SD-NEXT: ldr b5, [x9] +; CHECK-SD-NEXT: add x8, sp, #1048 +; CHECK-SD-NEXT: add x9, sp, #728 +; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #1056 +; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #1064 +; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #1072 +; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #1080 +; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #1088 +; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #1096 +; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v6.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #736 +; CHECK-SD-NEXT: add x9, sp, #784 +; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #744 +; CHECK-SD-NEXT: zip1 v4.2d, v4.2d, v5.2d +; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 +; CHECK-SD-NEXT: ld1 { v6.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #752 +; CHECK-SD-NEXT: sdot v19.4s, v4.16b, v1.16b +; CHECK-SD-NEXT: sdot v5.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #760 +; CHECK-SD-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #768 +; CHECK-SD-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #776 +; CHECK-SD-NEXT: ld1 { v6.b }[7], [x8] +; CHECK-SD-NEXT: ldr b7, [x9] +; CHECK-SD-NEXT: add x8, sp, #792 +; CHECK-SD-NEXT: add x9, sp, #856 +; CHECK-SD-NEXT: ld1 { v7.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #800 +; CHECK-SD-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #808 +; CHECK-SD-NEXT: ld1 { v7.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #816 -; CHECK-SD-NEXT: ld1 { v4.b }[13], [x10] -; CHECK-SD-NEXT: add x9, sp, #168 -; CHECK-SD-NEXT: add x10, sp, #176 -; CHECK-SD-NEXT: ld1 { v6.b }[12], [x8] -; CHECK-SD-NEXT: add x8, sp, #936 -; CHECK-SD-NEXT: ld1 { v7.b }[11], [x9] -; CHECK-SD-NEXT: add x9, sp, #1088 -; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8] -; CHECK-SD-NEXT: add x8, sp, #64 -; CHECK-SD-NEXT: ld1 { v4.b }[14], [x9] -; CHECK-SD-NEXT: add x9, sp, #824 -; CHECK-SD-NEXT: ld1 { v0.b }[14], [x8] -; CHECK-SD-NEXT: ld1 { v6.b }[13], [x9] -; CHECK-SD-NEXT: add x9, sp, #944 -; CHECK-SD-NEXT: ld1 { v7.b }[12], [x10] -; CHECK-SD-NEXT: add x10, sp, #1096 -; CHECK-SD-NEXT: ld1 { v5.b }[12], [x9] +; CHECK-SD-NEXT: ld1 { v7.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #824 +; CHECK-SD-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #832 -; CHECK-SD-NEXT: ld1 { v4.b }[15], [x10] -; CHECK-SD-NEXT: add x9, sp, #184 -; CHECK-SD-NEXT: add x10, sp, #72 -; CHECK-SD-NEXT: ld1 { v6.b }[14], [x8] -; CHECK-SD-NEXT: add x8, sp, #952 -; CHECK-SD-NEXT: ld1 { v7.b }[13], [x9] -; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8] +; CHECK-SD-NEXT: ld1 { v7.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #840 -; CHECK-SD-NEXT: ld1 { v0.b }[15], [x10] -; CHECK-SD-NEXT: sdot v2.4s, v4.16b, v1.16b -; CHECK-SD-NEXT: add x9, sp, #192 -; CHECK-SD-NEXT: ld1 { v6.b }[15], [x8] +; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v17.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #864 +; CHECK-SD-NEXT: add x9, sp, #16 +; CHECK-SD-NEXT: ld1 { v16.b }[8], [x9] +; CHECK-SD-NEXT: add x9, sp, #912 +; CHECK-SD-NEXT: ld1 { v17.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #872 +; CHECK-SD-NEXT: zip1 v0.2d, v6.2d, v7.2d +; CHECK-SD-NEXT: ld1 { v16.b }[9], [x10] +; CHECK-SD-NEXT: ld1 { v17.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #880 +; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: ld1 { v17.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #888 +; CHECK-SD-NEXT: ld1 { v17.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #896 +; CHECK-SD-NEXT: ld1 { v17.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #904 +; CHECK-SD-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-SD-NEXT: ldr b18, [x9] +; CHECK-SD-NEXT: add x8, sp, #920 +; CHECK-SD-NEXT: ld1 { v18.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #32 +; CHECK-SD-NEXT: ld1 { v16.b }[10], [x8] +; CHECK-SD-NEXT: add x8, sp, #928 +; CHECK-SD-NEXT: ld1 { v18.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #40 +; CHECK-SD-NEXT: ld1 { v16.b }[11], [x8] +; CHECK-SD-NEXT: add x8, sp, #936 +; CHECK-SD-NEXT: ld1 { v18.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #48 +; CHECK-SD-NEXT: ld1 { v16.b }[12], [x8] +; CHECK-SD-NEXT: add x8, sp, #944 +; CHECK-SD-NEXT: ld1 { v18.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #56 +; CHECK-SD-NEXT: ld1 { v16.b }[13], [x8] +; CHECK-SD-NEXT: add x8, sp, #952 +; CHECK-SD-NEXT: ld1 { v18.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #64 +; CHECK-SD-NEXT: ld1 { v16.b }[14], [x8] ; CHECK-SD-NEXT: add x8, sp, #960 -; CHECK-SD-NEXT: ld1 { v7.b }[14], [x9] -; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8] -; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b -; CHECK-SD-NEXT: add x8, sp, #200 -; CHECK-SD-NEXT: add x9, sp, #968 -; CHECK-SD-NEXT: sdot v2.4s, v6.16b, v1.16b -; CHECK-SD-NEXT: ld1 { v7.b }[15], [x8] -; CHECK-SD-NEXT: ld1 { v5.b }[15], [x9] -; CHECK-SD-NEXT: sdot v3.4s, v7.16b, v1.16b -; CHECK-SD-NEXT: sdot v2.4s, v5.16b, v1.16b -; CHECK-SD-NEXT: add v0.4s, v3.4s, v2.4s +; CHECK-SD-NEXT: ld1 { v18.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #72 +; CHECK-SD-NEXT: ld1 { v16.b }[15], [x8] +; CHECK-SD-NEXT: add x8, sp, #968 +; CHECK-SD-NEXT: ld1 { v18.b }[7], [x8] +; CHECK-SD-NEXT: sdot v5.4s, v16.16b, v1.16b +; CHECK-SD-NEXT: zip1 v0.2d, v17.2d, v18.2d +; CHECK-SD-NEXT: sdot v5.4s, v2.16b, v1.16b +; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: add v0.4s, v5.4s, v19.4s ; CHECK-SD-NEXT: addv s0, v0.4s ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll index f8ba150a0405f..f7a87ae340a73 100644 --- a/llvm/test/CodeGen/AArch64/nontemporal.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal.ll @@ -683,41 +683,43 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) { ; ; CHECK-BE-LABEL: test_stnp_v17f32: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4 +; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-BE-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-BE-NEXT: ldr s16, [sp, #36] +; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-BE-NEXT: // kill: def $s5 killed $s5 def $q5 -; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-BE-NEXT: ldr s17, [sp, #4] -; CHECK-BE-NEXT: add x8, sp, #44 -; CHECK-BE-NEXT: mov v4.s[1], v5.s[0] +; CHECK-BE-NEXT: add x8, sp, #12 +; CHECK-BE-NEXT: add x9, sp, #20 +; CHECK-BE-NEXT: ldr s16, [sp, #36] ; CHECK-BE-NEXT: mov v0.s[1], v1.s[0] +; CHECK-BE-NEXT: ldr s1, [sp, #4] +; CHECK-BE-NEXT: mov v4.s[1], v5.s[0] +; CHECK-BE-NEXT: add x10, sp, #52 ; CHECK-BE-NEXT: // kill: def $s6 killed $s6 def $q6 ; CHECK-BE-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-BE-NEXT: // kill: def $s7 killed $s7 def $q7 ; CHECK-BE-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-BE-NEXT: ldr s1, [sp, #68] -; CHECK-BE-NEXT: ld1 { v16.s }[1], [x8] -; CHECK-BE-NEXT: add x8, sp, #12 -; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8] -; CHECK-BE-NEXT: add x8, sp, #52 -; CHECK-BE-NEXT: str s1, [x0, #64] -; CHECK-BE-NEXT: ld1 { v16.s }[2], [x8] -; CHECK-BE-NEXT: add x8, sp, #20 +; CHECK-BE-NEXT: ld1 { v1.s }[1], [x8] +; CHECK-BE-NEXT: ldr s5, [x9] +; CHECK-BE-NEXT: add x8, sp, #28 +; CHECK-BE-NEXT: add x9, sp, #44 +; CHECK-BE-NEXT: ld1 { v5.s }[1], [x8] +; CHECK-BE-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-BE-NEXT: ldr s17, [x10] +; CHECK-BE-NEXT: add x8, sp, #60 ; CHECK-BE-NEXT: mov v4.s[2], v6.s[0] ; CHECK-BE-NEXT: mov v0.s[2], v2.s[0] -; CHECK-BE-NEXT: ld1 { v17.s }[2], [x8] -; CHECK-BE-NEXT: add x8, sp, #60 -; CHECK-BE-NEXT: ld1 { v16.s }[3], [x8] -; CHECK-BE-NEXT: add x8, sp, #28 -; CHECK-BE-NEXT: ld1 { v17.s }[3], [x8] +; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8] +; CHECK-BE-NEXT: ldr s2, [sp, #68] +; CHECK-BE-NEXT: add x8, x0, #32 +; CHECK-BE-NEXT: zip1 v1.2d, v1.2d, v5.2d +; CHECK-BE-NEXT: add x9, x0, #48 +; CHECK-BE-NEXT: str s2, [x0, #64] +; CHECK-BE-NEXT: zip1 v5.2d, v16.2d, v17.2d ; CHECK-BE-NEXT: mov v4.s[3], v7.s[0] -; CHECK-BE-NEXT: add x8, x0, #48 ; CHECK-BE-NEXT: mov v0.s[3], v3.s[0] -; CHECK-BE-NEXT: st1 { v16.4s }, [x8] -; CHECK-BE-NEXT: add x8, x0, #32 -; CHECK-BE-NEXT: st1 { v17.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x8] ; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: st1 { v5.4s }, [x9] ; CHECK-BE-NEXT: st1 { v4.4s }, [x8] ; CHECK-BE-NEXT: st1 { v0.4s }, [x0] ; CHECK-BE-NEXT: ret From d4bd7c911b71bf2af981f4fa3cc30a190b1cffbb Mon Sep 17 00:00:00 2001 From: Jonathan Cohen Date: Sun, 20 Jul 2025 11:19:38 +0300 Subject: [PATCH 2/3] [AArch64][MachineCombiner] Fix setting reg state for gather lane pattern --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 38 ++++++++++++------- .../AArch64/aarch64-combine-gather-lanes.mir | 18 ++++----- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 06bcaa1698aab..1d228ea517534 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -7514,14 +7515,15 @@ generateGatherPattern(MachineInstr &Root, auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr, Register SrcRegister, unsigned Lane, - Register OffsetRegister) { + Register OffsetRegister, + bool OffsetRegisterKillState) { auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); MachineInstrBuilder LoadIndexIntoRegister = BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), NewRegister) .addReg(SrcRegister) .addImm(Lane) - .addReg(OffsetRegister, getKillRegState(true)); + .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState)); InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); InsInstrs.push_back(LoadIndexIntoRegister); return NewRegister; @@ -7529,7 +7531,8 @@ generateGatherPattern(MachineInstr &Root, // Helper to create load instruction based on opcode auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, - Register OffsetReg) -> MachineInstrBuilder { + Register OffsetReg, + bool KillState) -> MachineInstrBuilder { unsigned Opcode; switch (NumLanes) { case 4: @@ -7555,25 +7558,30 @@ generateGatherPattern(MachineInstr &Root, auto LanesToLoadToReg0 = llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, LoadToLaneInstrsAscending.begin() + NumLanes / 2); - auto PrevReg = SubregToReg->getOperand(0).getReg(); + Register PrevReg = SubregToReg->getOperand(0).getReg(); for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { + const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3); PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, - LoadInstr->getOperand(3).getReg()); + OffsetRegOperand.getReg(), + OffsetRegOperand.isKill()); DelInstrs.push_back(LoadInstr); } - auto LastLoadReg0 = PrevReg; + Register LastLoadReg0 = PrevReg; // First load into register 1. Perform a LDRSui to zero out the upper lanes in // a single instruction. - auto Lane0Load = *LoadToLaneInstrsAscending.begin(); - auto OriginalSplitLoad = + MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin(); + MachineInstr *OriginalSplitLoad = *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); - auto DestRegForMiddleIndex = MRI.createVirtualRegister( + Register DestRegForMiddleIndex = MRI.createVirtualRegister( MRI.getRegClass(Lane0Load->getOperand(0).getReg())); + const MachineOperand &OriginalSplitToLoadOffsetOperand = + OriginalSplitLoad->getOperand(3); MachineInstrBuilder MiddleIndexLoadInstr = CreateLoadInstruction(NumLanes, DestRegForMiddleIndex, - OriginalSplitLoad->getOperand(3).getReg()); + OriginalSplitToLoadOffsetOperand.getReg(), + OriginalSplitToLoadOffsetOperand.isKill()); InstrIdxForVirtReg.insert( std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); @@ -7581,7 +7589,7 @@ generateGatherPattern(MachineInstr &Root, DelInstrs.push_back(OriginalSplitLoad); // Subreg To Reg instruction for register 1. - auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); + Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); unsigned SubregType; switch (NumLanes) { case 4: @@ -7614,14 +7622,18 @@ generateGatherPattern(MachineInstr &Root, LoadToLaneInstrsAscending.end()); PrevReg = SubRegToRegInstr->getOperand(0).getReg(); for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { + const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3); PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, - LoadInstr->getOperand(3).getReg()); + OffsetRegOperand.getReg(), + OffsetRegOperand.isKill()); + + // Do not add the last reg to DelInstrs - it will be removed later. if (Index == NumLanes / 2 - 2) { break; } DelInstrs.push_back(LoadInstr); } - auto LastLoadReg1 = PrevReg; + Register LastLoadReg1 = PrevReg; // Create the final zip instruction to combine the results. MachineInstrBuilder ZipInstr = diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir index 09eb18b0e3574..5cddf92fdbb4c 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir @@ -13,12 +13,12 @@ body: | ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 - ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 - ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub - ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, [[COPY2]] ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0 ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub - ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]] + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, [[COPY4]] ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] ; CHECK-NEXT: $q0 = COPY [[ZIP]] ; CHECK-NEXT: RET_ReallyLR implicit $q0 @@ -27,11 +27,11 @@ body: | %2:gpr64common = COPY $x2 %3:gpr64common = COPY $x3 %4:gpr64common = COPY $x4 - %5:fpr32 = LDRSroX %0, killed %1, 0, 1 - %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub - %7:fpr128 = LD1i32 %6, 1, killed %2 - %8:fpr128 = LD1i32 %7, 2, killed %3 - %9:fpr128 = LD1i32 %8, 3, killed %4 + %5:fpr32 = LDRSroX %0, %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, %2 + %8:fpr128 = LD1i32 %7, 2, %3 + %9:fpr128 = LD1i32 %8, 3, %4 $q0 = COPY %9 RET_ReallyLR implicit $q0 From 76289e1dc67d012a1c4bcf752e89c448276e74d4 Mon Sep 17 00:00:00 2001 From: Jonathan Cohen Date: Wed, 30 Jul 2025 09:36:28 +0300 Subject: [PATCH 3/3] [AArch64] [MachineCombiner] Verify load instructions in gather pattern cannot possibly overlap --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 24 +++++++++- ...aarch64-combine-gather-lanes-with-call.mir | 45 +++++++++++++++++++ .../aarch64-combine-gather-with-alias.mir | 42 +++++++++++++++++ 3 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-gather-with-alias.mir diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 1d228ea517534..bd451aa1fc79b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -86,6 +87,11 @@ static cl::opt BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)")); +#define DEBUG_TYPE "aarch64-machine-combine" +STATISTIC(NumGathersMatched, "Number of `gather`-like patterns matched"); +STATISTIC(NumGathersDroppedAliasing, "Number of `gather`-like patterns dropped " + "due to potential pointer aliasing"); + AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, AArch64::CATCHRET), @@ -7416,14 +7422,21 @@ static bool getGatherPattern(MachineInstr &Root, // 1. It has a single non-debug use (since we will be replacing the virtual // register) // 2. That the addressing mode only uses a single offset register. + // 3. The address operand does not have any users that are a COPY operation to + // a physical reg. + // This could indicate that it is copied as part of an ABI of a function + // call, which means that it may be modified in unexpected ways, see: auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); auto Range = llvm::seq(1, NumLanes - 1); - SmallSet RemainingLanes(Range.begin(), Range.end()); + SmallSet RemainingLanes(Range.begin(), Range.end()); + SmallSet LoadInstrs = {}; while (!RemainingLanes.empty() && CurrInstr && CurrInstr->getOpcode() == LoadLaneOpCode && MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && CurrInstr->getNumOperands() == 4) { RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); + LoadInstrs.insert(CurrInstr); CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); } @@ -7444,6 +7457,15 @@ static bool getGatherPattern(MachineInstr &Root, if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) return false; + LoadInstrs.insert(MRI.getUniqueVRegDef(Lane0LoadReg)); + + // Conservative check that we can + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + for (auto LoadA = LoadInstrs.begin(); LoadA != LoadInstrs.end(); ++LoadA) + for (auto LoadB = ++LoadA; LoadB != LoadInstrs.end(); ++LoadB) + if (!TII->areMemAccessesTriviallyDisjoint(**LoadA, **LoadB)) + return false; + switch (NumLanes) { case 4: Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir new file mode 100644 index 0000000000000..6b338d98afb53 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir @@ -0,0 +1,45 @@ +# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s + + +--- | + @external_func = external global i32 + define void @negative_pattern_offset_reg_copied_to_physical(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4) { + entry: + ret void + } +... +--- +name: negative_pattern_offset_reg_copied_to_physical +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3 + + ; CHECK-LABEL: name: negative_pattern_offset_reg_copied_to_physical + ; CHECK: [[BASE_REG:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[PTR_1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[PTR_2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[PTR_3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[BASE_REG]], killed [[PTR_1]], 0, 1 + ; CHECK-NEXT: [[LD_LANE_0:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_0]], 1, [[PTR_2]] + ; CHECK-NEXT: $x0 = COPY [[PTR_2]] + ; CHECK-NEXT: BL @external_func, csr_aarch64_aapcs, implicit-def $lr, implicit $x0, implicit-def $x0 + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[PTR_2]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[PTR_3]] + ; CHECK-NEXT: [[RESULT:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, %2 + $x0 = COPY %2 + BL @external_func, csr_aarch64_aapcs, implicit-def $lr, implicit $x0, implicit-def $x0 + %8:fpr128 = LD1i32 %7, 2, killed %2 + %9:fpr128 = LD1i32 %8, 3, killed %3 + %10:gpr64common = COPY $x0 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-with-alias.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-with-alias.mir new file mode 100644 index 0000000000000..f4d4678a207d7 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-with-alias.mir @@ -0,0 +1,42 @@ +# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s + +--- | + define void @aliasing_store_between_vector_loads(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3) { + entry: + ret void + } + +... +--- +name: aliasing_store_between_vector_loads +alignment: 4 +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3 + + ; CHECK-LABEL: name: aliasing_store_between_vector_loads + ; CHECK: [[BASE_PTR:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[OFFSET_PTR:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[ALIAS_ADDR:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[OTHER_ADDR:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[VEC1:%[0-9]+]]:fpr128 = LD1i32 %{{[0-9]+}}, 1, [[ALIAS_ADDR]] + ; CHECK-NEXT: [[CONST:%[0-9]+]]:gpr32 = MOVi32 99 + ; CHECK-NEXT: STRWui [[CONST]], [[ALIAS_ADDR]], 0 + ; CHECK-NEXT: [[VEC2:%[0-9]+]]:fpr128 = LD1i32 [[VEC1]], 2, killed [[ALIAS_ADDR]] + ; CHECK-NEXT: [[VEC3:%[0-9]+]]:fpr128 = LD1i32 [[VEC2]], 3, killed [[OTHER_ADDR]] + ; CHECK-NEXT: $q0 = COPY [[VEC3]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, %2 + %10:gpr32 = MOVi32imm 99 + STRWui %10, %2, 0 + %8:fpr128 = LD1i32 %7, 2, killed %2 + %9:fpr128 = LD1i32 %8, 3, killed %3 + $q0 = COPY %9 + RET_ReallyLR implicit $q0