Skip to content

Commit 83e3f0c

Browse files
authored
Merge branch 'main' into QTCREATORBUG-27788-friend-def-in-outline
2 parents 211948b + 5294793 commit 83e3f0c

File tree

8 files changed

+222
-164
lines changed

8 files changed

+222
-164
lines changed

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -979,12 +979,10 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
979979
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
980980
bool UseMaskForCond, bool UseMaskForGaps) const {
981981

982-
// The interleaved memory access pass will lower (de)interleave ops combined
983-
// with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
984-
// only support masking per-iteration (i.e. condition), not per-segment (i.e.
985-
// gap).
986-
// TODO: Support masked interleaved access for fixed length vector.
987-
if ((isa<ScalableVectorType>(VecTy) || !UseMaskForCond) && !UseMaskForGaps &&
982+
// The interleaved memory access pass will lower interleaved memory ops (i.e
983+
// a load and store followed by a specific shuffle) to vlseg/vsseg
984+
// intrinsics.
985+
if (!UseMaskForCond && !UseMaskForGaps &&
988986
Factor <= TLI->getMaxSupportedInterleaveFactor()) {
989987
auto *VTy = cast<VectorType>(VecTy);
990988
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -398,10 +398,6 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
398398

399399
bool enableInterleavedAccessVectorization() const override { return true; }
400400

401-
bool enableMaskedInterleavedAccessVectorization() const override {
402-
return ST->hasVInstructions();
403-
}
404-
405401
unsigned getMinTripCountTailFoldingThreshold() const override;
406402

407403
enum RISCVRegisterClass { GPRRC, FPRRC, VRRC };

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1359,9 +1359,7 @@ class LoopVectorizationCostModel {
13591359
return;
13601360
// Override EVL styles if needed.
13611361
// FIXME: Investigate opportunity for fixed vector factor.
1362-
// FIXME: Support interleave accesses.
13631362
bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1364-
!InterleaveInfo.hasGroups() &&
13651363
TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
13661364
if (EVLIsLegal)
13671365
return;

llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll

Lines changed: 167 additions & 121 deletions
Large diffs are not rendered by default.

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-interleave.ll

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,33 +21,36 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
2121
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP6]]
2222
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]]
2323
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
24-
; IF-EVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
2524
; IF-EVL-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
2625
; IF-EVL-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
27-
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
28-
; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
26+
; IF-EVL-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
27+
; IF-EVL-NEXT: [[TMP12:%.*]] = mul <vscale x 4 x i64> [[TMP10]], splat (i64 1)
28+
; IF-EVL-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP12]]
2929
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
3030
; IF-EVL: vector.body:
31+
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
3132
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
32-
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EVL_BASED_IV]], i64 0
33+
; IF-EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
34+
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
35+
; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true)
36+
; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64
37+
; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP13]]
38+
; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
3339
; IF-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
34-
; IF-EVL-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
35-
; IF-EVL-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP9]]
36-
; IF-EVL-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT2]], [[TMP10]]
37-
; IF-EVL-NEXT: [[TMP11:%.*]] = icmp ule <vscale x 4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
38-
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], i64 [[EVL_BASED_IV]], i32 0
39-
; IF-EVL-NEXT: [[INTERLEAVED_MASK:%.*]] = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> [[TMP11]], <vscale x 4 x i1> [[TMP11]])
40-
; IF-EVL-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.masked.load.nxv8i32.p0(ptr [[TMP12]], i32 4, <vscale x 8 x i1> [[INTERLEAVED_MASK]], <vscale x 8 x i32> poison)
41-
; IF-EVL-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_MASKED_VEC]])
42-
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
43-
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
40+
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i32], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
41+
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
42+
; IF-EVL-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i32], ptr [[B]], <vscale x 4 x i64> [[VEC_IND]], i32 1
43+
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP23]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
4444
; IF-EVL-NEXT: [[TMP26:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER5]], [[WIDE_MASKED_GATHER3]]
4545
; IF-EVL-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[EVL_BASED_IV]]
4646
; IF-EVL-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
47-
; IF-EVL-NEXT: call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr [[TMP29]], i32 4, <vscale x 4 x i1> [[TMP11]])
48-
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP8]]
49-
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N_VEC]]
50-
; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
47+
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP26]], ptr align 4 [[TMP29]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP11]])
48+
; IF-EVL-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
49+
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP15]], [[EVL_BASED_IV]]
50+
; IF-EVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
51+
; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT2]]
52+
; IF-EVL-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
53+
; IF-EVL-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
5154
; IF-EVL: middle.block:
5255
; IF-EVL-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
5356
; IF-EVL: scalar.ph:
@@ -64,7 +67,7 @@ define void @interleave(ptr noalias %a, ptr noalias %b, i64 %N) {
6467
; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
6568
; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
6669
; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
67-
; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
70+
; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
6871
; IF-EVL: for.cond.cleanup:
6972
; IF-EVL-NEXT: ret void
7073
;

offload/plugins-nextgen/common/include/JIT.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ struct JITEngine {
5555
process(const __tgt_device_image &Image,
5656
target::plugin::GenericDeviceTy &Device);
5757

58+
/// Remove \p Image from the jit engine's cache
59+
void erase(const __tgt_device_image &Image,
60+
target::plugin::GenericDeviceTy &Device);
61+
5862
private:
5963
/// Compile the bitcode image \p Image and generate the binary image that can
6064
/// be loaded to the target device of the triple \p Triple architecture \p
@@ -89,11 +93,13 @@ struct JITEngine {
8993
/// LLVM Context in which the modules will be constructed.
9094
LLVMContext Context;
9195

92-
/// Output images generated from LLVM backend.
93-
SmallVector<std::unique_ptr<MemoryBuffer>, 4> JITImages;
96+
/// A map of embedded IR images to the buffer used to store JITed code
97+
DenseMap<const __tgt_device_image *, std::unique_ptr<MemoryBuffer>>
98+
JITImages;
9499

95100
/// A map of embedded IR images to JITed images.
96-
DenseMap<const __tgt_device_image *, __tgt_device_image *> TgtImageMap;
101+
DenseMap<const __tgt_device_image *, std::unique_ptr<__tgt_device_image>>
102+
TgtImageMap;
97103
};
98104

99105
/// Map from (march) "CPUs" (e.g., sm_80, or gfx90a), which we call compute

offload/plugins-nextgen/common/src/JIT.cpp

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -285,8 +285,8 @@ JITEngine::compile(const __tgt_device_image &Image,
285285

286286
// Check if we JITed this image for the given compute unit kind before.
287287
ComputeUnitInfo &CUI = ComputeUnitMap[ComputeUnitKind];
288-
if (__tgt_device_image *JITedImage = CUI.TgtImageMap.lookup(&Image))
289-
return JITedImage;
288+
if (CUI.TgtImageMap.contains(&Image))
289+
return CUI.TgtImageMap[&Image].get();
290290

291291
auto ObjMBOrErr = getOrCreateObjFile(Image, CUI.Context, ComputeUnitKind);
292292
if (!ObjMBOrErr)
@@ -296,17 +296,15 @@ JITEngine::compile(const __tgt_device_image &Image,
296296
if (!ImageMBOrErr)
297297
return ImageMBOrErr.takeError();
298298

299-
CUI.JITImages.push_back(std::move(*ImageMBOrErr));
300-
__tgt_device_image *&JITedImage = CUI.TgtImageMap[&Image];
301-
JITedImage = new __tgt_device_image();
299+
CUI.JITImages.insert({&Image, std::move(*ImageMBOrErr)});
300+
auto &ImageMB = CUI.JITImages[&Image];
301+
CUI.TgtImageMap.insert({&Image, std::make_unique<__tgt_device_image>()});
302+
auto &JITedImage = CUI.TgtImageMap[&Image];
302303
*JITedImage = Image;
303-
304-
auto &ImageMB = CUI.JITImages.back();
305-
306304
JITedImage->ImageStart = const_cast<char *>(ImageMB->getBufferStart());
307305
JITedImage->ImageEnd = const_cast<char *>(ImageMB->getBufferEnd());
308306

309-
return JITedImage;
307+
return JITedImage.get();
310308
}
311309

312310
Expected<const __tgt_device_image *>
@@ -324,3 +322,13 @@ JITEngine::process(const __tgt_device_image &Image,
324322

325323
return &Image;
326324
}
325+
326+
void JITEngine::erase(const __tgt_device_image &Image,
327+
target::plugin::GenericDeviceTy &Device) {
328+
std::lock_guard<std::mutex> Lock(ComputeUnitMapMutex);
329+
const std::string &ComputeUnitKind = Device.getComputeUnitKind();
330+
ComputeUnitInfo &CUI = ComputeUnitMap[ComputeUnitKind];
331+
332+
CUI.TgtImageMap.erase(&Image);
333+
CUI.JITImages.erase(&Image);
334+
}

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,9 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
854854
return Err;
855855
}
856856

857+
if (Image->getTgtImageBitcode())
858+
Plugin.getJIT().erase(*Image->getTgtImageBitcode(), Image->getDevice());
859+
857860
return unloadBinaryImpl(Image);
858861
}
859862

0 commit comments

Comments
 (0)