From eed56d228c0613f563c23f9be23d681ef3d87f2b Mon Sep 17 00:00:00 2001 From: Wenju He Date: Thu, 31 Jul 2025 05:07:23 +0200 Subject: [PATCH 1/3] [libclc] Move mem_fence and barrier to clc library __clc_mem_fence and __clc_barrier function have two parameters Scope and MemorySemantics, which are defined in SPIR-V spec. The design allows the clc functions to implement SPIR-V ControlBarrier and MemoryBarrier functions in the future. The default memory ordering in clc is set to SequentiallyConsistent, which is also the default and strongest ordering in OpenCL and C++. The default memory scope in clc is set to memory_scope_device for amdgcn and ptx-nvidiacl since __opencl_c_atomic_scope_all_devices feature macro is not defined for these targets. llvm-diff shows no change to amdgcn--amdhsa.bc and nvptx64--nvidiacl.bc. --- .../clc/include/clc/mem_fence/clc_mem_fence.h | 18 +++++++ .../clc/mem_fence/clc_mem_scope_semantics.h | 36 +++++++++++++ .../include/clc/synchronization/clc_barrier.h | 18 +++++++ libclc/clc/lib/amdgcn/SOURCES | 2 + .../clc/lib/amdgcn/mem_fence/clc_mem_fence.cl | 37 ++++++++++++++ .../lib/amdgcn/synchronization/clc_barrier.cl | 16 ++++++ libclc/clc/lib/ptx-nvidiacl/SOURCES | 2 + .../ptx-nvidiacl/mem_fence/clc_mem_fence.cl | 15 ++++++ .../synchronization/clc_barrier.cl | 14 ++++++ .../synchronization/cl_mem_fence_flags.h | 27 ++++++++++ .../clc/opencl/synchronization/utils.h | 50 +++++++++++++++++++ libclc/opencl/lib/amdgcn/mem_fence/fence.cl | 29 ++--------- .../lib/amdgcn/synchronization/barrier.cl | 8 +-- .../lib/ptx-nvidiacl/mem_fence/fence.cl | 7 ++- .../ptx-nvidiacl/synchronization/barrier.cl | 6 ++- 15 files changed, 255 insertions(+), 30 deletions(-) create mode 100644 libclc/clc/include/clc/mem_fence/clc_mem_fence.h create mode 100644 libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h create mode 100644 libclc/clc/include/clc/synchronization/clc_barrier.h create mode 100644 libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl create mode 100644 libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl create mode 100644 libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl create mode 100644 libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl create mode 100644 libclc/opencl/include/clc/opencl/synchronization/utils.h diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h new file mode 100644 index 0000000000000..f0bbd136955bd --- /dev/null +++ b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ +#define __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ + +#include +#include + +_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(Scope scope, + MemorySemantics semantics); + +#endif // __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h b/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h new file mode 100644 index 0000000000000..7294026386b7a --- /dev/null +++ b/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__ +#define __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__ + +// Scope values are defined in SPIR-V spec. +typedef enum Scope { + CrossDevice = 0, + Device = 1, + Workgroup = 2, + Subgroup = 3, + Invocation = 4, +} Scope; + +// MemorySemantics values are defined in SPIR-V spec. +typedef enum MemorySemantics { + None = 0x0, + Acquire = 0x2, + Release = 0x4, + AcquireRelease = 0x8, + SequentiallyConsistent = 0x10, + UniformMemory = 0x40, + SubgroupMemory = 0x80, + WorkgroupMemory = 0x100, + CrossWorkgroupMemory = 0x200, + AtomicCounterMemory = 0x400, + ImageMemory = 0x800, +} MemorySemantics; + +#endif // __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__ diff --git a/libclc/clc/include/clc/synchronization/clc_barrier.h b/libclc/clc/include/clc/synchronization/clc_barrier.h new file mode 100644 index 0000000000000..d363652c6e14d --- /dev/null +++ b/libclc/clc/include/clc/synchronization/clc_barrier.h @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_SYNCHRONIZATION_CLC_BARRIER_H__ +#define __CLC_SYNCHRONIZATION_CLC_BARRIER_H__ + +#include +#include + +_CLC_OVERLOAD _CLC_DECL void __clc_barrier(Scope scope, + MemorySemantics semantics); + +#endif // __CLC_SYNCHRONIZATION_CLC_BARRIER_H__ diff --git a/libclc/clc/lib/amdgcn/SOURCES b/libclc/clc/lib/amdgcn/SOURCES index 7bec1740f7636..f2f58e3124aa8 100644 --- a/libclc/clc/lib/amdgcn/SOURCES +++ b/libclc/clc/lib/amdgcn/SOURCES @@ -1,6 +1,8 @@ math/clc_fmax.cl math/clc_fmin.cl math/clc_ldexp_override.cl +mem_fence/clc_mem_fence.cl +synchronization/clc_barrier.cl workitem/clc_get_global_offset.cl workitem/clc_get_global_size.cl workitem/clc_get_group_id.cl diff --git a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl new file mode 100644 index 0000000000000..12ec6d8d18091 --- /dev/null +++ b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +void __clc_amdgcn_s_waitcnt(unsigned flags); + +// s_waitcnt takes 16bit argument with a combined number of maximum allowed +// pending operations: +// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages +// [7] -- undefined +// [6:4] -- exports, GDS, and mem write +// [3:0] -- vector memory operations + +// Newer clang supports __builtin_amdgcn_s_waitcnt +#if __clang_major__ >= 5 +#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x) +#else +#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x) +_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt"); +#endif + +_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(Scope scope, + MemorySemantics semantics) { + if (semantics & CrossWorkgroupMemory) { + // scalar loads are counted with LGKM but we don't know whether + // the compiler turned any loads to scalar + __waitcnt(0); + } else if (semantics & WorkgroupMemory) + __waitcnt(0xff); // LGKM is [12:8] +} +#undef __waitcnt diff --git a/libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl b/libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl new file mode 100644 index 0000000000000..0299a426e4d21 --- /dev/null +++ b/libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +_CLC_OVERLOAD _CLC_DEF void __clc_barrier(Scope scope, + MemorySemantics semantics) { + __clc_mem_fence(scope, semantics); + __builtin_amdgcn_s_barrier(); +} diff --git a/libclc/clc/lib/ptx-nvidiacl/SOURCES b/libclc/clc/lib/ptx-nvidiacl/SOURCES index 05368c5e4d4e3..a0fb861549ebc 100644 --- a/libclc/clc/lib/ptx-nvidiacl/SOURCES +++ b/libclc/clc/lib/ptx-nvidiacl/SOURCES @@ -1,3 +1,5 @@ +mem_fence/clc_mem_fence.cl +synchronization/clc_barrier.cl workitem/clc_get_global_id.cl workitem/clc_get_group_id.cl workitem/clc_get_local_id.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl new file mode 100644 index 0000000000000..4c0d342b7244f --- /dev/null +++ b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(Scope scope, + MemorySemantics semantics) { + if (semantics & (CrossWorkgroupMemory | WorkgroupMemory)) + __nvvm_membar_cta(); +} diff --git a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl new file mode 100644 index 0000000000000..920b17cb02f92 --- /dev/null +++ b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl @@ -0,0 +1,14 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +_CLC_OVERLOAD _CLC_DEF void __clc_barrier(Scope scope, + MemorySemantics semantics) { + __syncthreads(); +} diff --git a/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h b/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h index 6636515fca47d..18f9a4afb2d5f 100644 --- a/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h +++ b/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h @@ -11,7 +11,34 @@ typedef uint cl_mem_fence_flags; +// Copied from +// https://github.com/llvm/llvm-project/blob/08e40c12fa0c/clang/lib/Headers/opencl-c-base.h#L390 +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, +#if defined(__opencl_c_atomic_scope_all_devices) + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, +#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100) + memory_scope_all_devices = memory_scope_all_svm_devices, +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= + // 202100) +#endif // defined(__opencl_c_atomic_scope_all_devices) +/** + * Subgroups have different requirements on forward progress, so just test + * all the relevant macros. + * CL 3.0 sub-groups "they are not guaranteed to make independent forward + * progress" KHR subgroups "Subgroups within a workgroup are independent, make + * forward progress with respect to each other" + */ +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || \ + defined(__opencl_c_subgroups) + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +#endif +} memory_scope; + #define CLK_LOCAL_MEM_FENCE 1 #define CLK_GLOBAL_MEM_FENCE 2 +#define CLK_IMAGE_MEM_FENCE 4 #endif // __CLC_OPENCL_SYNCHRONIZATION_CL_MEM_FENCE_FLAGS_H__ diff --git a/libclc/opencl/include/clc/opencl/synchronization/utils.h b/libclc/opencl/include/clc/opencl/synchronization/utils.h new file mode 100644 index 0000000000000..098d96d0a8a32 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/synchronization/utils.h @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ +#define __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ + +#include +#include +#include + +_CLC_INLINE Scope getCLCScope(memory_scope memory_scope) { + switch (memory_scope) { + case memory_scope_work_item: + return Invocation; +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || \ + defined(__opencl_c_subgroups) + case memory_scope_sub_group: + return Subgroup; +#endif + case memory_scope_work_group: + return Workgroup; + case memory_scope_device: + return Device; + default: + break; + } +#ifdef __opencl_c_atomic_scope_all_devices + return CrossDevice; +#else + return Device; +#endif +} + +_CLC_INLINE MemorySemantics getCLCMemorySemantics(cl_mem_fence_flags flag) { + MemorySemantics semantics = AcquireRelease; + if (flag & CLK_GLOBAL_MEM_FENCE) + semantics |= CrossWorkgroupMemory; + if (flag & CLK_LOCAL_MEM_FENCE) + semantics |= WorkgroupMemory; + if (flag & CLK_IMAGE_MEM_FENCE) + semantics |= ImageMemory; + return semantics; +} + +#endif // __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ diff --git a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl index 88b953005aae6..10d879d835c06 100644 --- a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl +++ b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl @@ -6,34 +6,15 @@ // //===----------------------------------------------------------------------===// +#include #include - -void __clc_amdgcn_s_waitcnt(unsigned flags); - -// s_waitcnt takes 16bit argument with a combined number of maximum allowed -// pending operations: -// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages -// [7] -- undefined -// [6:4] -- exports, GDS, and mem write -// [3:0] -- vector memory operations - -// Newer clang supports __builtin_amdgcn_s_waitcnt -#if __clang_major__ >= 5 -#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x) -#else -#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x) -_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt"); -#endif +#include _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) { - if (flags & CLK_GLOBAL_MEM_FENCE) { - // scalar loads are counted with LGKM but we don't know whether - // the compiler turned any loads to scalar - __waitcnt(0); - } else if (flags & CLK_LOCAL_MEM_FENCE) - __waitcnt(0xff); // LGKM is [12:8] + Scope scope = getCLCScope(memory_scope_device); + MemorySemantics semantics = getCLCMemorySemantics(flags); + __clc_mem_fence(scope, semantics); } -#undef __waitcnt // We don't have separate mechanism for read and write fences _CLC_DEF _CLC_OVERLOAD void read_mem_fence(cl_mem_fence_flags flags) { diff --git a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl index 5203db72f484c..b8372d4800bf1 100644 --- a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl +++ b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl @@ -6,10 +6,12 @@ // //===----------------------------------------------------------------------===// -#include #include +#include +#include _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) { - mem_fence(flags); - __builtin_amdgcn_s_barrier(); + Scope scope = getCLCScope(memory_scope_device); + MemorySemantics semantics = getCLCMemorySemantics(flags); + __clc_barrier(scope, semantics); } diff --git a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl index d24569ecda1bc..2d591c90d63c2 100644 --- a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl +++ b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl @@ -6,11 +6,14 @@ // //===----------------------------------------------------------------------===// +#include #include +#include _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) { - if (flags & (CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE)) - __nvvm_membar_cta(); + Scope scope = getCLCScope(memory_scope_device); + MemorySemantics semantics = getCLCMemorySemantics(flags); + __clc_mem_fence(scope, semantics); } // We do not have separate mechanism for read and write fences. diff --git a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl index 7c57478795dda..b8372d4800bf1 100644 --- a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl +++ b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl @@ -7,7 +7,11 @@ //===----------------------------------------------------------------------===// #include +#include +#include _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) { - __syncthreads(); + Scope scope = getCLCScope(memory_scope_device); + MemorySemantics semantics = getCLCMemorySemantics(flags); + __clc_barrier(scope, semantics); } From 29ec1763d807d66c5fd3ec19e0ef311e520026e7 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Thu, 31 Jul 2025 07:17:11 +0200 Subject: [PATCH 2/3] default to SequentiallyConsistent --- libclc/opencl/include/clc/opencl/synchronization/utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libclc/opencl/include/clc/opencl/synchronization/utils.h b/libclc/opencl/include/clc/opencl/synchronization/utils.h index 098d96d0a8a32..bbcfa20a556d5 100644 --- a/libclc/opencl/include/clc/opencl/synchronization/utils.h +++ b/libclc/opencl/include/clc/opencl/synchronization/utils.h @@ -37,7 +37,7 @@ _CLC_INLINE Scope getCLCScope(memory_scope memory_scope) { } _CLC_INLINE MemorySemantics getCLCMemorySemantics(cl_mem_fence_flags flag) { - MemorySemantics semantics = AcquireRelease; + MemorySemantics semantics = SequentiallyConsistent; if (flag & CLK_GLOBAL_MEM_FENCE) semantics |= CrossWorkgroupMemory; if (flag & CLK_LOCAL_MEM_FENCE) From 86ca62c88512b55cfd772e762e729f52907ee102 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Tue, 5 Aug 2025 12:45:42 +0200 Subject: [PATCH 3/3] rename clc_barrier to clc_work_group_barrier, replace SPIR-V constants with clang macros --- .../clc/include/clc/mem_fence/clc_mem_fence.h | 5 ++- .../clc/mem_fence/clc_mem_scope_semantics.h | 36 ------------------- ...clc_barrier.h => clc_work_group_barrier.h} | 11 +++--- libclc/clc/lib/amdgcn/SOURCES | 2 +- .../clc/lib/amdgcn/mem_fence/clc_mem_fence.cl | 8 ++--- ...c_barrier.cl => clc_work_group_barrier.cl} | 8 ++--- libclc/clc/lib/ptx-nvidiacl/SOURCES | 2 +- .../ptx-nvidiacl/mem_fence/clc_mem_fence.cl | 6 ++-- ...c_barrier.cl => clc_work_group_barrier.cl} | 6 ++-- .../synchronization/cl_mem_fence_flags.h | 26 -------------- .../clc/opencl/synchronization/utils.h | 36 +++---------------- libclc/opencl/lib/amdgcn/mem_fence/fence.cl | 6 ++-- .../lib/amdgcn/synchronization/barrier.cl | 8 ++--- .../lib/ptx-nvidiacl/mem_fence/fence.cl | 6 ++-- .../ptx-nvidiacl/synchronization/barrier.cl | 8 ++--- 15 files changed, 42 insertions(+), 132 deletions(-) delete mode 100644 libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h rename libclc/clc/include/clc/synchronization/{clc_barrier.h => clc_work_group_barrier.h} (55%) rename libclc/clc/lib/amdgcn/synchronization/{clc_barrier.cl => clc_work_group_barrier.cl} (64%) rename libclc/clc/lib/ptx-nvidiacl/synchronization/{clc_barrier.cl => clc_work_group_barrier.cl} (66%) diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h index f0bbd136955bd..2321634c76842 100644 --- a/libclc/clc/include/clc/mem_fence/clc_mem_fence.h +++ b/libclc/clc/include/clc/mem_fence/clc_mem_fence.h @@ -10,9 +10,8 @@ #define __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ #include -#include -_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(Scope scope, - MemorySemantics semantics); +_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(int memory_scope, + int memory_order); #endif // __CLC_MEM_FENCE_CLC_MEM_FENCE_H__ diff --git a/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h b/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h deleted file mode 100644 index 7294026386b7a..0000000000000 --- a/libclc/clc/include/clc/mem_fence/clc_mem_scope_semantics.h +++ /dev/null @@ -1,36 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__ -#define __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__ - -// Scope values are defined in SPIR-V spec. -typedef enum Scope { - CrossDevice = 0, - Device = 1, - Workgroup = 2, - Subgroup = 3, - Invocation = 4, -} Scope; - -// MemorySemantics values are defined in SPIR-V spec. -typedef enum MemorySemantics { - None = 0x0, - Acquire = 0x2, - Release = 0x4, - AcquireRelease = 0x8, - SequentiallyConsistent = 0x10, - UniformMemory = 0x40, - SubgroupMemory = 0x80, - WorkgroupMemory = 0x100, - CrossWorkgroupMemory = 0x200, - AtomicCounterMemory = 0x400, - ImageMemory = 0x800, -} MemorySemantics; - -#endif // __CLC_MEM_FENCE_CLC_MEM_SCOPE_SEMANTICS_H__ diff --git a/libclc/clc/include/clc/synchronization/clc_barrier.h b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h similarity index 55% rename from libclc/clc/include/clc/synchronization/clc_barrier.h rename to libclc/clc/include/clc/synchronization/clc_work_group_barrier.h index d363652c6e14d..5f864e1057b8b 100644 --- a/libclc/clc/include/clc/synchronization/clc_barrier.h +++ b/libclc/clc/include/clc/synchronization/clc_work_group_barrier.h @@ -6,13 +6,12 @@ // //===----------------------------------------------------------------------===// -#ifndef __CLC_SYNCHRONIZATION_CLC_BARRIER_H__ -#define __CLC_SYNCHRONIZATION_CLC_BARRIER_H__ +#ifndef __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__ +#define __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__ #include -#include -_CLC_OVERLOAD _CLC_DECL void __clc_barrier(Scope scope, - MemorySemantics semantics); +_CLC_OVERLOAD _CLC_DECL void __clc_work_group_barrier(int memory_scope, + int memory_order); -#endif // __CLC_SYNCHRONIZATION_CLC_BARRIER_H__ +#endif // __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__ diff --git a/libclc/clc/lib/amdgcn/SOURCES b/libclc/clc/lib/amdgcn/SOURCES index f2f58e3124aa8..b20d3db50c416 100644 --- a/libclc/clc/lib/amdgcn/SOURCES +++ b/libclc/clc/lib/amdgcn/SOURCES @@ -2,7 +2,7 @@ math/clc_fmax.cl math/clc_fmin.cl math/clc_ldexp_override.cl mem_fence/clc_mem_fence.cl -synchronization/clc_barrier.cl +synchronization/clc_work_group_barrier.cl workitem/clc_get_global_offset.cl workitem/clc_get_global_size.cl workitem/clc_get_group_id.cl diff --git a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl index 12ec6d8d18091..9e6460313718e 100644 --- a/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl +++ b/libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl @@ -25,13 +25,13 @@ void __clc_amdgcn_s_waitcnt(unsigned flags); _CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt"); #endif -_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(Scope scope, - MemorySemantics semantics) { - if (semantics & CrossWorkgroupMemory) { +_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, + int memory_order) { + if (memory_scope & __MEMORY_SCOPE_DEVICE) { // scalar loads are counted with LGKM but we don't know whether // the compiler turned any loads to scalar __waitcnt(0); - } else if (semantics & WorkgroupMemory) + } else if (memory_scope & __MEMORY_SCOPE_WRKGRP) __waitcnt(0xff); // LGKM is [12:8] } #undef __waitcnt diff --git a/libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl similarity index 64% rename from libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl rename to libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl index 0299a426e4d21..ff3628fa7c339 100644 --- a/libclc/clc/lib/amdgcn/synchronization/clc_barrier.cl +++ b/libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl @@ -7,10 +7,10 @@ //===----------------------------------------------------------------------===// #include -#include +#include -_CLC_OVERLOAD _CLC_DEF void __clc_barrier(Scope scope, - MemorySemantics semantics) { - __clc_mem_fence(scope, semantics); +_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope, + int memory_order) { + __clc_mem_fence(memory_scope, memory_order); __builtin_amdgcn_s_barrier(); } diff --git a/libclc/clc/lib/ptx-nvidiacl/SOURCES b/libclc/clc/lib/ptx-nvidiacl/SOURCES index a0fb861549ebc..b6f50654f89c5 100644 --- a/libclc/clc/lib/ptx-nvidiacl/SOURCES +++ b/libclc/clc/lib/ptx-nvidiacl/SOURCES @@ -1,5 +1,5 @@ mem_fence/clc_mem_fence.cl -synchronization/clc_barrier.cl +synchronization/clc_work_group_barrier.cl workitem/clc_get_global_id.cl workitem/clc_get_group_id.cl workitem/clc_get_local_id.cl diff --git a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl index 4c0d342b7244f..b3e2375e755a2 100644 --- a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl +++ b/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl @@ -8,8 +8,8 @@ #include -_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(Scope scope, - MemorySemantics semantics) { - if (semantics & (CrossWorkgroupMemory | WorkgroupMemory)) +_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope, + int memory_order) { + if (memory_scope & (__MEMORY_SCOPE_DEVICE | __MEMORY_SCOPE_WRKGRP)) __nvvm_membar_cta(); } diff --git a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl similarity index 66% rename from libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl rename to libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl index 920b17cb02f92..6cb37a38f06ac 100644 --- a/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_barrier.cl +++ b/libclc/clc/lib/ptx-nvidiacl/synchronization/clc_work_group_barrier.cl @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// -#include +#include -_CLC_OVERLOAD _CLC_DEF void __clc_barrier(Scope scope, - MemorySemantics semantics) { +_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope, + int memory_order) { __syncthreads(); } diff --git a/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h b/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h index 18f9a4afb2d5f..7b2f701c1ff99 100644 --- a/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h +++ b/libclc/opencl/include/clc/opencl/synchronization/cl_mem_fence_flags.h @@ -11,32 +11,6 @@ typedef uint cl_mem_fence_flags; -// Copied from -// https://github.com/llvm/llvm-project/blob/08e40c12fa0c/clang/lib/Headers/opencl-c-base.h#L390 -typedef enum memory_scope { - memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, - memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, - memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, -#if defined(__opencl_c_atomic_scope_all_devices) - memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, -#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100) - memory_scope_all_devices = memory_scope_all_svm_devices, -#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= - // 202100) -#endif // defined(__opencl_c_atomic_scope_all_devices) -/** - * Subgroups have different requirements on forward progress, so just test - * all the relevant macros. - * CL 3.0 sub-groups "they are not guaranteed to make independent forward - * progress" KHR subgroups "Subgroups within a workgroup are independent, make - * forward progress with respect to each other" - */ -#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || \ - defined(__opencl_c_subgroups) - memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP -#endif -} memory_scope; - #define CLK_LOCAL_MEM_FENCE 1 #define CLK_GLOBAL_MEM_FENCE 2 #define CLK_IMAGE_MEM_FENCE 4 diff --git a/libclc/opencl/include/clc/opencl/synchronization/utils.h b/libclc/opencl/include/clc/opencl/synchronization/utils.h index bbcfa20a556d5..cf3baf28cb5f1 100644 --- a/libclc/opencl/include/clc/opencl/synchronization/utils.h +++ b/libclc/opencl/include/clc/opencl/synchronization/utils.h @@ -10,41 +10,15 @@ #define __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ #include -#include #include -_CLC_INLINE Scope getCLCScope(memory_scope memory_scope) { - switch (memory_scope) { - case memory_scope_work_item: - return Invocation; -#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || \ - defined(__opencl_c_subgroups) - case memory_scope_sub_group: - return Subgroup; -#endif - case memory_scope_work_group: - return Workgroup; - case memory_scope_device: - return Device; - default: - break; - } -#ifdef __opencl_c_atomic_scope_all_devices - return CrossDevice; -#else - return Device; -#endif -} - -_CLC_INLINE MemorySemantics getCLCMemorySemantics(cl_mem_fence_flags flag) { - MemorySemantics semantics = SequentiallyConsistent; +_CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) { + int memory_scope = 0; if (flag & CLK_GLOBAL_MEM_FENCE) - semantics |= CrossWorkgroupMemory; + memory_scope |= __MEMORY_SCOPE_DEVICE; if (flag & CLK_LOCAL_MEM_FENCE) - semantics |= WorkgroupMemory; - if (flag & CLK_IMAGE_MEM_FENCE) - semantics |= ImageMemory; - return semantics; + memory_scope |= __MEMORY_SCOPE_WRKGRP; + return memory_scope; } #endif // __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__ diff --git a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl index 10d879d835c06..81216d6a26cf2 100644 --- a/libclc/opencl/lib/amdgcn/mem_fence/fence.cl +++ b/libclc/opencl/lib/amdgcn/mem_fence/fence.cl @@ -11,9 +11,9 @@ #include _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) { - Scope scope = getCLCScope(memory_scope_device); - MemorySemantics semantics = getCLCMemorySemantics(flags); - __clc_mem_fence(scope, semantics); + int memory_scope = getCLCMemoryScope(flags); + int memory_order = __ATOMIC_SEQ_CST; + __clc_mem_fence(memory_scope, memory_order); } // We don't have separate mechanism for read and write fences diff --git a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl index b8372d4800bf1..c8322e602302c 100644 --- a/libclc/opencl/lib/amdgcn/synchronization/barrier.cl +++ b/libclc/opencl/lib/amdgcn/synchronization/barrier.cl @@ -8,10 +8,10 @@ #include #include -#include +#include _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) { - Scope scope = getCLCScope(memory_scope_device); - MemorySemantics semantics = getCLCMemorySemantics(flags); - __clc_barrier(scope, semantics); + int memory_scope = getCLCMemoryScope(flags); + int memory_order = __ATOMIC_SEQ_CST; + __clc_work_group_barrier(memory_scope, memory_order); } diff --git a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl index 2d591c90d63c2..e22ed870a7e6b 100644 --- a/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl +++ b/libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl @@ -11,9 +11,9 @@ #include _CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) { - Scope scope = getCLCScope(memory_scope_device); - MemorySemantics semantics = getCLCMemorySemantics(flags); - __clc_mem_fence(scope, semantics); + int memory_scope = getCLCMemoryScope(flags); + int memory_order = __ATOMIC_SEQ_CST; + __clc_mem_fence(memory_scope, memory_order); } // We do not have separate mechanism for read and write fences. diff --git a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl index b8372d4800bf1..c8322e602302c 100644 --- a/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl +++ b/libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl @@ -8,10 +8,10 @@ #include #include -#include +#include _CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) { - Scope scope = getCLCScope(memory_scope_device); - MemorySemantics semantics = getCLCMemorySemantics(flags); - __clc_barrier(scope, semantics); + int memory_scope = getCLCMemoryScope(flags); + int memory_order = __ATOMIC_SEQ_CST; + __clc_work_group_barrier(memory_scope, memory_order); }