Skip to content

[libclc] Move mem_fence and barrier to clc library #151446

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions libclc/clc/include/clc/mem_fence/clc_mem_fence.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef __CLC_MEM_FENCE_CLC_MEM_FENCE_H__
#define __CLC_MEM_FENCE_CLC_MEM_FENCE_H__

#include <clc/internal/clc.h>

_CLC_OVERLOAD _CLC_DECL void __clc_mem_fence(int memory_scope,
int memory_order);

#endif // __CLC_MEM_FENCE_CLC_MEM_FENCE_H__
17 changes: 17 additions & 0 deletions libclc/clc/include/clc/synchronization/clc_work_group_barrier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__
#define __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__

#include <clc/internal/clc.h>

_CLC_OVERLOAD _CLC_DECL void __clc_work_group_barrier(int memory_scope,
int memory_order);

#endif // __CLC_SYNCHRONIZATION_CLC_WORK_GROUP_BARRIER_H__
2 changes: 2 additions & 0 deletions libclc/clc/lib/amdgcn/SOURCES
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
math/clc_fmax.cl
math/clc_fmin.cl
math/clc_ldexp_override.cl
mem_fence/clc_mem_fence.cl
synchronization/clc_work_group_barrier.cl
workitem/clc_get_global_offset.cl
workitem/clc_get_global_size.cl
workitem/clc_get_group_id.cl
Expand Down
37 changes: 37 additions & 0 deletions libclc/clc/lib/amdgcn/mem_fence/clc_mem_fence.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <clc/mem_fence/clc_mem_fence.h>

void __clc_amdgcn_s_waitcnt(unsigned flags);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should get rid of this; it can be properly implemented with __builtin_amdgcn_fence


// s_waitcnt takes 16bit argument with a combined number of maximum allowed
// pending operations:
// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages
// [7] -- undefined
// [6:4] -- exports, GDS, and mem write
// [3:0] -- vector memory operations

// Newer clang supports __builtin_amdgcn_s_waitcnt
#if __clang_major__ >= 5
#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x)
#else
#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x)
_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt");
#endif

_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope,
int memory_order) {
if (memory_scope & __MEMORY_SCOPE_DEVICE) {
// scalar loads are counted with LGKM but we don't know whether
// the compiler turned any loads to scalar
__waitcnt(0);
} else if (memory_scope & __MEMORY_SCOPE_WRKGRP)
__waitcnt(0xff); // LGKM is [12:8]
}
#undef __waitcnt
16 changes: 16 additions & 0 deletions libclc/clc/lib/amdgcn/synchronization/clc_work_group_barrier.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <clc/mem_fence/clc_mem_fence.h>
#include <clc/synchronization/clc_work_group_barrier.h>

_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope,
int memory_order) {
__clc_mem_fence(memory_scope, memory_order);
__builtin_amdgcn_s_barrier();
}
2 changes: 2 additions & 0 deletions libclc/clc/lib/ptx-nvidiacl/SOURCES
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
mem_fence/clc_mem_fence.cl
synchronization/clc_work_group_barrier.cl
workitem/clc_get_global_id.cl
workitem/clc_get_group_id.cl
workitem/clc_get_local_id.cl
Expand Down
15 changes: 15 additions & 0 deletions libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <clc/mem_fence/clc_mem_fence.h>

_CLC_OVERLOAD _CLC_DEF void __clc_mem_fence(int memory_scope,
int memory_order) {
if (memory_scope & (__MEMORY_SCOPE_DEVICE | __MEMORY_SCOPE_WRKGRP))
__nvvm_membar_cta();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <clc/synchronization/clc_work_group_barrier.h>

_CLC_OVERLOAD _CLC_DEF void __clc_work_group_barrier(int memory_scope,
int memory_order) {
__syncthreads();
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ typedef uint cl_mem_fence_flags;

#define CLK_LOCAL_MEM_FENCE 1
#define CLK_GLOBAL_MEM_FENCE 2
#define CLK_IMAGE_MEM_FENCE 4

#endif // __CLC_OPENCL_SYNCHRONIZATION_CL_MEM_FENCE_FLAGS_H__
24 changes: 24 additions & 0 deletions libclc/opencl/include/clc/opencl/synchronization/utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__
#define __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__

#include <clc/internal/clc.h>
#include <clc/opencl/synchronization/cl_mem_fence_flags.h>

_CLC_INLINE int getCLCMemoryScope(cl_mem_fence_flags flag) {
int memory_scope = 0;
if (flag & CLK_GLOBAL_MEM_FENCE)
memory_scope |= __MEMORY_SCOPE_DEVICE;
if (flag & CLK_LOCAL_MEM_FENCE)
memory_scope |= __MEMORY_SCOPE_WRKGRP;
return memory_scope;
}

#endif // __CLC_OPENCL_SYNCHRONIZATION_UTILS_H__
29 changes: 5 additions & 24 deletions libclc/opencl/lib/amdgcn/mem_fence/fence.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,15 @@
//
//===----------------------------------------------------------------------===//

#include <clc/mem_fence/clc_mem_fence.h>
#include <clc/opencl/explicit_fence/explicit_memory_fence.h>

void __clc_amdgcn_s_waitcnt(unsigned flags);

// s_waitcnt takes 16bit argument with a combined number of maximum allowed
// pending operations:
// [12:8] LGKM -- LDS, GDS, Konstant (SMRD), Messages
// [7] -- undefined
// [6:4] -- exports, GDS, and mem write
// [3:0] -- vector memory operations

// Newer clang supports __builtin_amdgcn_s_waitcnt
#if __clang_major__ >= 5
#define __waitcnt(x) __builtin_amdgcn_s_waitcnt(x)
#else
#define __waitcnt(x) __clc_amdgcn_s_waitcnt(x)
_CLC_DEF void __clc_amdgcn_s_waitcnt(unsigned) __asm("llvm.amdgcn.s.waitcnt");
#endif
#include <clc/opencl/synchronization/utils.h>

_CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) {
if (flags & CLK_GLOBAL_MEM_FENCE) {
// scalar loads are counted with LGKM but we don't know whether
// the compiler turned any loads to scalar
__waitcnt(0);
} else if (flags & CLK_LOCAL_MEM_FENCE)
__waitcnt(0xff); // LGKM is [12:8]
int memory_scope = getCLCMemoryScope(flags);
int memory_order = __ATOMIC_SEQ_CST;
__clc_mem_fence(memory_scope, memory_order);
}
#undef __waitcnt

// We don't have separate mechanism for read and write fences
_CLC_DEF _CLC_OVERLOAD void read_mem_fence(cl_mem_fence_flags flags) {
Expand Down
8 changes: 5 additions & 3 deletions libclc/opencl/lib/amdgcn/synchronization/barrier.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
//
//===----------------------------------------------------------------------===//

#include <clc/opencl/explicit_fence/explicit_memory_fence.h>
#include <clc/opencl/synchronization/barrier.h>
#include <clc/opencl/synchronization/utils.h>
#include <clc/synchronization/clc_work_group_barrier.h>

_CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) {
mem_fence(flags);
__builtin_amdgcn_s_barrier();
int memory_scope = getCLCMemoryScope(flags);
int memory_order = __ATOMIC_SEQ_CST;
__clc_work_group_barrier(memory_scope, memory_order);
}
7 changes: 5 additions & 2 deletions libclc/opencl/lib/ptx-nvidiacl/mem_fence/fence.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
//
//===----------------------------------------------------------------------===//

#include <clc/mem_fence/clc_mem_fence.h>
#include <clc/opencl/explicit_fence/explicit_memory_fence.h>
#include <clc/opencl/synchronization/utils.h>

_CLC_DEF _CLC_OVERLOAD void mem_fence(cl_mem_fence_flags flags) {
if (flags & (CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE))
__nvvm_membar_cta();
int memory_scope = getCLCMemoryScope(flags);
int memory_order = __ATOMIC_SEQ_CST;
__clc_mem_fence(memory_scope, memory_order);
}

// We do not have separate mechanism for read and write fences.
Expand Down
6 changes: 5 additions & 1 deletion libclc/opencl/lib/ptx-nvidiacl/synchronization/barrier.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
//===----------------------------------------------------------------------===//

#include <clc/opencl/synchronization/barrier.h>
#include <clc/opencl/synchronization/utils.h>
#include <clc/synchronization/clc_work_group_barrier.h>

_CLC_DEF _CLC_OVERLOAD void barrier(cl_mem_fence_flags flags) {
__syncthreads();
int memory_scope = getCLCMemoryScope(flags);
int memory_order = __ATOMIC_SEQ_CST;
__clc_work_group_barrier(memory_scope, memory_order);
}
Loading