Skip to content

Commit 7cea0ce

Browse files
[libomptarget] Revert all improvements to support
Summary: [libomptarget] Revert all improvements to support The change to unity build for nvcc has broken the build for some developers. This patch reverts to a known-working state. There has been some confusion over exactly how the build broke. I think we have reached a common understanding that the disappearing symbols are from the bitcode library built by clang. The static archive built by nvcc may show the same problem. Some of the confusion arose from building the deviceRTL twice and using one or the other library based on various environmental factors. I'm pretty sure the problem is clang expanding `__forceinline__` into both `__inline__` and `attribute(("always_inline"))`. The `__inline__` attribute resolves to linkonce_odr which is not safe for exporting symbols from translation units. "always_inline" is the desired semantic for small functions defined in one translation unit that are intended to be inlined at link time. "inline" is not. This therefore reintroduces the dependency hazard of supporti.h and some code duplication, and blocks progress separating deviceRTL into reusable components. See also D69857, D69859 for attempts at a fix instead of a revert. Reviewers: ABataev, jdoerfert, grokos, ikitayama, tianshilei1992 Reviewed By: ABataev Subscribers: mgorny, jfb, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D69885
1 parent ce2b5cb commit 7cea0ce

File tree

9 files changed

+58
-102
lines changed

9 files changed

+58
-102
lines changed

openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
5555
src/omptarget-nvptx.cu
5656
src/parallel.cu
5757
src/reduction.cu
58-
src/support.cu
5958
src/sync.cu
6059
src/task.cu
6160
)
@@ -89,7 +88,7 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
8988
set(BUILD_SHARED_LIBS OFF)
9089
set(CUDA_SEPARABLE_COMPILATION ON)
9190
list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory})
92-
cuda_add_library(omptarget-nvptx STATIC unity.cu
91+
cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
9392
OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
9493

9594
# Install device RTL under the lib destination folder.

openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@
1313
#include "target_impl.h"
1414
#include <stdio.h>
1515

16+
// Warp ID in the CUDA block
17+
INLINE static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
18+
// Lane ID in the CUDA warp.
19+
INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
20+
1621
// Return true if this is the first active thread in the warp.
1722
INLINE static bool IsWarpMasterActiveThread() {
1823
unsigned long long Mask = __kmpc_impl_activemask();
@@ -62,7 +67,7 @@ __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
6267
DSPRINT0(DSFLAG_INIT,
6368
"Entering __kmpc_initialize_data_sharing_environment\n");
6469

65-
unsigned WID = GetWarpId();
70+
unsigned WID = getWarpId();
6671
DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID);
6772

6873
omptarget_nvptx_TeamDescr *teamDescr =
@@ -106,7 +111,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
106111
DSPRINT(DSFLAG, "Default Data Size %016llx\n",
107112
(unsigned long long)SharingDefaultDataSize);
108113

109-
unsigned WID = GetWarpId();
114+
unsigned WID = getWarpId();
110115
__kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
111116

112117
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
@@ -226,7 +231,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
226231

227232
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
228233

229-
unsigned WID = GetWarpId();
234+
unsigned WID = getWarpId();
230235

231236
if (IsEntryPoint) {
232237
if (IsWarpMasterActiveThread()) {
@@ -354,7 +359,7 @@ EXTERN void __kmpc_data_sharing_init_stack_spmd() {
354359
// This function initializes the stack pointer with the pointer to the
355360
// statically allocated shared memory slots. The size of a shared memory
356361
// slot is pre-determined to be 256 bytes.
357-
if (GetThreadIdInBlock() == 0)
362+
if (threadIdx.x == 0)
358363
data_sharing_init_stack_common();
359364

360365
__threadfence_block();
@@ -372,7 +377,7 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
372377
PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
373378

374379
// Frame pointer must be visible to all workers in the same warp.
375-
const unsigned WID = GetWarpId();
380+
const unsigned WID = getWarpId();
376381
void *FrameP = 0;
377382
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
378383

@@ -462,7 +467,7 @@ EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
462467
// Compute the start address of the frame of each thread in the warp.
463468
uintptr_t FrameStartAddress =
464469
(uintptr_t) data_sharing_push_stack_common(PushSize);
465-
FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize);
470+
FrameStartAddress += (uintptr_t) (getLaneId() * DataSize);
466471
return (void *)FrameStartAddress;
467472
}
468473

@@ -477,7 +482,7 @@ EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
477482
__threadfence_block();
478483

479484
if (GetThreadIdInBlock() % WARPSIZE == 0) {
480-
unsigned WID = GetWarpId();
485+
unsigned WID = getWarpId();
481486

482487
// Current slot
483488
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];

openmp/libomptarget/deviceRTLs/nvptx/src/debug.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -128,12 +128,12 @@
128128

129129
#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
130130
#include <stdio.h>
131-
#include "support.h"
131+
#include "target_impl.h"
132132

133133
template <typename... Arguments>
134134
NOINLINE static void log(const char *fmt, Arguments... parameters) {
135-
printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
136-
(int)GetWarpId(), (int)GetLaneId(), parameters...);
135+
printf(fmt, (int)blockIdx.x, (int)threadIdx.x, (int)(threadIdx.x / WARPSIZE),
136+
(int)(threadIdx.x & 0x1F), parameters...);
137137
}
138138

139139
#endif
@@ -144,8 +144,9 @@ template <typename... Arguments>
144144
NOINLINE static void check(bool cond, const char *fmt,
145145
Arguments... parameters) {
146146
if (!cond)
147-
printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(),
148-
(int)GetWarpId(), (int)GetLaneId(), parameters...);
147+
printf(fmt, (int)blockIdx.x, (int)threadIdx.x,
148+
(int)(threadIdx.x / WARPSIZE), (int)(threadIdx.x & 0x1F),
149+
parameters...);
149150
assert(cond);
150151
}
151152

openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ EXTERN void omp_set_lock(omp_lock_t *lock) {
364364
for (;;) {
365365
now = clock();
366366
clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
367-
if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
367+
if (cycles >= __OMP_SPIN * blockIdx.x) {
368368
break;
369369
}
370370
}

openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,5 +385,6 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
385385
////////////////////////////////////////////////////////////////////////////////
386386

387387
#include "omptarget-nvptxi.h"
388+
#include "supporti.h"
388389

389390
#endif

openmp/libomptarget/deviceRTLs/nvptx/src/support.h

Lines changed: 29 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,7 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13-
#ifndef OMPTARGET_SUPPORT_H
14-
#define OMPTARGET_SUPPORT_H
15-
16-
#include "interface.h"
1713
#include "target_impl.h"
18-
1914
////////////////////////////////////////////////////////////////////////////////
2015
// Execution Parameters
2116
////////////////////////////////////////////////////////////////////////////////
@@ -31,70 +26,58 @@ enum RuntimeMode {
3126
RuntimeMask = 0x02u,
3227
};
3328

34-
DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
35-
DEVICE bool isGenericMode();
36-
DEVICE bool isSPMDMode();
37-
DEVICE bool isRuntimeUninitialized();
38-
DEVICE bool isRuntimeInitialized();
39-
40-
////////////////////////////////////////////////////////////////////////////////
41-
// Execution Modes based on ___location parameter fields
42-
////////////////////////////////////////////////////////////////////////////////
43-
44-
DEVICE bool checkSPMDMode(kmp_Ident *loc);
45-
46-
DEVICE bool checkGenericMode(kmp_Ident *loc);
47-
48-
DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc);
49-
50-
DEVICE bool checkRuntimeInitialized(kmp_Ident *loc);
29+
INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
30+
INLINE bool isGenericMode();
31+
INLINE bool isSPMDMode();
32+
INLINE bool isRuntimeUninitialized();
33+
INLINE bool isRuntimeInitialized();
5134

5235
////////////////////////////////////////////////////////////////////////////////
5336
// get info from machine
5437
////////////////////////////////////////////////////////////////////////////////
5538

5639
// get low level ids of resources
57-
DEVICE int GetThreadIdInBlock();
58-
DEVICE int GetBlockIdInKernel();
59-
DEVICE int GetNumberOfBlocksInKernel();
60-
DEVICE int GetNumberOfThreadsInBlock();
61-
DEVICE unsigned GetWarpId();
62-
DEVICE unsigned GetLaneId();
40+
INLINE int GetThreadIdInBlock();
41+
INLINE int GetBlockIdInKernel();
42+
INLINE int GetNumberOfBlocksInKernel();
43+
INLINE int GetNumberOfThreadsInBlock();
44+
INLINE unsigned GetWarpId();
45+
INLINE unsigned GetLaneId();
6346

6447
// get global ids to locate tread/team info (constant regardless of OMP)
65-
DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
66-
DEVICE int GetMasterThreadID();
67-
DEVICE int GetNumberOfWorkersInTeam();
48+
INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
49+
INLINE int GetMasterThreadID();
50+
INLINE int GetNumberOfWorkersInTeam();
6851

6952
// get OpenMP thread and team ids
70-
DEVICE int GetOmpThreadId(int threadId,
53+
INLINE int GetOmpThreadId(int threadId,
7154
bool isSPMDExecutionMode); // omp_thread_num
72-
DEVICE int GetOmpTeamId(); // omp_team_num
55+
INLINE int GetOmpTeamId(); // omp_team_num
7356

7457
// get OpenMP number of threads and team
75-
DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
76-
DEVICE int GetNumberOfOmpTeams(); // omp_num_teams
58+
INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
59+
INLINE int GetNumberOfOmpTeams(); // omp_num_teams
7760

7861
// get OpenMP number of procs
79-
DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
80-
DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
62+
INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
63+
INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
8164

8265
// masters
83-
DEVICE int IsTeamMaster(int ompThreadId);
66+
INLINE int IsTeamMaster(int ompThreadId);
8467

8568
// Parallel level
86-
DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
87-
DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
69+
INLINE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
70+
INLINE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
8871

8972
////////////////////////////////////////////////////////////////////////////////
9073
// Memory
9174
////////////////////////////////////////////////////////////////////////////////
9275

9376
// safe alloc and free
94-
DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success
95-
DEVICE void *SafeFree(void *ptr, const char *msg);
77+
INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
78+
INLINE void *SafeFree(void *ptr, const char *msg);
9679
// pad to a alignment (power of 2 only)
97-
DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
80+
INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
9881
#define ADD_BYTES(_addr, _bytes) \
9982
((void *)((char *)((void *)(_addr)) + (_bytes)))
10083
#define SUB_BYTES(_addr, _bytes) \
@@ -103,8 +86,6 @@ DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment);
10386
////////////////////////////////////////////////////////////////////////////////
10487
// Teams Reduction Scratchpad Helpers
10588
////////////////////////////////////////////////////////////////////////////////
106-
DEVICE unsigned int *GetTeamsReductionTimestamp();
107-
DEVICE char *GetTeamsReductionScratchpad();
108-
DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
109-
110-
#endif
89+
INLINE unsigned int *GetTeamsReductionTimestamp();
90+
INLINE char *GetTeamsReductionScratchpad();
91+
INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);

openmp/libomptarget/deviceRTLs/nvptx/src/support.cu renamed to openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
//===--------- support.cu - NVPTX OpenMP support functions ------- CUDA -*-===//
1+
//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
22
//
33
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
44
// See https://llvm.org/LICENSE.txt for license information.
@@ -10,14 +10,12 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13-
#include "support.h"
14-
#include "debug.h"
15-
#include "omptarget-nvptx.h"
16-
1713
////////////////////////////////////////////////////////////////////////////////
1814
// Execution Parameters
1915
////////////////////////////////////////////////////////////////////////////////
2016

17+
#include "target_impl.h"
18+
2119
INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
2220
execution_param = EMode;
2321
execution_param |= RMode;
@@ -106,9 +104,9 @@ INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
106104

107105
INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
108106

109-
INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
107+
INLINE unsigned GetWarpId() { return threadIdx.x / WARPSIZE; }
110108

111-
INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
109+
INLINE unsigned GetLaneId() { return threadIdx.x & (WARPSIZE - 1); }
112110

113111
////////////////////////////////////////////////////////////////////////////////
114112
//
@@ -124,9 +122,7 @@ INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
124122
// If NumThreads is 1024, master id is 992.
125123
//
126124
// Called in Generic Execution Mode only.
127-
INLINE int GetMasterThreadID() {
128-
return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1);
129-
}
125+
INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
130126

131127
// The last warp is reserved for the master; other warps are workers.
132128
// Called in Generic Execution Mode only.

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@
1515
#include <cuda.h>
1616
#include "nvptx_interface.h"
1717

18-
#define DEVICE __device__
19-
#define INLINE __forceinline__ DEVICE
20-
#define NOINLINE __noinline__ DEVICE
18+
#define INLINE __forceinline__ __device__
19+
#define NOINLINE __noinline__ __device__
2120

2221
////////////////////////////////////////////////////////////////////////////////
2322
// Kernel options

openmp/libomptarget/deviceRTLs/nvptx/unity.cu

Lines changed: 0 additions & 26 deletions
This file was deleted.

0 commit comments

Comments
 (0)