23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
29#if !defined(__HIPCC_RTC__)
30#include <hip/hip_runtime_api.h>
34#include <hip/hip_vector_types.h>
38extern "C" __device__
int printf(
const char *fmt, ...);
40template <
typename... All>
41static inline __device__
void printf(
const char* format, All... all) {}
44extern "C" __device__
unsigned long long __ockl_steadyctr_u64();
51__device__
static inline unsigned int __popc(
unsigned int input) {
52 return __builtin_popcount(input);
54__device__
static inline unsigned int __popcll(
unsigned long long int input) {
55 return __builtin_popcountll(input);
58__device__
static inline int __clz(
int input) {
59 return __ockl_clz_u32((uint)input);
62__device__
static inline int __clzll(
long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
66__device__
static inline unsigned int __ffs(
unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
70__device__
static inline unsigned int __ffsll(
unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
74__device__
static inline unsigned int __ffs(
int input) {
75 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
78__device__
static inline unsigned int __ffsll(
long long int input) {
79 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
85__device__
static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
86 uint64_t temp_mask = mask;
87 int32_t temp_offset = offset;
90 temp_mask &= (1 << base);
93 else if (offset < 0) {
94 temp_mask = __builtin_bitreverse64(mask);
96 temp_offset = -offset;
99 temp_mask = temp_mask & ((~0ULL) << base);
100 if (__builtin_popcountll(temp_mask) < temp_offset)
103 for (
int i = 0x20; i > 0; i >>= 1) {
104 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
105 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
106 if (pcnt < temp_offset) {
107 temp_mask = temp_mask >> i;
112 temp_mask = temp_mask_lo;
121__device__
static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
122 uint64_t temp_mask = mask;
123 int32_t temp_offset = offset;
125 temp_mask &= (1 << base);
128 else if (offset < 0) {
129 temp_mask = __builtin_bitreverse64(mask);
131 temp_offset = -offset;
133 temp_mask = temp_mask & ((~0ULL) << base);
134 if (__builtin_popcountll(temp_mask) < temp_offset)
137 for (
int i = 0x20; i > 0; i >>= 1) {
138 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
139 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
140 if (pcnt < temp_offset) {
141 temp_mask = temp_mask >> i;
146 temp_mask = temp_mask_lo;
154__device__
static inline unsigned int __brev(
unsigned int input) {
155 return __builtin_bitreverse32(input);
158__device__
static inline unsigned long long int __brevll(
unsigned long long int input) {
159 return __builtin_bitreverse64(input);
162__device__
static inline unsigned int __lastbit_u32_u64(uint64_t input) {
163 return input == 0 ? -1 : __builtin_ctzl(input);
166__device__
static inline unsigned int __bitextract_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2) {
167 uint32_t offset = src1 & 31;
168 uint32_t width = src2 & 31;
169 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
172__device__
static inline uint64_t __bitextract_u64(uint64_t src0,
unsigned int src1,
unsigned int src2) {
173 uint64_t offset = src1 & 63;
174 uint64_t width = src2 & 63;
175 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
178__device__
static inline unsigned int __bitinsert_u32(
unsigned int src0,
unsigned int src1,
unsigned int src2,
unsigned int src3) {
179 uint32_t offset = src2 & 31;
180 uint32_t width = src3 & 31;
181 uint32_t mask = (1 << width) - 1;
182 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
185__device__
static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1,
unsigned int src2,
unsigned int src3) {
186 uint64_t offset = src2 & 63;
187 uint64_t width = src3 & 63;
188 uint64_t mask = (1ULL << width) - 1;
189 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
192__device__
inline unsigned int __funnelshift_l(
unsigned int lo,
unsigned int hi,
unsigned int shift)
194 uint32_t mask_shift = shift & 31;
195 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
198__device__
inline unsigned int __funnelshift_lc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
200 uint32_t min_shift = shift >= 32 ? 32 : shift;
201 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
204__device__
inline unsigned int __funnelshift_r(
unsigned int lo,
unsigned int hi,
unsigned int shift)
206 return __builtin_amdgcn_alignbit(hi, lo, shift);
209__device__
inline unsigned int __funnelshift_rc(
unsigned int lo,
unsigned int hi,
unsigned int shift)
211 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
214__device__
static unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s);
215__device__
static unsigned int __hadd(
int x,
int y);
216__device__
static int __mul24(
int x,
int y);
217__device__
static long long int __mul64hi(
long long int x,
long long int y);
218__device__
static int __mulhi(
int x,
int y);
219__device__
static int __rhadd(
int x,
int y);
220__device__
static unsigned int __sad(
int x,
int y,
unsigned int z);
221__device__
static unsigned int __uhadd(
unsigned int x,
unsigned int y);
222__device__
static int __umul24(
unsigned int x,
unsigned int y);
223__device__
static unsigned long long int __umul64hi(
unsigned long long int x,
unsigned long long int y);
224__device__
static unsigned int __umulhi(
unsigned int x,
unsigned int y);
225__device__
static unsigned int __urhadd(
unsigned int x,
unsigned int y);
226__device__
static unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z);
233} __attribute__((aligned(4)));
240} __attribute__((aligned(8)));
243static inline unsigned int __byte_perm(
unsigned int x,
unsigned int y,
unsigned int s) {
250 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
251 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
252 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
253 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
257__device__
static inline unsigned int __hadd(
int x,
int y) {
259 int sign = z & 0x8000000;
260 int value = z & 0x7FFFFFFF;
261 return ((value) >> 1 || sign);
264__device__
static inline int __mul24(
int x,
int y) {
265 return __ockl_mul24_i32(x, y);
268__device__
static inline long long __mul64hi(
long long int x,
long long int y) {
269 ulong x0 = (ulong)x & 0xffffffffUL;
271 ulong y0 = (ulong)y & 0xffffffffUL;
274 long t = x1*y0 + (z0 >> 32);
275 long z1 = t & 0xffffffffL;
278 return x1*y1 + z2 + (z1 >> 32);
281__device__
static inline int __mulhi(
int x,
int y) {
282 return __ockl_mul_hi_i32(x, y);
285__device__
static inline int __rhadd(
int x,
int y) {
287 int sign = z & 0x8000000;
288 int value = z & 0x7FFFFFFF;
289 return ((value) >> 1 || sign);
291__device__
static inline unsigned int __sad(
int x,
int y,
unsigned int z) {
292 return x > y ? x - y + z : y - x + z;
294__device__
static inline unsigned int __uhadd(
unsigned int x,
unsigned int y) {
297__device__
static inline int __umul24(
unsigned int x,
unsigned int y) {
298 return __ockl_mul24_u32(x, y);
302static inline unsigned long long __umul64hi(
unsigned long long int x,
unsigned long long int y) {
303 ulong x0 = x & 0xffffffffUL;
305 ulong y0 = y & 0xffffffffUL;
308 ulong t = x1*y0 + (z0 >> 32);
309 ulong z1 = t & 0xffffffffUL;
312 return x1*y1 + z2 + (z1 >> 32);
315__device__
static inline unsigned int __umulhi(
unsigned int x,
unsigned int y) {
316 return __ockl_mul_hi_u32(x, y);
318__device__
static inline unsigned int __urhadd(
unsigned int x,
unsigned int y) {
319 return (x + y + 1) >> 1;
321__device__
static inline unsigned int __usad(
unsigned int x,
unsigned int y,
unsigned int z) {
322 return __ockl_sadd_u32(x, y, z);
325__device__
static inline unsigned int __lane_id() {
326 return __builtin_amdgcn_mbcnt_hi(
327 -1, __builtin_amdgcn_mbcnt_lo(-1, 0));
331static inline unsigned int __mbcnt_lo(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_lo(x,y);};
334static inline unsigned int __mbcnt_hi(
unsigned int x,
unsigned int y) {
return __builtin_amdgcn_mbcnt_hi(x,y);};
340#if !defined(__HIPCC_RTC__)
341#include "amd_warp_functions.h"
344#define MASK1 0x00ff00ff
345#define MASK2 0xff00ff00
349 unsigned one1 = in1.w & MASK1;
350 unsigned one2 = in2.w & MASK1;
351 out.w = (one1 + one2) & MASK1;
352 one1 = in1.w & MASK2;
353 one2 = in2.w & MASK2;
354 out.w = out.w | ((one1 + one2) & MASK2);
360 unsigned one1 = in1.w & MASK1;
361 unsigned one2 = in2.w & MASK1;
362 out.w = (one1 - one2) & MASK1;
363 one1 = in1.w & MASK2;
364 one2 = in2.w & MASK2;
365 out.w = out.w | ((one1 - one2) & MASK2);
371 unsigned one1 = in1.w & MASK1;
372 unsigned one2 = in2.w & MASK1;
373 out.w = (one1 * one2) & MASK1;
374 one1 = in1.w & MASK2;
375 one2 = in2.w & MASK2;
376 out.w = out.w | ((one1 * one2) & MASK2);
380__device__
static inline float __double2float_rd(
double x) {
381 return __ocml_cvtrtn_f32_f64(x);
383__device__
static inline float __double2float_rn(
double x) {
return x; }
384__device__
static inline float __double2float_ru(
double x) {
385 return __ocml_cvtrtp_f32_f64(x);
387__device__
static inline float __double2float_rz(
double x) {
388 return __ocml_cvtrtz_f32_f64(x);
391__device__
static inline int __double2hiint(
double x) {
392 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
395 __builtin_memcpy(tmp, &x,
sizeof(tmp));
399__device__
static inline int __double2loint(
double x) {
400 static_assert(
sizeof(double) == 2 *
sizeof(
int),
"");
403 __builtin_memcpy(tmp, &x,
sizeof(tmp));
408__device__
static inline int __double2int_rd(
double x) {
return (
int)__ocml_floor_f64(x); }
409__device__
static inline int __double2int_rn(
double x) {
return (
int)__ocml_rint_f64(x); }
410__device__
static inline int __double2int_ru(
double x) {
return (
int)__ocml_ceil_f64(x); }
411__device__
static inline int __double2int_rz(
double x) {
return (
int)x; }
413__device__
static inline long long int __double2ll_rd(
double x) {
414 return (
long long)__ocml_floor_f64(x);
416__device__
static inline long long int __double2ll_rn(
double x) {
417 return (
long long)__ocml_rint_f64(x);
419__device__
static inline long long int __double2ll_ru(
double x) {
420 return (
long long)__ocml_ceil_f64(x);
422__device__
static inline long long int __double2ll_rz(
double x) {
return (
long long)x; }
424__device__
static inline unsigned int __double2uint_rd(
double x) {
425 return (
unsigned int)__ocml_floor_f64(x);
427__device__
static inline unsigned int __double2uint_rn(
double x) {
428 return (
unsigned int)__ocml_rint_f64(x);
430__device__
static inline unsigned int __double2uint_ru(
double x) {
431 return (
unsigned int)__ocml_ceil_f64(x);
433__device__
static inline unsigned int __double2uint_rz(
double x) {
return (
unsigned int)x; }
435__device__
static inline unsigned long long int __double2ull_rd(
double x) {
436 return (
unsigned long long int)__ocml_floor_f64(x);
438__device__
static inline unsigned long long int __double2ull_rn(
double x) {
439 return (
unsigned long long int)__ocml_rint_f64(x);
441__device__
static inline unsigned long long int __double2ull_ru(
double x) {
442 return (
unsigned long long int)__ocml_ceil_f64(x);
444__device__
static inline unsigned long long int __double2ull_rz(
double x) {
445 return (
unsigned long long int)x;
447#if defined(__clang__)
448#pragma clang diagnostic push
449#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
451__device__
static inline long long int __double_as_longlong(
double x) {
452 static_assert(
sizeof(
long long) ==
sizeof(
double),
"");
455 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
459#if defined(__clang__)
460#pragma clang diagnostic pop
477__device__
static inline int __float2int_rd(
float x) {
return (
int)__ocml_floor_f32(x); }
478__device__
static inline int __float2int_rn(
float x) {
return (
int)__ocml_rint_f32(x); }
479__device__
static inline int __float2int_ru(
float x) {
return (
int)__ocml_ceil_f32(x); }
480__device__
static inline int __float2int_rz(
float x) {
return (
int)__ocml_trunc_f32(x); }
482__device__
static inline long long int __float2ll_rd(
float x) {
483 return (
long long int)__ocml_floor_f32(x);
485__device__
static inline long long int __float2ll_rn(
float x) {
486 return (
long long int)__ocml_rint_f32(x);
488__device__
static inline long long int __float2ll_ru(
float x) {
489 return (
long long int)__ocml_ceil_f32(x);
491__device__
static inline long long int __float2ll_rz(
float x) {
return (
long long int)x; }
493__device__
static inline unsigned int __float2uint_rd(
float x) {
494 return (
unsigned int)__ocml_floor_f32(x);
496__device__
static inline unsigned int __float2uint_rn(
float x) {
497 return (
unsigned int)__ocml_rint_f32(x);
499__device__
static inline unsigned int __float2uint_ru(
float x) {
500 return (
unsigned int)__ocml_ceil_f32(x);
502__device__
static inline unsigned int __float2uint_rz(
float x) {
return (
unsigned int)x; }
504__device__
static inline unsigned long long int __float2ull_rd(
float x) {
505 return (
unsigned long long int)__ocml_floor_f32(x);
507__device__
static inline unsigned long long int __float2ull_rn(
float x) {
508 return (
unsigned long long int)__ocml_rint_f32(x);
510__device__
static inline unsigned long long int __float2ull_ru(
float x) {
511 return (
unsigned long long int)__ocml_ceil_f32(x);
513__device__
static inline unsigned long long int __float2ull_rz(
float x) {
514 return (
unsigned long long int)x;
517__device__
static inline int __float_as_int(
float x) {
518 static_assert(
sizeof(int) ==
sizeof(
float),
"");
521 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
526__device__
static inline unsigned int __float_as_uint(
float x) {
527 static_assert(
sizeof(
unsigned int) ==
sizeof(
float),
"");
530 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
535__device__
static inline double __hiloint2double(
int hi,
int lo) {
536 static_assert(
sizeof(double) ==
sizeof(uint64_t),
"");
538 uint64_t tmp0 = (
static_cast<uint64_t
>(hi) << 32ull) |
static_cast<uint32_t
>(lo);
540 __builtin_memcpy(&tmp1, &tmp0,
sizeof(tmp0));
545__device__
static inline double __int2double_rn(
int x) {
return (
double)x; }
547__device__
static inline float __int2float_rd(
int x) {
548 return __ocml_cvtrtn_f32_s32(x);
550__device__
static inline float __int2float_rn(
int x) {
return (
float)x; }
551__device__
static inline float __int2float_ru(
int x) {
552 return __ocml_cvtrtp_f32_s32(x);
554__device__
static inline float __int2float_rz(
int x) {
555 return __ocml_cvtrtz_f32_s32(x);
558__device__
static inline float __int_as_float(
int x) {
559 static_assert(
sizeof(float) ==
sizeof(
int),
"");
562 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
567__device__
static inline double __ll2double_rd(
long long int x) {
568 return __ocml_cvtrtn_f64_s64(x);
570__device__
static inline double __ll2double_rn(
long long int x) {
return (
double)x; }
571__device__
static inline double __ll2double_ru(
long long int x) {
572 return __ocml_cvtrtp_f64_s64(x);
574__device__
static inline double __ll2double_rz(
long long int x) {
575 return __ocml_cvtrtz_f64_s64(x);
578__device__
static inline float __ll2float_rd(
long long int x) {
579 return __ocml_cvtrtn_f32_s64(x);
581__device__
static inline float __ll2float_rn(
long long int x) {
return (
float)x; }
582__device__
static inline float __ll2float_ru(
long long int x) {
583 return __ocml_cvtrtp_f32_s64(x);
585__device__
static inline float __ll2float_rz(
long long int x) {
586 return __ocml_cvtrtz_f32_s64(x);
589__device__
static inline double __longlong_as_double(
long long int x) {
590 static_assert(
sizeof(double) ==
sizeof(
long long),
"");
593 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
598__device__
static inline double __uint2double_rn(
unsigned int x) {
return (
double)x; }
600__device__
static inline float __uint2float_rd(
unsigned int x) {
601 return __ocml_cvtrtn_f32_u32(x);
603__device__
static inline float __uint2float_rn(
unsigned int x) {
return (
float)x; }
604__device__
static inline float __uint2float_ru(
unsigned int x) {
605 return __ocml_cvtrtp_f32_u32(x);
607__device__
static inline float __uint2float_rz(
unsigned int x) {
608 return __ocml_cvtrtz_f32_u32(x);
611__device__
static inline float __uint_as_float(
unsigned int x) {
612 static_assert(
sizeof(float) ==
sizeof(
unsigned int),
"");
615 __builtin_memcpy(&tmp, &x,
sizeof(tmp));
620__device__
static inline double __ull2double_rd(
unsigned long long int x) {
621 return __ocml_cvtrtn_f64_u64(x);
623__device__
static inline double __ull2double_rn(
unsigned long long int x) {
return (
double)x; }
624__device__
static inline double __ull2double_ru(
unsigned long long int x) {
625 return __ocml_cvtrtp_f64_u64(x);
627__device__
static inline double __ull2double_rz(
unsigned long long int x) {
628 return __ocml_cvtrtz_f64_u64(x);
631__device__
static inline float __ull2float_rd(
unsigned long long int x) {
632 return __ocml_cvtrtn_f32_u64(x);
634__device__
static inline float __ull2float_rn(
unsigned long long int x) {
return (
float)x; }
635__device__
static inline float __ull2float_ru(
unsigned long long int x) {
636 return __ocml_cvtrtp_f32_u64(x);
638__device__
static inline float __ull2float_rz(
unsigned long long int x) {
639 return __ocml_cvtrtz_f32_u64(x);
642#if __HIP_CLANG_ONLY__
645__device__
long long int __clock64();
646__device__
long long int __clock();
647__device__
long long int clock64();
648__device__
long long int clock();
649__device__
long long int wall_clock64();
651__device__
void __named_sync();
653#ifdef __HIP_DEVICE_COMPILE__
659inline __attribute((always_inline))
660long long int __clock64() {
661#if __has_builtin(__builtin_amdgcn_s_memtime)
663 return (
long long int) __builtin_amdgcn_s_memtime();
666 return (
long long int) __builtin_readcyclecounter();
671inline __attribute((always_inline))
672long long int __clock() {
return __clock64(); }
677inline __attribute__((always_inline))
678long long int wall_clock64() {
679 return (
long long int) __ockl_steadyctr_u64();
683inline __attribute__((always_inline))
684long long int clock64() {
return __clock64(); }
687inline __attribute__((always_inline))
688long long int clock() {
return __clock(); }
693void __named_sync() { __builtin_amdgcn_s_barrier(); }
700int __all(
int predicate) {
701 return __ockl_wfall_i32(predicate);
706int __any(
int predicate) {
707 return __ockl_wfany_i32(predicate);
715unsigned long long int __ballot(
int predicate) {
716 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
721unsigned long long int __ballot64(
int predicate) {
722 return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE);
728uint64_t __lanemask_gt()
730 uint32_t lane = __ockl_lane_u32();
733 uint64_t ballot = __ballot64(1);
734 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
735 return mask & ballot;
740uint64_t __lanemask_lt()
742 uint32_t lane = __ockl_lane_u32();
743 int64_t ballot = __ballot64(1);
744 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
745 return mask & ballot;
750uint64_t __lanemask_eq()
752 uint32_t lane = __ockl_lane_u32();
753 int64_t mask = ((uint64_t)1 << lane);
758__device__
inline void* __local_to_generic(
void* p) {
return p; }
760#ifdef __HIP_DEVICE_COMPILE__
763void* __get_dynamicgroupbaseptr()
766 return (
char*)__local_to_generic((
void*)__to_local(__builtin_amdgcn_groupstaticsize()));
770void* __get_dynamicgroupbaseptr();
775void *__amdgcn_get_dynamicgroupbaseptr() {
776 return __get_dynamicgroupbaseptr();
782static void __threadfence()
784 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"agent");
789static void __threadfence_block()
791 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"workgroup");
796static void __threadfence_system()
798 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST,
"");
806 return __builtin_trap();
814#if defined(_WIN32) || defined(_WIN64)
815extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
816void _wassert(
const wchar_t *_msg,
const wchar_t *_file,
unsigned _line) {
821extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
822void __assert_fail(
const char *assertion,
825 const char *function)
827 const char fmt[] =
"%s:%u: %s: Device-side assertion `%s' failed.\n";
839#define __hip_get_string_length(LEN, STR) \
841 const char *tmp = STR; \
846 auto msg = __ockl_fprintf_stderr_begin();
848 __hip_get_string_length(len, fmt);
849 msg = __ockl_fprintf_append_string_n(msg, fmt, len, 0);
850 __hip_get_string_length(len, file);
851 msg = __ockl_fprintf_append_string_n(msg, file, len, 0);
852 msg = __ockl_fprintf_append_args(msg, 1, line, 0, 0, 0, 0, 0, 0, 0);
853 __hip_get_string_length(len, function);
854 msg = __ockl_fprintf_append_string_n(msg, function, len, 0);
855 __hip_get_string_length(len, assertion);
856 __ockl_fprintf_append_string_n(msg, assertion, len, 1);
858#undef __hip_get_string_length
863extern "C" __device__ __attribute__((noinline)) __attribute__((weak))
871__device__
inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
873 __builtin_amdgcn_fence(__ATOMIC_RELEASE,
"workgroup");
874 __builtin_amdgcn_s_barrier();
875 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE,
"workgroup");
877 __builtin_amdgcn_s_barrier();
883static void __barrier(
int n)
885 __work_group_barrier((__cl_mem_fence_flags)n);
890__attribute__((convergent))
893 __barrier(__CLK_LOCAL_MEM_FENCE);
898__attribute__((convergent))
899int __syncthreads_count(
int predicate)
901 return __ockl_wgred_add_i32(!!predicate);
906__attribute__((convergent))
907int __syncthreads_and(
int predicate)
909 return __ockl_wgred_and_i32(!!predicate);
914__attribute__((convergent))
915int __syncthreads_or(
int predicate)
917 return __ockl_wgred_or_i32(!!predicate);
949#if (defined (__GFX10__) || defined (__GFX11__))
955#if (defined(__GFX10__) || defined(__GFX11__))
956 #define HW_ID_WGP_ID_SIZE 4
957 #define HW_ID_WGP_ID_OFFSET 10
959 #define HW_ID_CU_ID_SIZE 4
960 #define HW_ID_CU_ID_OFFSET 8
963#if (defined(__gfx908__) || defined(__gfx90a__) || \
965 #define HW_ID_SE_ID_SIZE 3
967 #define HW_ID_SE_ID_SIZE 2
969#if (defined(__GFX10__) || defined(__GFX11__))
970 #define HW_ID_SE_ID_OFFSET 18
971 #define HW_ID_SA_ID_OFFSET 16
972 #define HW_ID_SA_ID_SIZE 1
974 #define HW_ID_SE_ID_OFFSET 13
977#if (defined(__gfx940__))
979 #define XCC_ID_XCC_ID_SIZE 4
980 #define XCC_ID_XCC_ID_OFFSET 0
983#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
984 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
985 #define __HIP_NO_IMAGE_SUPPORT 1
995#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
1005unsigned __smid(
void)
1007 unsigned se_id = __builtin_amdgcn_s_getreg(
1008 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
1009 #if (defined(__GFX10__) || defined(__GFX11__))
1010 unsigned wgp_id = __builtin_amdgcn_s_getreg(
1011 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
1012 unsigned sa_id = __builtin_amdgcn_s_getreg(
1013 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
1015 #if defined(__gfx940__)
1016 unsigned xcc_id = __builtin_amdgcn_s_getreg(
1017 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
1019 unsigned cu_id = __builtin_amdgcn_s_getreg(
1020 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
1022 #if (defined(__GFX10__) || defined(__GFX11__))
1023 unsigned temp = se_id;
1024 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
1025 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
1028 #elif defined(__gfx940__)
1029 unsigned temp = xcc_id;
1030 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
1031 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
1034 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
1042#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
1043#define HIP_DYNAMIC_SHARED_ATTRIBUTE
1049static inline __device__
void* __hip_hc_memcpy(
void* dst,
const void* src,
size_t size) {
1050 auto dstPtr =
static_cast<unsigned char*
>(dst);
1051 auto srcPtr =
static_cast<const unsigned char*
>(src);
1053 while (size >= 4u) {
1054 dstPtr[0] = srcPtr[0];
1055 dstPtr[1] = srcPtr[1];
1056 dstPtr[2] = srcPtr[2];
1057 dstPtr[3] = srcPtr[3];
1065 dstPtr[2] = srcPtr[2];
1067 dstPtr[1] = srcPtr[1];
1069 dstPtr[0] = srcPtr[0];
1075static inline __device__
void* __hip_hc_memset(
void* dst,
unsigned char val,
size_t size) {
1076 auto dstPtr =
static_cast<unsigned char*
>(dst);
1078 while (size >= 4u) {
1098#ifndef __OPENMP_AMDGCN__
1099static inline __device__
void* memcpy(
void* dst,
const void* src,
size_t size) {
1100 return __hip_hc_memcpy(dst, src, size);
1103static inline __device__
void* memset(
void* ptr,
int val,
size_t size) {
1104 unsigned char val8 =
static_cast<unsigned char>(val);
1105 return __hip_hc_memset(ptr, val8, size);
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
Definition amd_device_functions.h:228
Definition amd_device_functions.h:235
Definition amd_hip_vector_types.h:1623