Skip to content

Commit 1a9fe17

Browse files
authored
libclc: Update remquo (#187998)
This was failing in the float case without -cl-denorms-are-zero and failing for double. This now passes in all cases. This was originally ported from rocm device libs in 8db45e4. This is mostly a port in of more recent changes with a few changes. - Templatification, which almost but doesn't quite enable vectorization yet due to the outer branch and loop. - Merging of the 3 types into one shared code path, instead of duplicating per type with 3 different functions implemented together. There are only some slight differences for the half case, which mostly evaluates as float. - Splitting out of the is_odd tracking, instead of deriving it from the accumulated quotient. This costs an extra register, but saves several instructions. This also enables automatic elimination of all of the quo output handling when this code is reused for remainder. I'm guessing this would be unnecessary if SimplifyDemandedBits handled phis. - Removal of the slow FMA path. I don't see how this would ever be faster with the number of instructions replacing it. This is really a problem for the compiler to solve anyway.
1 parent d6373b4 commit 1a9fe17

File tree

5 files changed

+316
-287
lines changed

5 files changed

+316
-287
lines changed

libclc/clc/include/clc/math/remquo_decl.inc

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,29 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9-
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
10-
__CLC_GENTYPE y,
11-
private __CLC_INTN *q);
9+
typedef struct __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE) {
10+
__CLC_GENTYPE rem;
11+
__CLC_INTN quo;
12+
} __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE);
1213

13-
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
14-
__CLC_GENTYPE y,
15-
global __CLC_INTN *q);
14+
#define __CLC_REMQUO_RET_GENTYPE __CLC_XCONCAT(__clc_remquo_ret_, __CLC_GENTYPE)
1615

17-
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
18-
__CLC_GENTYPE y,
19-
local __CLC_INTN *q);
16+
_CLC_OVERLOAD _CLC_DECL __CLC_REMQUO_RET_GENTYPE
17+
__clc_remquo_stret(__CLC_GENTYPE x, __CLC_GENTYPE y);
18+
19+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
20+
__CLC_GENTYPE y,
21+
private __CLC_INTN *q);
22+
23+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
24+
__CLC_GENTYPE y,
25+
global __CLC_INTN *q);
26+
27+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
28+
__CLC_GENTYPE y,
29+
local __CLC_INTN *q);
2030
#if _CLC_GENERIC_AS_SUPPORTED
21-
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x,
22-
__CLC_GENTYPE y,
23-
generic __CLC_INTN *q);
31+
_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_remquo(__CLC_GENTYPE x,
32+
__CLC_GENTYPE y,
33+
generic __CLC_INTN *q);
2434
#endif
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "clc/utils.h"
10+
11+
#ifndef __CLC_IMPL_FUNCTION
12+
#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
13+
#endif
14+
15+
#ifndef __CLC_RET_SCALAR_TYPE
16+
#define __CLC_RET_SCALAR_TYPE __CLC_SCALAR_GENTYPE
17+
#endif
18+
19+
#ifndef __CLC_ARG1_SCALAR_TYPE
20+
#define __CLC_ARG1_SCALAR_TYPE __CLC_SCALAR_GENTYPE
21+
#endif
22+
23+
#ifndef __CLC_ARG2_SCALAR_TYPE
24+
#define __CLC_ARG2_SCALAR_TYPE __CLC_SCALAR_GENTYPE
25+
#endif
26+
27+
#ifndef __CLC_OUT_ARG3_SCALAR_TYPE
28+
#define __CLC_OUT_ARG3_SCALAR_TYPE __CLC_SCALAR_GENTYPE
29+
#endif
30+
31+
#define __CLC_RET_TYPE __CLC_XCONCAT(__CLC_RET_SCALAR_TYPE, __CLC_VECSIZE)
32+
#define __CLC_ARG1_TYPE __CLC_XCONCAT(__CLC_ARG1_SCALAR_TYPE, __CLC_VECSIZE)
33+
#define __CLC_ARG2_TYPE __CLC_XCONCAT(__CLC_ARG2_SCALAR_TYPE, __CLC_VECSIZE)
34+
#define __CLC_OUT_ARG3_TYPE \
35+
__CLC_XCONCAT(__CLC_OUT_ARG3_SCALAR_TYPE, __CLC_VECSIZE)
36+
37+
#ifndef __CLC_OUT_ARG3_ADDRESS_SPACE
38+
#define __CLC_OUT_ARG3_ADDRESS_SPACE __private
39+
#endif
40+
41+
#if __CLC_VECSIZE_OR_1 >= 2
42+
43+
_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE
44+
__CLC_FUNCTION(__CLC_ARG1_TYPE x, __CLC_ARG2_TYPE y,
45+
__CLC_OUT_ARG3_ADDRESS_SPACE __CLC_OUT_ARG3_TYPE *z) {
46+
union {
47+
__CLC_ARG1_TYPE vec;
48+
__CLC_ARG1_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
49+
} u_x;
50+
51+
union {
52+
__CLC_ARG2_TYPE vec;
53+
__CLC_ARG2_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
54+
} u_y;
55+
56+
union {
57+
__CLC_RET_TYPE vec;
58+
__CLC_RET_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
59+
} u_result0;
60+
61+
union {
62+
__CLC_OUT_ARG3_TYPE vec;
63+
__CLC_OUT_ARG3_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
64+
} u_result1;
65+
66+
u_x.vec = x;
67+
u_y.vec = y;
68+
for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i) {
69+
u_result0.arr[i] =
70+
__CLC_IMPL_FUNCTION(u_x.arr[i], u_y.arr[i], &u_result1.arr[i]);
71+
}
72+
73+
*z = u_result1.vec;
74+
return u_result0.vec;
75+
}
76+
77+
#endif // __CLC_VECSIZE_OR_1 >= 2
78+
79+
#undef __CLC_RET_TYPE
80+
#undef __CLC_ARG1_TYPE
81+
#undef __CLC_ARG2_TYPE
82+
#undef __CLC_OUT_ARG3_TYPE

libclc/clc/lib/generic/math/clc_remquo.cl

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,32 +6,58 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include "clc/math/clc_remquo.h"
10+
911
#include "clc/clc_convert.h"
12+
#include "clc/float/definitions.h"
1013
#include "clc/integer/clc_clz.h"
11-
#include "clc/internal/clc.h"
12-
#include "clc/math/clc_floor.h"
14+
#include "clc/math/clc_copysign.h"
15+
#include "clc/math/clc_fabs.h"
1316
#include "clc/math/clc_flush_if_daz.h"
1417
#include "clc/math/clc_fma.h"
18+
#include "clc/math/clc_frexp.h"
1519
#include "clc/math/clc_ldexp.h"
20+
#include "clc/math/clc_mad.h"
21+
#include "clc/math/clc_recip_fast.h"
22+
#include "clc/math/clc_rint.h"
1623
#include "clc/math/clc_subnormal_config.h"
1724
#include "clc/math/clc_trunc.h"
1825
#include "clc/math/math.h"
19-
#include "clc/shared/clc_max.h"
26+
#include "clc/relational/clc_isfinite.h"
27+
#include "clc/relational/clc_isnan.h"
28+
#include "clc/relational/clc_signbit.h"
29+
30+
#define __CLC_FUNCTION __clc_remquo_stret
31+
#define __CLC_BODY "clc_remquo_stret.inc"
32+
#include "clc/math/gentype.inc"
33+
#undef __CLC_FUNCTION
34+
35+
#define __CLC_FUNCTION __clc_remquo
36+
#define __CLC_BODY "clc_remquo.inc"
37+
#include "clc/math/gentype.inc"
2038

21-
#define __CLC_ADDRESS_SPACE private
22-
#include "clc_remquo.inc"
23-
#undef __CLC_ADDRESS_SPACE
39+
#define __CLC_OUT_ARG3_SCALAR_TYPE int
40+
#define __CLC_OUT_ARG3_ADDRESS_SPACE __private
41+
#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
42+
#include "clc/math/gentype.inc"
43+
#undef __CLC_OUT_ARG3_ADDRESS_SPACE
2444

25-
#define __CLC_ADDRESS_SPACE global
26-
#include "clc_remquo.inc"
27-
#undef __CLC_ADDRESS_SPACE
45+
#define __CLC_OUT_ARG3_SCALAR_TYPE int
46+
#define __CLC_OUT_ARG3_ADDRESS_SPACE __local
47+
#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
48+
#include "clc/math/gentype.inc"
49+
#undef __CLC_OUT_ARG3_ADDRESS_SPACE
2850

29-
#define __CLC_ADDRESS_SPACE local
30-
#include "clc_remquo.inc"
31-
#undef __CLC_ADDRESS_SPACE
51+
#define __CLC_OUT_ARG3_SCALAR_TYPE int
52+
#define __CLC_OUT_ARG3_ADDRESS_SPACE __global
53+
#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
54+
#include "clc/math/gentype.inc"
55+
#undef __CLC_OUT_ARG3_ADDRESS_SPACE
3256

3357
#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
34-
#define __CLC_ADDRESS_SPACE generic
35-
#include "clc_remquo.inc"
36-
#undef __CLC_ADDRESS_SPACE
58+
#define __CLC_OUT_ARG3_SCALAR_TYPE int
59+
#define __CLC_OUT_ARG3_ADDRESS_SPACE
60+
#define __CLC_BODY "clc/shared/binary_with_out_arg_scalarize.inc"
61+
#include "clc/math/gentype.inc"
62+
#undef __CLC_OUT_ARG3_ADDRESS_SPACE
3763
#endif

0 commit comments

Comments
 (0)