Skip to content

Commit 4fdb108

Browse files
authored
[AMDGPU] Be more careful about using expandDivRem24 (#201186)
expandDivRem24 uses v_rcp_f32 which can have an error of one ulp. This can cause an incorrect calculation of Y/X when abs(Y) > 0x800000. Thus, do not use expandDivRem24 for unsigned 24-bit values. --------- Signed-off-by: John Lu <[email protected]>
1 parent 1c0b58e commit 4fdb108

10 files changed

Lines changed: 1438 additions & 649 deletions

File tree

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,7 +1062,18 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
10621062
Value *Den, bool IsDiv,
10631063
bool IsSigned) const {
10641064
unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
1065-
if (DivBits > 24)
1065+
1066+
// v_rcp_f32(float(X)) can have an error of 1 ulp.
1067+
// This can cause expandDivRem24Impl to sometimes calculate Y/X incorrectly
1068+
// when abs(Y)>0x800000.
1069+
// For example,
1070+
// (0xbf2758/0xbf2759) erroneously produces 1 instead of 0.
1071+
// (0xe3170d/0x000c32) erroneously produces 4767 instead of 4766.
1072+
//
1073+
// Note that for DivBits==24 && IsSigned, Y is in the range
1074+
// [-0x800000:0x7FFFFF]. abs(Y) is at most
1075+
// 0x800000 so it cannot hit this issue.
1076+
if (DivBits > (IsSigned ? 24 : 23))
10661077
return nullptr;
10671078
return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
10681079
}
@@ -1353,7 +1364,17 @@ Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
13531364
return nullptr;
13541365

13551366
Value *Narrowed = nullptr;
1356-
if (NumDivBits <= 24) {
1367+
// v_rcp_f32(float(X)) can have an error of 1 ulp.
1368+
// This can cause expandDivRem24Impl to sometimes calculate Y/X incorrectly
1369+
// when abs(Y)>0x800000.
1370+
// For example,
1371+
// (0xbf2758/0xbf2759) erroneously produces 1 instead of 0.
1372+
// (0xe3170d/0x000c32) erroneously produces 4767 instead of 4766.
1373+
//
1374+
// Note that for NumDivBits==24 && IsSigned, Y is in the range
1375+
// [-0x800000:0x7FFFFF]. abs(Y) is at most
1376+
// 0x800000 so it cannot hit this issue.
1377+
if (NumDivBits <= (IsSigned ? 24 : 23)) {
13571378
Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
13581379
IsDiv, IsSigned);
13591380
} else if (NumDivBits <= 32) {

llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll

Lines changed: 55 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -431,17 +431,25 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
431431
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
432432
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
433433
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
434-
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0
435-
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1
436-
; CGP-NEXT: v_rcp_f32_e32 v2, v1
437-
; CGP-NEXT: v_mul_f32_e32 v2, v0, v2
438-
; CGP-NEXT: v_trunc_f32_e32 v2, v2
439-
; CGP-NEXT: v_fma_f32 v0, -v2, v1, v0
434+
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
435+
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
436+
; CGP-NEXT: v_rcp_f32_e32 v2, v2
437+
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
440438
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
441-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v1
442-
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
443-
; CGP-NEXT: v_add_i32_e32 v0, vcc, v2, v0
444-
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
439+
; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
440+
; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
441+
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
442+
; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
443+
; CGP-NEXT: v_mul_lo_u32 v3, v2, v1
444+
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
445+
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
446+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
447+
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
448+
; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
449+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
450+
; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v2
451+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
452+
; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
445453
; CGP-NEXT: s_setpc_b64 s[30:31]
446454
%num.mask = and i32 %num, 16777215
447455
%den.mask = and i32 %den, 16777215
@@ -504,28 +512,44 @@ define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
504512
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
505513
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
506514
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
507-
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0
508-
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2
509-
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1
510-
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v3
511-
; CGP-NEXT: v_rcp_f32_e32 v4, v2
512-
; CGP-NEXT: v_rcp_f32_e32 v5, v3
513-
; CGP-NEXT: v_mul_f32_e32 v4, v0, v4
514-
; CGP-NEXT: v_mul_f32_e32 v5, v1, v5
515-
; CGP-NEXT: v_trunc_f32_e32 v4, v4
516-
; CGP-NEXT: v_trunc_f32_e32 v5, v5
517-
; CGP-NEXT: v_fma_f32 v0, -v4, v2, v0
515+
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
516+
; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
517+
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
518+
; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
519+
; CGP-NEXT: v_rcp_f32_e32 v4, v4
520+
; CGP-NEXT: v_rcp_f32_e32 v6, v6
521+
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
522+
; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
518523
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
519-
; CGP-NEXT: v_fma_f32 v1, -v5, v3, v1
520-
; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
521-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v2
522-
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
523-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, v3
524-
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
525-
; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
526-
; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
527-
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
528-
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
524+
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
525+
; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
526+
; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
527+
; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
528+
; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
529+
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
530+
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
531+
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
532+
; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
533+
; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
534+
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
535+
; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
536+
; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
537+
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
538+
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
539+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
540+
; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
541+
; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
542+
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
543+
; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
544+
; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
545+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
546+
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
547+
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
548+
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
549+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
550+
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
551+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
552+
; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
529553
; CGP-NEXT: s_setpc_b64 s[30:31]
530554
%num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
531555
%den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>

llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll

Lines changed: 61 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1919,17 +1919,25 @@ define i64 @v_udiv_i64_24bit(i64 %num, i64 %den) {
19191919
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19201920
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
19211921
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2
1922-
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0
1923-
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1
1924-
; CGP-NEXT: v_rcp_f32_e32 v2, v1
1925-
; CGP-NEXT: v_mul_f32_e32 v2, v0, v2
1926-
; CGP-NEXT: v_trunc_f32_e32 v2, v2
1927-
; CGP-NEXT: v_mad_f32 v0, -v2, v1, v0
1922+
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
1923+
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
1924+
; CGP-NEXT: v_rcp_f32_e32 v2, v2
1925+
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
19281926
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
1929-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v1
1930-
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
1931-
; CGP-NEXT: v_add_i32_e32 v0, vcc, v2, v0
1932-
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
1927+
; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
1928+
; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
1929+
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
1930+
; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
1931+
; CGP-NEXT: v_mul_lo_u32 v3, v2, v1
1932+
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
1933+
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
1934+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
1935+
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
1936+
; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
1937+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
1938+
; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v2
1939+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
1940+
; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
19331941
; CGP-NEXT: v_mov_b32_e32 v1, 0
19341942
; CGP-NEXT: s_setpc_b64 s[30:31]
19351943
%num.mask = and i64 %num, 16777215
@@ -2173,31 +2181,51 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
21732181
; CGP: ; %bb.0:
21742182
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21752183
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
2176-
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4
2177-
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
2184+
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2
2185+
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4
21782186
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6
2179-
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v0
2180-
; CGP-NEXT: v_cvt_f32_u32_e32 v1, v1
2181-
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v2
2182-
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v3
2183-
; CGP-NEXT: v_rcp_f32_e32 v4, v1
2184-
; CGP-NEXT: v_rcp_f32_e32 v5, v3
2185-
; CGP-NEXT: v_mul_f32_e32 v4, v0, v4
2186-
; CGP-NEXT: v_mul_f32_e32 v5, v2, v5
2187-
; CGP-NEXT: v_trunc_f32_e32 v4, v4
2188-
; CGP-NEXT: v_trunc_f32_e32 v5, v5
2189-
; CGP-NEXT: v_mad_f32 v0, -v4, v1, v0
2187+
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
2188+
; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
2189+
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
2190+
; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
2191+
; CGP-NEXT: v_rcp_f32_e32 v4, v4
2192+
; CGP-NEXT: v_rcp_f32_e32 v6, v6
2193+
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
2194+
; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
21902195
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
2191-
; CGP-NEXT: v_mad_f32 v2, -v5, v3, v2
2192-
; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
2193-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v1
2194-
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
2195-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, v3
2196-
; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
2197-
; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
2198-
; CGP-NEXT: v_add_i32_e32 v1, vcc, v5, v1
2199-
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
2200-
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v1
2196+
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
2197+
; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
2198+
; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
2199+
; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
2200+
; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
2201+
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
2202+
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
2203+
; CGP-NEXT: v_mul_hi_u32 v6, v0, v4
2204+
; CGP-NEXT: v_mul_lo_u32 v4, 0, v4
2205+
; CGP-NEXT: v_mul_hi_u32 v7, v1, v5
2206+
; CGP-NEXT: v_mul_lo_u32 v5, 0, v5
2207+
; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
2208+
; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
2209+
; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
2210+
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
2211+
; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
2212+
; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
2213+
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
2214+
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
2215+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
2216+
; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
2217+
; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
2218+
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
2219+
; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
2220+
; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
2221+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
2222+
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
2223+
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
2224+
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
2225+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
2226+
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
2227+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
2228+
; CGP-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc
22012229
; CGP-NEXT: v_mov_b32_e32 v1, 0
22022230
; CGP-NEXT: v_mov_b32_e32 v3, 0
22032231
; CGP-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll

Lines changed: 51 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -409,19 +409,23 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
409409
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
410410
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
411411
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
412-
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v0
413-
; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1
414-
; CGP-NEXT: v_rcp_f32_e32 v4, v3
415-
; CGP-NEXT: v_mul_f32_e32 v4, v2, v4
416-
; CGP-NEXT: v_trunc_f32_e32 v4, v4
417-
; CGP-NEXT: v_fma_f32 v2, -v4, v3, v2
418-
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
419-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, v3
420-
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
421-
; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
422-
; CGP-NEXT: v_mul_lo_u32 v1, v2, v1
423-
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
424-
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
412+
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
413+
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
414+
; CGP-NEXT: v_rcp_f32_e32 v2, v2
415+
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
416+
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
417+
; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
418+
; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
419+
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
420+
; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
421+
; CGP-NEXT: v_mul_lo_u32 v2, v2, v1
422+
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
423+
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
424+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
425+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
426+
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
427+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
428+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
425429
; CGP-NEXT: s_setpc_b64 s[30:31]
426430
%num.mask = and i32 %num, 16777215
427431
%den.mask = and i32 %den, 16777215
@@ -480,32 +484,40 @@ define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
480484
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
481485
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
482486
; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
483-
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v0
484-
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2
485-
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v1
486-
; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3
487-
; CGP-NEXT: v_rcp_f32_e32 v8, v5
488-
; CGP-NEXT: v_rcp_f32_e32 v9, v7
489-
; CGP-NEXT: v_mul_f32_e32 v8, v4, v8
490-
; CGP-NEXT: v_mul_f32_e32 v9, v6, v9
491-
; CGP-NEXT: v_trunc_f32_e32 v8, v8
492-
; CGP-NEXT: v_trunc_f32_e32 v9, v9
493-
; CGP-NEXT: v_fma_f32 v4, -v8, v5, v4
494-
; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
495-
; CGP-NEXT: v_fma_f32 v6, -v9, v7, v6
496-
; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9
497-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, v5
498-
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
499-
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, v7
500-
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
501-
; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
502-
; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
503-
; CGP-NEXT: v_mul_lo_u32 v2, v4, v2
504-
; CGP-NEXT: v_mul_lo_u32 v3, v5, v3
505-
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
506-
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
507-
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
508-
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
487+
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
488+
; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
489+
; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
490+
; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
491+
; CGP-NEXT: v_rcp_f32_e32 v4, v4
492+
; CGP-NEXT: v_rcp_f32_e32 v6, v6
493+
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
494+
; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
495+
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
496+
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
497+
; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
498+
; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
499+
; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
500+
; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
501+
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
502+
; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
503+
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
504+
; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
505+
; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
506+
; CGP-NEXT: v_mul_lo_u32 v5, v5, v3
507+
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
508+
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
509+
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
510+
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
511+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
512+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
513+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
514+
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
515+
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
516+
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
517+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
518+
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
519+
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
520+
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
509521
; CGP-NEXT: s_setpc_b64 s[30:31]
510522
%num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
511523
%den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>

0 commit comments

Comments
 (0)