Skip to content

Commit c19fa5b

Browse files
authored
[WebAssembly] narrow instructions use signed saturation (#201798)
Fixes #201780 Per https://www.w3.org/TR/wasm-core-2/#-hrefop-narrowmathrmnarrowmathsfu_m-n-i the saturation is signed, the truncation is unsigned.
1 parent f04b271 commit c19fa5b

3 files changed

Lines changed: 101 additions & 33 deletions

File tree

llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1531,12 +1531,26 @@ multiclass SignedSaturatingTruncate<ValueType input, ValueType output,
15311531
defm : SignedSaturatingTruncate<v8i16, v16i8, NARROW_S_I8x16, -128, 127, 0xFF>;
15321532
defm : SignedSaturatingTruncate<v4i32, v8i16, NARROW_S_I16x8, -32768, 32767, 0xFFFF>;
15331533

1534+
// NOTE: the saturating is actually signed, the truncation is unsigned, see
1535+
// https://www.w3.org/TR/wasm-core-2/#-hrefop-narrowmathrmnarrowmathsfu_m-n-i
15341536
multiclass UnsignedSaturatingTruncate<ValueType input, ValueType output,
1535-
Instruction narrow, int maxval> {
1537+
Instruction narrow, int maxval> {
15361538
def : Pat<
15371539
(output (wasm_narrow_u
1538-
(umin (input V128:$a), (splat_vector (i32 maxval))),
1539-
(umin (input V128:$b), (splat_vector (i32 maxval)))
1540+
(smin (smax (input V128:$a), (splat_vector (i32 0))),
1541+
(splat_vector (i32 maxval))),
1542+
(smin (smax (input V128:$b), (splat_vector (i32 0))),
1543+
(splat_vector (i32 maxval)))
1544+
)),
1545+
(narrow V128:$a, V128:$b)
1546+
>;
1547+
1548+
def : Pat<
1549+
(output (wasm_narrow_u
1550+
(smax (smin (input V128:$a), (splat_vector (i32 maxval))),
1551+
(splat_vector (i32 0))),
1552+
(smax (smin (input V128:$b), (splat_vector (i32 maxval))),
1553+
(splat_vector (i32 0)))
15401554
)),
15411555
(narrow V128:$a, V128:$b)
15421556
>;

llvm/test/CodeGen/WebAssembly/fpclamptosat_vec.ll

Lines changed: 14 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -484,7 +484,7 @@ entry:
484484
define <8 x i16> @utest_f16i16(<8 x half> %x) {
485485
; CHECK-LABEL: utest_f16i16:
486486
; CHECK: .functype utest_f16i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128)
487-
; CHECK-NEXT: .local f32, f32, f32, f32, f32
487+
; CHECK-NEXT: .local f32, f32, f32, f32, f32, v128
488488
; CHECK-NEXT: # %bb.0: # %entry
489489
; CHECK-NEXT: local.get 5
490490
; CHECK-NEXT: call __extendhfsf2
@@ -516,6 +516,9 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) {
516516
; CHECK-NEXT: call __extendhfsf2
517517
; CHECK-NEXT: i32.trunc_sat_f32_u
518518
; CHECK-NEXT: i32x4.replace_lane 3
519+
; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
520+
; CHECK-NEXT: local.tee 13
521+
; CHECK-NEXT: i32x4.min_u
519522
; CHECK-NEXT: local.get 9
520523
; CHECK-NEXT: i32.trunc_sat_f32_u
521524
; CHECK-NEXT: i32x4.splat
@@ -528,6 +531,8 @@ define <8 x i16> @utest_f16i16(<8 x half> %x) {
528531
; CHECK-NEXT: local.get 11
529532
; CHECK-NEXT: i32.trunc_sat_f32_u
530533
; CHECK-NEXT: i32x4.replace_lane 3
534+
; CHECK-NEXT: local.get 13
535+
; CHECK-NEXT: i32x4.min_u
531536
; CHECK-NEXT: i16x8.narrow_i32x4_u
532537
; CHECK-NEXT: # fallthrough-return
533538
entry:
@@ -541,7 +546,7 @@ entry:
541546
define <8 x i16> @ustest_f16i16(<8 x half> %x) {
542547
; CHECK-LABEL: ustest_f16i16:
543548
; CHECK: .functype ustest_f16i16 (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128)
544-
; CHECK-NEXT: .local f32, f32, f32, f32, f32, v128, v128
549+
; CHECK-NEXT: .local f32, f32, f32, f32, f32
545550
; CHECK-NEXT: # %bb.0: # %entry
546551
; CHECK-NEXT: local.get 5
547552
; CHECK-NEXT: call __extendhfsf2
@@ -573,12 +578,6 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
573578
; CHECK-NEXT: call __extendhfsf2
574579
; CHECK-NEXT: i32.trunc_sat_f32_s
575580
; CHECK-NEXT: i32x4.replace_lane 3
576-
; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
577-
; CHECK-NEXT: local.tee 13
578-
; CHECK-NEXT: i32x4.min_s
579-
; CHECK-NEXT: v128.const 0, 0, 0, 0
580-
; CHECK-NEXT: local.tee 14
581-
; CHECK-NEXT: i32x4.max_s
582581
; CHECK-NEXT: local.get 9
583582
; CHECK-NEXT: i32.trunc_sat_f32_s
584583
; CHECK-NEXT: i32x4.splat
@@ -591,10 +590,6 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) {
591590
; CHECK-NEXT: local.get 11
592591
; CHECK-NEXT: i32.trunc_sat_f32_s
593592
; CHECK-NEXT: i32x4.replace_lane 3
594-
; CHECK-NEXT: local.get 13
595-
; CHECK-NEXT: i32x4.min_s
596-
; CHECK-NEXT: local.get 14
597-
; CHECK-NEXT: i32x4.max_s
598593
; CHECK-NEXT: i16x8.narrow_i32x4_u
599594
; CHECK-NEXT: # fallthrough-return
600595
entry:
@@ -1850,7 +1845,7 @@ entry:
18501845
define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
18511846
; CHECK-LABEL: utest_f16i16_mm:
18521847
; CHECK: .functype utest_f16i16_mm (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128)
1853-
; CHECK-NEXT: .local f32, f32, f32, f32, f32
1848+
; CHECK-NEXT: .local f32, f32, f32, f32, f32, v128
18541849
; CHECK-NEXT: # %bb.0: # %entry
18551850
; CHECK-NEXT: local.get 5
18561851
; CHECK-NEXT: call __extendhfsf2
@@ -1882,6 +1877,9 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
18821877
; CHECK-NEXT: call __extendhfsf2
18831878
; CHECK-NEXT: i32.trunc_sat_f32_u
18841879
; CHECK-NEXT: i32x4.replace_lane 3
1880+
; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
1881+
; CHECK-NEXT: local.tee 13
1882+
; CHECK-NEXT: i32x4.min_u
18851883
; CHECK-NEXT: local.get 9
18861884
; CHECK-NEXT: i32.trunc_sat_f32_u
18871885
; CHECK-NEXT: i32x4.splat
@@ -1894,6 +1892,8 @@ define <8 x i16> @utest_f16i16_mm(<8 x half> %x) {
18941892
; CHECK-NEXT: local.get 11
18951893
; CHECK-NEXT: i32.trunc_sat_f32_u
18961894
; CHECK-NEXT: i32x4.replace_lane 3
1895+
; CHECK-NEXT: local.get 13
1896+
; CHECK-NEXT: i32x4.min_u
18971897
; CHECK-NEXT: i16x8.narrow_i32x4_u
18981898
; CHECK-NEXT: # fallthrough-return
18991899
entry:
@@ -1906,7 +1906,7 @@ entry:
19061906
define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
19071907
; CHECK-LABEL: ustest_f16i16_mm:
19081908
; CHECK: .functype ustest_f16i16_mm (i32, i32, i32, i32, i32, i32, i32, i32) -> (v128)
1909-
; CHECK-NEXT: .local f32, f32, f32, f32, f32, v128, v128
1909+
; CHECK-NEXT: .local f32, f32, f32, f32, f32
19101910
; CHECK-NEXT: # %bb.0: # %entry
19111911
; CHECK-NEXT: local.get 5
19121912
; CHECK-NEXT: call __extendhfsf2
@@ -1938,12 +1938,6 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
19381938
; CHECK-NEXT: call __extendhfsf2
19391939
; CHECK-NEXT: i32.trunc_sat_f32_s
19401940
; CHECK-NEXT: i32x4.replace_lane 3
1941-
; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
1942-
; CHECK-NEXT: local.tee 13
1943-
; CHECK-NEXT: i32x4.min_s
1944-
; CHECK-NEXT: v128.const 0, 0, 0, 0
1945-
; CHECK-NEXT: local.tee 14
1946-
; CHECK-NEXT: i32x4.max_s
19471941
; CHECK-NEXT: local.get 9
19481942
; CHECK-NEXT: i32.trunc_sat_f32_s
19491943
; CHECK-NEXT: i32x4.splat
@@ -1956,10 +1950,6 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
19561950
; CHECK-NEXT: local.get 11
19571951
; CHECK-NEXT: i32.trunc_sat_f32_s
19581952
; CHECK-NEXT: i32x4.replace_lane 3
1959-
; CHECK-NEXT: local.get 13
1960-
; CHECK-NEXT: i32x4.min_s
1961-
; CHECK-NEXT: local.get 14
1962-
; CHECK-NEXT: i32x4.max_s
19631953
; CHECK-NEXT: i16x8.narrow_i32x4_u
19641954
; CHECK-NEXT: # fallthrough-return
19651955
entry:

llvm/test/CodeGen/WebAssembly/saturating-truncation.ll

Lines changed: 70 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,19 @@ bb2:
5656
ret <8 x i16> %3
5757
}
5858

59-
define <16 x i8> @i16_unsigned(<8 x i16> %a, <8 x i16> %b) {
60-
; CHECK-LABEL: i16_unsigned:
61-
; CHECK: .functype i16_unsigned (v128, v128) -> (v128)
59+
; NOTE: unsigned narrow uses *signed* saturation, the manual unsigned saturation cannot be optimized out.
60+
define <16 x i8> @i16_unsigned_sat_unsigned_truncate(<8 x i16> %a, <8 x i16> %b) {
61+
; CHECK-LABEL: i16_unsigned_sat_unsigned_truncate:
62+
; CHECK: .functype i16_unsigned_sat_unsigned_truncate (v128, v128) -> (v128)
63+
; CHECK-NEXT: .local v128
6264
; CHECK-NEXT: # %bb.0: # %bb2
6365
; CHECK-NEXT: local.get 0
66+
; CHECK-NEXT: v128.const 255, 255, 255, 255, 255, 255, 255, 255
67+
; CHECK-NEXT: local.tee 2
68+
; CHECK-NEXT: i16x8.min_u
6469
; CHECK-NEXT: local.get 1
70+
; CHECK-NEXT: local.get 2
71+
; CHECK-NEXT: i16x8.min_u
6572
; CHECK-NEXT: i8x16.narrow_i16x8_u
6673
; CHECK-NEXT: # fallthrough-return
6774
bb2:
@@ -71,12 +78,19 @@ bb2:
7178
ret <16 x i8> %2
7279
}
7380

74-
define <8 x i16> @i32_unsigned(<4 x i32> %a, <4 x i32> %b) {
75-
; CHECK-LABEL: i32_unsigned:
76-
; CHECK: .functype i32_unsigned (v128, v128) -> (v128)
81+
; NOTE: unsigned narrow uses *signed* saturation, the manual unsigned saturation cannot be optimized out.
82+
define <8 x i16> @i32_unsigned_sat_unsigned_truncate(<4 x i32> %a, <4 x i32> %b) {
83+
; CHECK-LABEL: i32_unsigned_sat_unsigned_truncate:
84+
; CHECK: .functype i32_unsigned_sat_unsigned_truncate (v128, v128) -> (v128)
85+
; CHECK-NEXT: .local v128
7786
; CHECK-NEXT: # %bb.0: # %bb2
7887
; CHECK-NEXT: local.get 0
88+
; CHECK-NEXT: v128.const 65535, 65535, 65535, 65535
89+
; CHECK-NEXT: local.tee 2
90+
; CHECK-NEXT: i32x4.min_u
7991
; CHECK-NEXT: local.get 1
92+
; CHECK-NEXT: local.get 2
93+
; CHECK-NEXT: i32x4.min_u
8094
; CHECK-NEXT: i16x8.narrow_i32x4_u
8195
; CHECK-NEXT: # fallthrough-return
8296
bb2:
@@ -85,3 +99,53 @@ bb2:
8599
%2 = trunc nsw <8 x i32> %1 to <8 x i16>
86100
ret <8 x i16> %2
87101
}
102+
103+
; NOTE: narrow_i16x8_u uses *signed* saturation, the manual unsigned saturation cannot be optimized out.
104+
define <16 x i8> @narrow_with_manual_unsigned_sat(<8 x i16> %a) {
105+
; CHECK-LABEL: narrow_with_manual_unsigned_sat:
106+
; CHECK: .functype narrow_with_manual_unsigned_sat (v128) -> (v128)
107+
; CHECK-NEXT: # %bb.0: # %start
108+
; CHECK-NEXT: local.get 0
109+
; CHECK-NEXT: v128.const 255, 255, 255, 255, 255, 255, 255, 255
110+
; CHECK-NEXT: i16x8.min_u
111+
; CHECK-NEXT: local.tee 0
112+
; CHECK-NEXT: local.get 0
113+
; CHECK-NEXT: i8x16.narrow_i16x8_u
114+
; CHECK-NEXT: # fallthrough-return
115+
start:
116+
%0 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a, <8 x i16> splat (i16 255))
117+
%_21 = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> %0, <8 x i16> %0)
118+
ret <16 x i8> %_21
119+
}
120+
121+
define <16 x i8> @i16_signed_sat_unsigned_truncate(<8 x i16> %a, <8 x i16> %b) {
122+
; CHECK-LABEL: i16_signed_sat_unsigned_truncate:
123+
; CHECK: .functype i16_signed_sat_unsigned_truncate (v128, v128) -> (v128)
124+
; CHECK-NEXT: # %bb.0: # %bb2
125+
; CHECK-NEXT: local.get 0
126+
; CHECK-NEXT: local.get 1
127+
; CHECK-NEXT: i8x16.narrow_i16x8_u
128+
; CHECK-NEXT: # fallthrough-return
129+
bb2:
130+
%0 = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
131+
%1 = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %0, <16 x i16> zeroinitializer)
132+
%2 = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %1, <16 x i16> splat (i16 255))
133+
%3 = trunc nuw <16 x i16> %2 to <16 x i8>
134+
ret <16 x i8> %3
135+
}
136+
137+
define <8 x i16> @i32_signed_sat_unsigned_truncate(<4 x i32> %a, <4 x i32> %b) {
138+
; CHECK-LABEL: i32_signed_sat_unsigned_truncate:
139+
; CHECK: .functype i32_signed_sat_unsigned_truncate (v128, v128) -> (v128)
140+
; CHECK-NEXT: # %bb.0: # %bb2
141+
; CHECK-NEXT: local.get 0
142+
; CHECK-NEXT: local.get 1
143+
; CHECK-NEXT: i16x8.narrow_i32x4_u
144+
; CHECK-NEXT: # fallthrough-return
145+
bb2:
146+
%0 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
147+
%1 = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %0, <8 x i32> splat (i32 65535))
148+
%2 = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %1, <8 x i32> zeroinitializer)
149+
%3 = trunc nuw <8 x i32> %2 to <8 x i16>
150+
ret <8 x i16> %3
151+
}

0 commit comments

Comments
 (0)