[APFloat] Fix truncation of certain subnormal numbers

danilaml · danilaml · commit ed6c309d4bf6 · 2022-06-08T21:54:35.000+03:00
Certain subnormals would be incorrectly rounded away from zero. Fixes #55838 Differential Revision: https://reviews.llvm.org/D127140
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
@@ -2213,15 +2213,22 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
   // when truncating from PowerPC double-double to double format), the
   // right shift could lose result mantissa bits.  Adjust exponent instead
   // of performing excessive shift.
+  // Also do a similar trick in case shifting denormal would produce zero
+  // significand as this case isn't handled correctly by normalize.
   if (shift < 0 && isFiniteNonZero()) {
-    int exponentChange = significandMSB() + 1 - fromSemantics.precision;
+    int omsb = significandMSB() + 1;
+    int exponentChange = omsb - fromSemantics.precision;
     if (exponent + exponentChange < toSemantics.minExponent)
       exponentChange = toSemantics.minExponent - exponent;
     if (exponentChange < shift)
       exponentChange = shift;
     if (exponentChange < 0) {
       shift -= exponentChange;
       exponent += exponentChange;
+    } else if (omsb <= -shift) {
+      exponentChange = omsb + shift - 1; // leave at least one bit set
+      shift -= exponentChange;
+      exponent += exponentChange;
     }
   }
 
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll b/llvm/test/Transforms/InstSimplify/ConstProp/cast.ll
@@ -79,21 +79,17 @@ define float @trunc_denorm_lost_fraction0() {
   ret float %b
 }
 
-; FIXME: This should be 0.0.
-
 define float @trunc_denorm_lost_fraction1() {
 ; CHECK-LABEL: @trunc_denorm_lost_fraction1(
-; CHECK-NEXT:    ret float 0x36A0000000000000
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %b = fptrunc double 0x0000000010000001 to float
   ret float %b
 }
 
-; FIXME: This should be 0.0.
-
 define float @trunc_denorm_lost_fraction2() {
 ; CHECK-LABEL: @trunc_denorm_lost_fraction2(
-; CHECK-NEXT:    ret float 0x36A0000000000000
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %b = fptrunc double 0x000000001fffffff to float
   ret float %b
@@ -107,11 +103,9 @@ define float @trunc_denorm_lost_fraction3() {
   ret float %b
 }
 
-; FIXME: This should be -0.0.
-
 define float @trunc_denorm_lost_fraction4() {
 ; CHECK-LABEL: @trunc_denorm_lost_fraction4(
-; CHECK-NEXT:    ret float 0xB6A0000000000000
+; CHECK-NEXT:    ret float -0.000000e+00
 ;
   %b = fptrunc double 0x8000000010000001 to float
   ret float %b
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
@@ -1859,6 +1859,48 @@ TEST(APFloatTest, convert) {
   EXPECT_EQ(0x7fc00000, test.bitcastToAPInt());
   EXPECT_TRUE(losesInfo);
   EXPECT_EQ(status, APFloat::opOK);
+
+  // Test that subnormals are handled correctly in double to float conversion
+  test = APFloat(APFloat::IEEEdouble(), "0x0.0000010000000p-1022");
+  test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  test = APFloat(APFloat::IEEEdouble(), "0x0.0000010000001p-1022");
+  test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  test = APFloat(APFloat::IEEEdouble(), "-0x0.0000010000001p-1022");
+  test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  test = APFloat(APFloat::IEEEdouble(), "0x0.0000020000000p-1022");
+  test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  test = APFloat(APFloat::IEEEdouble(), "0x0.0000020000001p-1022");
+  test.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  // Test subnormal conversion to bfloat
+  test = APFloat(APFloat::IEEEsingle(), "0x0.01p-126");
+  test.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+
+  test = APFloat(APFloat::IEEEsingle(), "0x0.02p-126");
+  test.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(0x01, test.bitcastToAPInt());
+  EXPECT_FALSE(losesInfo);
+
+  test = APFloat(APFloat::IEEEsingle(), "0x0.01p-126");
+  test.convert(APFloat::BFloat(), APFloat::rmNearestTiesToAway, &losesInfo);
+  EXPECT_EQ(0x01, test.bitcastToAPInt());
+  EXPECT_TRUE(losesInfo);
 }
 
 TEST(APFloatTest, PPCDoubleDouble) {

Original file line number	Diff line number	Diff line change
`@@ -79,21 +79,17 @@ define float @trunc_denorm_lost_fraction0() {`
`79`	`79`	`ret float %b`
`80`	`80`	`}`
`81`	`81`
`82`		`-; FIXME: This should be 0.0.`
`83`		`-`
`84`	`82`	`define float @trunc_denorm_lost_fraction1() {`
`85`	`83`	`; CHECK-LABEL: @trunc_denorm_lost_fraction1(`
`86`		`-; CHECK-NEXT: ret float 0x36A0000000000000`
	`84`	`+; CHECK-NEXT: ret float 0.000000e+00`
`87`	`85`	`;`
`88`	`86`	`%b = fptrunc double 0x0000000010000001 to float`
`89`	`87`	`ret float %b`
`90`	`88`	`}`
`91`	`89`
`92`		`-; FIXME: This should be 0.0.`
`93`		`-`
`94`	`90`	`define float @trunc_denorm_lost_fraction2() {`
`95`	`91`	`; CHECK-LABEL: @trunc_denorm_lost_fraction2(`
`96`		`-; CHECK-NEXT: ret float 0x36A0000000000000`
	`92`	`+; CHECK-NEXT: ret float 0.000000e+00`
`97`	`93`	`;`
`98`	`94`	`%b = fptrunc double 0x000000001fffffff to float`
`99`	`95`	`ret float %b`
`@@ -107,11 +103,9 @@ define float @trunc_denorm_lost_fraction3() {`
`107`	`103`	`ret float %b`
`108`	`104`	`}`
`109`	`105`
`110`		`-; FIXME: This should be -0.0.`
`111`		`-`
`112`	`106`	`define float @trunc_denorm_lost_fraction4() {`
`113`	`107`	`; CHECK-LABEL: @trunc_denorm_lost_fraction4(`
`114`		`-; CHECK-NEXT: ret float 0xB6A0000000000000`
	`108`	`+; CHECK-NEXT: ret float -0.000000e+00`
`115`	`109`	`;`
`116`	`110`	`%b = fptrunc double 0x8000000010000001 to float`
`117`	`111`	`ret float %b`