Skip to content

Commit dddfdd4

Browse files
committed
Add FromNaN32ps(), converts NaN, preserves signal
FromNaN32ps() converts to binary16 NaN without changing the quiet bit so it preserves the signalling. It inlines so it is faster than Fromfloat32(). This was needed because Fromfloat32() is 100% compatible with AMD and Intel F16C instructions. Unfortunately, that means NaN input values are converted to NaN with quiet bit always set in order to be compatible. FromNaN32ps() offers an alternative.
1 parent ba4ba47 commit dddfdd4

File tree

3 files changed

+130
-24
lines changed

3 files changed

+130
-24
lines changed

README.md

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Current status:
3131
* core API is done and breaking API changes are unlikely.
3232
* 100% of unit tests pass:
3333
* short mode (`go test -short`) tests around 65765 conversions in 0.005s.
34-
* normal mode (`go test`) tests all possible 4+ billion conversions in about 75s.
34+
* normal mode (`go test`) tests all possible 4+ billion conversions in about 95s.
3535
* 100% code coverage with both short mode and normal mode.
3636
* tested on amd64 but it should work on all little-endian platforms supported by Go.
3737

@@ -49,7 +49,7 @@ Unit tests take a fraction of a second to check all 65536 expected values for fl
4949
## Float32 to Float16 Conversion
5050
Conversions from float32 to float16 use IEEE 754 default rounding ("Round-to-Nearest RoundTiesToEven"). All 4294967296 possible float32 to float16 conversions (in pure Go) are confirmed to be correct.
5151

52-
Unit tests in normal mode take about 60-90 seconds to check all 4+ billion expected values for float32 to float16 conversions as well as PrecisionFromfloat32() for each.
52+
Unit tests in normal mode take about 1-2 minutes to check all 4+ billion float32 input values and results for Fromfloat32(), FromNaN32ps(), and PrecisionFromfloat32().
5353

5454
Unit tests in short mode use a small subset (around 229 float32 inputs) and finish in under 0.01 second while still reaching 100% code coverage.
5555

@@ -66,7 +66,7 @@ pi32 := pi16.Float32()
6666
// Only convert if there's no data loss (useful for CBOR encoders)
6767
// PrecisionFromfloat32() is faster than the overhead of calling a function
6868
if float16.Precision(pi) == float16.PrecisionExact {
69-
pi16 := float16.Fromfloat32(pi)
69+
pi16 := float16.Fromfloat32(pi)
7070
}
7171
```
7272

@@ -79,23 +79,27 @@ package float16 // import "github.com/cbor-go/float16"
7979
type Float16 uint16
8080
8181
// Exported functions
82-
Fromfloat32(f32 float32) Float16 // Float16 number converted from f32 using IEEE 754 default rounding
83-
Frombits(b16 uint16) Float16 // Float16 number corresponding to b16 (IEEE 754 binary16 rep.)
84-
NaN() Float16 // Float16 of IEEE 754 binary16 not-a-number
85-
Inf(sign int) Float16 // Float16 of IEEE 754 binary16 infinity according to sign
82+
Fromfloat32(f32 float32) Float16 // Float16 number converted from f32 using IEEE 754 default rounding
83+
with identical results to AMD and Intel F16C hardware. NaN inputs
84+
are converted with quiet bit always set on, to be like F16C.
85+
FromNAN32ps(nan float32) Float16 // Float16 NaN converted from 32-bit NaN without changing quiet bit.
86+
// The "ps" suffix means "preserve signalling".
87+
Frombits(b16 uint16) Float16 // Float16 number corresponding to b16 (IEEE 754 binary16 rep.)
88+
NaN() Float16 // Float16 of IEEE 754 binary16 not-a-number
89+
Inf(sign int) Float16 // Float16 of IEEE 754 binary16 infinity according to sign
8690
8791
PrecisionFromfloat32(f32 float32) Precision // quickly indicates exact, inexact, overflow, underflow
8892
// (inline and < 1 ns/op)
8993
// Exported methods
90-
(f Float16) Float32() float32 // float32 number converted from f16 using lossless conversion
91-
(f Float16) Bits() uint16 // the IEEE 754 binary16 representation of f
92-
(f Float16) IsNaN() bool // true if f is not-a-number (NaN)
93-
(f Float16) IsQuietNaN() bool // true if f is a quiet not-a-number (NaN)
94-
(f Float16) IsInf(sign int) bool // true if f is infinite based on sign (-1=NegInf, 0=any, 1=PosInf)
95-
(f Float16) IsFinite() bool // true if f is not infinite or NaN
96-
(f Float16) IsNormal() bool // true if f is not zero, infinite, subnormal, or NaN.
97-
(f Float16) Signbit() bool // true if f is negative or negative zero
98-
(f Float16) String() string // string representation of f to satisfy fmt.Stringer interface
94+
(f Float16) Float32() float32 // float32 number converted from f16 using lossless conversion
95+
(f Float16) Bits() uint16 // the IEEE 754 binary16 representation of f
96+
(f Float16) IsNaN() bool // true if f is not-a-number (NaN)
97+
(f Float16) IsQuietNaN() bool // true if f is a quiet not-a-number (NaN)
98+
(f Float16) IsInf(sign int) bool // true if f is infinite based on sign (-1=NegInf, 0=any, 1=PosInf)
99+
(f Float16) IsFinite() bool // true if f is not infinite or NaN
100+
(f Float16) IsNormal() bool // true if f is not zero, infinite, subnormal, or NaN.
101+
(f Float16) Signbit() bool // true if f is negative or negative zero
102+
(f Float16) String() string // string representation of f to satisfy fmt.Stringer interface
99103
```
100104
See [API](https://godoc.org/github.com/cbor-go/float16) at godoc.org for more info.
101105

float16.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
package float16
77

88
import (
9+
"errors"
910
"math"
1011
"strconv"
1112
)
@@ -238,3 +239,34 @@ func f32bitsToF16bits(u32 uint32) uint16 {
238239
}
239240
return uint16(halfSign | uHalfExp | halfCoef)
240241
}
242+
243+
// ErrInvalidNaNValue indicates a NaN was not received.
244+
var ErrInvalidNaNValue = errors.New("float16: invalid NaN value, expected IEEE 754 NaN")
245+
246+
// FromNaN32ps converts nan to IEEE binary16 NaN while preserving both
247+
// signaling and payload. Unlike Fromfloat32(), which can only return
248+
// qNaN because it sets quiet bit = 1, this can return both sNaN and qNaN.
249+
// If the result is infinity (sNaN with empty payload), then the
250+
// lowest bit of payload is set to make the result a NaN.
251+
// This function was kept simple to be able to inline.
252+
func FromNaN32ps(nan float32) (Float16, error) {
253+
const SNAN = Float16(uint16(0x7c01)) // signalling NaN
254+
255+
u32 := math.Float32bits(nan)
256+
sign := u32 & 0x80000000
257+
exp := u32 & 0x7f800000
258+
coef := u32 & 0x007fffff
259+
260+
if (exp != 0x7f800000) || (coef == 0) {
261+
return SNAN, ErrInvalidNaNValue
262+
}
263+
264+
u16 := uint16((sign >> 16) | uint32(0x7c00) | (coef >> 13))
265+
266+
if (u16 & 0x03ff) == 0 {
267+
// result became infinity, make it NaN by setting lowest bit in payload
268+
u16 = u16 | 0x0001
269+
}
270+
271+
return Float16(u16), nil
272+
}

float16_test.go

Lines changed: 78 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ var wantF32toF16bits = []struct {
251251
{in: math.Float32frombits(0x477ff000), out: 0x7c00}, // in f32=65520.000000, out f16=+Inf
252252
}
253253

254-
func Float32parts(f32 float32) (exp int32, coef uint32, dropped uint32) {
254+
func float32parts(f32 float32) (exp int32, coef uint32, dropped uint32) {
255255
const COEFMASK uint32 = 0x7fffff // 23 least significant bits
256256
const EXPSHIFT uint32 = 23
257257
const EXPBIAS uint32 = 127
@@ -264,19 +264,65 @@ func Float32parts(f32 float32) (exp int32, coef uint32, dropped uint32) {
264264
return exp, coef, dropped
265265
}
266266

267-
func IsNaN32(f32 float32) bool {
268-
exp, coef, _ := Float32parts(f32)
267+
func isNaN32(f32 float32) bool {
268+
exp, coef, _ := float32parts(f32)
269269
return (exp == 128) && (coef != 0)
270270
}
271271

272+
func isQuietNaN32(f32 float32) bool {
273+
exp, coef, _ := float32parts(f32)
274+
return (exp == 128) && (coef != 0) && ((coef & 0x00400000) != 0)
275+
}
276+
277+
func checkFromNaN32ps(t *testing.T, f32 float32, f16 float16.Float16) {
278+
279+
if !isNaN32(f32) {
280+
return
281+
}
282+
283+
u32 := math.Float32bits(f32)
284+
nan16, err := float16.FromNaN32ps(f32)
285+
286+
if isQuietNaN32(f32) {
287+
// result should be the same
288+
if err != nil {
289+
t.Errorf("FromNaN32ps: qnan = 0x%08x (%f) wanted err = nil, got err = %q", u32, f32, err)
290+
}
291+
if uint16(nan16) != uint16(f16) {
292+
t.Errorf("FromNaN32ps: qnan = 0x%08x (%f) wanted nan16 = %v, got nan16 = %v", u32, f32, f16, nan16)
293+
}
294+
} else {
295+
// result should differ only by the signaling/quiet bit unless payload is empty
296+
if err != nil {
297+
t.Errorf("FromNaN32ps: snan = 0x%08x (%f) wanted err = nil, got err = %q", u32, f32, err)
298+
}
299+
300+
coef := uint16(f16) & uint16(0x03ff)
301+
payload := uint16(f16) & uint16(0x01ff)
302+
diff := uint16(nan16 ^ f16)
303+
304+
if payload == 0 {
305+
// the lowest bit needed to be set to prevent turning sNaN into infinity, so 2 bits differ
306+
if diff != 0x0201 {
307+
t.Errorf("FromNaN32ps: snan = 0x%08x (%f) wanted diff == 0x0201, got 0x%04x", u32, f32, diff)
308+
}
309+
} else {
310+
// only the quiet bit was restored, so 1 bit differs
311+
if diff != 0x0200 {
312+
t.Errorf("FromNaN32ps: snan = 0x%08x (%f) wanted diff == 0x0200, got 0x%04x. f16=0x%04x n16=0x%04x coef=0x%04x", u32, f32, diff, uint16(f16), uint16(nan16), coef)
313+
}
314+
}
315+
}
316+
}
317+
272318
func checkPrecision(t *testing.T, f32 float32, f16 float16.Float16, i uint64) {
273319
u32 := math.Float32bits(f32)
274320
u16 := f16.Bits()
275321
f32bis := f16.Float32()
276322
u32bis := math.Float32bits(f32bis)
277323
pre := float16.PrecisionFromfloat32(f32)
278324
roundtripped := u32 == u32bis
279-
exp32, coef32, dropped32 := Float32parts(f32)
325+
exp32, coef32, dropped32 := float32parts(f32)
280326

281327
if roundtripped && (pre != float16.PrecisionExact) {
282328
// The "undead" 2046 binary32 inputs can round-trip back to original value despite underflowing binary16 result (input exponent < -14).
@@ -287,7 +333,7 @@ func checkPrecision(t *testing.T, f32 float32, f16 float16.Float16, i uint64) {
287333
} else if !roundtripped {
288334
if pre == float16.PrecisionExact {
289335
// this should only happen if both input and output are NaN
290-
if !(f16.IsNaN() && IsNaN32(f32)) {
336+
if !(f16.IsNaN() && isNaN32(f32)) {
291337
t.Errorf("i=%d, PrecisionFromfloat32 in f32bits=0x%08x (%f), out f16bits=0x%04x, back=0x%08x (%f), got PrecisionExact when roundtrip failed with non-special value", i, u32, f32, u16, u32bis, f32bis)
292338
}
293339
} else if pre == float16.PrecisionInexact {
@@ -337,6 +383,29 @@ func TestPrecisionFromfloat32(t *testing.T) {
337383
checkPrecision(t, f32, f16, uint64(0))
338384
}
339385

386+
func TestFromNaN32ps(t *testing.T) {
387+
for i, v := range wantF32toF16bits {
388+
f16 := float16.Fromfloat32(v.in)
389+
u16 := uint16(f16)
390+
391+
if u16 != v.out {
392+
t.Errorf("i=%d, in f32bits=0x%08x, wanted=0x%04x, got=0x%04x.", i, math.Float32bits(v.in), v.out, u16)
393+
}
394+
395+
checkFromNaN32ps(t, v.in, f16)
396+
}
397+
398+
// since checkFromNaN32ps rejects non-NaN input, try one here
399+
nan, err := float16.FromNaN32ps(float32(math.Pi))
400+
if err != float16.ErrInvalidNaNValue {
401+
t.Errorf("FromNaN32ps: in float32(math.Pi) wanted err float16.ErrInvalidNaNValue, got err = %q", err)
402+
}
403+
if uint16(nan) != 0x7c01 { // signalling NaN
404+
t.Errorf("FromNaN32ps: in float32(math.Pi) wanted nan = 0x7c01, got nan = 0x%04x", uint16(nan))
405+
}
406+
407+
}
408+
340409
// Test a small subset of possible conversions from float32 to Float16.
341410
// TestSomeFromFloat32 runs in under 1 second while TestAllFromFloat32 takes about 45 seconds.
342411
func TestSomeFromFloat32(t *testing.T) {
@@ -351,15 +420,15 @@ func TestSomeFromFloat32(t *testing.T) {
351420
}
352421
}
353422

354-
// Test all possible 4294967296 conversions from float32 to float16 and
355-
// also checks PrecisionFromfloat32().
423+
// Test all possible 4294967296 float32 input values and results for
424+
// Fromfloat32(), FromNaN32ps(), and PrecisionFromfloat32().
356425
func TestAllFromFloat32(t *testing.T) {
357426

358427
if testing.Short() {
359428
t.Skip("skipping TestAllFromFloat32 in short mode.")
360429
}
361430

362-
fmt.Printf("WARNING: TestAllFromFloat32 should take 60-90 secs to run on amd64, other platforms may take longer...\n")
431+
fmt.Printf("WARNING: TestAllFromFloat32 should take about 1-2 minutes to run on amd64, other platforms may take longer...\n")
363432

364433
//const wantBlake2b = "3f310bc5608a087462d361644fe66feeb4c68145f6f18eb6f1439cd7914888b6df9e30ae5350dce0635162cc6a2f23b31b3e4353ca132a3c552bdbd58baa54e6"
365434
const wantSHA512 = "08670429a475164d6c4a080969e35231c77ef7069b430b5f38af22e013796b7818bbe8f5942a6ddf26de0e1dfc67d02243f483d85729ebc3762fc2948a5ca1f8"
@@ -376,6 +445,7 @@ func TestAllFromFloat32(t *testing.T) {
376445
f16 := float16.Fromfloat32(inF32)
377446
results[j] = uint16(f16)
378447
checkPrecision(t, inF32, f16, i)
448+
checkFromNaN32ps(t, inF32, f16)
379449
}
380450

381451
// convert results to []byte

0 commit comments

Comments
 (0)