gotch/half/float16.go

304 lines
9.2 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

// Copyright 2019 Montgomery Edwards⁴⁴⁸ and Faye Amacker
//
// Special thanks to Kathryn Long for her Rust implementation
// of float16 at github.com/starkat99/half-rs (MIT license)
// Package half defines support for half-precision floating-point numbers.
package half
import (
"math"
"strconv"
)
// Float16 represents IEEE 754 half-precision floating-point numbers (binary16).
type Float16 uint16
// Precision indicates whether the conversion to Float16 is
// exact, subnormal without dropped bits, inexact, underflow, or overflow.
type Precision int
const (
// PrecisionExact is for non-subnormals that don't drop bits during conversion.
// All of these can round-trip. Should always convert to float16.
PrecisionExact Precision = iota
// PrecisionUnknown is for subnormals that don't drop bits during conversion but
// not all of these can round-trip so precision is unknown without more effort.
// Only 2046 of these can round-trip and the rest cannot round-trip.
PrecisionUnknown
// PrecisionInexact is for dropped significand bits and cannot round-trip.
// Some of these are subnormals. Cannot round-trip float32->float16->float32.
PrecisionInexact
// PrecisionUnderflow is for Underflows. Cannot round-trip float32->float16->float32.
PrecisionUnderflow
// PrecisionOverflow is for Overflows. Cannot round-trip float32->float16->float32.
PrecisionOverflow
)
// PrecisionFromfloat32 returns Precision without performing
// the conversion. Conversions from both Infinity and NaN
// values will always report PrecisionExact even if NaN payload
// or NaN-Quiet-Bit is lost. This function is kept simple to
// allow inlining and run < 0.5 ns/op, to serve as a fast filter.
func PrecisionFromfloat32(f32 float32) Precision {
u32 := math.Float32bits(f32)
if u32 == 0 || u32 == 0x80000000 {
// +- zero will always be exact conversion
return PrecisionExact
}
const COEFMASK uint32 = 0x7fffff // 23 least significant bits
const EXPSHIFT uint32 = 23
const EXPBIAS uint32 = 127
const EXPMASK uint32 = uint32(0xff) << EXPSHIFT
const DROPMASK uint32 = COEFMASK >> 10
exp := int32(((u32 & EXPMASK) >> EXPSHIFT) - EXPBIAS)
coef := u32 & COEFMASK
if exp == 128 {
// +- infinity or NaN
// apps may want to do extra checks for NaN separately
return PrecisionExact
}
// https://en.wikipedia.org/wiki/Half-precision_floating-point_format says,
// "Decimals between 2^24 (minimum positive subnormal) and 2^14 (maximum subnormal): fixed interval 2^24"
if exp < -24 {
return PrecisionUnderflow
}
if exp > 15 {
return PrecisionOverflow
}
if (coef & DROPMASK) != uint32(0) {
// these include subnormals and non-subnormals that dropped bits
return PrecisionInexact
}
if exp < -14 {
// Subnormals. Caller may want to test these further.
// There are 2046 subnormals that can successfully round-trip f32->f16->f32
// and 20 of those 2046 have 32-bit input coef == 0.
// RFC 7049 and 7049bis Draft 12 don't precisely define "preserves value"
// so some protocols and libraries will choose to handle subnormals differently
// when deciding to encode them to CBOR float32 vs float16.
return PrecisionUnknown
}
return PrecisionExact
}
// Frombits returns the float16 number corresponding to the IEEE 754 binary16
// representation u16, with the sign bit of u16 and the result in the same bit
// position. Frombits(Bits(x)) == x.
func Frombits(u16 uint16) Float16 {
return Float16(u16)
}
// Fromfloat32 returns a Float16 value converted from f32. Conversion uses
// IEEE default rounding (nearest int, with ties to even).
func Fromfloat32(f32 float32) Float16 {
return Float16(f32bitsToF16bits(math.Float32bits(f32)))
}
// ErrInvalidNaNValue indicates a NaN was not received.
const ErrInvalidNaNValue = float16Error("float16: invalid NaN value, expected IEEE 754 NaN")
type float16Error string
func (e float16Error) Error() string { return string(e) }
// FromNaN32ps converts nan to IEEE binary16 NaN while preserving both
// signaling and payload. Unlike Fromfloat32(), which can only return
// qNaN because it sets quiet bit = 1, this can return both sNaN and qNaN.
// If the result is infinity (sNaN with empty payload), then the
// lowest bit of payload is set to make the result a NaN.
// Returns ErrInvalidNaNValue and 0x7c01 (sNaN) if nan isn't IEEE 754 NaN.
// This function was kept simple to be able to inline.
func FromNaN32ps(nan float32) (Float16, error) {
const SNAN = Float16(uint16(0x7c01)) // signaling NaN
u32 := math.Float32bits(nan)
sign := u32 & 0x80000000
exp := u32 & 0x7f800000
coef := u32 & 0x007fffff
if (exp != 0x7f800000) || (coef == 0) {
return SNAN, ErrInvalidNaNValue
}
u16 := uint16((sign >> 16) | uint32(0x7c00) | (coef >> 13))
if (u16 & 0x03ff) == 0 {
// result became infinity, make it NaN by setting lowest bit in payload
u16 |= 0x0001
}
return Float16(u16), nil
}
// NaN returns a Float16 of IEEE 754 binary16 not-a-number (NaN).
// Returned NaN value 0x7e01 has all exponent bits = 1 with the
// first and last bits = 1 in the significand. This is consistent
// with Go's 64-bit math.NaN(). Canonical CBOR in RFC 7049 uses 0x7e00.
func NaN() Float16 {
return Float16(0x7e01)
}
// Inf returns a Float16 with an infinity value with the specified sign.
// A sign >= returns positive infinity.
// A sign < 0 returns negative infinity.
func Inf(sign int) Float16 {
if sign >= 0 {
return Float16(0x7c00)
}
return Float16(0x8000 | 0x7c00)
}
// Float32 returns a float32 converted from f (Float16).
// This is a lossless conversion.
func (f Float16) Float32() float32 {
u32 := f16bitsToF32bits(uint16(f))
return math.Float32frombits(u32)
}
// Bits returns the IEEE 754 binary16 representation of f, with the sign bit
// of f and the result in the same bit position. Bits(Frombits(x)) == x.
func (f Float16) Bits() uint16 {
return uint16(f)
}
// IsNaN reports whether f is an IEEE 754 binary16 “not-a-number” value.
func (f Float16) IsNaN() bool {
return (f&0x7c00 == 0x7c00) && (f&0x03ff != 0)
}
// IsQuietNaN reports whether f is a quiet (non-signaling) IEEE 754 binary16
// “not-a-number” value.
func (f Float16) IsQuietNaN() bool {
return (f&0x7c00 == 0x7c00) && (f&0x03ff != 0) && (f&0x0200 != 0)
}
// IsInf reports whether f is an infinity (inf).
// A sign > 0 reports whether f is positive inf.
// A sign < 0 reports whether f is negative inf.
// A sign == 0 reports whether f is either inf.
func (f Float16) IsInf(sign int) bool {
return ((f == 0x7c00) && sign >= 0) ||
(f == 0xfc00 && sign <= 0)
}
// IsFinite returns true if f is neither infinite nor NaN.
func (f Float16) IsFinite() bool {
return (uint16(f) & uint16(0x7c00)) != uint16(0x7c00)
}
// IsNormal returns true if f is neither zero, infinite, subnormal, or NaN.
func (f Float16) IsNormal() bool {
exp := uint16(f) & uint16(0x7c00)
return (exp != uint16(0x7c00)) && (exp != 0)
}
// Signbit reports whether f is negative or negative zero.
func (f Float16) Signbit() bool {
return (uint16(f) & uint16(0x8000)) != 0
}
// String satisfies the fmt.Stringer interface.
func (f Float16) String() string {
return strconv.FormatFloat(float64(f.Float32()), 'f', -1, 32)
}
// f16bitsToF32bits returns uint32 (float32 bits) converted from specified uint16.
func f16bitsToF32bits(in uint16) uint32 {
// All 65536 conversions with this were confirmed to be correct
// by Montgomery Edwards⁴⁴⁸ (github.com/x448).
sign := uint32(in&0x8000) << 16 // sign for 32-bit
exp := uint32(in&0x7c00) >> 10 // exponenent for 16-bit
coef := uint32(in&0x03ff) << 13 // significand for 32-bit
if exp == 0x1f {
if coef == 0 {
// infinity
return sign | 0x7f800000 | coef
}
// NaN
return sign | 0x7fc00000 | coef
}
if exp == 0 {
if coef == 0 {
// zero
return sign
}
// normalize subnormal numbers
exp++
for coef&0x7f800000 == 0 {
coef <<= 1
exp--
}
coef &= 0x007fffff
}
return sign | ((exp + (0x7f - 0xf)) << 23) | coef
}
// f32bitsToF16bits returns uint16 (Float16 bits) converted from the specified float32.
// Conversion rounds to nearest integer with ties to even.
func f32bitsToF16bits(u32 uint32) uint16 {
// Translated from Rust to Go by Montgomery Edwards⁴⁴⁸ (github.com/x448).
// All 4294967296 conversions with this were confirmed to be correct by x448.
// Original Rust implementation is by Kathryn Long (github.com/starkat99) with MIT license.
sign := u32 & 0x80000000
exp := u32 & 0x7f800000
coef := u32 & 0x007fffff
if exp == 0x7f800000 {
// NaN or Infinity
nanBit := uint32(0)
if coef != 0 {
nanBit = uint32(0x0200)
}
return uint16((sign >> 16) | uint32(0x7c00) | nanBit | (coef >> 13))
}
halfSign := sign >> 16
unbiasedExp := int32(exp>>23) - 127
halfExp := unbiasedExp + 15
if halfExp >= 0x1f {
return uint16(halfSign | uint32(0x7c00))
}
if halfExp <= 0 {
if 14-halfExp > 24 {
return uint16(halfSign)
}
c := coef | uint32(0x00800000)
halfCoef := c >> uint32(14-halfExp)
roundBit := uint32(1) << uint32(13-halfExp)
if (c&roundBit) != 0 && (c&(3*roundBit-1)) != 0 {
halfCoef++
}
return uint16(halfSign | halfCoef)
}
uHalfExp := uint32(halfExp) << 10
halfCoef := coef >> 13
roundBit := uint32(0x00001000)
if (coef&roundBit) != 0 && (coef&(3*roundBit-1)) != 0 {
return uint16((halfSign | uHalfExp | halfCoef) + 1)
}
return uint16(halfSign | uHalfExp | halfCoef)
}