core/num/dec2flt/
float.rs

1//! Helper trait for generic float types.
2
3use core::f64;
4
5use crate::fmt::{Debug, LowerExp};
6use crate::num::FpCategory;
7use crate::ops::{self, Add, Div, Mul, Neg};
8
9/// Lossy `as` casting between two types.
10pub trait CastInto<T: Copy>: Copy {
11    fn cast(self) -> T;
12}
13
14/// Collection of traits that allow us to be generic over integer size.
15pub trait Integer:
16    Sized
17    + Clone
18    + Copy
19    + Debug
20    + ops::Shr<u32, Output = Self>
21    + ops::Shl<u32, Output = Self>
22    + ops::BitAnd<Output = Self>
23    + ops::BitOr<Output = Self>
24    + PartialEq
25    + CastInto<i16>
26{
27    const ZERO: Self;
28    const ONE: Self;
29}
30
31macro_rules! int {
32    ($($ty:ty),+) => {
33        $(
34            impl CastInto<i16> for $ty {
35                fn cast(self) -> i16 {
36                    self as i16
37                }
38            }
39
40            impl Integer for $ty {
41                const ZERO: Self = 0;
42                const ONE: Self = 1;
43            }
44        )+
45    }
46}
47
48int!(u16, u32, u64);
49
50/// A helper trait to avoid duplicating basically all the conversion code for IEEE floats.
51///
52/// See the parent module's doc comment for why this is necessary.
53///
54/// Should **never ever** be implemented for other types or be used outside the `dec2flt` module.
55#[doc(hidden)]
56pub trait RawFloat:
57    Sized
58    + Div<Output = Self>
59    + Neg<Output = Self>
60    + Mul<Output = Self>
61    + Add<Output = Self>
62    + LowerExp
63    + PartialEq
64    + PartialOrd
65    + Default
66    + Clone
67    + Copy
68    + Debug
69{
70    /// The unsigned integer with the same size as the float
71    type Int: Integer + Into<u64>;
72
73    /* general constants */
74
75    const INFINITY: Self;
76    const NEG_INFINITY: Self;
77    const NAN: Self;
78    const NEG_NAN: Self;
79
80    /// Bit width of the float
81    const BITS: u32;
82
83    /// The number of bits in the significand, *including* the hidden bit.
84    const SIG_TOTAL_BITS: u32;
85
86    const EXP_MASK: Self::Int;
87    const SIG_MASK: Self::Int;
88
89    /// The number of bits in the significand, *excluding* the hidden bit.
90    const SIG_BITS: u32 = Self::SIG_TOTAL_BITS - 1;
91
92    /// Number of bits in the exponent.
93    const EXP_BITS: u32 = Self::BITS - Self::SIG_BITS - 1;
94
95    /// The saturated (maximum bitpattern) value of the exponent, i.e. the infinite
96    /// representation.
97    ///
98    /// This shifted fully right, use `EXP_MASK` for the shifted value.
99    const EXP_SAT: u32 = (1 << Self::EXP_BITS) - 1;
100
101    /// Signed version of `EXP_SAT` since we convert a lot.
102    const INFINITE_POWER: i32 = Self::EXP_SAT as i32;
103
104    /// The exponent bias value. This is also the maximum value of the exponent.
105    const EXP_BIAS: u32 = Self::EXP_SAT >> 1;
106
107    /// Minimum exponent value of normal values.
108    const EXP_MIN: i32 = -(Self::EXP_BIAS as i32 - 1);
109
110    /// Round-to-even only happens for negative values of q
111    /// when q ≥ −4 in the 64-bit case and when q ≥ −17 in
112    /// the 32-bitcase.
113    ///
114    /// When q ≥ 0,we have that 5^q ≤ 2m+1. In the 64-bit case,we
115    /// have 5^q ≤ 2m+1 ≤ 2^54 or q ≤ 23. In the 32-bit case,we have
116    /// 5^q ≤ 2m+1 ≤ 2^25 or q ≤ 10.
117    ///
118    /// When q < 0, we have w ≥ (2m+1)×5^−q. We must have that w < 2^64
119    /// so (2m+1)×5^−q < 2^64. We have that 2m+1 > 2^53 (64-bit case)
120    /// or 2m+1 > 2^24 (32-bit case). Hence,we must have 2^53×5^−q < 2^64
121    /// (64-bit) and 2^24×5^−q < 2^64 (32-bit). Hence we have 5^−q < 2^11
122    /// or q ≥ −4 (64-bit case) and 5^−q < 2^40 or q ≥ −17 (32-bitcase).
123    ///
124    /// Thus we have that we only need to round ties to even when
125    /// we have that q ∈ [−4,23](in the 64-bit case) or q∈[−17,10]
126    /// (in the 32-bit case). In both cases,the power of five(5^|q|)
127    /// fits in a 64-bit word.
128    const MIN_EXPONENT_ROUND_TO_EVEN: i32;
129    const MAX_EXPONENT_ROUND_TO_EVEN: i32;
130
131    /* limits related to Fast pathing */
132
133    /// Largest decimal exponent for a non-infinite value.
134    ///
135    /// This is the max exponent in binary converted to the max exponent in decimal. Allows fast
136    /// pathing anything larger than `10^LARGEST_POWER_OF_TEN`, which will round to infinity.
137    const LARGEST_POWER_OF_TEN: i32 = {
138        let largest_pow2 = Self::EXP_BIAS + 1;
139        pow2_to_pow10(largest_pow2 as i64) as i32
140    };
141
142    /// Smallest decimal exponent for a non-zero value. This allows for fast pathing anything
143    /// smaller than `10^SMALLEST_POWER_OF_TEN`, which will round to zero.
144    ///
145    /// The smallest power of ten is represented by `⌊log10(2^-n / (2^64 - 1))⌋`, where `n` is
146    /// the smallest power of two. The `2^64 - 1)` denomenator comes from the number of values
147    /// that are representable by the intermediate storage format. I don't actually know _why_
148    /// the storage format is relevant here.
149    ///
150    /// The values may be calculated using the formula. Unfortunately we cannot calculate them at
151    /// compile time since intermediates exceed the range of an `f64`.
152    const SMALLEST_POWER_OF_TEN: i32;
153
154    /// Maximum exponent for a fast path case, or `⌊(SIG_BITS+1)/log2(5)⌋`
155    // assuming FLT_EVAL_METHOD = 0
156    const MAX_EXPONENT_FAST_PATH: i64 = {
157        let log2_5 = f64::consts::LOG2_10 - 1.0;
158        (Self::SIG_TOTAL_BITS as f64 / log2_5) as i64
159    };
160
161    /// Minimum exponent for a fast path case, or `-⌊(SIG_BITS+1)/log2(5)⌋`
162    const MIN_EXPONENT_FAST_PATH: i64 = -Self::MAX_EXPONENT_FAST_PATH;
163
164    /// Maximum exponent that can be represented for a disguised-fast path case.
165    /// This is `MAX_EXPONENT_FAST_PATH + ⌊(SIG_BITS+1)/log2(10)⌋`
166    const MAX_EXPONENT_DISGUISED_FAST_PATH: i64 =
167        Self::MAX_EXPONENT_FAST_PATH + (Self::SIG_TOTAL_BITS as f64 / f64::consts::LOG2_10) as i64;
168
169    /// Maximum mantissa for the fast-path (`1 << 53` for f64).
170    const MAX_MANTISSA_FAST_PATH: u64 = 1 << Self::SIG_TOTAL_BITS;
171
172    /// Converts integer into float through an as cast.
173    /// This is only called in the fast-path algorithm, and therefore
174    /// will not lose precision, since the value will always have
175    /// only if the value is <= Self::MAX_MANTISSA_FAST_PATH.
176    fn from_u64(v: u64) -> Self;
177
178    /// Performs a raw transmutation from an integer.
179    fn from_u64_bits(v: u64) -> Self;
180
181    /// Gets a small power-of-ten for fast-path multiplication.
182    fn pow10_fast_path(exponent: usize) -> Self;
183
184    /// Returns the category that this number falls into.
185    fn classify(self) -> FpCategory;
186
187    /// Transmute to the integer representation
188    fn to_bits(self) -> Self::Int;
189
190    /// Returns the mantissa, exponent and sign as integers.
191    ///
192    /// This returns `(m, p, s)` such that `s * m * 2^p` represents the original float. For 0, the
193    /// exponent will be `-(EXP_BIAS + SIG_BITS)`, which is the minimum subnormal power. For
194    /// infinity or NaN, the exponent will be `EXP_SAT - EXP_BIAS - SIG_BITS`.
195    ///
196    /// If subnormal, the mantissa will be shifted one bit to the left. Otherwise, it is returned
197    /// with the explicit bit set but otherwise unshifted
198    ///
199    /// `s` is only ever +/-1.
200    fn integer_decode(self) -> (u64, i16, i8) {
201        let bits = self.to_bits();
202        let sign: i8 = if bits >> (Self::BITS - 1) == Self::Int::ZERO { 1 } else { -1 };
203        let mut exponent: i16 = ((bits & Self::EXP_MASK) >> Self::SIG_BITS).cast();
204        let mantissa = if exponent == 0 {
205            (bits & Self::SIG_MASK) << 1
206        } else {
207            (bits & Self::SIG_MASK) | (Self::Int::ONE << Self::SIG_BITS)
208        };
209        // Exponent bias + mantissa shift
210        exponent -= (Self::EXP_BIAS + Self::SIG_BITS) as i16;
211        (mantissa.into(), exponent, sign)
212    }
213}
214
215/// Solve for `b` in `10^b = 2^a`
216const fn pow2_to_pow10(a: i64) -> i64 {
217    let res = (a as f64) / f64::consts::LOG2_10;
218    res as i64
219}
220
221#[cfg(target_has_reliable_f16)]
222impl RawFloat for f16 {
223    type Int = u16;
224
225    const INFINITY: Self = Self::INFINITY;
226    const NEG_INFINITY: Self = Self::NEG_INFINITY;
227    const NAN: Self = Self::NAN;
228    const NEG_NAN: Self = -Self::NAN;
229
230    const BITS: u32 = 16;
231    const SIG_TOTAL_BITS: u32 = Self::MANTISSA_DIGITS;
232    const EXP_MASK: Self::Int = Self::EXP_MASK;
233    const SIG_MASK: Self::Int = Self::MAN_MASK;
234
235    const MIN_EXPONENT_ROUND_TO_EVEN: i32 = -22;
236    const MAX_EXPONENT_ROUND_TO_EVEN: i32 = 5;
237    const SMALLEST_POWER_OF_TEN: i32 = -27;
238
239    #[inline]
240    fn from_u64(v: u64) -> Self {
241        debug_assert!(v <= Self::MAX_MANTISSA_FAST_PATH);
242        v as _
243    }
244
245    #[inline]
246    fn from_u64_bits(v: u64) -> Self {
247        Self::from_bits((v & 0xFFFF) as u16)
248    }
249
250    fn pow10_fast_path(exponent: usize) -> Self {
251        #[allow(clippy::use_self)]
252        const TABLE: [f16; 8] = [1e0, 1e1, 1e2, 1e3, 1e4, 0.0, 0.0, 0.];
253        TABLE[exponent & 7]
254    }
255
256    fn to_bits(self) -> Self::Int {
257        self.to_bits()
258    }
259
260    fn classify(self) -> FpCategory {
261        self.classify()
262    }
263}
264
265impl RawFloat for f32 {
266    type Int = u32;
267
268    const INFINITY: Self = f32::INFINITY;
269    const NEG_INFINITY: Self = f32::NEG_INFINITY;
270    const NAN: Self = f32::NAN;
271    const NEG_NAN: Self = -f32::NAN;
272
273    const BITS: u32 = 32;
274    const SIG_TOTAL_BITS: u32 = Self::MANTISSA_DIGITS;
275    const EXP_MASK: Self::Int = Self::EXP_MASK;
276    const SIG_MASK: Self::Int = Self::MAN_MASK;
277
278    const MIN_EXPONENT_ROUND_TO_EVEN: i32 = -17;
279    const MAX_EXPONENT_ROUND_TO_EVEN: i32 = 10;
280    const SMALLEST_POWER_OF_TEN: i32 = -65;
281
282    #[inline]
283    fn from_u64(v: u64) -> Self {
284        debug_assert!(v <= Self::MAX_MANTISSA_FAST_PATH);
285        v as _
286    }
287
288    #[inline]
289    fn from_u64_bits(v: u64) -> Self {
290        f32::from_bits((v & 0xFFFFFFFF) as u32)
291    }
292
293    fn pow10_fast_path(exponent: usize) -> Self {
294        #[allow(clippy::use_self)]
295        const TABLE: [f32; 16] =
296            [1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 0., 0., 0., 0., 0.];
297        TABLE[exponent & 15]
298    }
299
300    fn to_bits(self) -> Self::Int {
301        self.to_bits()
302    }
303
304    fn classify(self) -> FpCategory {
305        self.classify()
306    }
307}
308
309impl RawFloat for f64 {
310    type Int = u64;
311
312    const INFINITY: Self = Self::INFINITY;
313    const NEG_INFINITY: Self = Self::NEG_INFINITY;
314    const NAN: Self = Self::NAN;
315    const NEG_NAN: Self = -Self::NAN;
316
317    const BITS: u32 = 64;
318    const SIG_TOTAL_BITS: u32 = Self::MANTISSA_DIGITS;
319    const EXP_MASK: Self::Int = Self::EXP_MASK;
320    const SIG_MASK: Self::Int = Self::MAN_MASK;
321
322    const MIN_EXPONENT_ROUND_TO_EVEN: i32 = -4;
323    const MAX_EXPONENT_ROUND_TO_EVEN: i32 = 23;
324    const SMALLEST_POWER_OF_TEN: i32 = -342;
325
326    #[inline]
327    fn from_u64(v: u64) -> Self {
328        debug_assert!(v <= Self::MAX_MANTISSA_FAST_PATH);
329        v as _
330    }
331
332    #[inline]
333    fn from_u64_bits(v: u64) -> Self {
334        f64::from_bits(v)
335    }
336
337    fn pow10_fast_path(exponent: usize) -> Self {
338        const TABLE: [f64; 32] = [
339            1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15,
340            1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 0., 0., 0., 0., 0., 0., 0., 0., 0.,
341        ];
342        TABLE[exponent & 31]
343    }
344
345    fn to_bits(self) -> Self::Int {
346        self.to_bits()
347    }
348
349    fn classify(self) -> FpCategory {
350        self.classify()
351    }
352}