GDAL
cpl_float.h
1/******************************************************************************
2 *
3 * Project: CPL
4 * Purpose: Floating point conversion functions. Convert 16- and 24-bit
5 * floating point numbers into the 32-bit IEEE 754 compliant ones.
6 * Author: Andrey Kiselev, dron@remotesensing.org
7 *
8 ******************************************************************************
9 * Copyright (c) 2005, Andrey Kiselev <dron@remotesensing.org>
10 * Copyright (c) 2010, Even Rouault <even dot rouault at spatialys.com>
11 *
12 * This code is based on the code from OpenEXR project with the following
13 * copyright:
14 *
15 * Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
16 * Digital Ltd. LLC
17 *
18 * All rights reserved.
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions are
22 * met:
23 * * Redistributions of source code must retain the above copyright
24 * notice, this list of conditions and the following disclaimer.
25 * * Redistributions in binary form must reproduce the above
26 * copyright notice, this list of conditions and the following disclaimer
27 * in the documentation and/or other materials provided with the
28 * distribution.
29 * * Neither the name of Industrial Light & Magic nor the names of
30 * its contributors may be used to endorse or promote products derived
31 * from this software without specific prior written permission.
32 *
33 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
34 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
35 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
36 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
37 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
38 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
39 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
40 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
41 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
42 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 *
45 ****************************************************************************/
46
47#ifndef CPL_FLOAT_H_INCLUDED
48#define CPL_FLOAT_H_INCLUDED
49
50#include "cpl_port.h"
51
52#ifdef __cplusplus
53#include <algorithm>
54#include <cmath>
55#include <cstdint>
56#include <cstring>
57#include <limits>
58#ifdef HAVE_STD_FLOAT16_T
59#include <stdfloat>
60#endif
61#endif
62
64GUInt32 CPL_DLL CPLHalfToFloat(GUInt16 iHalf);
65GUInt32 CPL_DLL CPLTripleToFloat(GUInt32 iTriple);
67
68#ifdef __cplusplus
69
70GUInt16 CPL_DLL CPLFloatToHalf(GUInt32 iFloat32, bool &bHasWarned);
71
72GUInt16 CPL_DLL CPLConvertFloatToHalf(float fFloat32);
73float CPL_DLL CPLConvertHalfToFloat(GUInt16 nHalf);
74
75#if defined(__F16C__)
76#include <immintrin.h>
77#ifndef GDALCopyXMMToInt16_defined
78#define GDALCopyXMMToInt16_defined
79
80static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
81{
82 GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
83 memcpy(pDest, &i, 2);
84}
85#endif
86#endif
87
88namespace cpl
89{
90
91// We define our own version of `std::numeric_limits` so that we can
92// specialize it for `cpl::Float16` if necessary. Specializing
93// `std::numeric_limits` doesn't always work because some libraries
94// use `std::numeric_limits`, and one cannot specialize a type
95// template after it has been used.
96template <typename T> struct NumericLimits : std::numeric_limits<T>
97{
98};
99
100#ifndef HAVE_STD_FLOAT16_T
101
102// Define a type `cpl::Float16`. If the compiler supports it natively
103// (as `_Float16`), then this class is a simple wrapper. Otherwise we
104// store the values in a `GUInt16` as bit pattern.
105
107struct Float16
108{
109 struct make_from_bits_and_value
110 {
111 };
112
113#ifdef HAVE__FLOAT16
114
115 // How we represent a `Float16` internally
116 using repr = _Float16;
117
118 // How we compute on `Float16` values
119 using compute = _Float16;
120
121 // Create a Float16 in a constexpr manner. Since we can't convert
122 // bits in a constexpr function, we need to take both the bit
123 // pattern and a float value as input, and can then choose which
124 // of the two to use.
125 constexpr Float16(make_from_bits_and_value, CPL_UNUSED std::uint16_t bits,
126 float fValue)
127 : rValue(repr(fValue))
128 {
129 }
130
131 static constexpr repr computeToRepr(compute fValue)
132 {
133 return fValue;
134 }
135
136 static constexpr compute reprToCompute(repr rValue)
137 {
138 return rValue;
139 }
140
141 template <typename T> static constexpr repr toRepr(T fValue)
142 {
143 return static_cast<repr>(fValue);
144 }
145
146 template <typename T> static constexpr T fromRepr(repr rValue)
147 {
148 return static_cast<T>(rValue);
149 }
150
151#else // #ifndef HAVE__FLOAT16
152
153 // How we represent a `Float16` internally
154 using repr = std::uint16_t;
155
156 // How we compute on `Float16` values
157 using compute = float;
158
159 // Create a Float16 in a constexpr manner. Since we can't convert
160 // bits in a constexpr function, we need to take both the bit
161 // pattern and a float value as input, and can then choose which
162 // of the two to use.
163 constexpr Float16(make_from_bits_and_value, std::uint16_t bits,
164 CPL_UNUSED float fValue)
165 : rValue(bits)
166 {
167 }
168
169 static unsigned float2unsigned(float f)
170 {
171 unsigned u;
172 std::memcpy(&u, &f, 4);
173 return u;
174 }
175
176 static float unsigned2float(unsigned u)
177 {
178 float f;
179 std::memcpy(&f, &u, 4);
180 return f;
181 }
182
183 // Copied from cpl_float.cpp so that we can inline for performance
184 static std::uint16_t computeToRepr(float fFloat32)
185 {
186 std::uint32_t iFloat32 = float2unsigned(fFloat32);
187
188 std::uint32_t iSign = (iFloat32 >> 31) & 0x00000001;
189 std::uint32_t iExponent = (iFloat32 >> 23) & 0x000000ff;
190 std::uint32_t iMantissa = iFloat32 & 0x007fffff;
191
192 if (iExponent == 255)
193 {
194 if (iMantissa == 0)
195 {
196 // Positive or negative infinity.
197 return static_cast<std::int16_t>((iSign << 15) | 0x7C00);
198 }
199
200 // NaN -- preserve sign and significand bits.
201 if (iMantissa >> 13)
202 return static_cast<std::int16_t>((iSign << 15) | 0x7C00 |
203 (iMantissa >> 13));
204 return static_cast<std::int16_t>((iSign << 15) | 0x7E00);
205 }
206
207 if (iExponent <= 127 - 15)
208 {
209 // Zero, float32 denormalized number or float32 too small normalized
210 // number
211 if (13 + 1 + 127 - 15 - iExponent >= 32)
212 return static_cast<std::int16_t>(iSign << 15);
213
214 // Return a denormalized number
215 return static_cast<std::int16_t>(
216 (iSign << 15) |
217 ((iMantissa | 0x00800000) >> (13 + 1 + 127 - 15 - iExponent)));
218 }
219
220 if (iExponent - (127 - 15) >= 31)
221 {
222 return static_cast<std::int16_t>((iSign << 15) |
223 0x7C00); // Infinity
224 }
225
226 // Normalized number.
227 iExponent = iExponent - (127 - 15);
228 iMantissa = iMantissa >> 13;
229
230 // Assemble sign, exponent and mantissa.
231 // coverity[overflow_sink]
232 return static_cast<std::int16_t>((iSign << 15) | (iExponent << 10) |
233 iMantissa);
234 }
235
236 // Copied from cpl_float.cpp so that we can inline for performance
237 static float reprToCompute(std::uint16_t iHalf)
238 {
239 std::uint32_t iSign = (iHalf >> 15) & 0x00000001;
240 int iExponent = (iHalf >> 10) & 0x0000001f;
241 std::uint32_t iMantissa = iHalf & 0x000003ff;
242
243 if (iExponent == 31)
244 {
245 if (iMantissa == 0)
246 {
247 // Positive or negative infinity.
248 return unsigned2float((iSign << 31) | 0x7f800000);
249 }
250
251 // NaN -- preserve sign and significand bits.
252 return unsigned2float((iSign << 31) | 0x7f800000 |
253 (iMantissa << 13));
254 }
255
256 if (iExponent == 0)
257 {
258 if (iMantissa == 0)
259 {
260 // Plus or minus zero.
261 return unsigned2float(iSign << 31);
262 }
263
264 // Denormalized number -- renormalize it.
265 while (!(iMantissa & 0x00000400))
266 {
267 iMantissa <<= 1;
268 iExponent -= 1;
269 }
270
271 iExponent += 1;
272 iMantissa &= ~0x00000400U;
273 }
274
275 // Normalized number.
276 iExponent = iExponent + (127 - 15);
277 iMantissa = iMantissa << 13;
278
279 // Assemble sign, exponent and mantissa.
280 /* coverity[overflow_sink] */
281 return unsigned2float((iSign << 31) |
282 (static_cast<std::uint32_t>(iExponent) << 23) |
283 iMantissa);
284 }
285
286 template <typename T> static repr toRepr(T value)
287 {
288#ifdef __F16C__
289 float fValue = static_cast<float>(value);
290 __m128 xmm_float = _mm_load_ss(&fValue);
291 const __m128i xmm_hfloat =
292 _mm_cvtps_ph(xmm_float, _MM_FROUND_TO_NEAREST_INT);
293 repr hfValueOut;
294 GDALCopyXMMToInt16(xmm_hfloat, &hfValueOut);
295 return hfValueOut;
296#else
297 return computeToRepr(static_cast<compute>(value));
298#endif
299 }
300
301 template <typename T> static T fromRepr(repr rValue)
302 {
303#ifdef __F16C__
304 __m128i xmm;
305 memcpy(&xmm, &rValue, sizeof(repr));
306 float fValueOut;
307 _mm_store_ss(&fValueOut, _mm_cvtph_ps(xmm));
308 return static_cast<T>(fValueOut);
309#else
310 return static_cast<T>(reprToCompute(rValue));
311#endif
312 }
313
314#endif // #ifndef HAVE__FLOAT16
315
316 private:
317 repr rValue;
318
319 public:
320 compute get() const
321 {
322 return reprToCompute(rValue);
323 }
324
325 // cppcheck-suppress uninitMemberVar
326 Float16() = default;
327 Float16(const Float16 &) = default;
328 Float16(Float16 &&) = default;
329 Float16 &operator=(const Float16 &) = default;
330 Float16 &operator=(Float16 &&) = default;
331
332 // Constructors and conversion operators
333
334#ifdef HAVE__FLOAT16
335 // cppcheck-suppress noExplicitConstructor
336 constexpr Float16(_Float16 hfValue) : rValue(hfValue)
337 {
338 }
339
340 constexpr operator _Float16() const
341 {
342 return rValue;
343 }
344#endif
345
346 // cppcheck-suppress-macro noExplicitConstructor
347#define GDAL_DEFINE_CONVERSION(TYPE) \
348 \
349 Float16(TYPE fValue) : rValue(toRepr(fValue)) \
350 { \
351 } \
352 \
353 operator TYPE() const \
354 { \
355 return fromRepr<TYPE>(rValue); \
356 }
357
358 GDAL_DEFINE_CONVERSION(float)
359 GDAL_DEFINE_CONVERSION(double)
360 GDAL_DEFINE_CONVERSION(char)
361 GDAL_DEFINE_CONVERSION(signed char)
362 GDAL_DEFINE_CONVERSION(short)
363 GDAL_DEFINE_CONVERSION(int)
364 GDAL_DEFINE_CONVERSION(long)
365 GDAL_DEFINE_CONVERSION(long long)
366 GDAL_DEFINE_CONVERSION(unsigned char)
367 GDAL_DEFINE_CONVERSION(unsigned short)
368 GDAL_DEFINE_CONVERSION(unsigned int)
369 GDAL_DEFINE_CONVERSION(unsigned long)
370 GDAL_DEFINE_CONVERSION(unsigned long long)
371
372#undef GDAL_DEFINE_CONVERSION
373
374 // Arithmetic operators
375
376 friend Float16 operator+(Float16 x)
377 {
378 return +x.get();
379 }
380
381 friend Float16 operator-(Float16 x)
382 {
383 return -x.get();
384 }
385
386#define GDAL_DEFINE_ARITHOP(OP) \
387 \
388 friend Float16 operator OP(Float16 x, Float16 y) \
389 { \
390 return x.get() OP y.get(); \
391 } \
392 \
393 friend double operator OP(double x, Float16 y) \
394 { \
395 return x OP static_cast<double>(y.get()); \
396 } \
397 \
398 friend float operator OP(float x, Float16 y) \
399 { \
400 return x OP static_cast<float>(y.get()); \
401 } \
402 \
403 friend Float16 operator OP(int x, Float16 y) \
404 { \
405 return x OP static_cast<float>(y.get()); \
406 } \
407 \
408 friend double operator OP(Float16 x, double y) \
409 { \
410 return static_cast<double>(x.get()) OP y; \
411 } \
412 \
413 friend float operator OP(Float16 x, float y) \
414 { \
415 return static_cast<float>(x.get()) OP y; \
416 } \
417 \
418 friend Float16 operator OP(Float16 x, int y) \
419 { \
420 return static_cast<float>(x.get()) OP y; \
421 }
422
423 GDAL_DEFINE_ARITHOP(+)
424 GDAL_DEFINE_ARITHOP(-)
425 GDAL_DEFINE_ARITHOP(*)
426 GDAL_DEFINE_ARITHOP(/)
427
428#undef GDAL_DEFINE_ARITHOP
429
430 // Comparison operators
431
432#define GDAL_DEFINE_COMPARISON(OP) \
433 \
434 friend bool operator OP(Float16 x, Float16 y) \
435 { \
436 return x.get() OP y.get(); \
437 } \
438 \
439 friend bool operator OP(float x, Float16 y) \
440 { \
441 return x OP static_cast<float>(y.get()); \
442 } \
443 \
444 friend bool operator OP(double x, Float16 y) \
445 { \
446 return x OP static_cast<double>(y.get()); \
447 } \
448 \
449 friend bool operator OP(int x, Float16 y) \
450 { \
451 return x OP static_cast<float>(y.get()); \
452 } \
453 \
454 friend bool operator OP(Float16 x, float y) \
455 { \
456 return static_cast<float>(x.get()) OP y; \
457 } \
458 \
459 friend bool operator OP(Float16 x, double y) \
460 { \
461 return static_cast<double>(x.get()) OP y; \
462 } \
463 \
464 friend bool operator OP(Float16 x, int y) \
465 { \
466 return static_cast<float>(x.get()) OP y; \
467 }
468
469 GDAL_DEFINE_COMPARISON(==)
470 GDAL_DEFINE_COMPARISON(!=)
471 GDAL_DEFINE_COMPARISON(<)
472 GDAL_DEFINE_COMPARISON(>)
473 GDAL_DEFINE_COMPARISON(<=)
474 GDAL_DEFINE_COMPARISON(>=)
475
476#undef GDAL_DEFINE_COMPARISON
477
478 // Standard math functions
479
480 friend bool isfinite(Float16 x)
481 {
482 using std::isfinite;
483 return isfinite(float(x));
484 }
485
486 friend bool isinf(Float16 x)
487 {
488 using std::isinf;
489 return isinf(float(x));
490 }
491
492 friend bool isnan(Float16 x)
493 {
494 using std::isnan;
495 return isnan(float(x));
496 }
497
498 friend bool isnormal(Float16 x)
499 {
500 using std::isnormal;
501 return isnormal(float(x));
502 }
503
504 friend bool signbit(Float16 x)
505 {
506 using std::signbit;
507 return signbit(float(x));
508 }
509
510 friend Float16 abs(Float16 x)
511 {
512 using std::abs;
513 return Float16(abs(float(x)));
514 }
515
516 friend Float16 cbrt(Float16 x)
517 {
518 using std::cbrt;
519 return Float16(cbrt(float(x)));
520 }
521
522 friend Float16 ceil(Float16 x)
523 {
524 using std::ceil;
525 return Float16(ceil(float(x)));
526 }
527
528 friend Float16 copysign(Float16 x, Float16 y)
529 {
530 using std::copysign;
531 return Float16(copysign(float(x), float(y)));
532 }
533
534 friend Float16 fabs(Float16 x)
535 {
536 using std::fabs;
537 return Float16(fabs(float(x)));
538 }
539
540 friend Float16 floor(Float16 x)
541 {
542 using std::floor;
543 return Float16(floor(float(x)));
544 }
545
546 friend Float16 fmax(Float16 x, Float16 y)
547 {
548 using std::fmax;
549 return Float16(fmax(float(x), float(y)));
550 }
551
552 friend Float16 fmin(Float16 x, Float16 y)
553 {
554 using std::fmin;
555 return Float16(fmin(float(x), float(y)));
556 }
557
558 friend Float16 hypot(Float16 x, Float16 y)
559 {
560 using std::hypot;
561 return Float16(hypot(float(x), float(y)));
562 }
563
564 friend Float16 max(Float16 x, Float16 y)
565 {
566 using std::max;
567 return Float16(max(float(x), float(y)));
568 }
569
570 friend Float16 min(Float16 x, Float16 y)
571 {
572 using std::min;
573 return Float16(min(float(x), float(y)));
574 }
575
576 // Adapted from the LLVM Project, under the Apache License v2.0
577 friend Float16 nextafter(Float16 x, Float16 y)
578 {
579 if (isnan(x))
580 return x;
581 if (isnan(y))
582 return y;
583 if (x == y)
584 return y;
585
586 std::uint16_t bits;
587 if (x != Float16(0))
588 {
589 std::memcpy(&bits, &x.rValue, 2);
590 if ((x < y) == (x > Float16(0)))
591 ++bits;
592 else
593 --bits;
594 }
595 else
596 {
597 bits = (signbit(y) << 15) | 0x0001;
598 }
599
600 Float16 r;
601 std::memcpy(&r.rValue, &bits, 2);
602
603 return r;
604 }
605
606 friend Float16 pow(Float16 x, Float16 y)
607 {
608 using std::pow;
609 return Float16(pow(float(x), float(y)));
610 }
611
612 friend Float16 pow(Float16 x, int n)
613 {
614 using std::pow;
615 return Float16(pow(float(x), n));
616 }
617
618 friend Float16 round(Float16 x)
619 {
620 using std::round;
621 return Float16(round(float(x)));
622 }
623
624 friend Float16 sqrt(Float16 x)
625 {
626 using std::sqrt;
627 return Float16(sqrt(float(x)));
628 }
629};
630
631template <> struct NumericLimits<Float16>
632{
633 static constexpr bool is_specialized = true;
634 static constexpr bool is_signed = true;
635 static constexpr bool is_integer = false;
636 static constexpr bool is_exact = false;
637 static constexpr bool has_infinity = true;
638 static constexpr bool has_quiet_NaN = true;
639 static constexpr bool has_signaling_NaN = true;
640 static constexpr bool has_denorm = true;
641 static constexpr bool is_iec559 = true;
642
643 static constexpr int digits = 11;
644 static constexpr int digits10 = 3;
645 static constexpr int max_digits10 = 5;
646 static constexpr int radix = 2;
647
648 static constexpr Float16 epsilon()
649 {
650 return Float16(Float16::make_from_bits_and_value{}, 0x1400, 0.000977f);
651 }
652
653 static constexpr Float16 min()
654 {
655 return Float16(Float16::make_from_bits_and_value{}, 0x0001, 6.0e-8f);
656 }
657
658 static constexpr Float16 lowest()
659 {
660 return Float16(Float16::make_from_bits_and_value{}, 0xfbff, -65504.0f);
661 }
662
663 static constexpr Float16 max()
664 {
665 return Float16(Float16::make_from_bits_and_value{}, 0x7bff, +65504.0f);
666 }
667
668 static constexpr Float16 infinity()
669 {
670 return Float16(Float16::make_from_bits_and_value{}, 0x7c00,
671 std::numeric_limits<float>::infinity());
672 }
673
674 static constexpr Float16 quiet_NaN()
675 {
676 return Float16(Float16::make_from_bits_and_value{}, 0x7e00,
677 std::numeric_limits<float>::quiet_NaN());
678 }
679
680 static constexpr Float16 signaling_NaN()
681 {
682 return Float16(Float16::make_from_bits_and_value{}, 0xfe00,
683 std::numeric_limits<float>::signaling_NaN());
684 }
685};
686
688
689#endif // #ifndef HAVE_STD_FLOAT16_T
690
691} // namespace cpl
692
693#ifdef HAVE_STD_FLOAT16_T
694using GFloat16 = std::float16_t;
695#else
696using GFloat16 = cpl::Float16;
697#endif
698
699// Define some GDAL wrappers. Their C equivalents are defined in `cpl_port.h`.
700// (These wrappers are not necessary any more in C++, one can always
701// call `isnan` etc directly.)
702
703template <typename T> constexpr int CPLIsNan(T x)
704{
705 // We need to write `using std::isnan` instead of directly using
706 // `std::isnan` because `std::isnan` only supports the types
707 // `float` and `double`. The `isnan` for `cpl::Float16` is found in the
708 // `cpl` namespace via argument-dependent lookup
709 // <https://en.cppreference.com/w/cpp/language/adl>.
710 using std::isnan;
711 return isnan(x);
712}
713
714template <typename T> constexpr int CPLIsInf(T x)
715{
716 using std::isinf;
717 return isinf(x);
718}
719
720template <typename T> constexpr int CPLIsFinite(T x)
721{
722 using std::isfinite;
723 return isfinite(x);
724}
725
726#endif // #ifdef __cplusplus
727
728double CPL_DLL CPLGreatestCommonDivisor(double x, double y);
729
730#endif // CPL_FLOAT_H_INCLUDED
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition cpl_port.h:171
#define CPL_C_END
Macro to end a block of C symbols.
Definition cpl_port.h:289
#define CPL_UNUSED
Qualifier for an argument that is unused.
Definition cpl_port.h:879
#define CPL_C_START
Macro to start a block of C symbols.
Definition cpl_port.h:285
unsigned int GUInt32
Unsigned int32 type.
Definition cpl_port.h:167
unsigned short GUInt16
Unsigned int16 type.
Definition cpl_port.h:173
Definition cpl_float.h:97