GDAL
gdalsse_priv.h
1/******************************************************************************
2 *
3 * Project: GDAL
4 * Purpose: SSE2 helper
5 * Author: Even Rouault <even dot rouault at spatialys dot com>
6 *
7 ******************************************************************************
8 * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
9 *
10 * SPDX-License-Identifier: MIT
11 ****************************************************************************/
12
13#ifndef GDALSSE_PRIV_H_INCLUDED
14#define GDALSSE_PRIV_H_INCLUDED
15
16#ifndef DOXYGEN_SKIP
17
18#include "cpl_port.h"
19
20/* We restrict to 64bit processors because they are guaranteed to have SSE2 */
21/* Could possibly be used too on 32bit, but we would need to check at runtime */
22#if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
23 !defined(USE_SSE2_EMULATION)
24
25#include <string.h>
26
27#ifdef USE_NEON_OPTIMIZATIONS
28#include "include_sse2neon.h"
29#else
30/* Requires SSE2 */
31#include <emmintrin.h>
32
33#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
34#include <smmintrin.h>
35#endif
36#endif
37
38#include "gdal_priv_templates.hpp"
39
40static inline __m128i GDALCopyInt16ToXMM(const void *ptr)
41{
42 unsigned short s;
43 memcpy(&s, ptr, 2);
44 return _mm_cvtsi32_si128(s);
45}
46
47static inline __m128i GDALCopyInt32ToXMM(const void *ptr)
48{
49 GInt32 i;
50 memcpy(&i, ptr, 4);
51 return _mm_cvtsi32_si128(i);
52}
53
54static inline __m128i GDALCopyInt64ToXMM(const void *ptr)
55{
56#if defined(__i386__) || defined(_M_IX86)
57 return _mm_loadl_epi64(static_cast<const __m128i *>(ptr));
58#else
59 GInt64 i;
60 memcpy(&i, ptr, 8);
61 return _mm_cvtsi64_si128(i);
62#endif
63}
64
65#ifndef GDALCopyXMMToInt16_defined
66#define GDALCopyXMMToInt16_defined
67
68static inline void GDALCopyXMMToInt16(const __m128i xmm, void *pDest)
69{
70 GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
71 memcpy(pDest, &i, 2);
72}
73#endif
74
75class XMMReg4Int;
76
77class XMMReg4Double;
78
79class XMMReg4Float
80{
81 public:
82 __m128 xmm;
83
84 XMMReg4Float()
85#if !defined(_MSC_VER)
86 : xmm(_mm_undefined_ps())
87#endif
88 {
89 }
90
91 XMMReg4Float(const XMMReg4Float &other) : xmm(other.xmm)
92 {
93 }
94
95 static inline XMMReg4Float Zero()
96 {
97 XMMReg4Float reg;
98 reg.Zeroize();
99 return reg;
100 }
101
102 static inline XMMReg4Float Set1(float f)
103 {
104 XMMReg4Float reg;
105 reg.xmm = _mm_set1_ps(f);
106 return reg;
107 }
108
109 static inline XMMReg4Float Load4Val(const float *ptr)
110 {
111 XMMReg4Float reg;
112 reg.nsLoad4Val(ptr);
113 return reg;
114 }
115
116 static inline XMMReg4Float Load4Val(const unsigned char *ptr)
117 {
118 XMMReg4Float reg;
119 reg.nsLoad4Val(ptr);
120 return reg;
121 }
122
123 static inline XMMReg4Float Load4Val(const short *ptr)
124 {
125 XMMReg4Float reg;
126 reg.nsLoad4Val(ptr);
127 return reg;
128 }
129
130 static inline XMMReg4Float Load4Val(const unsigned short *ptr)
131 {
132 XMMReg4Float reg;
133 reg.nsLoad4Val(ptr);
134 return reg;
135 }
136
137 static inline XMMReg4Float Load4Val(const int *ptr)
138 {
139 XMMReg4Float reg;
140 reg.nsLoad4Val(ptr);
141 return reg;
142 }
143
144 static inline XMMReg4Float Equals(const XMMReg4Float &expr1,
145 const XMMReg4Float &expr2)
146 {
147 XMMReg4Float reg;
148 reg.xmm = _mm_cmpeq_ps(expr1.xmm, expr2.xmm);
149 return reg;
150 }
151
152 static inline XMMReg4Float NotEquals(const XMMReg4Float &expr1,
153 const XMMReg4Float &expr2)
154 {
155 XMMReg4Float reg;
156 reg.xmm = _mm_cmpneq_ps(expr1.xmm, expr2.xmm);
157 return reg;
158 }
159
160 static inline XMMReg4Float Lesser(const XMMReg4Float &expr1,
161 const XMMReg4Float &expr2)
162 {
163 XMMReg4Float reg;
164 reg.xmm = _mm_cmplt_ps(expr1.xmm, expr2.xmm);
165 return reg;
166 }
167
168 static inline XMMReg4Float Greater(const XMMReg4Float &expr1,
169 const XMMReg4Float &expr2)
170 {
171 XMMReg4Float reg;
172 reg.xmm = _mm_cmpgt_ps(expr1.xmm, expr2.xmm);
173 return reg;
174 }
175
176 static inline XMMReg4Float And(const XMMReg4Float &expr1,
177 const XMMReg4Float &expr2)
178 {
179 XMMReg4Float reg;
180 reg.xmm = _mm_and_ps(expr1.xmm, expr2.xmm);
181 return reg;
182 }
183
184 static inline XMMReg4Float Ternary(const XMMReg4Float &cond,
185 const XMMReg4Float &true_expr,
186 const XMMReg4Float &false_expr)
187 {
188 XMMReg4Float reg;
189 reg.xmm = _mm_or_ps(_mm_and_ps(cond.xmm, true_expr.xmm),
190 _mm_andnot_ps(cond.xmm, false_expr.xmm));
191 return reg;
192 }
193
194 static inline XMMReg4Float Min(const XMMReg4Float &expr1,
195 const XMMReg4Float &expr2)
196 {
197 XMMReg4Float reg;
198 reg.xmm = _mm_min_ps(expr1.xmm, expr2.xmm);
199 return reg;
200 }
201
202 static inline XMMReg4Float Max(const XMMReg4Float &expr1,
203 const XMMReg4Float &expr2)
204 {
205 XMMReg4Float reg;
206 reg.xmm = _mm_max_ps(expr1.xmm, expr2.xmm);
207 return reg;
208 }
209
210 inline void nsLoad4Val(const float *ptr)
211 {
212 xmm = _mm_loadu_ps(ptr);
213 }
214
215 static inline void Load16Val(const float *ptr, XMMReg4Float &r0,
216 XMMReg4Float &r1, XMMReg4Float &r2,
217 XMMReg4Float &r3)
218 {
219 r0.nsLoad4Val(ptr);
220 r1.nsLoad4Val(ptr + 4);
221 r2.nsLoad4Val(ptr + 8);
222 r3.nsLoad4Val(ptr + 12);
223 }
224
225 inline void nsLoad4Val(const int *ptr)
226 {
227 const __m128i xmm_i =
228 _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
229 xmm = _mm_cvtepi32_ps(xmm_i);
230 }
231
232 static inline void Load16Val(const int *ptr, XMMReg4Float &r0,
233 XMMReg4Float &r1, XMMReg4Float &r2,
234 XMMReg4Float &r3)
235 {
236 r0.nsLoad4Val(ptr);
237 r1.nsLoad4Val(ptr + 4);
238 r2.nsLoad4Val(ptr + 8);
239 r3.nsLoad4Val(ptr + 12);
240 }
241
242 static inline __m128i cvtepu8_epi32(__m128i x)
243 {
244#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
245 return _mm_cvtepu8_epi32(x);
246#else
247 return _mm_unpacklo_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()),
248 _mm_setzero_si128());
249#endif
250 }
251
252 inline void nsLoad4Val(const unsigned char *ptr)
253 {
254 const __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
255 xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
256 }
257
258 static inline void Load8Val(const unsigned char *ptr, XMMReg4Float &r0,
259 XMMReg4Float &r1)
260 {
261 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
262 r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
263 r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
264 }
265
266 static inline void Load16Val(const unsigned char *ptr, XMMReg4Float &r0,
267 XMMReg4Float &r1, XMMReg4Float &r2,
268 XMMReg4Float &r3)
269 {
270 const __m128i xmm_i =
271 _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
272 r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
273 r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
274 r2.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 8)));
275 r3.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 12)));
276 }
277
278 static inline __m128i cvtepi16_epi32(__m128i x)
279 {
280#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
281 return _mm_cvtepi16_epi32(x);
282#else
283 /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
284 return _mm_srai_epi32(
285 /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
286 _mm_unpacklo_epi16(x, x), 16);
287#endif
288 }
289
290 inline void nsLoad4Val(const short *ptr)
291 {
292 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
293 xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
294 }
295
296 static inline void Load8Val(const short *ptr, XMMReg4Float &r0,
297 XMMReg4Float &r1)
298 {
299 const __m128i xmm_i =
300 _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
301 r0.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
302 r1.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(_mm_srli_si128(xmm_i, 8)));
303 }
304
305 static inline void Load16Val(const short *ptr, XMMReg4Float &r0,
306 XMMReg4Float &r1, XMMReg4Float &r2,
307 XMMReg4Float &r3)
308 {
309 Load8Val(ptr, r0, r1);
310 Load8Val(ptr + 8, r2, r3);
311 }
312
313 static inline __m128i cvtepu16_epi32(__m128i x)
314 {
315#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
316 return _mm_cvtepu16_epi32(x);
317#else
318 return _mm_unpacklo_epi16(
319 x, _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
320#endif
321 }
322
323 inline void nsLoad4Val(const unsigned short *ptr)
324 {
325 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
326 xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
327 }
328
329 static inline void Load8Val(const unsigned short *ptr, XMMReg4Float &r0,
330 XMMReg4Float &r1)
331 {
332 const __m128i xmm_i =
333 _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
334 r0.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
335 r1.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(_mm_srli_si128(xmm_i, 8)));
336 }
337
338 static inline void Load16Val(const unsigned short *ptr, XMMReg4Float &r0,
339 XMMReg4Float &r1, XMMReg4Float &r2,
340 XMMReg4Float &r3)
341 {
342 Load8Val(ptr, r0, r1);
343 Load8Val(ptr + 8, r2, r3);
344 }
345
346 inline void Zeroize()
347 {
348 xmm = _mm_setzero_ps();
349 }
350
351 inline XMMReg4Float &operator=(const XMMReg4Float &other)
352 {
353 xmm = other.xmm;
354 return *this;
355 }
356
357 inline XMMReg4Float &operator+=(const XMMReg4Float &other)
358 {
359 xmm = _mm_add_ps(xmm, other.xmm);
360 return *this;
361 }
362
363 inline XMMReg4Float &operator-=(const XMMReg4Float &other)
364 {
365 xmm = _mm_sub_ps(xmm, other.xmm);
366 return *this;
367 }
368
369 inline XMMReg4Float &operator*=(const XMMReg4Float &other)
370 {
371 xmm = _mm_mul_ps(xmm, other.xmm);
372 return *this;
373 }
374
375 inline XMMReg4Float operator+(const XMMReg4Float &other) const
376 {
377 XMMReg4Float ret;
378 ret.xmm = _mm_add_ps(xmm, other.xmm);
379 return ret;
380 }
381
382 inline XMMReg4Float operator-(const XMMReg4Float &other) const
383 {
384 XMMReg4Float ret;
385 ret.xmm = _mm_sub_ps(xmm, other.xmm);
386 return ret;
387 }
388
389 inline XMMReg4Float operator*(const XMMReg4Float &other) const
390 {
391 XMMReg4Float ret;
392 ret.xmm = _mm_mul_ps(xmm, other.xmm);
393 return ret;
394 }
395
396 inline XMMReg4Float operator/(const XMMReg4Float &other) const
397 {
398 XMMReg4Float ret;
399 ret.xmm = _mm_div_ps(xmm, other.xmm);
400 return ret;
401 }
402
403 inline XMMReg4Float inverse() const
404 {
405 XMMReg4Float ret;
406 ret.xmm = _mm_div_ps(_mm_set1_ps(1.0f), xmm);
407 return ret;
408 }
409
410 inline XMMReg4Int truncate_to_int() const;
411
412 inline XMMReg4Double cast_to_double() const;
413
414 inline void Store4Val(float *ptr) const
415 {
416 _mm_storeu_ps(ptr, xmm);
417 }
418
419 inline void Store4ValAligned(float *ptr) const
420 {
421 _mm_store_ps(ptr, xmm);
422 }
423
424 inline operator float() const
425 {
426 return _mm_cvtss_f32(xmm);
427 }
428};
429
430class XMMReg4Int
431{
432 public:
433 __m128i xmm;
434
435 XMMReg4Int()
436#if !defined(_MSC_VER)
437 : xmm(_mm_undefined_si128())
438#endif
439 {
440 }
441
442 XMMReg4Int(const XMMReg4Int &other) : xmm(other.xmm)
443 {
444 }
445
446 static inline XMMReg4Int Zero()
447 {
448 XMMReg4Int reg;
449 reg.xmm = _mm_setzero_si128();
450 return reg;
451 }
452
453 static inline XMMReg4Int Set1(int i)
454 {
455 XMMReg4Int reg;
456 reg.xmm = _mm_set1_epi32(i);
457 return reg;
458 }
459
460 static inline XMMReg4Int Load4Val(const int *ptr)
461 {
462 XMMReg4Int reg;
463 reg.nsLoad4Val(ptr);
464 return reg;
465 }
466
467 inline void nsLoad4Val(const int *ptr)
468 {
469 xmm = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
470 }
471
472 static inline XMMReg4Int Equals(const XMMReg4Int &expr1,
473 const XMMReg4Int &expr2)
474 {
475 XMMReg4Int reg;
476 reg.xmm = _mm_cmpeq_epi32(expr1.xmm, expr2.xmm);
477 return reg;
478 }
479
480 static inline XMMReg4Int Ternary(const XMMReg4Int &cond,
481 const XMMReg4Int &true_expr,
482 const XMMReg4Int &false_expr)
483 {
484 XMMReg4Int reg;
485 reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
486 _mm_andnot_si128(cond.xmm, false_expr.xmm));
487 return reg;
488 }
489
490 inline XMMReg4Int &operator+=(const XMMReg4Int &other)
491 {
492 xmm = _mm_add_epi32(xmm, other.xmm);
493 return *this;
494 }
495
496 inline XMMReg4Int &operator-=(const XMMReg4Int &other)
497 {
498 xmm = _mm_sub_epi32(xmm, other.xmm);
499 return *this;
500 }
501
502 inline XMMReg4Int operator+(const XMMReg4Int &other) const
503 {
504 XMMReg4Int ret;
505 ret.xmm = _mm_add_epi32(xmm, other.xmm);
506 return ret;
507 }
508
509 inline XMMReg4Int operator-(const XMMReg4Int &other) const
510 {
511 XMMReg4Int ret;
512 ret.xmm = _mm_sub_epi32(xmm, other.xmm);
513 return ret;
514 }
515
516 XMMReg4Double cast_to_double() const;
517
518 XMMReg4Float to_float() const
519 {
520 XMMReg4Float ret;
521 ret.xmm = _mm_cvtepi32_ps(xmm);
522 return ret;
523 }
524};
525
526inline XMMReg4Int XMMReg4Float::truncate_to_int() const
527{
528 XMMReg4Int ret;
529 ret.xmm = _mm_cvttps_epi32(xmm);
530 return ret;
531}
532
533class XMMReg8Byte
534{
535 public:
536 __m128i xmm;
537
538 XMMReg8Byte()
539#if !defined(_MSC_VER)
540 : xmm(_mm_undefined_si128())
541#endif
542 {
543 }
544
545 XMMReg8Byte(const XMMReg8Byte &other) : xmm(other.xmm)
546 {
547 }
548
549 static inline XMMReg8Byte Zero()
550 {
551 XMMReg8Byte reg;
552 reg.xmm = _mm_setzero_si128();
553 return reg;
554 }
555
556 static inline XMMReg8Byte Set1(char i)
557 {
558 XMMReg8Byte reg;
559 reg.xmm = _mm_set1_epi8(i);
560 return reg;
561 }
562
563 static inline XMMReg8Byte Equals(const XMMReg8Byte &expr1,
564 const XMMReg8Byte &expr2)
565 {
566 XMMReg8Byte reg;
567 reg.xmm = _mm_cmpeq_epi8(expr1.xmm, expr2.xmm);
568 return reg;
569 }
570
571 static inline XMMReg8Byte Or(const XMMReg8Byte &expr1,
572 const XMMReg8Byte &expr2)
573 {
574 XMMReg8Byte reg;
575 reg.xmm = _mm_or_si128(expr1.xmm, expr2.xmm);
576 return reg;
577 }
578
579 static inline XMMReg8Byte Ternary(const XMMReg8Byte &cond,
580 const XMMReg8Byte &true_expr,
581 const XMMReg8Byte &false_expr)
582 {
583 XMMReg8Byte reg;
584 reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
585 _mm_andnot_si128(cond.xmm, false_expr.xmm));
586 return reg;
587 }
588
589 inline XMMReg8Byte operator+(const XMMReg8Byte &other) const
590 {
591 XMMReg8Byte ret;
592 ret.xmm = _mm_add_epi8(xmm, other.xmm);
593 return ret;
594 }
595
596 inline XMMReg8Byte operator-(const XMMReg8Byte &other) const
597 {
598 XMMReg8Byte ret;
599 ret.xmm = _mm_sub_epi8(xmm, other.xmm);
600 return ret;
601 }
602
603 static inline XMMReg8Byte Pack(const XMMReg4Int &r0, const XMMReg4Int &r1)
604 {
605 XMMReg8Byte reg;
606 reg.xmm = _mm_packs_epi32(r0.xmm, r1.xmm);
607 reg.xmm = _mm_packus_epi16(reg.xmm, reg.xmm);
608 return reg;
609 }
610
611 inline void Store8Val(unsigned char *ptr) const
612 {
613 GDALCopyXMMToInt64(xmm, reinterpret_cast<GInt64 *>(ptr));
614 }
615};
616
617class XMMReg2Double
618{
619 public:
620 __m128d xmm;
621
622 XMMReg2Double()
623#if !defined(_MSC_VER)
624 : xmm(_mm_undefined_pd())
625#endif
626 {
627 }
628
629 XMMReg2Double(double val) : xmm(_mm_load_sd(&val))
630 {
631 }
632
633 XMMReg2Double(const XMMReg2Double &other) : xmm(other.xmm)
634 {
635 }
636
637 static inline XMMReg2Double Set1(double d)
638 {
639 XMMReg2Double reg;
640 reg.xmm = _mm_set1_pd(d);
641 return reg;
642 }
643
644 static inline XMMReg2Double Zero()
645 {
646 XMMReg2Double reg;
647 reg.Zeroize();
648 return reg;
649 }
650
651 static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
652 {
653 XMMReg2Double reg;
654 reg.nsLoad1ValHighAndLow(ptr);
655 return reg;
656 }
657
658 static inline XMMReg2Double Load2Val(const double *ptr)
659 {
660 XMMReg2Double reg;
661 reg.nsLoad2Val(ptr);
662 return reg;
663 }
664
665 static inline XMMReg2Double Load2Val(const float *ptr)
666 {
667 XMMReg2Double reg;
668 reg.nsLoad2Val(ptr);
669 return reg;
670 }
671
672 static inline XMMReg2Double Load2ValAligned(const double *ptr)
673 {
674 XMMReg2Double reg;
675 reg.nsLoad2ValAligned(ptr);
676 return reg;
677 }
678
679 static inline XMMReg2Double Load2Val(const unsigned char *ptr)
680 {
681 XMMReg2Double reg;
682 reg.nsLoad2Val(ptr);
683 return reg;
684 }
685
686 static inline XMMReg2Double Load2Val(const short *ptr)
687 {
688 XMMReg2Double reg;
689 reg.nsLoad2Val(ptr);
690 return reg;
691 }
692
693 static inline XMMReg2Double Load2Val(const unsigned short *ptr)
694 {
695 XMMReg2Double reg;
696 reg.nsLoad2Val(ptr);
697 return reg;
698 }
699
700 static inline XMMReg2Double Load2Val(const int *ptr)
701 {
702 XMMReg2Double reg;
703 reg.nsLoad2Val(ptr);
704 return reg;
705 }
706
707 static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
708 const XMMReg2Double &expr2)
709 {
710 XMMReg2Double reg;
711 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
712 return reg;
713 }
714
715 static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
716 const XMMReg2Double &expr2)
717 {
718 XMMReg2Double reg;
719 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
720 return reg;
721 }
722
723 static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
724 const XMMReg2Double &expr2)
725 {
726 XMMReg2Double reg;
727 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
728 return reg;
729 }
730
731 static inline XMMReg2Double And(const XMMReg2Double &expr1,
732 const XMMReg2Double &expr2)
733 {
734 XMMReg2Double reg;
735 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
736 return reg;
737 }
738
739 static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
740 const XMMReg2Double &true_expr,
741 const XMMReg2Double &false_expr)
742 {
743 XMMReg2Double reg;
744 reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
745 _mm_andnot_pd(cond.xmm, false_expr.xmm));
746 return reg;
747 }
748
749 static inline XMMReg2Double Min(const XMMReg2Double &expr1,
750 const XMMReg2Double &expr2)
751 {
752 XMMReg2Double reg;
753 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
754 return reg;
755 }
756
757 inline void nsLoad1ValHighAndLow(const double *ptr)
758 {
759 xmm = _mm_load1_pd(ptr);
760 }
761
762 inline void nsLoad2Val(const double *ptr)
763 {
764 xmm = _mm_loadu_pd(ptr);
765 }
766
767 inline void nsLoad2ValAligned(const double *ptr)
768 {
769 xmm = _mm_load_pd(ptr);
770 }
771
772 inline void nsLoad2Val(const float *ptr)
773 {
774 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
775 }
776
777 inline void nsLoad2Val(const int *ptr)
778 {
779 xmm = _mm_cvtepi32_pd(GDALCopyInt64ToXMM(ptr));
780 }
781
782 inline void nsLoad2Val(const unsigned char *ptr)
783 {
784 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
785#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
786 xmm_i = _mm_cvtepu8_epi32(xmm_i);
787#else
788 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
789 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
790#endif
791 xmm = _mm_cvtepi32_pd(xmm_i);
792 }
793
794 inline void nsLoad2Val(const short *ptr)
795 {
796 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
797#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
798 xmm_i = _mm_cvtepi16_epi32(xmm_i);
799#else
800 xmm_i = _mm_unpacklo_epi16(
801 xmm_i, xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
802 xmm_i = _mm_srai_epi32(
803 xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
804#endif
805 xmm = _mm_cvtepi32_pd(xmm_i);
806 }
807
808 inline void nsLoad2Val(const unsigned short *ptr)
809 {
810 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
811#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
812 xmm_i = _mm_cvtepu16_epi32(xmm_i);
813#else
814 xmm_i = _mm_unpacklo_epi16(
815 xmm_i,
816 _mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
817#endif
818 xmm = _mm_cvtepi32_pd(xmm_i);
819 }
820
821 static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
822 XMMReg2Double &high)
823 {
824 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
825#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
826 xmm_i = _mm_cvtepu8_epi32(xmm_i);
827#else
828 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
829 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
830#endif
831 low.xmm = _mm_cvtepi32_pd(xmm_i);
832 high.xmm =
833 _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
834 }
835
836 static inline void Load4Val(const short *ptr, XMMReg2Double &low,
837 XMMReg2Double &high)
838 {
839 low.nsLoad2Val(ptr);
840 high.nsLoad2Val(ptr + 2);
841 }
842
843 static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
844 XMMReg2Double &high)
845 {
846 low.nsLoad2Val(ptr);
847 high.nsLoad2Val(ptr + 2);
848 }
849
850 static inline void Load4Val(const double *ptr, XMMReg2Double &low,
851 XMMReg2Double &high)
852 {
853 low.nsLoad2Val(ptr);
854 high.nsLoad2Val(ptr + 2);
855 }
856
857 static inline void Load4Val(const float *ptr, XMMReg2Double &low,
858 XMMReg2Double &high)
859 {
860 __m128 temp1 = _mm_loadu_ps(ptr);
861 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
862 low.xmm = _mm_cvtps_pd(temp1);
863 high.xmm = _mm_cvtps_pd(temp2);
864 }
865
866 inline void Zeroize()
867 {
868 xmm = _mm_setzero_pd();
869 }
870
871 inline XMMReg2Double &operator=(const XMMReg2Double &other)
872 {
873 xmm = other.xmm;
874 return *this;
875 }
876
877 inline XMMReg2Double &operator+=(const XMMReg2Double &other)
878 {
879 xmm = _mm_add_pd(xmm, other.xmm);
880 return *this;
881 }
882
883 inline XMMReg2Double &operator*=(const XMMReg2Double &other)
884 {
885 xmm = _mm_mul_pd(xmm, other.xmm);
886 return *this;
887 }
888
889 inline XMMReg2Double operator+(const XMMReg2Double &other) const
890 {
891 XMMReg2Double ret;
892 ret.xmm = _mm_add_pd(xmm, other.xmm);
893 return ret;
894 }
895
896 inline XMMReg2Double operator-(const XMMReg2Double &other) const
897 {
898 XMMReg2Double ret;
899 ret.xmm = _mm_sub_pd(xmm, other.xmm);
900 return ret;
901 }
902
903 inline XMMReg2Double operator*(const XMMReg2Double &other) const
904 {
905 XMMReg2Double ret;
906 ret.xmm = _mm_mul_pd(xmm, other.xmm);
907 return ret;
908 }
909
910 inline XMMReg2Double operator/(const XMMReg2Double &other) const
911 {
912 XMMReg2Double ret;
913 ret.xmm = _mm_div_pd(xmm, other.xmm);
914 return ret;
915 }
916
917 inline double GetHorizSum() const
918 {
919 __m128d xmm2;
920 xmm2 = _mm_shuffle_pd(
921 xmm, xmm,
922 _MM_SHUFFLE2(0, 1)); /* transfer high word into low word of xmm2 */
923 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
924 }
925
926 inline void Store2Val(double *ptr) const
927 {
928 _mm_storeu_pd(ptr, xmm);
929 }
930
931 inline void Store2ValAligned(double *ptr) const
932 {
933 _mm_store_pd(ptr, xmm);
934 }
935
936 inline void Store2Val(float *ptr) const
937 {
938 __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
939 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
940 }
941
942 inline void Store2Val(unsigned char *ptr) const
943 {
944 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
945 xmm,
946 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
947 tmp = _mm_packs_epi32(tmp, tmp);
948 tmp = _mm_packus_epi16(tmp, tmp);
949 GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16 *>(ptr));
950 }
951
952 inline void Store2Val(unsigned short *ptr) const
953 {
954 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
955 xmm,
956 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
957 // X X X X 0 B 0 A --> X X X X A A B A
958 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
959 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
960 }
961
962 inline void StoreMask(unsigned char *ptr) const
963 {
964 _mm_storeu_si128(reinterpret_cast<__m128i *>(ptr),
965 _mm_castpd_si128(xmm));
966 }
967
968 inline operator double() const
969 {
970 return _mm_cvtsd_f64(xmm);
971 }
972};
973
974#else
975
976#ifndef NO_WARN_USE_SSE2_EMULATION
977#warning "Software emulation of SSE2 !"
978#endif
979
980class XMMReg2Double
981{
982 public:
983 double low;
984 double high;
985
986 // cppcheck-suppress uninitMemberVar
987 XMMReg2Double() = default;
988
989 explicit XMMReg2Double(double val)
990 {
991 low = val;
992 high = 0.0;
993 }
994
995 XMMReg2Double(const XMMReg2Double &other) : low(other.low), high(other.high)
996 {
997 }
998
999 static inline XMMReg2Double Zero()
1000 {
1001 XMMReg2Double reg;
1002 reg.Zeroize();
1003 return reg;
1004 }
1005
1006 static inline XMMReg2Double Set1(double d)
1007 {
1008 XMMReg2Double reg;
1009 reg.low = d;
1010 reg.high = d;
1011 return reg;
1012 }
1013
1014 static inline XMMReg2Double Load1ValHighAndLow(const double *ptr)
1015 {
1016 XMMReg2Double reg;
1017 reg.nsLoad1ValHighAndLow(ptr);
1018 return reg;
1019 }
1020
1021 static inline XMMReg2Double Equals(const XMMReg2Double &expr1,
1022 const XMMReg2Double &expr2)
1023 {
1024 XMMReg2Double reg;
1025
1026 if (expr1.low == expr2.low)
1027 memset(&(reg.low), 0xFF, sizeof(double));
1028 else
1029 reg.low = 0;
1030
1031 if (expr1.high == expr2.high)
1032 memset(&(reg.high), 0xFF, sizeof(double));
1033 else
1034 reg.high = 0;
1035
1036 return reg;
1037 }
1038
1039 static inline XMMReg2Double NotEquals(const XMMReg2Double &expr1,
1040 const XMMReg2Double &expr2)
1041 {
1042 XMMReg2Double reg;
1043
1044 if (expr1.low != expr2.low)
1045 memset(&(reg.low), 0xFF, sizeof(double));
1046 else
1047 reg.low = 0;
1048
1049 if (expr1.high != expr2.high)
1050 memset(&(reg.high), 0xFF, sizeof(double));
1051 else
1052 reg.high = 0;
1053
1054 return reg;
1055 }
1056
1057 static inline XMMReg2Double Greater(const XMMReg2Double &expr1,
1058 const XMMReg2Double &expr2)
1059 {
1060 XMMReg2Double reg;
1061
1062 if (expr1.low > expr2.low)
1063 memset(&(reg.low), 0xFF, sizeof(double));
1064 else
1065 reg.low = 0;
1066
1067 if (expr1.high > expr2.high)
1068 memset(&(reg.high), 0xFF, sizeof(double));
1069 else
1070 reg.high = 0;
1071
1072 return reg;
1073 }
1074
1075 static inline XMMReg2Double And(const XMMReg2Double &expr1,
1076 const XMMReg2Double &expr2)
1077 {
1078 XMMReg2Double reg;
1079 int low1[2], high1[2];
1080 int low2[2], high2[2];
1081 memcpy(low1, &expr1.low, sizeof(double));
1082 memcpy(high1, &expr1.high, sizeof(double));
1083 memcpy(low2, &expr2.low, sizeof(double));
1084 memcpy(high2, &expr2.high, sizeof(double));
1085 low1[0] &= low2[0];
1086 low1[1] &= low2[1];
1087 high1[0] &= high2[0];
1088 high1[1] &= high2[1];
1089 memcpy(&reg.low, low1, sizeof(double));
1090 memcpy(&reg.high, high1, sizeof(double));
1091 return reg;
1092 }
1093
1094 static inline XMMReg2Double Ternary(const XMMReg2Double &cond,
1095 const XMMReg2Double &true_expr,
1096 const XMMReg2Double &false_expr)
1097 {
1098 XMMReg2Double reg;
1099 if (cond.low != 0)
1100 reg.low = true_expr.low;
1101 else
1102 reg.low = false_expr.low;
1103 if (cond.high != 0)
1104 reg.high = true_expr.high;
1105 else
1106 reg.high = false_expr.high;
1107 return reg;
1108 }
1109
1110 static inline XMMReg2Double Min(const XMMReg2Double &expr1,
1111 const XMMReg2Double &expr2)
1112 {
1113 XMMReg2Double reg;
1114 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
1115 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
1116 return reg;
1117 }
1118
1119 static inline XMMReg2Double Load2Val(const double *ptr)
1120 {
1121 XMMReg2Double reg;
1122 reg.nsLoad2Val(ptr);
1123 return reg;
1124 }
1125
1126 static inline XMMReg2Double Load2ValAligned(const double *ptr)
1127 {
1128 XMMReg2Double reg;
1129 reg.nsLoad2ValAligned(ptr);
1130 return reg;
1131 }
1132
1133 static inline XMMReg2Double Load2Val(const float *ptr)
1134 {
1135 XMMReg2Double reg;
1136 reg.nsLoad2Val(ptr);
1137 return reg;
1138 }
1139
1140 static inline XMMReg2Double Load2Val(const unsigned char *ptr)
1141 {
1142 XMMReg2Double reg;
1143 reg.nsLoad2Val(ptr);
1144 return reg;
1145 }
1146
1147 static inline XMMReg2Double Load2Val(const short *ptr)
1148 {
1149 XMMReg2Double reg;
1150 reg.nsLoad2Val(ptr);
1151 return reg;
1152 }
1153
1154 static inline XMMReg2Double Load2Val(const unsigned short *ptr)
1155 {
1156 XMMReg2Double reg;
1157 reg.nsLoad2Val(ptr);
1158 return reg;
1159 }
1160
1161 static inline XMMReg2Double Load2Val(const int *ptr)
1162 {
1163 XMMReg2Double reg;
1164 reg.nsLoad2Val(ptr);
1165 return reg;
1166 }
1167
1168 inline void nsLoad1ValHighAndLow(const double *ptr)
1169 {
1170 low = ptr[0];
1171 high = ptr[0];
1172 }
1173
1174 inline void nsLoad2Val(const double *ptr)
1175 {
1176 low = ptr[0];
1177 high = ptr[1];
1178 }
1179
1180 inline void nsLoad2ValAligned(const double *ptr)
1181 {
1182 low = ptr[0];
1183 high = ptr[1];
1184 }
1185
1186 inline void nsLoad2Val(const float *ptr)
1187 {
1188 low = ptr[0];
1189 high = ptr[1];
1190 }
1191
1192 inline void nsLoad2Val(const unsigned char *ptr)
1193 {
1194 low = ptr[0];
1195 high = ptr[1];
1196 }
1197
1198 inline void nsLoad2Val(const short *ptr)
1199 {
1200 low = ptr[0];
1201 high = ptr[1];
1202 }
1203
1204 inline void nsLoad2Val(const unsigned short *ptr)
1205 {
1206 low = ptr[0];
1207 high = ptr[1];
1208 }
1209
1210 inline void nsLoad2Val(const int *ptr)
1211 {
1212 low = ptr[0];
1213 high = ptr[1];
1214 }
1215
1216 static inline void Load4Val(const unsigned char *ptr, XMMReg2Double &low,
1217 XMMReg2Double &high)
1218 {
1219 low.low = ptr[0];
1220 low.high = ptr[1];
1221 high.low = ptr[2];
1222 high.high = ptr[3];
1223 }
1224
1225 static inline void Load4Val(const short *ptr, XMMReg2Double &low,
1226 XMMReg2Double &high)
1227 {
1228 low.nsLoad2Val(ptr);
1229 high.nsLoad2Val(ptr + 2);
1230 }
1231
1232 static inline void Load4Val(const unsigned short *ptr, XMMReg2Double &low,
1233 XMMReg2Double &high)
1234 {
1235 low.nsLoad2Val(ptr);
1236 high.nsLoad2Val(ptr + 2);
1237 }
1238
1239 static inline void Load4Val(const double *ptr, XMMReg2Double &low,
1240 XMMReg2Double &high)
1241 {
1242 low.nsLoad2Val(ptr);
1243 high.nsLoad2Val(ptr + 2);
1244 }
1245
1246 static inline void Load4Val(const float *ptr, XMMReg2Double &low,
1247 XMMReg2Double &high)
1248 {
1249 low.nsLoad2Val(ptr);
1250 high.nsLoad2Val(ptr + 2);
1251 }
1252
1253 inline void Zeroize()
1254 {
1255 low = 0.0;
1256 high = 0.0;
1257 }
1258
1259 inline XMMReg2Double &operator=(const XMMReg2Double &other)
1260 {
1261 low = other.low;
1262 high = other.high;
1263 return *this;
1264 }
1265
1266 inline XMMReg2Double &operator+=(const XMMReg2Double &other)
1267 {
1268 low += other.low;
1269 high += other.high;
1270 return *this;
1271 }
1272
1273 inline XMMReg2Double &operator*=(const XMMReg2Double &other)
1274 {
1275 low *= other.low;
1276 high *= other.high;
1277 return *this;
1278 }
1279
1280 inline XMMReg2Double operator+(const XMMReg2Double &other) const
1281 {
1282 XMMReg2Double ret;
1283 ret.low = low + other.low;
1284 ret.high = high + other.high;
1285 return ret;
1286 }
1287
1288 inline XMMReg2Double operator-(const XMMReg2Double &other) const
1289 {
1290 XMMReg2Double ret;
1291 ret.low = low - other.low;
1292 ret.high = high - other.high;
1293 return ret;
1294 }
1295
1296 inline XMMReg2Double operator*(const XMMReg2Double &other) const
1297 {
1298 XMMReg2Double ret;
1299 ret.low = low * other.low;
1300 ret.high = high * other.high;
1301 return ret;
1302 }
1303
1304 inline XMMReg2Double operator/(const XMMReg2Double &other) const
1305 {
1306 XMMReg2Double ret;
1307 ret.low = low / other.low;
1308 ret.high = high / other.high;
1309 return ret;
1310 }
1311
1312 inline double GetHorizSum() const
1313 {
1314 return low + high;
1315 }
1316
1317 inline void Store2Val(double *ptr) const
1318 {
1319 ptr[0] = low;
1320 ptr[1] = high;
1321 }
1322
1323 inline void Store2ValAligned(double *ptr) const
1324 {
1325 ptr[0] = low;
1326 ptr[1] = high;
1327 }
1328
1329 inline void Store2Val(float *ptr) const
1330 {
1331 ptr[0] = static_cast<float>(low);
1332 ptr[1] = static_cast<float>(high);
1333 }
1334
1335 void Store2Val(unsigned char *ptr) const
1336 {
1337 ptr[0] = static_cast<unsigned char>(low + 0.5);
1338 ptr[1] = static_cast<unsigned char>(high + 0.5);
1339 }
1340
1341 void Store2Val(unsigned short *ptr) const
1342 {
1343 ptr[0] = static_cast<GUInt16>(low + 0.5);
1344 ptr[1] = static_cast<GUInt16>(high + 0.5);
1345 }
1346
1347 inline void StoreMask(unsigned char *ptr) const
1348 {
1349 memcpy(ptr, &low, 8);
1350 memcpy(ptr + 8, &high, 8);
1351 }
1352
1353 inline operator double() const
1354 {
1355 return low;
1356 }
1357};
1358
1359#endif /* defined(__x86_64) || defined(_M_X64) */
1360
1361#if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
1362
1363#include <immintrin.h>
1364
1365class XMMReg4Double
1366{
1367 public:
1368 __m256d ymm;
1369
1370 XMMReg4Double() : ymm(_mm256_setzero_pd())
1371 {
1372 }
1373
1374 XMMReg4Double(const XMMReg4Double &other) : ymm(other.ymm)
1375 {
1376 }
1377
1378 static inline XMMReg4Double Zero()
1379 {
1380 XMMReg4Double reg;
1381 reg.Zeroize();
1382 return reg;
1383 }
1384
1385 static inline XMMReg4Double Set1(double d)
1386 {
1387 XMMReg4Double reg;
1388 reg.ymm = _mm256_set1_pd(d);
1389 return reg;
1390 }
1391
1392 inline void Zeroize()
1393 {
1394 ymm = _mm256_setzero_pd();
1395 }
1396
1397 static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1398 {
1399 XMMReg4Double reg;
1400 reg.nsLoad1ValHighAndLow(ptr);
1401 return reg;
1402 }
1403
1404 inline void nsLoad1ValHighAndLow(const double *ptr)
1405 {
1406 ymm = _mm256_set1_pd(*ptr);
1407 }
1408
1409 static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1410 {
1411 XMMReg4Double reg;
1412 reg.nsLoad4Val(ptr);
1413 return reg;
1414 }
1415
1416 inline void nsLoad4Val(const unsigned char *ptr)
1417 {
1418 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
1419 xmm_i = _mm_cvtepu8_epi32(xmm_i);
1420 ymm = _mm256_cvtepi32_pd(xmm_i);
1421 }
1422
1423 static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
1424 XMMReg4Double &high)
1425 {
1426 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1427 const __m128i xmm_i_low = _mm_cvtepu8_epi32(xmm_i);
1428 low.ymm = _mm256_cvtepi32_pd(xmm_i_low);
1429 const __m128i xmm_i_high = _mm_cvtepu8_epi32(_mm_srli_si128(xmm_i, 4));
1430 high.ymm = _mm256_cvtepi32_pd(xmm_i_high);
1431 }
1432
1433 static inline XMMReg4Double Load4Val(const short *ptr)
1434 {
1435 XMMReg4Double reg;
1436 reg.nsLoad4Val(ptr);
1437 return reg;
1438 }
1439
1440 inline void nsLoad4Val(const short *ptr)
1441 {
1442 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1443 xmm_i = _mm_cvtepi16_epi32(xmm_i);
1444 ymm = _mm256_cvtepi32_pd(xmm_i);
1445 }
1446
1447 static inline void Load8Val(const short *ptr, XMMReg4Double &low,
1448 XMMReg4Double &high)
1449 {
1450 low.nsLoad4Val(ptr);
1451 high.nsLoad4Val(ptr + 4);
1452 }
1453
1454 static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1455 {
1456 XMMReg4Double reg;
1457 reg.nsLoad4Val(ptr);
1458 return reg;
1459 }
1460
1461 inline void nsLoad4Val(const unsigned short *ptr)
1462 {
1463 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1464 xmm_i = _mm_cvtepu16_epi32(xmm_i);
1465 ymm = _mm256_cvtepi32_pd(
1466 xmm_i); // ok to use signed conversion since we are in the ushort
1467 // range, so cannot be interpreted as negative int32
1468 }
1469
1470 static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
1471 XMMReg4Double &high)
1472 {
1473 low.nsLoad4Val(ptr);
1474 high.nsLoad4Val(ptr + 4);
1475 }
1476
1477 static inline XMMReg4Double Load4Val(const double *ptr)
1478 {
1479 XMMReg4Double reg;
1480 reg.nsLoad4Val(ptr);
1481 return reg;
1482 }
1483
1484 inline void nsLoad4Val(const double *ptr)
1485 {
1486 ymm = _mm256_loadu_pd(ptr);
1487 }
1488
1489 static inline void Load8Val(const double *ptr, XMMReg4Double &low,
1490 XMMReg4Double &high)
1491 {
1492 low.nsLoad4Val(ptr);
1493 high.nsLoad4Val(ptr + 4);
1494 }
1495
1496 static inline XMMReg4Double Load4ValAligned(const double *ptr)
1497 {
1498 XMMReg4Double reg;
1499 reg.nsLoad4ValAligned(ptr);
1500 return reg;
1501 }
1502
1503 inline void nsLoad4ValAligned(const double *ptr)
1504 {
1505 ymm = _mm256_load_pd(ptr);
1506 }
1507
1508 static inline XMMReg4Double Load4Val(const float *ptr)
1509 {
1510 XMMReg4Double reg;
1511 reg.nsLoad4Val(ptr);
1512 return reg;
1513 }
1514
1515 inline void nsLoad4Val(const float *ptr)
1516 {
1517 ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
1518 }
1519
1520 static inline void Load8Val(const float *ptr, XMMReg4Double &low,
1521 XMMReg4Double &high)
1522 {
1523 low.nsLoad4Val(ptr);
1524 high.nsLoad4Val(ptr + 4);
1525 }
1526
1527 static inline XMMReg4Double Load4Val(const int *ptr)
1528 {
1529 XMMReg4Double reg;
1530 reg.nsLoad4Val(ptr);
1531 return reg;
1532 }
1533
1534 inline void nsLoad4Val(const int *ptr)
1535 {
1536 ymm = _mm256_cvtepi32_pd(
1537 _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)));
1538 }
1539
1540 static inline void Load8Val(const int *ptr, XMMReg4Double &low,
1541 XMMReg4Double &high)
1542 {
1543 low.nsLoad4Val(ptr);
1544 high.nsLoad4Val(ptr + 4);
1545 }
1546
1547 static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1548 const XMMReg4Double &expr2)
1549 {
1550 XMMReg4Double reg;
1551 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
1552 return reg;
1553 }
1554
1555 static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1556 const XMMReg4Double &expr2)
1557 {
1558 XMMReg4Double reg;
1559 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
1560 return reg;
1561 }
1562
1563 static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1564 const XMMReg4Double &expr2)
1565 {
1566 XMMReg4Double reg;
1567 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
1568 return reg;
1569 }
1570
1571 static inline XMMReg4Double And(const XMMReg4Double &expr1,
1572 const XMMReg4Double &expr2)
1573 {
1574 XMMReg4Double reg;
1575 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
1576 return reg;
1577 }
1578
1579 static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1580 const XMMReg4Double &true_expr,
1581 const XMMReg4Double &false_expr)
1582 {
1583 XMMReg4Double reg;
1584 reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
1585 _mm256_andnot_pd(cond.ymm, false_expr.ymm));
1586 return reg;
1587 }
1588
1589 static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1590 const XMMReg4Double &expr2)
1591 {
1592 XMMReg4Double reg;
1593 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
1594 return reg;
1595 }
1596
1597 inline XMMReg4Double &operator=(const XMMReg4Double &other)
1598 {
1599 ymm = other.ymm;
1600 return *this;
1601 }
1602
1603 inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1604 {
1605 ymm = _mm256_add_pd(ymm, other.ymm);
1606 return *this;
1607 }
1608
1609 inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1610 {
1611 ymm = _mm256_mul_pd(ymm, other.ymm);
1612 return *this;
1613 }
1614
1615 inline XMMReg4Double operator+(const XMMReg4Double &other) const
1616 {
1617 XMMReg4Double ret;
1618 ret.ymm = _mm256_add_pd(ymm, other.ymm);
1619 return ret;
1620 }
1621
1622 inline XMMReg4Double operator-(const XMMReg4Double &other) const
1623 {
1624 XMMReg4Double ret;
1625 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
1626 return ret;
1627 }
1628
1629 inline XMMReg4Double operator*(const XMMReg4Double &other) const
1630 {
1631 XMMReg4Double ret;
1632 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
1633 return ret;
1634 }
1635
1636 inline XMMReg4Double operator/(const XMMReg4Double &other) const
1637 {
1638 XMMReg4Double ret;
1639 ret.ymm = _mm256_div_pd(ymm, other.ymm);
1640 return ret;
1641 }
1642
1643 void AddToLow(const XMMReg2Double &other)
1644 {
1645 __m256d ymm2 = _mm256_setzero_pd();
1646 ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1647 ymm = _mm256_add_pd(ymm, ymm2);
1648 }
1649
1650 inline double GetHorizSum() const
1651 {
1652 __m256d ymm_tmp1, ymm_tmp2;
1653 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1654 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1655 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1656 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1657 }
1658
1659 inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
1660 const XMMReg4Double &half) const
1661 {
1662 __m256d reg = ymm;
1663 __m256d reg_half = _mm256_mul_pd(reg, half.ymm);
1664 // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
1665 reg = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(reg)));
1666 // And perform one step of Newton-Raphson approximation to improve it
1667 // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
1668 // 0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
1669 const __m256d one_and_a_half = _mm256_add_pd(one.ymm, half.ymm);
1670 reg = _mm256_mul_pd(
1671 reg,
1672 _mm256_sub_pd(one_and_a_half,
1673 _mm256_mul_pd(reg_half, _mm256_mul_pd(reg, reg))));
1674 XMMReg4Double ret;
1675 ret.ymm = reg;
1676 return ret;
1677 }
1678
1679 inline XMMReg4Float cast_to_float() const
1680 {
1681 XMMReg4Float ret;
1682 ret.xmm = _mm256_cvtpd_ps(ymm);
1683 return ret;
1684 }
1685
1686 inline void Store4Val(unsigned char *ptr) const
1687 {
1688 __m128i xmm_i =
1689 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1690 // xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
1691 // xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
1692 xmm_i =
1693 _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1694 (12 << 24))); // SSSE3
1695 GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32 *>(ptr));
1696 }
1697
1698 inline void Store4Val(unsigned short *ptr) const
1699 {
1700 __m128i xmm_i =
1701 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1702 xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
1703 GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64 *>(ptr));
1704 }
1705
1706 inline void Store4Val(float *ptr) const
1707 {
1708 _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1709 }
1710
1711 inline void Store4Val(double *ptr) const
1712 {
1713 _mm256_storeu_pd(ptr, ymm);
1714 }
1715
1716 inline void StoreMask(unsigned char *ptr) const
1717 {
1718 _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
1719 _mm256_castpd_si256(ymm));
1720 }
1721};
1722
1723inline XMMReg4Double XMMReg4Float::cast_to_double() const
1724{
1725 XMMReg4Double ret;
1726 ret.ymm = _mm256_cvtps_pd(xmm);
1727 return ret;
1728}
1729
1730inline XMMReg4Double XMMReg4Int::cast_to_double() const
1731{
1732 XMMReg4Double ret;
1733 ret.ymm = _mm256_cvtepi32_pd(xmm);
1734 return ret;
1735}
1736
1737#else
1738
1739class XMMReg4Double
1740{
1741 public:
1742 XMMReg2Double low, high;
1743
1744 XMMReg4Double() : low(XMMReg2Double()), high(XMMReg2Double())
1745 {
1746 }
1747
1748 XMMReg4Double(const XMMReg4Double &other) : low(other.low), high(other.high)
1749 {
1750 }
1751
1752 static inline XMMReg4Double Zero()
1753 {
1754 XMMReg4Double reg;
1755 reg.low.Zeroize();
1756 reg.high.Zeroize();
1757 return reg;
1758 }
1759
1760 static inline XMMReg4Double Set1(double d)
1761 {
1762 XMMReg4Double reg;
1763 reg.low = XMMReg2Double::Set1(d);
1764 reg.high = XMMReg2Double::Set1(d);
1765 return reg;
1766 }
1767
1768 static inline XMMReg4Double Load1ValHighAndLow(const double *ptr)
1769 {
1770 XMMReg4Double reg;
1771 reg.low.nsLoad1ValHighAndLow(ptr);
1772 reg.high = reg.low;
1773 return reg;
1774 }
1775
1776 static inline XMMReg4Double Load4Val(const unsigned char *ptr)
1777 {
1778 XMMReg4Double reg;
1779 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1780 return reg;
1781 }
1782
1783 static inline void Load8Val(const unsigned char *ptr, XMMReg4Double &low,
1784 XMMReg4Double &high)
1785 {
1786 low = Load4Val(ptr);
1787 high = Load4Val(ptr + 4);
1788 }
1789
1790 static inline XMMReg4Double Load4Val(const short *ptr)
1791 {
1792 XMMReg4Double reg;
1793 reg.low.nsLoad2Val(ptr);
1794 reg.high.nsLoad2Val(ptr + 2);
1795 return reg;
1796 }
1797
1798 static inline void Load8Val(const short *ptr, XMMReg4Double &low,
1799 XMMReg4Double &high)
1800 {
1801 low = Load4Val(ptr);
1802 high = Load4Val(ptr + 4);
1803 }
1804
1805 static inline XMMReg4Double Load4Val(const unsigned short *ptr)
1806 {
1807 XMMReg4Double reg;
1808 reg.low.nsLoad2Val(ptr);
1809 reg.high.nsLoad2Val(ptr + 2);
1810 return reg;
1811 }
1812
1813 static inline void Load8Val(const unsigned short *ptr, XMMReg4Double &low,
1814 XMMReg4Double &high)
1815 {
1816 low = Load4Val(ptr);
1817 high = Load4Val(ptr + 4);
1818 }
1819
1820 static inline XMMReg4Double Load4Val(const int *ptr)
1821 {
1822 XMMReg4Double reg;
1823 reg.low.nsLoad2Val(ptr);
1824 reg.high.nsLoad2Val(ptr + 2);
1825 return reg;
1826 }
1827
1828 static inline void Load8Val(const int *ptr, XMMReg4Double &low,
1829 XMMReg4Double &high)
1830 {
1831 low = Load4Val(ptr);
1832 high = Load4Val(ptr + 4);
1833 }
1834
1835 static inline XMMReg4Double Load4Val(const double *ptr)
1836 {
1837 XMMReg4Double reg;
1838 reg.low.nsLoad2Val(ptr);
1839 reg.high.nsLoad2Val(ptr + 2);
1840 return reg;
1841 }
1842
1843 static inline void Load8Val(const double *ptr, XMMReg4Double &low,
1844 XMMReg4Double &high)
1845 {
1846 low = Load4Val(ptr);
1847 high = Load4Val(ptr + 4);
1848 }
1849
1850 static inline XMMReg4Double Load4ValAligned(const double *ptr)
1851 {
1852 XMMReg4Double reg;
1853 reg.low.nsLoad2ValAligned(ptr);
1854 reg.high.nsLoad2ValAligned(ptr + 2);
1855 return reg;
1856 }
1857
1858 static inline XMMReg4Double Load4Val(const float *ptr)
1859 {
1860 XMMReg4Double reg;
1861 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1862 return reg;
1863 }
1864
1865 static inline void Load8Val(const float *ptr, XMMReg4Double &low,
1866 XMMReg4Double &high)
1867 {
1868 low = Load4Val(ptr);
1869 high = Load4Val(ptr + 4);
1870 }
1871
1872 static inline XMMReg4Double Equals(const XMMReg4Double &expr1,
1873 const XMMReg4Double &expr2)
1874 {
1875 XMMReg4Double reg;
1876 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1877 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1878 return reg;
1879 }
1880
1881 static inline XMMReg4Double NotEquals(const XMMReg4Double &expr1,
1882 const XMMReg4Double &expr2)
1883 {
1884 XMMReg4Double reg;
1885 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1886 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1887 return reg;
1888 }
1889
1890 static inline XMMReg4Double Greater(const XMMReg4Double &expr1,
1891 const XMMReg4Double &expr2)
1892 {
1893 XMMReg4Double reg;
1894 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1895 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1896 return reg;
1897 }
1898
1899 static inline XMMReg4Double And(const XMMReg4Double &expr1,
1900 const XMMReg4Double &expr2)
1901 {
1902 XMMReg4Double reg;
1903 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1904 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1905 return reg;
1906 }
1907
1908 static inline XMMReg4Double Ternary(const XMMReg4Double &cond,
1909 const XMMReg4Double &true_expr,
1910 const XMMReg4Double &false_expr)
1911 {
1912 XMMReg4Double reg;
1913 reg.low =
1914 XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1915 reg.high =
1916 XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1917 return reg;
1918 }
1919
1920 static inline XMMReg4Double Min(const XMMReg4Double &expr1,
1921 const XMMReg4Double &expr2)
1922 {
1923 XMMReg4Double reg;
1924 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1925 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1926 return reg;
1927 }
1928
1929 inline XMMReg4Double &operator=(const XMMReg4Double &other)
1930 {
1931 low = other.low;
1932 high = other.high;
1933 return *this;
1934 }
1935
1936 inline XMMReg4Double &operator+=(const XMMReg4Double &other)
1937 {
1938 low += other.low;
1939 high += other.high;
1940 return *this;
1941 }
1942
1943 inline XMMReg4Double &operator*=(const XMMReg4Double &other)
1944 {
1945 low *= other.low;
1946 high *= other.high;
1947 return *this;
1948 }
1949
1950 inline XMMReg4Double operator+(const XMMReg4Double &other) const
1951 {
1952 XMMReg4Double ret;
1953 ret.low = low + other.low;
1954 ret.high = high + other.high;
1955 return ret;
1956 }
1957
1958 inline XMMReg4Double operator-(const XMMReg4Double &other) const
1959 {
1960 XMMReg4Double ret;
1961 ret.low = low - other.low;
1962 ret.high = high - other.high;
1963 return ret;
1964 }
1965
1966 inline XMMReg4Double operator*(const XMMReg4Double &other) const
1967 {
1968 XMMReg4Double ret;
1969 ret.low = low * other.low;
1970 ret.high = high * other.high;
1971 return ret;
1972 }
1973
1974 inline XMMReg4Double operator/(const XMMReg4Double &other) const
1975 {
1976 XMMReg4Double ret;
1977 ret.low = low / other.low;
1978 ret.high = high / other.high;
1979 return ret;
1980 }
1981
1982 void AddToLow(const XMMReg2Double &other)
1983 {
1984 low += other;
1985 }
1986
1987 inline double GetHorizSum() const
1988 {
1989 return (low + high).GetHorizSum();
1990 }
1991
1992#if !defined(USE_SSE2_EMULATION)
1993 inline XMMReg4Double approx_inv_sqrt(const XMMReg4Double &one,
1994 const XMMReg4Double &half) const
1995 {
1996 __m128d reg0 = low.xmm;
1997 __m128d reg1 = high.xmm;
1998 __m128d reg0_half = _mm_mul_pd(reg0, half.low.xmm);
1999 __m128d reg1_half = _mm_mul_pd(reg1, half.low.xmm);
2000 // Compute rough approximation of 1 / sqrt(b) with _mm_rsqrt_ps
2001 reg0 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg0)));
2002 reg1 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg1)));
2003 // And perform one step of Newton-Raphson approximation to improve it
2004 // approx_inv_sqrt_x = approx_inv_sqrt_x*(1.5 -
2005 // 0.5*x*approx_inv_sqrt_x*approx_inv_sqrt_x);
2006 const __m128d one_and_a_half = _mm_add_pd(one.low.xmm, half.low.xmm);
2007 reg0 = _mm_mul_pd(
2008 reg0, _mm_sub_pd(one_and_a_half,
2009 _mm_mul_pd(reg0_half, _mm_mul_pd(reg0, reg0))));
2010 reg1 = _mm_mul_pd(
2011 reg1, _mm_sub_pd(one_and_a_half,
2012 _mm_mul_pd(reg1_half, _mm_mul_pd(reg1, reg1))));
2013 XMMReg4Double ret;
2014 ret.low.xmm = reg0;
2015 ret.high.xmm = reg1;
2016 return ret;
2017 }
2018
2019 inline XMMReg4Float cast_to_float() const
2020 {
2021 XMMReg4Float ret;
2022 ret.xmm = _mm_castsi128_ps(
2023 _mm_unpacklo_epi64(_mm_castps_si128(_mm_cvtpd_ps(low.xmm)),
2024 _mm_castps_si128(_mm_cvtpd_ps(high.xmm))));
2025 return ret;
2026 }
2027#endif
2028
2029 inline void Store4Val(unsigned char *ptr) const
2030 {
2031#ifdef USE_SSE2_EMULATION
2032 low.Store2Val(ptr);
2033 high.Store2Val(ptr + 2);
2034#else
2035 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
2036 low.xmm,
2037 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
2038 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
2039 high.xmm,
2040 _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
2041 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
2042 _mm_castsi128_ps(tmpHigh),
2043 _MM_SHUFFLE(1, 0, 1, 0)));
2044 tmp = _mm_packs_epi32(tmp, tmp);
2045 tmp = _mm_packus_epi16(tmp, tmp);
2046 GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32 *>(ptr));
2047#endif
2048 }
2049
2050 inline void Store4Val(unsigned short *ptr) const
2051 {
2052#if 1
2053 low.Store2Val(ptr);
2054 high.Store2Val(ptr + 2);
2055#else
2056 __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
2057 __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
2058 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
2059#if __SSE4_1__
2060 xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
2061#else
2062 xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
2063 xmm0 = _mm_packs_epi32(xmm0, xmm0);
2064 xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
2065#endif
2066 GDALCopyXMMToInt64(xmm0, (GInt64 *)ptr);
2067#endif
2068 }
2069
2070 inline void Store4Val(float *ptr) const
2071 {
2072 low.Store2Val(ptr);
2073 high.Store2Val(ptr + 2);
2074 }
2075
2076 inline void Store4Val(double *ptr) const
2077 {
2078 low.Store2Val(ptr);
2079 high.Store2Val(ptr + 2);
2080 }
2081
2082 inline void StoreMask(unsigned char *ptr) const
2083 {
2084 low.StoreMask(ptr);
2085 high.StoreMask(ptr + 16);
2086 }
2087};
2088
2089#if !defined(USE_SSE2_EMULATION)
2090inline XMMReg4Double XMMReg4Float::cast_to_double() const
2091{
2092 XMMReg4Double ret;
2093 ret.low.xmm = _mm_cvtps_pd(xmm);
2094 ret.high.xmm = _mm_cvtps_pd(
2095 _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(xmm), 8)));
2096 return ret;
2097}
2098
2099inline XMMReg4Double XMMReg4Int::cast_to_double() const
2100{
2101 XMMReg4Double ret;
2102 ret.low.xmm = _mm_cvtepi32_pd(xmm);
2103 ret.high.xmm = _mm_cvtepi32_pd(_mm_srli_si128(xmm, 8));
2104 return ret;
2105}
2106#endif
2107
2108#endif
2109
2110#endif /* #ifndef DOXYGEN_SKIP */
2111
2112#endif /* GDALSSE_PRIV_H_INCLUDED */
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition cpl_port.h:171
GIntBig GInt64
Signed 64 bit integer type.
Definition cpl_port.h:226
unsigned short GUInt16
Unsigned int16 type.
Definition cpl_port.h:173
int GInt32
Int32 type.
Definition cpl_port.h:165