13#ifndef GDALSSE_PRIV_H_INCLUDED
14#define GDALSSE_PRIV_H_INCLUDED
22#if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \
23 !defined(USE_SSE2_EMULATION)
27#ifdef USE_NEON_OPTIMIZATIONS
28#include "include_sse2neon.h"
33#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
38#include "gdal_priv_templates.hpp"
40static inline __m128i GDALCopyInt16ToXMM(
const void *ptr)
44 return _mm_cvtsi32_si128(s);
47static inline __m128i GDALCopyInt32ToXMM(
const void *ptr)
51 return _mm_cvtsi32_si128(i);
54static inline __m128i GDALCopyInt64ToXMM(
const void *ptr)
56#if defined(__i386__) || defined(_M_IX86)
57 return _mm_loadl_epi64(
static_cast<const __m128i *
>(ptr));
61 return _mm_cvtsi64_si128(i);
65#ifndef GDALCopyXMMToInt16_defined
66#define GDALCopyXMMToInt16_defined
68static inline void GDALCopyXMMToInt16(
const __m128i xmm,
void *pDest)
70 GInt16 i =
static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
86 : xmm(_mm_undefined_ps())
91 XMMReg4Float(
const XMMReg4Float &other) : xmm(other.xmm)
95 static inline XMMReg4Float Zero()
102 static inline XMMReg4Float Set1(
float f)
105 reg.xmm = _mm_set1_ps(f);
109 static inline XMMReg4Float Load4Val(
const float *ptr)
116 static inline XMMReg4Float Load4Val(
const unsigned char *ptr)
123 static inline XMMReg4Float Load4Val(
const short *ptr)
130 static inline XMMReg4Float Load4Val(
const unsigned short *ptr)
137 static inline XMMReg4Float Load4Val(
const int *ptr)
144 static inline XMMReg4Float Equals(
const XMMReg4Float &expr1,
145 const XMMReg4Float &expr2)
148 reg.xmm = _mm_cmpeq_ps(expr1.xmm, expr2.xmm);
152 static inline XMMReg4Float NotEquals(
const XMMReg4Float &expr1,
153 const XMMReg4Float &expr2)
156 reg.xmm = _mm_cmpneq_ps(expr1.xmm, expr2.xmm);
160 static inline XMMReg4Float Lesser(
const XMMReg4Float &expr1,
161 const XMMReg4Float &expr2)
164 reg.xmm = _mm_cmplt_ps(expr1.xmm, expr2.xmm);
168 static inline XMMReg4Float Greater(
const XMMReg4Float &expr1,
169 const XMMReg4Float &expr2)
172 reg.xmm = _mm_cmpgt_ps(expr1.xmm, expr2.xmm);
176 static inline XMMReg4Float And(
const XMMReg4Float &expr1,
177 const XMMReg4Float &expr2)
180 reg.xmm = _mm_and_ps(expr1.xmm, expr2.xmm);
184 static inline XMMReg4Float Ternary(
const XMMReg4Float &cond,
185 const XMMReg4Float &true_expr,
186 const XMMReg4Float &false_expr)
189 reg.xmm = _mm_or_ps(_mm_and_ps(cond.xmm, true_expr.xmm),
190 _mm_andnot_ps(cond.xmm, false_expr.xmm));
194 static inline XMMReg4Float Min(
const XMMReg4Float &expr1,
195 const XMMReg4Float &expr2)
198 reg.xmm = _mm_min_ps(expr1.xmm, expr2.xmm);
202 static inline XMMReg4Float Max(
const XMMReg4Float &expr1,
203 const XMMReg4Float &expr2)
206 reg.xmm = _mm_max_ps(expr1.xmm, expr2.xmm);
210 inline void nsLoad4Val(
const float *ptr)
212 xmm = _mm_loadu_ps(ptr);
215 static inline void Load16Val(
const float *ptr, XMMReg4Float &r0,
216 XMMReg4Float &r1, XMMReg4Float &r2,
220 r1.nsLoad4Val(ptr + 4);
221 r2.nsLoad4Val(ptr + 8);
222 r3.nsLoad4Val(ptr + 12);
225 inline void nsLoad4Val(
const int *ptr)
227 const __m128i xmm_i =
228 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr));
229 xmm = _mm_cvtepi32_ps(xmm_i);
232 static inline void Load16Val(
const int *ptr, XMMReg4Float &r0,
233 XMMReg4Float &r1, XMMReg4Float &r2,
237 r1.nsLoad4Val(ptr + 4);
238 r2.nsLoad4Val(ptr + 8);
239 r3.nsLoad4Val(ptr + 12);
242 static inline __m128i cvtepu8_epi32(__m128i x)
244#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
245 return _mm_cvtepu8_epi32(x);
247 return _mm_unpacklo_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()),
248 _mm_setzero_si128());
252 inline void nsLoad4Val(
const unsigned char *ptr)
254 const __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
255 xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
258 static inline void Load8Val(
const unsigned char *ptr, XMMReg4Float &r0,
261 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
262 r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
263 r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
266 static inline void Load16Val(
const unsigned char *ptr, XMMReg4Float &r0,
267 XMMReg4Float &r1, XMMReg4Float &r2,
270 const __m128i xmm_i =
271 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr));
272 r0.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(xmm_i));
273 r1.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 4)));
274 r2.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 8)));
275 r3.xmm = _mm_cvtepi32_ps(cvtepu8_epi32(_mm_srli_si128(xmm_i, 12)));
278 static inline __m128i cvtepi16_epi32(__m128i x)
280#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
281 return _mm_cvtepi16_epi32(x);
284 return _mm_srai_epi32(
286 _mm_unpacklo_epi16(x, x), 16);
290 inline void nsLoad4Val(
const short *ptr)
292 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
293 xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
296 static inline void Load8Val(
const short *ptr, XMMReg4Float &r0,
299 const __m128i xmm_i =
300 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr));
301 r0.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(xmm_i));
302 r1.xmm = _mm_cvtepi32_ps(cvtepi16_epi32(_mm_srli_si128(xmm_i, 8)));
305 static inline void Load16Val(
const short *ptr, XMMReg4Float &r0,
306 XMMReg4Float &r1, XMMReg4Float &r2,
309 Load8Val(ptr, r0, r1);
310 Load8Val(ptr + 8, r2, r3);
313 static inline __m128i cvtepu16_epi32(__m128i x)
315#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
316 return _mm_cvtepu16_epi32(x);
318 return _mm_unpacklo_epi16(
319 x, _mm_setzero_si128());
323 inline void nsLoad4Val(
const unsigned short *ptr)
325 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
326 xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
329 static inline void Load8Val(
const unsigned short *ptr, XMMReg4Float &r0,
332 const __m128i xmm_i =
333 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr));
334 r0.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(xmm_i));
335 r1.xmm = _mm_cvtepi32_ps(cvtepu16_epi32(_mm_srli_si128(xmm_i, 8)));
338 static inline void Load16Val(
const unsigned short *ptr, XMMReg4Float &r0,
339 XMMReg4Float &r1, XMMReg4Float &r2,
342 Load8Val(ptr, r0, r1);
343 Load8Val(ptr + 8, r2, r3);
346 inline void Zeroize()
348 xmm = _mm_setzero_ps();
351 inline XMMReg4Float &operator=(
const XMMReg4Float &other)
357 inline XMMReg4Float &operator+=(
const XMMReg4Float &other)
359 xmm = _mm_add_ps(xmm, other.xmm);
363 inline XMMReg4Float &operator-=(
const XMMReg4Float &other)
365 xmm = _mm_sub_ps(xmm, other.xmm);
369 inline XMMReg4Float &operator*=(
const XMMReg4Float &other)
371 xmm = _mm_mul_ps(xmm, other.xmm);
375 inline XMMReg4Float operator+(
const XMMReg4Float &other)
const
378 ret.xmm = _mm_add_ps(xmm, other.xmm);
382 inline XMMReg4Float operator-(
const XMMReg4Float &other)
const
385 ret.xmm = _mm_sub_ps(xmm, other.xmm);
389 inline XMMReg4Float operator*(
const XMMReg4Float &other)
const
392 ret.xmm = _mm_mul_ps(xmm, other.xmm);
396 inline XMMReg4Float operator/(
const XMMReg4Float &other)
const
399 ret.xmm = _mm_div_ps(xmm, other.xmm);
403 inline XMMReg4Float inverse()
const
406 ret.xmm = _mm_div_ps(_mm_set1_ps(1.0f), xmm);
410 inline XMMReg4Int truncate_to_int()
const;
412 inline XMMReg4Double cast_to_double()
const;
414 inline void Store4Val(
float *ptr)
const
416 _mm_storeu_ps(ptr, xmm);
419 inline void Store4ValAligned(
float *ptr)
const
421 _mm_store_ps(ptr, xmm);
424 inline operator float()
const
426 return _mm_cvtss_f32(xmm);
436#if !defined(_MSC_VER)
437 : xmm(_mm_undefined_si128())
442 XMMReg4Int(
const XMMReg4Int &other) : xmm(other.xmm)
446 static inline XMMReg4Int Zero()
449 reg.xmm = _mm_setzero_si128();
453 static inline XMMReg4Int Set1(
int i)
456 reg.xmm = _mm_set1_epi32(i);
460 static inline XMMReg4Int Load4Val(
const int *ptr)
467 inline void nsLoad4Val(
const int *ptr)
469 xmm = _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr));
472 static inline XMMReg4Int Equals(
const XMMReg4Int &expr1,
473 const XMMReg4Int &expr2)
476 reg.xmm = _mm_cmpeq_epi32(expr1.xmm, expr2.xmm);
480 static inline XMMReg4Int Ternary(
const XMMReg4Int &cond,
481 const XMMReg4Int &true_expr,
482 const XMMReg4Int &false_expr)
485 reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
486 _mm_andnot_si128(cond.xmm, false_expr.xmm));
490 inline XMMReg4Int &operator+=(
const XMMReg4Int &other)
492 xmm = _mm_add_epi32(xmm, other.xmm);
496 inline XMMReg4Int &operator-=(
const XMMReg4Int &other)
498 xmm = _mm_sub_epi32(xmm, other.xmm);
502 inline XMMReg4Int operator+(
const XMMReg4Int &other)
const
505 ret.xmm = _mm_add_epi32(xmm, other.xmm);
509 inline XMMReg4Int operator-(
const XMMReg4Int &other)
const
512 ret.xmm = _mm_sub_epi32(xmm, other.xmm);
516 XMMReg4Double cast_to_double()
const;
518 XMMReg4Float to_float()
const
521 ret.xmm = _mm_cvtepi32_ps(xmm);
526inline XMMReg4Int XMMReg4Float::truncate_to_int()
const
529 ret.xmm = _mm_cvttps_epi32(xmm);
539#if !defined(_MSC_VER)
540 : xmm(_mm_undefined_si128())
545 XMMReg8Byte(
const XMMReg8Byte &other) : xmm(other.xmm)
549 static inline XMMReg8Byte Zero()
552 reg.xmm = _mm_setzero_si128();
556 static inline XMMReg8Byte Set1(
char i)
559 reg.xmm = _mm_set1_epi8(i);
563 static inline XMMReg8Byte Equals(
const XMMReg8Byte &expr1,
564 const XMMReg8Byte &expr2)
567 reg.xmm = _mm_cmpeq_epi8(expr1.xmm, expr2.xmm);
571 static inline XMMReg8Byte Or(
const XMMReg8Byte &expr1,
572 const XMMReg8Byte &expr2)
575 reg.xmm = _mm_or_si128(expr1.xmm, expr2.xmm);
579 static inline XMMReg8Byte Ternary(
const XMMReg8Byte &cond,
580 const XMMReg8Byte &true_expr,
581 const XMMReg8Byte &false_expr)
584 reg.xmm = _mm_or_si128(_mm_and_si128(cond.xmm, true_expr.xmm),
585 _mm_andnot_si128(cond.xmm, false_expr.xmm));
589 inline XMMReg8Byte operator+(
const XMMReg8Byte &other)
const
592 ret.xmm = _mm_add_epi8(xmm, other.xmm);
596 inline XMMReg8Byte operator-(
const XMMReg8Byte &other)
const
599 ret.xmm = _mm_sub_epi8(xmm, other.xmm);
603 static inline XMMReg8Byte Pack(
const XMMReg4Int &r0,
const XMMReg4Int &r1)
606 reg.xmm = _mm_packs_epi32(r0.xmm, r1.xmm);
607 reg.xmm = _mm_packus_epi16(reg.xmm, reg.xmm);
611 inline void Store8Val(
unsigned char *ptr)
const
613 GDALCopyXMMToInt64(xmm,
reinterpret_cast<GInt64 *
>(ptr));
623#if !defined(_MSC_VER)
624 : xmm(_mm_undefined_pd())
629 XMMReg2Double(
double val) : xmm(_mm_load_sd(&val))
633 XMMReg2Double(
const XMMReg2Double &other) : xmm(other.xmm)
637 static inline XMMReg2Double Set1(
double d)
640 reg.xmm = _mm_set1_pd(d);
644 static inline XMMReg2Double Zero()
651 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
654 reg.nsLoad1ValHighAndLow(ptr);
658 static inline XMMReg2Double Load2Val(
const double *ptr)
665 static inline XMMReg2Double Load2Val(
const float *ptr)
672 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
675 reg.nsLoad2ValAligned(ptr);
679 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
686 static inline XMMReg2Double Load2Val(
const short *ptr)
693 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
700 static inline XMMReg2Double Load2Val(
const int *ptr)
707 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
708 const XMMReg2Double &expr2)
711 reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
715 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
716 const XMMReg2Double &expr2)
719 reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
723 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
724 const XMMReg2Double &expr2)
727 reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
731 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
732 const XMMReg2Double &expr2)
735 reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
739 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
740 const XMMReg2Double &true_expr,
741 const XMMReg2Double &false_expr)
744 reg.xmm = _mm_or_pd(_mm_and_pd(cond.xmm, true_expr.xmm),
745 _mm_andnot_pd(cond.xmm, false_expr.xmm));
749 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
750 const XMMReg2Double &expr2)
753 reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
757 inline void nsLoad1ValHighAndLow(
const double *ptr)
759 xmm = _mm_load1_pd(ptr);
762 inline void nsLoad2Val(
const double *ptr)
764 xmm = _mm_loadu_pd(ptr);
767 inline void nsLoad2ValAligned(
const double *ptr)
769 xmm = _mm_load_pd(ptr);
772 inline void nsLoad2Val(
const float *ptr)
774 xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
777 inline void nsLoad2Val(
const int *ptr)
779 xmm = _mm_cvtepi32_pd(GDALCopyInt64ToXMM(ptr));
782 inline void nsLoad2Val(
const unsigned char *ptr)
784 __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
785#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
786 xmm_i = _mm_cvtepu8_epi32(xmm_i);
788 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
789 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
791 xmm = _mm_cvtepi32_pd(xmm_i);
794 inline void nsLoad2Val(
const short *ptr)
796 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
797#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
798 xmm_i = _mm_cvtepi16_epi32(xmm_i);
800 xmm_i = _mm_unpacklo_epi16(
802 xmm_i = _mm_srai_epi32(
805 xmm = _mm_cvtepi32_pd(xmm_i);
808 inline void nsLoad2Val(
const unsigned short *ptr)
810 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
811#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
812 xmm_i = _mm_cvtepu16_epi32(xmm_i);
814 xmm_i = _mm_unpacklo_epi16(
816 _mm_setzero_si128());
818 xmm = _mm_cvtepi32_pd(xmm_i);
821 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
824 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
825#if defined(__SSE4_1__) || defined(__AVX__) || defined(USE_NEON_OPTIMIZATIONS)
826 xmm_i = _mm_cvtepu8_epi32(xmm_i);
828 xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
829 xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
831 low.xmm = _mm_cvtepi32_pd(xmm_i);
833 _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i, _MM_SHUFFLE(3, 2, 3, 2)));
836 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
840 high.nsLoad2Val(ptr + 2);
843 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
847 high.nsLoad2Val(ptr + 2);
850 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
854 high.nsLoad2Val(ptr + 2);
857 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
860 __m128 temp1 = _mm_loadu_ps(ptr);
861 __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3, 2, 3, 2));
862 low.xmm = _mm_cvtps_pd(temp1);
863 high.xmm = _mm_cvtps_pd(temp2);
866 inline void Zeroize()
868 xmm = _mm_setzero_pd();
871 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
877 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
879 xmm = _mm_add_pd(xmm, other.xmm);
883 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
885 xmm = _mm_mul_pd(xmm, other.xmm);
889 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
892 ret.xmm = _mm_add_pd(xmm, other.xmm);
896 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
899 ret.xmm = _mm_sub_pd(xmm, other.xmm);
903 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
906 ret.xmm = _mm_mul_pd(xmm, other.xmm);
910 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
913 ret.xmm = _mm_div_pd(xmm, other.xmm);
917 inline double GetHorizSum()
const
920 xmm2 = _mm_shuffle_pd(
923 return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
926 inline void Store2Val(
double *ptr)
const
928 _mm_storeu_pd(ptr, xmm);
931 inline void Store2ValAligned(
double *ptr)
const
933 _mm_store_pd(ptr, xmm);
936 inline void Store2Val(
float *ptr)
const
938 __m128i xmm_i = _mm_castps_si128(_mm_cvtpd_ps(xmm));
939 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
942 inline void Store2Val(
unsigned char *ptr)
const
944 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
947 tmp = _mm_packs_epi32(tmp, tmp);
948 tmp = _mm_packus_epi16(tmp, tmp);
949 GDALCopyXMMToInt16(tmp,
reinterpret_cast<GInt16 *
>(ptr));
952 inline void Store2Val(
unsigned short *ptr)
const
954 __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(
958 tmp = _mm_shufflelo_epi16(tmp, 0 | (2 << 2));
959 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
962 inline void StoreMask(
unsigned char *ptr)
const
964 _mm_storeu_si128(
reinterpret_cast<__m128i *
>(ptr),
965 _mm_castpd_si128(xmm));
968 inline operator double()
const
970 return _mm_cvtsd_f64(xmm);
976#ifndef NO_WARN_USE_SSE2_EMULATION
977#warning "Software emulation of SSE2 !"
987 XMMReg2Double() =
default;
989 explicit XMMReg2Double(
double val)
995 XMMReg2Double(
const XMMReg2Double &other) : low(other.low), high(other.high)
999 static inline XMMReg2Double Zero()
1006 static inline XMMReg2Double Set1(
double d)
1014 static inline XMMReg2Double Load1ValHighAndLow(
const double *ptr)
1017 reg.nsLoad1ValHighAndLow(ptr);
1021 static inline XMMReg2Double Equals(
const XMMReg2Double &expr1,
1022 const XMMReg2Double &expr2)
1026 if (expr1.low == expr2.low)
1027 memset(&(reg.low), 0xFF,
sizeof(
double));
1031 if (expr1.high == expr2.high)
1032 memset(&(reg.high), 0xFF,
sizeof(
double));
1039 static inline XMMReg2Double NotEquals(
const XMMReg2Double &expr1,
1040 const XMMReg2Double &expr2)
1044 if (expr1.low != expr2.low)
1045 memset(&(reg.low), 0xFF,
sizeof(
double));
1049 if (expr1.high != expr2.high)
1050 memset(&(reg.high), 0xFF,
sizeof(
double));
1057 static inline XMMReg2Double Greater(
const XMMReg2Double &expr1,
1058 const XMMReg2Double &expr2)
1062 if (expr1.low > expr2.low)
1063 memset(&(reg.low), 0xFF,
sizeof(
double));
1067 if (expr1.high > expr2.high)
1068 memset(&(reg.high), 0xFF,
sizeof(
double));
1075 static inline XMMReg2Double And(
const XMMReg2Double &expr1,
1076 const XMMReg2Double &expr2)
1079 int low1[2], high1[2];
1080 int low2[2], high2[2];
1081 memcpy(low1, &expr1.low,
sizeof(
double));
1082 memcpy(high1, &expr1.high,
sizeof(
double));
1083 memcpy(low2, &expr2.low,
sizeof(
double));
1084 memcpy(high2, &expr2.high,
sizeof(
double));
1087 high1[0] &= high2[0];
1088 high1[1] &= high2[1];
1089 memcpy(®.low, low1,
sizeof(
double));
1090 memcpy(®.high, high1,
sizeof(
double));
1094 static inline XMMReg2Double Ternary(
const XMMReg2Double &cond,
1095 const XMMReg2Double &true_expr,
1096 const XMMReg2Double &false_expr)
1100 reg.low = true_expr.low;
1102 reg.low = false_expr.low;
1104 reg.high = true_expr.high;
1106 reg.high = false_expr.high;
1110 static inline XMMReg2Double Min(
const XMMReg2Double &expr1,
1111 const XMMReg2Double &expr2)
1114 reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
1115 reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
1119 static inline XMMReg2Double Load2Val(
const double *ptr)
1122 reg.nsLoad2Val(ptr);
1126 static inline XMMReg2Double Load2ValAligned(
const double *ptr)
1129 reg.nsLoad2ValAligned(ptr);
1133 static inline XMMReg2Double Load2Val(
const float *ptr)
1136 reg.nsLoad2Val(ptr);
1140 static inline XMMReg2Double Load2Val(
const unsigned char *ptr)
1143 reg.nsLoad2Val(ptr);
1147 static inline XMMReg2Double Load2Val(
const short *ptr)
1150 reg.nsLoad2Val(ptr);
1154 static inline XMMReg2Double Load2Val(
const unsigned short *ptr)
1157 reg.nsLoad2Val(ptr);
1161 static inline XMMReg2Double Load2Val(
const int *ptr)
1164 reg.nsLoad2Val(ptr);
1168 inline void nsLoad1ValHighAndLow(
const double *ptr)
1174 inline void nsLoad2Val(
const double *ptr)
1180 inline void nsLoad2ValAligned(
const double *ptr)
1186 inline void nsLoad2Val(
const float *ptr)
1192 inline void nsLoad2Val(
const unsigned char *ptr)
1198 inline void nsLoad2Val(
const short *ptr)
1204 inline void nsLoad2Val(
const unsigned short *ptr)
1210 inline void nsLoad2Val(
const int *ptr)
1216 static inline void Load4Val(
const unsigned char *ptr, XMMReg2Double &low,
1217 XMMReg2Double &high)
1225 static inline void Load4Val(
const short *ptr, XMMReg2Double &low,
1226 XMMReg2Double &high)
1228 low.nsLoad2Val(ptr);
1229 high.nsLoad2Val(ptr + 2);
1232 static inline void Load4Val(
const unsigned short *ptr, XMMReg2Double &low,
1233 XMMReg2Double &high)
1235 low.nsLoad2Val(ptr);
1236 high.nsLoad2Val(ptr + 2);
1239 static inline void Load4Val(
const double *ptr, XMMReg2Double &low,
1240 XMMReg2Double &high)
1242 low.nsLoad2Val(ptr);
1243 high.nsLoad2Val(ptr + 2);
1246 static inline void Load4Val(
const float *ptr, XMMReg2Double &low,
1247 XMMReg2Double &high)
1249 low.nsLoad2Val(ptr);
1250 high.nsLoad2Val(ptr + 2);
1253 inline void Zeroize()
1259 inline XMMReg2Double &operator=(
const XMMReg2Double &other)
1266 inline XMMReg2Double &operator+=(
const XMMReg2Double &other)
1273 inline XMMReg2Double &operator*=(
const XMMReg2Double &other)
1280 inline XMMReg2Double operator+(
const XMMReg2Double &other)
const
1283 ret.low = low + other.low;
1284 ret.high = high + other.high;
1288 inline XMMReg2Double operator-(
const XMMReg2Double &other)
const
1291 ret.low = low - other.low;
1292 ret.high = high - other.high;
1296 inline XMMReg2Double operator*(
const XMMReg2Double &other)
const
1299 ret.low = low * other.low;
1300 ret.high = high * other.high;
1304 inline XMMReg2Double operator/(
const XMMReg2Double &other)
const
1307 ret.low = low / other.low;
1308 ret.high = high / other.high;
1312 inline double GetHorizSum()
const
1317 inline void Store2Val(
double *ptr)
const
1323 inline void Store2ValAligned(
double *ptr)
const
1329 inline void Store2Val(
float *ptr)
const
1331 ptr[0] =
static_cast<float>(low);
1332 ptr[1] =
static_cast<float>(high);
1335 void Store2Val(
unsigned char *ptr)
const
1337 ptr[0] =
static_cast<unsigned char>(low + 0.5);
1338 ptr[1] =
static_cast<unsigned char>(high + 0.5);
1341 void Store2Val(
unsigned short *ptr)
const
1343 ptr[0] =
static_cast<GUInt16>(low + 0.5);
1344 ptr[1] =
static_cast<GUInt16>(high + 0.5);
1347 inline void StoreMask(
unsigned char *ptr)
const
1349 memcpy(ptr, &low, 8);
1350 memcpy(ptr + 8, &high, 8);
1353 inline operator double()
const
1361#if defined(__AVX__) && !defined(USE_SSE2_EMULATION)
1363#include <immintrin.h>
1370 XMMReg4Double() : ymm(_mm256_setzero_pd())
1374 XMMReg4Double(
const XMMReg4Double &other) : ymm(other.ymm)
1378 static inline XMMReg4Double Zero()
1385 static inline XMMReg4Double Set1(
double d)
1388 reg.ymm = _mm256_set1_pd(d);
1392 inline void Zeroize()
1394 ymm = _mm256_setzero_pd();
1397 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
1400 reg.nsLoad1ValHighAndLow(ptr);
1404 inline void nsLoad1ValHighAndLow(
const double *ptr)
1406 ymm = _mm256_set1_pd(*ptr);
1409 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
1412 reg.nsLoad4Val(ptr);
1416 inline void nsLoad4Val(
const unsigned char *ptr)
1418 __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
1419 xmm_i = _mm_cvtepu8_epi32(xmm_i);
1420 ymm = _mm256_cvtepi32_pd(xmm_i);
1423 static inline void Load8Val(
const unsigned char *ptr, XMMReg4Double &low,
1424 XMMReg4Double &high)
1426 const __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1427 const __m128i xmm_i_low = _mm_cvtepu8_epi32(xmm_i);
1428 low.ymm = _mm256_cvtepi32_pd(xmm_i_low);
1429 const __m128i xmm_i_high = _mm_cvtepu8_epi32(_mm_srli_si128(xmm_i, 4));
1430 high.ymm = _mm256_cvtepi32_pd(xmm_i_high);
1433 static inline XMMReg4Double Load4Val(
const short *ptr)
1436 reg.nsLoad4Val(ptr);
1440 inline void nsLoad4Val(
const short *ptr)
1442 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1443 xmm_i = _mm_cvtepi16_epi32(xmm_i);
1444 ymm = _mm256_cvtepi32_pd(xmm_i);
1447 static inline void Load8Val(
const short *ptr, XMMReg4Double &low,
1448 XMMReg4Double &high)
1450 low.nsLoad4Val(ptr);
1451 high.nsLoad4Val(ptr + 4);
1454 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
1457 reg.nsLoad4Val(ptr);
1461 inline void nsLoad4Val(
const unsigned short *ptr)
1463 __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
1464 xmm_i = _mm_cvtepu16_epi32(xmm_i);
1465 ymm = _mm256_cvtepi32_pd(
1470 static inline void Load8Val(
const unsigned short *ptr, XMMReg4Double &low,
1471 XMMReg4Double &high)
1473 low.nsLoad4Val(ptr);
1474 high.nsLoad4Val(ptr + 4);
1477 static inline XMMReg4Double Load4Val(
const double *ptr)
1480 reg.nsLoad4Val(ptr);
1484 inline void nsLoad4Val(
const double *ptr)
1486 ymm = _mm256_loadu_pd(ptr);
1489 static inline void Load8Val(
const double *ptr, XMMReg4Double &low,
1490 XMMReg4Double &high)
1492 low.nsLoad4Val(ptr);
1493 high.nsLoad4Val(ptr + 4);
1496 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
1499 reg.nsLoad4ValAligned(ptr);
1503 inline void nsLoad4ValAligned(
const double *ptr)
1505 ymm = _mm256_load_pd(ptr);
1508 static inline XMMReg4Double Load4Val(
const float *ptr)
1511 reg.nsLoad4Val(ptr);
1515 inline void nsLoad4Val(
const float *ptr)
1517 ymm = _mm256_cvtps_pd(_mm_loadu_ps(ptr));
1520 static inline void Load8Val(
const float *ptr, XMMReg4Double &low,
1521 XMMReg4Double &high)
1523 low.nsLoad4Val(ptr);
1524 high.nsLoad4Val(ptr + 4);
1527 static inline XMMReg4Double Load4Val(
const int *ptr)
1530 reg.nsLoad4Val(ptr);
1534 inline void nsLoad4Val(
const int *ptr)
1536 ymm = _mm256_cvtepi32_pd(
1537 _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(ptr)));
1540 static inline void Load8Val(
const int *ptr, XMMReg4Double &low,
1541 XMMReg4Double &high)
1543 low.nsLoad4Val(ptr);
1544 high.nsLoad4Val(ptr + 4);
1547 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
1548 const XMMReg4Double &expr2)
1551 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
1555 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
1556 const XMMReg4Double &expr2)
1559 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
1563 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
1564 const XMMReg4Double &expr2)
1567 reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
1571 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
1572 const XMMReg4Double &expr2)
1575 reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
1579 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
1580 const XMMReg4Double &true_expr,
1581 const XMMReg4Double &false_expr)
1584 reg.ymm = _mm256_or_pd(_mm256_and_pd(cond.ymm, true_expr.ymm),
1585 _mm256_andnot_pd(cond.ymm, false_expr.ymm));
1589 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
1590 const XMMReg4Double &expr2)
1593 reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
1597 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
1603 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
1605 ymm = _mm256_add_pd(ymm, other.ymm);
1609 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
1611 ymm = _mm256_mul_pd(ymm, other.ymm);
1615 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
1618 ret.ymm = _mm256_add_pd(ymm, other.ymm);
1622 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
1625 ret.ymm = _mm256_sub_pd(ymm, other.ymm);
1629 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
1632 ret.ymm = _mm256_mul_pd(ymm, other.ymm);
1636 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
1639 ret.ymm = _mm256_div_pd(ymm, other.ymm);
1643 void AddToLow(
const XMMReg2Double &other)
1645 __m256d ymm2 = _mm256_setzero_pd();
1646 ymm2 = _mm256_insertf128_pd(ymm2, other.xmm, 0);
1647 ymm = _mm256_add_pd(ymm, ymm2);
1650 inline double GetHorizSum()
const
1652 __m256d ymm_tmp1, ymm_tmp2;
1653 ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
1654 ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
1655 ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
1656 return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
1659 inline XMMReg4Double approx_inv_sqrt(
const XMMReg4Double &one,
1660 const XMMReg4Double &half)
const
1663 __m256d reg_half = _mm256_mul_pd(reg, half.ymm);
1665 reg = _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(reg)));
1669 const __m256d one_and_a_half = _mm256_add_pd(one.ymm, half.ymm);
1670 reg = _mm256_mul_pd(
1672 _mm256_sub_pd(one_and_a_half,
1673 _mm256_mul_pd(reg_half, _mm256_mul_pd(reg, reg))));
1679 inline XMMReg4Float cast_to_float()
const
1682 ret.xmm = _mm256_cvtpd_ps(ymm);
1686 inline void Store4Val(
unsigned char *ptr)
const
1689 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1693 _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) |
1695 GDALCopyXMMToInt32(xmm_i,
reinterpret_cast<GInt32 *
>(ptr));
1698 inline void Store4Val(
unsigned short *ptr)
const
1701 _mm256_cvttpd_epi32(_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
1702 xmm_i = _mm_packus_epi32(xmm_i, xmm_i);
1703 GDALCopyXMMToInt64(xmm_i,
reinterpret_cast<GInt64 *
>(ptr));
1706 inline void Store4Val(
float *ptr)
const
1708 _mm_storeu_ps(ptr, _mm256_cvtpd_ps(ymm));
1711 inline void Store4Val(
double *ptr)
const
1713 _mm256_storeu_pd(ptr, ymm);
1716 inline void StoreMask(
unsigned char *ptr)
const
1718 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(ptr),
1719 _mm256_castpd_si256(ymm));
1723inline XMMReg4Double XMMReg4Float::cast_to_double()
const
1726 ret.ymm = _mm256_cvtps_pd(xmm);
1730inline XMMReg4Double XMMReg4Int::cast_to_double()
const
1733 ret.ymm = _mm256_cvtepi32_pd(xmm);
1742 XMMReg2Double low, high;
1744 XMMReg4Double() : low(XMMReg2Double()), high(XMMReg2Double())
1748 XMMReg4Double(
const XMMReg4Double &other) : low(other.low), high(other.high)
1752 static inline XMMReg4Double Zero()
1760 static inline XMMReg4Double Set1(
double d)
1763 reg.low = XMMReg2Double::Set1(d);
1764 reg.high = XMMReg2Double::Set1(d);
1768 static inline XMMReg4Double Load1ValHighAndLow(
const double *ptr)
1771 reg.low.nsLoad1ValHighAndLow(ptr);
1776 static inline XMMReg4Double Load4Val(
const unsigned char *ptr)
1779 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1783 static inline void Load8Val(
const unsigned char *ptr, XMMReg4Double &low,
1784 XMMReg4Double &high)
1786 low = Load4Val(ptr);
1787 high = Load4Val(ptr + 4);
1790 static inline XMMReg4Double Load4Val(
const short *ptr)
1793 reg.low.nsLoad2Val(ptr);
1794 reg.high.nsLoad2Val(ptr + 2);
1798 static inline void Load8Val(
const short *ptr, XMMReg4Double &low,
1799 XMMReg4Double &high)
1801 low = Load4Val(ptr);
1802 high = Load4Val(ptr + 4);
1805 static inline XMMReg4Double Load4Val(
const unsigned short *ptr)
1808 reg.low.nsLoad2Val(ptr);
1809 reg.high.nsLoad2Val(ptr + 2);
1813 static inline void Load8Val(
const unsigned short *ptr, XMMReg4Double &low,
1814 XMMReg4Double &high)
1816 low = Load4Val(ptr);
1817 high = Load4Val(ptr + 4);
1820 static inline XMMReg4Double Load4Val(
const int *ptr)
1823 reg.low.nsLoad2Val(ptr);
1824 reg.high.nsLoad2Val(ptr + 2);
1828 static inline void Load8Val(
const int *ptr, XMMReg4Double &low,
1829 XMMReg4Double &high)
1831 low = Load4Val(ptr);
1832 high = Load4Val(ptr + 4);
1835 static inline XMMReg4Double Load4Val(
const double *ptr)
1838 reg.low.nsLoad2Val(ptr);
1839 reg.high.nsLoad2Val(ptr + 2);
1843 static inline void Load8Val(
const double *ptr, XMMReg4Double &low,
1844 XMMReg4Double &high)
1846 low = Load4Val(ptr);
1847 high = Load4Val(ptr + 4);
1850 static inline XMMReg4Double Load4ValAligned(
const double *ptr)
1853 reg.low.nsLoad2ValAligned(ptr);
1854 reg.high.nsLoad2ValAligned(ptr + 2);
1858 static inline XMMReg4Double Load4Val(
const float *ptr)
1861 XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1865 static inline void Load8Val(
const float *ptr, XMMReg4Double &low,
1866 XMMReg4Double &high)
1868 low = Load4Val(ptr);
1869 high = Load4Val(ptr + 4);
1872 static inline XMMReg4Double Equals(
const XMMReg4Double &expr1,
1873 const XMMReg4Double &expr2)
1876 reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1877 reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1881 static inline XMMReg4Double NotEquals(
const XMMReg4Double &expr1,
1882 const XMMReg4Double &expr2)
1885 reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1886 reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1890 static inline XMMReg4Double Greater(
const XMMReg4Double &expr1,
1891 const XMMReg4Double &expr2)
1894 reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1895 reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1899 static inline XMMReg4Double And(
const XMMReg4Double &expr1,
1900 const XMMReg4Double &expr2)
1903 reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1904 reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1908 static inline XMMReg4Double Ternary(
const XMMReg4Double &cond,
1909 const XMMReg4Double &true_expr,
1910 const XMMReg4Double &false_expr)
1914 XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1916 XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1920 static inline XMMReg4Double Min(
const XMMReg4Double &expr1,
1921 const XMMReg4Double &expr2)
1924 reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1925 reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1929 inline XMMReg4Double &operator=(
const XMMReg4Double &other)
1936 inline XMMReg4Double &operator+=(
const XMMReg4Double &other)
1943 inline XMMReg4Double &operator*=(
const XMMReg4Double &other)
1950 inline XMMReg4Double operator+(
const XMMReg4Double &other)
const
1953 ret.low = low + other.low;
1954 ret.high = high + other.high;
1958 inline XMMReg4Double operator-(
const XMMReg4Double &other)
const
1961 ret.low = low - other.low;
1962 ret.high = high - other.high;
1966 inline XMMReg4Double operator*(
const XMMReg4Double &other)
const
1969 ret.low = low * other.low;
1970 ret.high = high * other.high;
1974 inline XMMReg4Double operator/(
const XMMReg4Double &other)
const
1977 ret.low = low / other.low;
1978 ret.high = high / other.high;
1982 void AddToLow(
const XMMReg2Double &other)
1987 inline double GetHorizSum()
const
1989 return (low + high).GetHorizSum();
1992#if !defined(USE_SSE2_EMULATION)
1993 inline XMMReg4Double approx_inv_sqrt(
const XMMReg4Double &one,
1994 const XMMReg4Double &half)
const
1996 __m128d reg0 = low.xmm;
1997 __m128d reg1 = high.xmm;
1998 __m128d reg0_half = _mm_mul_pd(reg0, half.low.xmm);
1999 __m128d reg1_half = _mm_mul_pd(reg1, half.low.xmm);
2001 reg0 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg0)));
2002 reg1 = _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(reg1)));
2006 const __m128d one_and_a_half = _mm_add_pd(one.low.xmm, half.low.xmm);
2008 reg0, _mm_sub_pd(one_and_a_half,
2009 _mm_mul_pd(reg0_half, _mm_mul_pd(reg0, reg0))));
2011 reg1, _mm_sub_pd(one_and_a_half,
2012 _mm_mul_pd(reg1_half, _mm_mul_pd(reg1, reg1))));
2015 ret.high.xmm = reg1;
2019 inline XMMReg4Float cast_to_float()
const
2022 ret.xmm = _mm_castsi128_ps(
2023 _mm_unpacklo_epi64(_mm_castps_si128(_mm_cvtpd_ps(low.xmm)),
2024 _mm_castps_si128(_mm_cvtpd_ps(high.xmm))));
2029 inline void Store4Val(
unsigned char *ptr)
const
2031#ifdef USE_SSE2_EMULATION
2033 high.Store2Val(ptr + 2);
2035 __m128i tmpLow = _mm_cvttpd_epi32(_mm_add_pd(
2038 __m128i tmpHigh = _mm_cvttpd_epi32(_mm_add_pd(
2041 auto tmp = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmpLow),
2042 _mm_castsi128_ps(tmpHigh),
2043 _MM_SHUFFLE(1, 0, 1, 0)));
2044 tmp = _mm_packs_epi32(tmp, tmp);
2045 tmp = _mm_packus_epi16(tmp, tmp);
2046 GDALCopyXMMToInt32(tmp,
reinterpret_cast<GInt32 *
>(ptr));
2050 inline void Store4Val(
unsigned short *ptr)
const
2054 high.Store2Val(ptr + 2);
2056 __m128i xmm0 = _mm_cvtpd_epi32(low.xmm);
2057 __m128i xmm1 = _mm_cvtpd_epi32(high.xmm);
2058 xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
2060 xmm0 = _mm_packus_epi32(xmm0, xmm0);
2062 xmm0 = _mm_add_epi32(xmm0, _mm_set1_epi32(-32768));
2063 xmm0 = _mm_packs_epi32(xmm0, xmm0);
2064 xmm0 = _mm_sub_epi16(xmm0, _mm_set1_epi16(-32768));
2066 GDALCopyXMMToInt64(xmm0, (
GInt64 *)ptr);
2070 inline void Store4Val(
float *ptr)
const
2073 high.Store2Val(ptr + 2);
2076 inline void Store4Val(
double *ptr)
const
2079 high.Store2Val(ptr + 2);
2082 inline void StoreMask(
unsigned char *ptr)
const
2085 high.StoreMask(ptr + 16);
2089#if !defined(USE_SSE2_EMULATION)
2090inline XMMReg4Double XMMReg4Float::cast_to_double()
const
2093 ret.low.xmm = _mm_cvtps_pd(xmm);
2094 ret.high.xmm = _mm_cvtps_pd(
2095 _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(xmm), 8)));
2099inline XMMReg4Double XMMReg4Int::cast_to_double()
const
2102 ret.low.xmm = _mm_cvtepi32_pd(xmm);
2103 ret.high.xmm = _mm_cvtepi32_pd(_mm_srli_si128(xmm, 8));
Core portability definitions for CPL.
short GInt16
Int16 type.
Definition cpl_port.h:171
GIntBig GInt64
Signed 64 bit integer type.
Definition cpl_port.h:226
unsigned short GUInt16
Unsigned int16 type.
Definition cpl_port.h:173
int GInt32
Int32 type.
Definition cpl_port.h:165