GDAL
gdalsse_priv.h
1 /******************************************************************************
2  * $Id$
3  *
4  * Project: GDAL
5  * Purpose: SSE2 helper
6  * Author: Even Rouault <even dot rouault at spatialys dot com>
7  *
8  ******************************************************************************
9  * Copyright (c) 2014, Even Rouault <even dot rouault at spatialys dot com>
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a
12  * copy of this software and associated documentation files (the "Software"),
13  * to deal in the Software without restriction, including without limitation
14  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15  * and/or sell copies of the Software, and to permit persons to whom the
16  * Software is furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included
19  * in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  ****************************************************************************/
29 
30 #ifndef GDALSSE_PRIV_H_INCLUDED
31 #define GDALSSE_PRIV_H_INCLUDED
32 
33 #ifndef DOXYGEN_SKIP
34 
35 #include "cpl_port.h"
36 
37 /* We restrict to 64bit processors because they are guaranteed to have SSE2 */
38 /* Could possibly be used too on 32bit, but we would need to check at runtime */
39 #if (defined(__x86_64) || defined(_M_X64)) && !defined(USE_SSE2_EMULATION)
40 
41 /* Requires SSE2 */
42 #include <emmintrin.h>
43 #include <string.h>
44 
45 #ifdef __SSE4_1__
46 #include <smmintrin.h>
47 #endif
48 
49 #include "gdal_priv_templates.hpp"
50 
51 static inline __m128i GDALCopyInt16ToXMM(const void* ptr)
52 {
53 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
54  unsigned short s;
55  memcpy(&s, ptr, 2);
56  return _mm_cvtsi32_si128(s);
57 #else
58  return _mm_cvtsi32_si128(*static_cast<const unsigned short*>(ptr));
59 #endif
60 }
61 
62 static inline __m128i GDALCopyInt32ToXMM(const void* ptr)
63 {
64 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
65  GInt32 i;
66  memcpy(&i, ptr, 4);
67  return _mm_cvtsi32_si128(i);
68 #else
69  return _mm_cvtsi32_si128(*static_cast<const GInt32*>(ptr));
70 #endif
71 }
72 
73 static inline __m128i GDALCopyInt64ToXMM(const void* ptr)
74 {
75 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
76  GInt64 i;
77  memcpy(&i, ptr, 8);
78  return _mm_cvtsi64_si128(i);
79 #else
80  return _mm_cvtsi64_si128(*static_cast<const GInt64*>(ptr));
81 #endif
82 }
83 
84 static inline void GDALCopyXMMToInt16(const __m128i xmm, void* pDest)
85 {
86 #ifdef CPL_CPU_REQUIRES_ALIGNED_ACCESS
87  GInt16 i = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
88  memcpy(pDest, &i, 2);
89 #else
90  *static_cast<GInt16*>(pDest) = static_cast<GInt16>(_mm_extract_epi16(xmm, 0));
91 #endif
92 }
93 
94 class XMMReg2Double
95 {
96  public:
97  __m128d xmm;
98 
99 #if defined(__GNUC__)
100 #pragma GCC diagnostic push
101 #pragma GCC diagnostic ignored "-Weffc++"
102 #endif
103  /* coverity[uninit_member] */
104  XMMReg2Double() = default;
105 #if defined(__GNUC__)
106 #pragma GCC diagnostic pop
107 #endif
108 
109  XMMReg2Double(double val): xmm(_mm_load_sd (&val)) {}
110  XMMReg2Double(const XMMReg2Double& other) : xmm(other.xmm) {}
111 
112  static inline XMMReg2Double Zero()
113  {
114  XMMReg2Double reg;
115  reg.Zeroize();
116  return reg;
117  }
118 
119  static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
120  {
121  XMMReg2Double reg;
122  reg.nsLoad1ValHighAndLow(ptr);
123  return reg;
124  }
125 
126  static inline XMMReg2Double Load2Val(const double* ptr)
127  {
128  XMMReg2Double reg;
129  reg.nsLoad2Val(ptr);
130  return reg;
131  }
132 
133  static inline XMMReg2Double Load2Val(const float* ptr)
134  {
135  XMMReg2Double reg;
136  reg.nsLoad2Val(ptr);
137  return reg;
138  }
139 
140  static inline XMMReg2Double Load2ValAligned(const double* ptr)
141  {
142  XMMReg2Double reg;
143  reg.nsLoad2ValAligned(ptr);
144  return reg;
145  }
146 
147  static inline XMMReg2Double Load2Val(const unsigned char* ptr)
148  {
149  XMMReg2Double reg;
150  reg.nsLoad2Val(ptr);
151  return reg;
152  }
153 
154  static inline XMMReg2Double Load2Val(const short* ptr)
155  {
156  XMMReg2Double reg;
157  reg.nsLoad2Val(ptr);
158  return reg;
159  }
160 
161  static inline XMMReg2Double Load2Val(const unsigned short* ptr)
162  {
163  XMMReg2Double reg;
164  reg.nsLoad2Val(ptr);
165  return reg;
166  }
167 
168  static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
169  {
170  XMMReg2Double reg;
171  reg.xmm = _mm_cmpeq_pd(expr1.xmm, expr2.xmm);
172  return reg;
173  }
174 
175  static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
176  {
177  XMMReg2Double reg;
178  reg.xmm = _mm_cmpneq_pd(expr1.xmm, expr2.xmm);
179  return reg;
180  }
181 
182  static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
183  {
184  XMMReg2Double reg;
185  reg.xmm = _mm_cmpgt_pd(expr1.xmm, expr2.xmm);
186  return reg;
187  }
188 
189  static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
190  {
191  XMMReg2Double reg;
192  reg.xmm = _mm_and_pd(expr1.xmm, expr2.xmm);
193  return reg;
194  }
195 
196  static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
197  {
198  XMMReg2Double reg;
199  reg.xmm = _mm_or_pd(_mm_and_pd (cond.xmm, true_expr.xmm), _mm_andnot_pd(cond.xmm, false_expr.xmm));
200  return reg;
201  }
202 
203  static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
204  {
205  XMMReg2Double reg;
206  reg.xmm = _mm_min_pd(expr1.xmm, expr2.xmm);
207  return reg;
208  }
209 
210  inline void nsLoad1ValHighAndLow(const double* ptr)
211  {
212  xmm = _mm_load1_pd(ptr);
213  }
214 
215  inline void nsLoad2Val(const double* ptr)
216  {
217  xmm = _mm_loadu_pd(ptr);
218  }
219 
220  inline void nsLoad2ValAligned(const double* ptr)
221  {
222  xmm = _mm_load_pd(ptr);
223  }
224 
225  inline void nsLoad2Val(const float* ptr)
226  {
227  xmm = _mm_cvtps_pd(_mm_castsi128_ps(GDALCopyInt64ToXMM(ptr)));
228  }
229 
230  inline void nsLoad2Val(const unsigned char* ptr)
231  {
232  __m128i xmm_i = GDALCopyInt16ToXMM(ptr);
233 #ifdef __SSE4_1__
234  xmm_i = _mm_cvtepu8_epi32(xmm_i);
235 #else
236  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
237  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
238 #endif
239  xmm = _mm_cvtepi32_pd(xmm_i);
240  }
241 
242  inline void nsLoad2Val(const short* ptr)
243  {
244  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
245 #ifdef __SSE4_1__
246  xmm_i = _mm_cvtepi16_epi32(xmm_i);
247 #else
248  xmm_i = _mm_unpacklo_epi16(xmm_i,xmm_i); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|b|b|a|a */
249  xmm_i = _mm_srai_epi32(xmm_i, 16); /* 0|0|0|0|b|b|a|a --> 0|0|0|0|sign(b)|b|sign(a)|a */
250 #endif
251  xmm = _mm_cvtepi32_pd(xmm_i);
252  }
253 
254  inline void nsLoad2Val(const unsigned short* ptr)
255  {
256  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
257 #ifdef __SSE4_1__
258  xmm_i = _mm_cvtepu16_epi32(xmm_i);
259 #else
260  xmm_i = _mm_unpacklo_epi16(xmm_i,_mm_setzero_si128()); /* 0|0|0|0|0|0|b|a --> 0|0|0|0|0|b|0|a */
261 #endif
262  xmm = _mm_cvtepi32_pd(xmm_i);
263  }
264 
265  static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
266  {
267  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
268 #ifdef __SSE4_1__
269  xmm_i = _mm_cvtepu8_epi32(xmm_i);
270 #else
271  xmm_i = _mm_unpacklo_epi8(xmm_i, _mm_setzero_si128());
272  xmm_i = _mm_unpacklo_epi16(xmm_i, _mm_setzero_si128());
273 #endif
274  low.xmm = _mm_cvtepi32_pd(xmm_i);
275  high.xmm = _mm_cvtepi32_pd(_mm_shuffle_epi32(xmm_i,_MM_SHUFFLE(3,2,3,2)));
276  }
277 
278  static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
279  {
280  low.nsLoad2Val(ptr);
281  high.nsLoad2Val(ptr+2);
282  }
283 
284  static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
285  {
286  low.nsLoad2Val(ptr);
287  high.nsLoad2Val(ptr+2);
288  }
289 
290  static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
291  {
292  low.nsLoad2Val(ptr);
293  high.nsLoad2Val(ptr+2);
294  }
295 
296  static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
297  {
298  __m128 temp1 = _mm_loadu_ps(ptr);
299  __m128 temp2 = _mm_shuffle_ps(temp1, temp1, _MM_SHUFFLE(3,2,3,2));
300  low.xmm = _mm_cvtps_pd(temp1);
301  high.xmm = _mm_cvtps_pd(temp2);
302  }
303 
304  inline void Zeroize()
305  {
306  xmm = _mm_setzero_pd();
307  }
308 
309  inline XMMReg2Double& operator= (const XMMReg2Double& other)
310  {
311  xmm = other.xmm;
312  return *this;
313  }
314 
315  inline XMMReg2Double& operator+= (const XMMReg2Double& other)
316  {
317  xmm = _mm_add_pd(xmm, other.xmm);
318  return *this;
319  }
320 
321  inline XMMReg2Double& operator*= (const XMMReg2Double& other)
322  {
323  xmm = _mm_mul_pd(xmm, other.xmm);
324  return *this;
325  }
326 
327  inline XMMReg2Double operator+ (const XMMReg2Double& other) const
328  {
329  XMMReg2Double ret;
330  ret.xmm = _mm_add_pd(xmm, other.xmm);
331  return ret;
332  }
333 
334  inline XMMReg2Double operator- (const XMMReg2Double& other) const
335  {
336  XMMReg2Double ret;
337  ret.xmm = _mm_sub_pd(xmm, other.xmm);
338  return ret;
339  }
340 
341  inline XMMReg2Double operator* (const XMMReg2Double& other) const
342  {
343  XMMReg2Double ret;
344  ret.xmm = _mm_mul_pd(xmm, other.xmm);
345  return ret;
346  }
347 
348  inline XMMReg2Double operator/ (const XMMReg2Double& other) const
349  {
350  XMMReg2Double ret;
351  ret.xmm = _mm_div_pd(xmm, other.xmm);
352  return ret;
353  }
354 
355  inline double GetHorizSum() const
356  {
357  __m128d xmm2;
358  xmm2 = _mm_shuffle_pd(xmm,xmm,_MM_SHUFFLE2(0,1)); /* transfer high word into low word of xmm2 */
359  return _mm_cvtsd_f64(_mm_add_sd(xmm, xmm2));
360  }
361 
362  inline void Store2Val(double* ptr) const
363  {
364  _mm_storeu_pd(ptr, xmm);
365  }
366 
367  inline void Store2ValAligned(double* ptr) const
368  {
369  _mm_store_pd(ptr, xmm);
370  }
371 
372  inline void Store2Val(float* ptr) const
373  {
374  __m128i xmm_i = _mm_castps_si128( _mm_cvtpd_ps(xmm) );
375  GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
376  }
377 
378  inline void Store2Val(unsigned char* ptr) const
379  {
380  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
381  // X X X X 0 B 0 A --> 0 X X X X 0 B 0 (srli)
382  // or X X X X 0 B 0 A
383  tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
384  tmp = _mm_packus_epi16(tmp, tmp);
385  GDALCopyXMMToInt16(tmp, reinterpret_cast<GInt16*>(ptr));
386  }
387 
388  inline void Store2Val(unsigned short* ptr) const
389  {
390  __m128i tmp = _mm_cvttpd_epi32(_mm_add_pd(xmm, _mm_set1_pd(0.5))); /* Convert the 2 double values to 2 integers */
391  // X X X X 0 B 0 A --> 0 X X X X 0 B 0 (srli)
392  // or X X X X 0 B 0 A
393  tmp = _mm_or_si128(tmp, _mm_srli_si128(tmp, 2));
394  GDALCopyXMMToInt32(tmp, reinterpret_cast<GInt32*>(ptr));
395 
396  //ptr[0] = (GUInt16)_mm_extract_epi16(tmp, 0);
397  //ptr[1] = (GUInt16)_mm_extract_epi16(tmp, 2);
398  }
399 
400  inline void StoreMask(unsigned char* ptr) const
401  {
402  _mm_storeu_si128( reinterpret_cast<__m128i*>(ptr), _mm_castpd_si128(xmm) );
403  }
404 
405  inline operator double () const
406  {
407  return _mm_cvtsd_f64(xmm);
408  }
409 };
410 
411 #else
412 
413 #warning "Software emulation of SSE2 !"
414 
415 class XMMReg2Double
416 {
417  public:
418  double low;
419  double high;
420 
421  XMMReg2Double() {}
422  XMMReg2Double(double val) { low = val; high = 0.0; }
423  XMMReg2Double(const XMMReg2Double& other) : low(other.low), high(other.high) {}
424 
425  static inline XMMReg2Double Zero()
426  {
427  XMMReg2Double reg;
428  reg.Zeroize();
429  return reg;
430  }
431 
432  static inline XMMReg2Double Load1ValHighAndLow(const double* ptr)
433  {
434  XMMReg2Double reg;
435  reg.nsLoad1ValHighAndLow(ptr);
436  return reg;
437  }
438 
439  static inline XMMReg2Double Equals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
440  {
441  XMMReg2Double reg;
442 
443  if (expr1.low == expr2.low)
444  memset(&(reg.low), 0xFF, sizeof(double));
445  else
446  reg.low = 0;
447 
448  if (expr1.high == expr2.high)
449  memset(&(reg.high), 0xFF, sizeof(double));
450  else
451  reg.high = 0;
452 
453  return reg;
454  }
455 
456  static inline XMMReg2Double NotEquals(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
457  {
458  XMMReg2Double reg;
459 
460  if (expr1.low != expr2.low)
461  memset(&(reg.low), 0xFF, sizeof(double));
462  else
463  reg.low = 0;
464 
465  if (expr1.high != expr2.high)
466  memset(&(reg.high), 0xFF, sizeof(double));
467  else
468  reg.high = 0;
469 
470  return reg;
471  }
472 
473  static inline XMMReg2Double Greater(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
474  {
475  XMMReg2Double reg;
476 
477  if (expr1.low > expr2.low)
478  memset(&(reg.low), 0xFF, sizeof(double));
479  else
480  reg.low = 0;
481 
482  if (expr1.high > expr2.high)
483  memset(&(reg.high), 0xFF, sizeof(double));
484  else
485  reg.high = 0;
486 
487  return reg;
488  }
489 
490  static inline XMMReg2Double And(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
491  {
492  XMMReg2Double reg;
493  int low1[2], high1[2];
494  int low2[2], high2[2];
495  memcpy(low1, &expr1.low, sizeof(double));
496  memcpy(high1, &expr1.high, sizeof(double));
497  memcpy(low2, &expr2.low, sizeof(double));
498  memcpy(high2, &expr2.high, sizeof(double));
499  low1[0] &= low2[0];
500  low1[1] &= low2[1];
501  high1[0] &= high2[0];
502  high1[1] &= high2[1];
503  memcpy(&reg.low, low1, sizeof(double));
504  memcpy(&reg.high, high1, sizeof(double));
505  return reg;
506  }
507 
508  static inline XMMReg2Double Ternary(const XMMReg2Double& cond, const XMMReg2Double& true_expr, const XMMReg2Double& false_expr)
509  {
510  XMMReg2Double reg;
511  if( cond.low )
512  reg.low = true_expr.low;
513  else
514  reg.low = false_expr.low;
515  if( cond.high )
516  reg.high = true_expr.high;
517  else
518  reg.high = false_expr.high;
519  return reg;
520  }
521 
522  static inline XMMReg2Double Min(const XMMReg2Double& expr1, const XMMReg2Double& expr2)
523  {
524  XMMReg2Double reg;
525  reg.low = (expr1.low < expr2.low) ? expr1.low : expr2.low;
526  reg.high = (expr1.high < expr2.high) ? expr1.high : expr2.high;
527  return reg;
528  }
529 
530  static inline XMMReg2Double Load2Val(const double* ptr)
531  {
532  XMMReg2Double reg;
533  reg.nsLoad2Val(ptr);
534  return reg;
535  }
536 
537  static inline XMMReg2Double Load2ValAligned(const double* ptr)
538  {
539  XMMReg2Double reg;
540  reg.nsLoad2ValAligned(ptr);
541  return reg;
542  }
543 
544  static inline XMMReg2Double Load2Val(const float* ptr)
545  {
546  XMMReg2Double reg;
547  reg.nsLoad2Val(ptr);
548  return reg;
549  }
550 
551  static inline XMMReg2Double Load2Val(const unsigned char* ptr)
552  {
553  XMMReg2Double reg;
554  reg.nsLoad2Val(ptr);
555  return reg;
556  }
557 
558  static inline XMMReg2Double Load2Val(const short* ptr)
559  {
560  XMMReg2Double reg;
561  reg.nsLoad2Val(ptr);
562  return reg;
563  }
564 
565  static inline XMMReg2Double Load2Val(const unsigned short* ptr)
566  {
567  XMMReg2Double reg;
568  reg.nsLoad2Val(ptr);
569  return reg;
570  }
571 
572  inline void nsLoad1ValHighAndLow(const double* ptr)
573  {
574  low = ptr[0];
575  high = ptr[0];
576  }
577 
578  inline void nsLoad2Val(const double* ptr)
579  {
580  low = ptr[0];
581  high = ptr[1];
582  }
583 
584  inline void nsLoad2ValAligned(const double* ptr)
585  {
586  low = ptr[0];
587  high = ptr[1];
588  }
589 
590  inline void nsLoad2Val(const float* ptr)
591  {
592  low = ptr[0];
593  high = ptr[1];
594  }
595 
596  inline void nsLoad2Val(const unsigned char* ptr)
597  {
598  low = ptr[0];
599  high = ptr[1];
600  }
601 
602  inline void nsLoad2Val(const short* ptr)
603  {
604  low = ptr[0];
605  high = ptr[1];
606  }
607 
608  inline void nsLoad2Val(const unsigned short* ptr)
609  {
610  low = ptr[0];
611  high = ptr[1];
612  }
613 
614  static inline void Load4Val(const unsigned char* ptr, XMMReg2Double& low, XMMReg2Double& high)
615  {
616  low.low = ptr[0];
617  low.high = ptr[1];
618  high.low = ptr[2];
619  high.high = ptr[3];
620  }
621 
622  static inline void Load4Val(const short* ptr, XMMReg2Double& low, XMMReg2Double& high)
623  {
624  low.nsLoad2Val(ptr);
625  high.nsLoad2Val(ptr+2);
626  }
627 
628  static inline void Load4Val(const unsigned short* ptr, XMMReg2Double& low, XMMReg2Double& high)
629  {
630  low.nsLoad2Val(ptr);
631  high.nsLoad2Val(ptr+2);
632  }
633 
634  static inline void Load4Val(const double* ptr, XMMReg2Double& low, XMMReg2Double& high)
635  {
636  low.nsLoad2Val(ptr);
637  high.nsLoad2Val(ptr+2);
638  }
639 
640  static inline void Load4Val(const float* ptr, XMMReg2Double& low, XMMReg2Double& high)
641  {
642  low.nsLoad2Val(ptr);
643  high.nsLoad2Val(ptr+2);
644  }
645 
646  inline void Zeroize()
647  {
648  low = 0.0;
649  high = 0.0;
650  }
651 
652  inline XMMReg2Double& operator= (const XMMReg2Double& other)
653  {
654  low = other.low;
655  high = other.high;
656  return *this;
657  }
658 
659  inline XMMReg2Double& operator+= (const XMMReg2Double& other)
660  {
661  low += other.low;
662  high += other.high;
663  return *this;
664  }
665 
666  inline XMMReg2Double& operator*= (const XMMReg2Double& other)
667  {
668  low *= other.low;
669  high *= other.high;
670  return *this;
671  }
672 
673  inline XMMReg2Double operator+ (const XMMReg2Double& other) const
674  {
675  XMMReg2Double ret;
676  ret.low = low + other.low;
677  ret.high = high + other.high;
678  return ret;
679  }
680 
681  inline XMMReg2Double operator- (const XMMReg2Double& other) const
682  {
683  XMMReg2Double ret;
684  ret.low = low - other.low;
685  ret.high = high - other.high;
686  return ret;
687  }
688 
689  inline XMMReg2Double operator* (const XMMReg2Double& other) const
690  {
691  XMMReg2Double ret;
692  ret.low = low * other.low;
693  ret.high = high * other.high;
694  return ret;
695  }
696 
697  inline XMMReg2Double operator/ (const XMMReg2Double& other) const
698  {
699  XMMReg2Double ret;
700  ret.low = low / other.low;
701  ret.high = high / other.high;
702  return ret;
703  }
704 
705  inline double GetHorizSum() const
706  {
707  return low + high;
708  }
709 
710  inline void Store2Val(double* ptr) const
711  {
712  ptr[0] = low;
713  ptr[1] = high;
714  }
715 
716  inline void Store2ValAligned(double* ptr) const
717  {
718  ptr[0] = low;
719  ptr[1] = high;
720  }
721 
722  inline void Store2Val(float* ptr) const
723  {
724  ptr[0] = low;
725  ptr[1] = high;
726  }
727 
728  void Store2Val(unsigned char* ptr) const
729  {
730  ptr[0] = (unsigned char)(low + 0.5);
731  ptr[1] = (unsigned char)(high + 0.5);
732  }
733 
734  void Store2Val(unsigned short* ptr) const
735  {
736  ptr[0] = (GUInt16)(low + 0.5);
737  ptr[1] = (GUInt16)(high + 0.5);
738  }
739 
740  inline void StoreMask(unsigned char* ptr) const
741  {
742  memcpy(ptr, &low, 8);
743  memcpy(ptr + 8, &high, 8);
744  }
745 
746  inline operator double () const
747  {
748  return low;
749  }
750 };
751 
752 #endif /* defined(__x86_64) || defined(_M_X64) */
753 
754 #ifdef __AVX__
755 
756 #include <immintrin.h>
757 
758 class XMMReg4Double
759 {
760  public:
761  __m256d ymm;
762 
763  XMMReg4Double() {}
764  XMMReg4Double(const XMMReg4Double& other) : ymm(other.ymm) {}
765 
766  static inline XMMReg4Double Zero()
767  {
768  XMMReg4Double reg;
769  reg.Zeroize();
770  return reg;
771  }
772 
773  inline void Zeroize()
774  {
775  ymm = _mm256_setzero_pd();
776  }
777 
778  static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
779  {
780  XMMReg4Double reg;
781  reg.nsLoad1ValHighAndLow(ptr);
782  return reg;
783  }
784 
785  inline void nsLoad1ValHighAndLow(const double* ptr)
786  {
787  ymm = _mm256_set1_pd(*ptr);
788  }
789 
790  static inline XMMReg4Double Load4Val(const unsigned char* ptr)
791  {
792  XMMReg4Double reg;
793  reg.nsLoad4Val(ptr);
794  return reg;
795  }
796 
797  inline void nsLoad4Val(const unsigned char* ptr)
798  {
799  __m128i xmm_i = GDALCopyInt32ToXMM(ptr);
800  xmm_i = _mm_cvtepu8_epi32(xmm_i);
801  ymm = _mm256_cvtepi32_pd(xmm_i);
802  }
803 
804  static inline XMMReg4Double Load4Val(const short* ptr)
805  {
806  XMMReg4Double reg;
807  reg.nsLoad4Val(ptr);
808  return reg;
809  }
810 
811  inline void nsLoad4Val(const short* ptr)
812  {
813  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
814  xmm_i = _mm_cvtepi16_epi32(xmm_i);
815  ymm = _mm256_cvtepi32_pd(xmm_i);
816  }
817 
818  static inline XMMReg4Double Load4Val(const unsigned short* ptr)
819  {
820  XMMReg4Double reg;
821  reg.nsLoad4Val(ptr);
822  return reg;
823  }
824 
825  inline void nsLoad4Val(const unsigned short* ptr)
826  {
827  __m128i xmm_i = GDALCopyInt64ToXMM(ptr);
828  xmm_i = _mm_cvtepu16_epi32(xmm_i);
829  ymm = _mm256_cvtepi32_pd(xmm_i); // ok to use signed conversion since we are in the ushort range, so cannot be interpreted as negative int32
830  }
831 
832  static inline XMMReg4Double Load4Val(const double* ptr)
833  {
834  XMMReg4Double reg;
835  reg.nsLoad4Val(ptr);
836  return reg;
837  }
838 
839  inline void nsLoad4Val(const double* ptr)
840  {
841  ymm = _mm256_loadu_pd(ptr);
842  }
843 
844  static inline XMMReg4Double Load4ValAligned(const double* ptr)
845  {
846  XMMReg4Double reg;
847  reg.nsLoad4ValAligned(ptr);
848  return reg;
849  }
850 
851  inline void nsLoad4ValAligned(const double* ptr)
852  {
853  ymm = _mm256_load_pd(ptr);
854  }
855 
856  static inline XMMReg4Double Load4Val(const float* ptr)
857  {
858  XMMReg4Double reg;
859  reg.nsLoad4Val(ptr);
860  return reg;
861  }
862 
863  inline void nsLoad4Val(const float* ptr)
864  {
865  ymm = _mm256_cvtps_pd( _mm_loadu_ps(ptr) );
866  }
867 
868  static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
869  {
870  XMMReg4Double reg;
871  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_EQ_OQ);
872  return reg;
873  }
874 
875  static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
876  {
877  XMMReg4Double reg;
878  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_NEQ_OQ);
879  return reg;
880  }
881 
882  static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
883  {
884  XMMReg4Double reg;
885  reg.ymm = _mm256_cmp_pd(expr1.ymm, expr2.ymm, _CMP_GT_OQ);
886  return reg;
887  }
888 
889  static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
890  {
891  XMMReg4Double reg;
892  reg.ymm = _mm256_and_pd(expr1.ymm, expr2.ymm);
893  return reg;
894  }
895 
896  static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
897  {
898  XMMReg4Double reg;
899  reg.ymm = _mm256_or_pd(_mm256_and_pd (cond.ymm, true_expr.ymm), _mm256_andnot_pd(cond.ymm, false_expr.ymm));
900  return reg;
901  }
902 
903  static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
904  {
905  XMMReg4Double reg;
906  reg.ymm = _mm256_min_pd(expr1.ymm, expr2.ymm);
907  return reg;
908  }
909 
910  inline XMMReg4Double& operator= (const XMMReg4Double& other)
911  {
912  ymm = other.ymm;
913  return *this;
914  }
915 
916  inline XMMReg4Double& operator+= (const XMMReg4Double& other)
917  {
918  ymm = _mm256_add_pd(ymm, other.ymm);
919  return *this;
920  }
921 
922  inline XMMReg4Double& operator*= (const XMMReg4Double& other)
923  {
924  ymm = _mm256_mul_pd(ymm, other.ymm);
925  return *this;
926  }
927 
928  inline XMMReg4Double operator+ (const XMMReg4Double& other) const
929  {
930  XMMReg4Double ret;
931  ret.ymm = _mm256_add_pd(ymm, other.ymm);
932  return ret;
933  }
934 
935  inline XMMReg4Double operator- (const XMMReg4Double& other) const
936  {
937  XMMReg4Double ret;
938  ret.ymm = _mm256_sub_pd(ymm, other.ymm);
939  return ret;
940  }
941 
942  inline XMMReg4Double operator* (const XMMReg4Double& other) const
943  {
944  XMMReg4Double ret;
945  ret.ymm = _mm256_mul_pd(ymm, other.ymm);
946  return ret;
947  }
948 
949  inline XMMReg4Double operator/ (const XMMReg4Double& other) const
950  {
951  XMMReg4Double ret;
952  ret.ymm = _mm256_div_pd(ymm, other.ymm);
953  return ret;
954  }
955 
956  void AddToLow( const XMMReg2Double& other )
957  {
958  __m256d ymm2 = _mm256_setzero_pd();
959  ymm2 = _mm256_insertf128_pd( ymm2, other.xmm, 0);
960  ymm = _mm256_add_pd(ymm, ymm2);
961  }
962 
963  inline double GetHorizSum() const
964  {
965  __m256d ymm_tmp1, ymm_tmp2;
966  ymm_tmp2 = _mm256_hadd_pd(ymm, ymm);
967  ymm_tmp1 = _mm256_permute2f128_pd(ymm_tmp2, ymm_tmp2, 1);
968  ymm_tmp1 = _mm256_add_pd(ymm_tmp1, ymm_tmp2);
969  return _mm_cvtsd_f64(_mm256_castpd256_pd128(ymm_tmp1));
970  }
971 
972  inline void Store4Val(unsigned char* ptr) const
973  {
974  __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
975  //xmm_i = _mm_packs_epi32(xmm_i, xmm_i); // Pack int32 to int16
976  //xmm_i = _mm_packus_epi16(xmm_i, xmm_i); // Pack int16 to uint8
977  xmm_i = _mm_shuffle_epi8(xmm_i, _mm_cvtsi32_si128(0 | (4 << 8) | (8 << 16) | (12 << 24))); // SSSE3
978  GDALCopyXMMToInt32(xmm_i, reinterpret_cast<GInt32*>(ptr));
979  }
980 
981  inline void Store4Val(unsigned short* ptr) const
982  {
983  __m128i xmm_i = _mm256_cvttpd_epi32 (_mm256_add_pd(ymm, _mm256_set1_pd(0.5)));
984  xmm_i = _mm_packus_epi32(xmm_i, xmm_i); // Pack uint32 to uint16
985  GDALCopyXMMToInt64(xmm_i, reinterpret_cast<GInt64*>(ptr));
986  }
987 
988  inline void Store4Val(float* ptr) const
989  {
990  _mm_storeu_ps(ptr, _mm256_cvtpd_ps (ymm));
991  }
992 
993  inline void Store4Val(double* ptr) const
994  {
995  _mm256_storeu_pd(ptr, ymm);
996  }
997 
998  inline void StoreMask(unsigned char* ptr) const
999  {
1000  _mm256_storeu_si256( reinterpret_cast<__m256i*>(ptr), _mm256_castpd_si256(ymm) );
1001  }
1002 };
1003 
1004 #else
1005 
1006 class XMMReg4Double
1007 {
1008  public:
1009  XMMReg2Double low, high;
1010 
1011 #if defined(__GNUC__)
1012 #pragma GCC diagnostic push
1013 #pragma GCC diagnostic ignored "-Weffc++"
1014 #endif
1015  /* coverity[uninit_member] */
1016  XMMReg4Double() = default;
1017 #if defined(__GNUC__)
1018 #pragma GCC diagnostic pop
1019 #endif
1020 
1021  XMMReg4Double(const XMMReg4Double& other) : low(other.low), high(other.high) {}
1022 
1023  static inline XMMReg4Double Zero()
1024  {
1025  XMMReg4Double reg;
1026  reg.low.Zeroize();
1027  reg.high.Zeroize();
1028  return reg;
1029  }
1030 
1031  static inline XMMReg4Double Load1ValHighAndLow(const double* ptr)
1032  {
1033  XMMReg4Double reg;
1034  reg.low.nsLoad1ValHighAndLow(ptr);
1035  reg.high = reg.low;
1036  return reg;
1037  }
1038 
1039  static inline XMMReg4Double Load4Val(const unsigned char* ptr)
1040  {
1041  XMMReg4Double reg;
1042  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1043  return reg;
1044  }
1045 
1046  static inline XMMReg4Double Load4Val(const short* ptr)
1047  {
1048  XMMReg4Double reg;
1049  reg.low.nsLoad2Val(ptr);
1050  reg.high.nsLoad2Val(ptr+2);
1051  return reg;
1052  }
1053 
1054  static inline XMMReg4Double Load4Val(const unsigned short* ptr)
1055  {
1056  XMMReg4Double reg;
1057  reg.low.nsLoad2Val(ptr);
1058  reg.high.nsLoad2Val(ptr+2);
1059  return reg;
1060  }
1061 
1062  static inline XMMReg4Double Load4Val(const double* ptr)
1063  {
1064  XMMReg4Double reg;
1065  reg.low.nsLoad2Val(ptr);
1066  reg.high.nsLoad2Val(ptr+2);
1067  return reg;
1068  }
1069 
1070  static inline XMMReg4Double Load4ValAligned(const double* ptr)
1071  {
1072  XMMReg4Double reg;
1073  reg.low.nsLoad2ValAligned(ptr);
1074  reg.high.nsLoad2ValAligned(ptr+2);
1075  return reg;
1076  }
1077 
1078  static inline XMMReg4Double Load4Val(const float* ptr)
1079  {
1080  XMMReg4Double reg;
1081  XMMReg2Double::Load4Val(ptr, reg.low, reg.high);
1082  return reg;
1083  }
1084 
1085  static inline XMMReg4Double Equals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1086  {
1087  XMMReg4Double reg;
1088  reg.low = XMMReg2Double::Equals(expr1.low, expr2.low);
1089  reg.high = XMMReg2Double::Equals(expr1.high, expr2.high);
1090  return reg;
1091  }
1092 
1093  static inline XMMReg4Double NotEquals(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1094  {
1095  XMMReg4Double reg;
1096  reg.low = XMMReg2Double::NotEquals(expr1.low, expr2.low);
1097  reg.high = XMMReg2Double::NotEquals(expr1.high, expr2.high);
1098  return reg;
1099  }
1100 
1101  static inline XMMReg4Double Greater(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1102  {
1103  XMMReg4Double reg;
1104  reg.low = XMMReg2Double::Greater(expr1.low, expr2.low);
1105  reg.high = XMMReg2Double::Greater(expr1.high, expr2.high);
1106  return reg;
1107  }
1108 
1109  static inline XMMReg4Double And(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1110  {
1111  XMMReg4Double reg;
1112  reg.low = XMMReg2Double::And(expr1.low, expr2.low);
1113  reg.high = XMMReg2Double::And(expr1.high, expr2.high);
1114  return reg;
1115  }
1116 
1117  static inline XMMReg4Double Ternary(const XMMReg4Double& cond, const XMMReg4Double& true_expr, const XMMReg4Double& false_expr)
1118  {
1119  XMMReg4Double reg;
1120  reg.low = XMMReg2Double::Ternary(cond.low, true_expr.low, false_expr.low);
1121  reg.high = XMMReg2Double::Ternary(cond.high, true_expr.high, false_expr.high);
1122  return reg;
1123  }
1124 
1125  static inline XMMReg4Double Min(const XMMReg4Double& expr1, const XMMReg4Double& expr2)
1126  {
1127  XMMReg4Double reg;
1128  reg.low = XMMReg2Double::Min(expr1.low, expr2.low);
1129  reg.high = XMMReg2Double::Min(expr1.high, expr2.high);
1130  return reg;
1131  }
1132 
1133  inline XMMReg4Double& operator= (const XMMReg4Double& other)
1134  {
1135  low = other.low;
1136  high = other.high;
1137  return *this;
1138  }
1139 
1140  inline XMMReg4Double& operator+= (const XMMReg4Double& other)
1141  {
1142  low += other.low;
1143  high += other.high;
1144  return *this;
1145  }
1146 
1147  inline XMMReg4Double& operator*= (const XMMReg4Double& other)
1148  {
1149  low *= other.low;
1150  high *= other.high;
1151  return *this;
1152  }
1153 
1154  inline XMMReg4Double operator+ (const XMMReg4Double& other) const
1155  {
1156  XMMReg4Double ret;
1157  ret.low = low + other.low;
1158  ret.high = high + other.high;
1159  return ret;
1160  }
1161 
1162  inline XMMReg4Double operator- (const XMMReg4Double& other) const
1163  {
1164  XMMReg4Double ret;
1165  ret.low = low - other.low;
1166  ret.high = high - other.high;
1167  return ret;
1168  }
1169 
1170  inline XMMReg4Double operator* (const XMMReg4Double& other) const
1171  {
1172  XMMReg4Double ret;
1173  ret.low = low * other.low;
1174  ret.high = high * other.high;
1175  return ret;
1176  }
1177 
1178  inline XMMReg4Double operator/ (const XMMReg4Double& other) const
1179  {
1180  XMMReg4Double ret;
1181  ret.low = low / other.low;
1182  ret.high = high / other.high;
1183  return ret;
1184  }
1185 
1186  void AddToLow( const XMMReg2Double& other )
1187  {
1188  low += other;
1189  }
1190 
1191  inline double GetHorizSum() const
1192  {
1193  return (low + high).GetHorizSum();
1194  }
1195 
1196  inline void Store4Val(unsigned char* ptr) const
1197  {
1198  low.Store2Val(ptr);
1199  high.Store2Val(ptr+2);
1200  }
1201 
1202  inline void Store4Val(unsigned short* ptr) const
1203  {
1204 #if 1
1205  low.Store2Val(ptr);
1206  high.Store2Val(ptr+2);
1207 #else
1208  __m128i xmm0 = _mm_cvtpd_epi32 (low.xmm);
1209  __m128i xmm1 = _mm_cvtpd_epi32 (high.xmm);
1210  xmm0 = _mm_or_si128(xmm0, _mm_slli_si128(xmm1, 8));
1211 #if __SSE4_1__
1212  xmm0 = _mm_packus_epi32(xmm0, xmm0); // Pack uint32 to uint16
1213 #else
1214  xmm0 = _mm_add_epi32( xmm0, _mm_set1_epi32(-32768) );
1215  xmm0 = _mm_packs_epi32( xmm0, xmm0 );
1216  xmm0 = _mm_sub_epi16( xmm0, _mm_set1_epi16(-32768) );
1217 #endif
1218  GDALCopyXMMToInt64(xmm0, (GInt64*)ptr);
1219 #endif
1220  }
1221 
1222  inline void Store4Val(float* ptr) const
1223  {
1224  low.Store2Val(ptr);
1225  high.Store2Val(ptr+2);
1226  }
1227 
1228  inline void Store4Val(double* ptr) const
1229  {
1230  low.Store2Val(ptr);
1231  high.Store2Val(ptr+2);
1232  }
1233 
1234  inline void StoreMask(unsigned char* ptr) const
1235  {
1236  low.StoreMask(ptr);
1237  high.StoreMask(ptr+16);
1238  }
1239 
1240 };
1241 
1242 #endif
1243 
1244 #endif /* #ifndef DOXYGEN_SKIP */
1245 
1246 #endif /* GDALSSE_PRIV_H_INCLUDED */
Core portability definitions for CPL.
int GInt32
Int32 type.
Definition: cpl_port.h:203
short GInt16
Int16 type.
Definition: cpl_port.h:209
unsigned short GUInt16
Unsigned int16 type.
Definition: cpl_port.h:211
GIntBig GInt64
Signed 64 bit integer type.
Definition: cpl_port.h:265

Generated for GDAL by doxygen 1.8.8.