sse_utils_cxx.h

Go to the documentation of this file.
00001 /*
00002  *  This file is part of libcxxsupport.
00003  *
00004  *  libcxxsupport is free software; you can redistribute it and/or modify
00005  *  it under the terms of the GNU General Public License as published by
00006  *  the Free Software Foundation; either version 2 of the License, or
00007  *  (at your option) any later version.
00008  *
00009  *  libcxxsupport is distributed in the hope that it will be useful,
00010  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  *  GNU General Public License for more details.
00013  *
00014  *  You should have received a copy of the GNU General Public License
00015  *  along with libcxxsupport; if not, write to the Free Software
00016  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00017  */
00018 
00019 /*
00020  *  libcxxsupport is being developed at the Max-Planck-Institut fuer Astrophysik
00021  *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
00022  *  (DLR).
00023  */
00024 
00025 /*! \file sse_utils_cxx.h
00026  *  SSE/SSE2/SSE3-related functionality for C++
00027  *
00028  *  Copyright (C) 2011, 2012 Max-Planck-Society
00029  *  \author Martin Reinecke
00030  */
00031 
00032 #ifndef PLANCK_SSE_UTILS_CXX_H
00033 #define PLANCK_SSE_UTILS_CXX_H
00034 
00035 template<typename T, int sz> class svec;
00036 
00037 #if (defined(__SSE2__))
00038 
00039 #include <xmmintrin.h>
00040 #include <emmintrin.h>
00041 
00042 template<> class svec<int, 4>
00043   {
00044   public:
00045     typedef int Ts;
00046     typedef __m128i Tv;
00047     typedef union { Tv v; Ts d[4]; } Tu;
00048     Tv v;
00049 
00050     svec () {}
00051     svec (const svec &b) : v(b.v) {}
00052     svec (const Tv &b) : v(b) {}
00053     svec (const Ts &val) : v(_mm_set1_epi32(val)) {}
00054     svec (const Ts &val1, const Ts &val2, const Ts &val3, const Ts &val4)
00055       : v(_mm_set_epi32(val4,val3,val2,val1)) {}
00056 
00057     const svec &operator= (const Ts &val)
00058       { v=_mm_set1_epi32(val); return *this; }
00059     const svec &operator= (const svec &b)
00060       { v=b.v; return *this; }
00061 
00062     Ts operator[] (int p) const
00063       { Tu u; u.v=v; return u.d[p]; }
00064     void set (int p, Ts val)
00065       { Tu u; u.v=v; u.d[p]=val; v=u.v; }
00066 
00067     const svec &operator+= (const svec &b)
00068       { v=_mm_add_epi32(v,b.v); return *this; }
00069     const svec &operator-= (const svec &b)
00070       { v=_mm_sub_epi32(v,b.v); return *this; }
00071     svec operator+ (const svec &b) const
00072       { return svec(_mm_add_epi32(v,b.v)); }
00073     svec operator- (const svec &b) const
00074       { return svec(_mm_sub_epi32(v,b.v)); }
00075 
00076     const svec &operator&= (const svec &b)
00077       { v=_mm_and_si128(v,b.v); return *this; }
00078     const svec &operator|= (const svec &b)
00079       { v=_mm_or_si128(v,b.v); return *this; }
00080     const svec &operator^= (const svec &b)
00081       { v=_mm_xor_si128(v,b.v); return *this; }
00082     svec operator& (const svec &b) const
00083       { return svec(_mm_and_si128(v,b.v)); }
00084     svec operator| (const svec &b) const
00085       { return svec(_mm_or_si128(v,b.v)); }
00086     svec operator^ (const svec &b) const
00087       { return svec(_mm_xor_si128(v,b.v)); }
00088     svec andnot (const svec &b) const
00089       { return svec(_mm_andnot_si128(v,b.v)); }
00090 
00091     const svec &operator<<= (int b)
00092       { v=_mm_slli_epi32(v,b); return *this; }
00093     svec operator<< (int b) const
00094       { return svec(_mm_slli_epi32(v,b)); }
00095     const svec &operator>>= (int b)
00096       { v=_mm_srai_epi32(v,b); return *this; }
00097     svec operator>> (int b) const
00098       { return svec(_mm_srai_epi32(v,b)); }
00099 
00100     svec eq (const svec &b) const
00101       { return svec(_mm_cmpeq_epi32(v,b.v)); }
00102     svec gt (const svec &b) const
00103       { return svec(_mm_cmpgt_epi32(v,b.v)); }
00104     svec lt (const svec &b) const
00105       { return svec(_mm_cmplt_epi32(v,b.v)); }
00106   };
00107 
00108 typedef svec<int,4> V4si;
00109 
00110 #if 0
00111 template<> class svec<long long , 2>
00112   {
00113   public:
00114     typedef long long Ts;
00115     typedef __m128i Tv;
00116     typedef union { Tv v; Ts d[2]; } Tu;
00117     Tv v;
00118 
00119     svec () {}
00120     svec (const svec &b) : v(b.v) {}
00121     svec (const Tv &b) : v(b) {}
00122     svec (const Ts &val) : v(_mm_set1_epi64x(val)) {}
00123     svec (const Ts &val1, const Ts &val2)
00124       : v(_mm_set_epi64x(val2,val1)) {}
00125 
00126     const svec &operator= (const Ts &val)
00127       { v=_mm_set1_epi64x(val); return *this; }
00128     const svec &operator= (const svec &b)
00129       { v=b.v; return *this; }
00130 
00131     int operator[] (int p) const
00132       { Tu u; u.v=v; return u.d[p]; }
00133     void set (int p, int val)
00134       { Tu u; u.v=v; u.d[p]=val; v=u.v; }
00135 
00136     const svec &operator+= (const svec &b)
00137       { v=_mm_add_epi64(v,b.v); return *this; }
00138     const svec &operator-= (const svec &b)
00139       { v=_mm_sub_epi64(v,b.v); return *this; }
00140     svec operator+ (const svec &b) const
00141       { return svec(_mm_add_epi64(v,b.v)); }
00142     svec operator- (const svec &b) const
00143       { return svec(_mm_sub_epi64(v,b.v)); }
00144 
00145     const svec &operator&= (const svec &b)
00146       { v=_mm_and_si128(v,b.v); return *this; }
00147     const svec &operator|= (const svec &b)
00148       { v=_mm_or_si128(v,b.v); return *this; }
00149     const svec &operator^= (const svec &b)
00150       { v=_mm_xor_si128(v,b.v); return *this; }
00151     svec operator& (const svec &b) const
00152       { return svec(_mm_and_si128(v,b.v)); }
00153     svec operator| (const svec &b) const
00154       { return svec(_mm_or_si128(v,b.v)); }
00155     svec operator^ (const svec &b) const
00156       { return svec(_mm_xor_si128(v,b.v)); }
00157     svec andnot (const svec &b) const
00158       { return svec(_mm_andnot_si128(v,b.v)); }
00159 
00160     const svec &operator<<= (int b)
00161       { v=_mm_slli_epi64(v,b); return *this; }
00162     svec operator<< (int b) const
00163       { return svec(_mm_slli_epi64(v,b)); }
00164   };
00165 
00166 typedef svec<long long,2> V2di;
00167 #endif
00168 
00169 template<> class svec<float, 4>
00170   {
00171   public:
00172     typedef float Ts;
00173     typedef __m128 Tv;
00174     typedef union { Tv v; Ts d[4]; } Tu;
00175     Tv v;
00176 
00177     svec () {}
00178     svec (const svec &b) : v(b.v) {}
00179     svec (const Tv &b) : v(b) {}
00180     svec (const Ts &val) : v(_mm_set1_ps(val)) {}
00181     svec (Ts val1, Ts val2, Ts val3, Ts val4)
00182       : v(_mm_set_ps(val4,val3,val2,val1)) {}
00183     explicit svec (const svec<int,4> &b) : v(_mm_cvtepi32_ps(b.v)) {}
00184 
00185     operator svec<int,4>() const
00186       { return svec<int,4> (_mm_cvtps_epi32(v)); }
00187     const svec &operator= (const Ts &val)
00188       { v=_mm_set1_ps(val); return *this; }
00189     const svec &operator= (const svec &b)
00190       { v=b.v; return *this; }
00191 
00192     Ts operator[] (int p) const
00193       { Tu u; u.v=v; return u.d[p]; }
00194     void set (int p, Ts val)
00195       { Tu u; u.v=v; u.d[p]=val; v=u.v; }
00196 
00197     const svec &operator+= (const svec &b)
00198       { v=_mm_add_ps(v,b.v); return *this; }
00199     const svec &operator-= (const svec &b)
00200       { v=_mm_sub_ps(v,b.v); return *this; }
00201     const svec &operator*= (const svec &b)
00202       { v=_mm_mul_ps(v,b.v); return *this; }
00203     const svec &operator/= (const svec &b)
00204       { v=_mm_div_ps(v,b.v); return *this; }
00205 
00206     svec operator+ (const svec &b) const
00207       { return svec(_mm_add_ps(v,b.v)); }
00208     svec operator- (const svec &b) const
00209       { return svec(_mm_sub_ps(v,b.v)); }
00210     svec operator* (const svec &b) const
00211       { return svec(_mm_mul_ps(v,b.v)); }
00212     svec operator/ (const svec &b) const
00213       { return svec(_mm_div_ps(v,b.v)); }
00214 
00215     const svec &operator&= (const svec &b)
00216       { v=_mm_and_ps(v,b.v); return *this; }
00217     const svec &operator|= (const svec &b)
00218       { v=_mm_or_ps(v,b.v); return *this; }
00219     const svec &operator^= (const svec &b)
00220       { v=_mm_xor_ps(v,b.v); return *this; }
00221     svec operator& (const svec &b) const
00222       { return svec(_mm_and_ps(v,b.v)); }
00223     svec andnot (const svec &b) const
00224       { return svec(_mm_andnot_ps(v,b.v)); }
00225     svec operator| (const svec &b) const
00226       { return svec(_mm_or_ps(v,b.v)); }
00227     svec operator^ (const svec &b) const
00228       { return svec(_mm_xor_ps(v,b.v)); }
00229 
00230     svec operator- () const
00231       { return svec(_mm_xor_ps(_mm_set1_ps(-0.),v)); }
00232 
00233     svec eq (const svec &b) const
00234       { return svec(_mm_cmpeq_ps(v,b.v)); }
00235     svec neq (const svec &b) const
00236       { return svec(_mm_cmpneq_ps(v,b.v)); }
00237     svec lt (const svec &b) const
00238       { return svec(_mm_cmplt_ps(v,b.v)); }
00239     svec le (const svec &b) const
00240       { return svec(_mm_cmple_ps(v,b.v)); }
00241     svec gt (const svec &b) const
00242       { return svec(_mm_cmpgt_ps(v,b.v)); }
00243     svec ge (const svec &b) const
00244       { return svec(_mm_cmpge_ps(v,b.v)); }
00245 
00246     void writeTo (Ts *val) const
00247       { _mm_storeu_ps (val, v); }
00248     void writeTo (Ts &a, Ts &b, Ts &c, Ts &d) const
00249       { Tu u; u.v=v; a=u.d[0]; b=u.d[1]; c=u.d[2]; d=u.d[3]; }
00250     void readFrom (const Ts *val)
00251       { v=_mm_loadu_ps(val); }
00252     void readFrom (Ts a, Ts b, Ts c, Ts d)
00253       { v=_mm_set_ps(d,c,b,a); }
00254   };
00255 
00256 typedef svec<float,4> V4sf;
00257 
00258 inline V4sf sqrt(const V4sf &v)
00259   { return V4sf(_mm_sqrt_ps(v.v)); }
00260 inline V4sf abs(const V4sf &v)
00261   { return V4sf(_mm_andnot_ps(_mm_set1_ps(-0.),v.v)); }
00262 inline V4sf blend(const V4sf &mask, const V4sf &a, const V4sf &b)
00263   { return (mask&a)|(mask.andnot(b)); }
00264 inline bool any (const V4sf &a)
00265   { return _mm_movemask_ps(a.v)!=0; }
00266 inline bool all (const V4sf &a)
00267   { return _mm_movemask_ps(a.v)==15; }
00268 inline bool none (const V4sf &a)
00269   { return _mm_movemask_ps(a.v)==0; }
00270 inline V4sf min (const V4sf &a, const V4sf &b)
00271   { return _mm_min_ps(a.v,b.v); }
00272 inline V4sf max (const V4sf &a, const V4sf &b)
00273   { return _mm_max_ps(a.v,b.v); }
00274 
00275 template<> class svec<double, 2>
00276   {
00277   public:
00278     typedef double Ts;
00279     typedef __m128d Tv;
00280     typedef union { Tv v; Ts d[2]; } Tu;
00281     Tv v;
00282 
00283     svec () {}
00284     svec (const svec &b) : v(b.v) {}
00285     svec (const Tv &b) : v(b) {}
00286     svec (const Ts &val) : v(_mm_set1_pd(val)) {}
00287     svec (const Ts &val1, const Ts &val2)
00288       : v(_mm_set_pd(val2,val1)) {}
00289     explicit svec (const svec<int,4> &b) : v(_mm_cvtepi32_pd(b.v)) {}
00290 
00291     operator svec<int,4>() const
00292       { return svec<int,4> (_mm_cvtpd_epi32(v)); }
00293     const svec &operator= (const Ts &val)
00294       { v=_mm_set1_pd(val); return *this; }
00295     const svec &operator= (const svec &b)
00296       { v=b.v; return *this; }
00297 
00298     Ts operator[] (int p) const
00299       { Tu u; u.v=v; return u.d[p]; }
00300     void set (int p, Ts val)
00301       { Tu u; u.v=v; u.d[p]=val; v=u.v; }
00302 
00303     const svec &operator+= (const svec &b)
00304       { v=_mm_add_pd(v,b.v); return *this; }
00305     const svec &operator-= (const svec &b)
00306       { v=_mm_sub_pd(v,b.v); return *this; }
00307     const svec &operator*= (const svec &b)
00308       { v=_mm_mul_pd(v,b.v); return *this; }
00309     const svec &operator/= (const svec &b)
00310       { v=_mm_div_pd(v,b.v); return *this; }
00311 
00312     svec operator+ (const svec &b) const
00313       { return svec(_mm_add_pd(v,b.v)); }
00314     svec operator- (const svec &b) const
00315       { return svec(_mm_sub_pd(v,b.v)); }
00316     svec operator* (const svec &b) const
00317       { return svec(_mm_mul_pd(v,b.v)); }
00318     svec operator/ (const svec &b) const
00319       { return svec(_mm_div_pd(v,b.v)); }
00320 
00321     const svec &operator&= (const svec &b)
00322       { v=_mm_and_pd(v,b.v); return *this; }
00323     const svec &operator|= (const svec &b)
00324       { v=_mm_or_pd(v,b.v); return *this; }
00325     const svec &operator^= (const svec &b)
00326       { v=_mm_xor_pd(v,b.v); return *this; }
00327     svec operator& (const svec &b) const
00328       { return svec(_mm_and_pd(v,b.v)); }
00329     svec operator| (const svec &b) const
00330       { return svec(_mm_or_pd(v,b.v)); }
00331     svec operator^ (const svec &b) const
00332       { return svec(_mm_xor_pd(v,b.v)); }
00333 
00334     svec operator- () const
00335       { return svec(_mm_xor_pd(_mm_set1_pd(-0.),v)); }
00336 
00337     svec eq (const svec &b) const
00338       { return svec(_mm_cmpeq_pd(v,b.v)); }
00339     svec neq (const svec &b) const
00340       { return svec(_mm_cmpneq_pd(v,b.v)); }
00341     svec lt (const svec &b) const
00342       { return svec(_mm_cmplt_pd(v,b.v)); }
00343     svec le (const svec &b) const
00344       { return svec(_mm_cmple_pd(v,b.v)); }
00345     svec gt (const svec &b) const
00346       { return svec(_mm_cmpgt_pd(v,b.v)); }
00347     svec ge (const svec &b) const
00348       { return svec(_mm_cmpge_pd(v,b.v)); }
00349 
00350     void writeTo (Ts *val) const
00351       { _mm_storeu_pd (val, v); }
00352     void writeTo (Ts &a, Ts &b) const
00353       { _mm_store_sd(&a,v); _mm_storeh_pd(&b,v); }
00354     void readFrom (const Ts *val)
00355       { v=_mm_loadu_pd(val); }
00356     void readFrom (const Ts &a, const Ts &b)
00357       { v=_mm_set_pd(b,a); }
00358   };
00359 
00360 typedef svec<double,2> V2df;
00361 
00362 inline V2df sqrt(const V2df &v)
00363   { return V2df(_mm_sqrt_pd(v.v)); }
00364 inline V2df abs(const V2df &v)
00365   { return V2df(_mm_andnot_pd(_mm_set1_pd(-0.),v.v)); }
00366 inline V2df blend(const V2df &mask, const V2df &a, const V2df &b)
00367   { return V2df(_mm_or_pd(_mm_and_pd(a.v,mask.v),_mm_andnot_pd(mask.v,b.v))); }
00368 inline bool any (const V2df &a)
00369   { return _mm_movemask_pd(a.v)!=0; }
00370 inline bool all (const V2df &a)
00371   { return _mm_movemask_pd(a.v)==3; }
00372 inline bool none (const V2df &a)
00373   { return _mm_movemask_pd(a.v)==0; }
00374 
00375 template<typename T> inline T vcast(const V4si &a);
00376 template<typename T> inline T vcast(const V4sf &a);
00377 template<typename T> inline T vcast(const V2df &a);
00378 
00379 template<> inline V4si vcast (const V4sf &a)
00380   { return V4si (_mm_castps_si128(a.v)); }
00381 template<> inline V4sf vcast (const V4si &a)
00382   { return V4sf (_mm_castsi128_ps(a.v)); }
00383 template<> inline V2df vcast (const V4si &a)
00384   { return V2df (_mm_castsi128_pd(a.v)); }
00385 
00386 #endif
00387 
00388 #endif

Generated on Thu Oct 8 14:48:51 2015 for LevelS C++ support library