| 1 | // This file is part of Eigen, a lightweight C++ template library |
| 2 | // for linear algebra. |
| 3 | // |
| 4 | // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> |
| 5 | // |
| 6 | // This Source Code Form is subject to the terms of the Mozilla |
| 7 | // Public License v. 2.0. If a copy of the MPL was not distributed |
| 8 | // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 9 | |
| 10 | #ifndef EIGEN_PACKET_MATH_SSE_H |
| 11 | #define EIGEN_PACKET_MATH_SSE_H |
| 12 | |
| 13 | namespace Eigen { |
| 14 | |
| 15 | namespace internal { |
| 16 | |
| 17 | #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD |
| 18 | #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 |
| 19 | #endif |
| 20 | |
| 21 | #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS |
| 22 | #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) |
| 23 | #endif |
| 24 | |
| 25 | #ifdef __FMA__ |
| 26 | #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD |
| 27 | #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 |
| 28 | #endif |
| 29 | #endif |
| 30 | |
| 31 | #if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX |
| 32 | // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot |
| 33 | // have overloads for both types without linking error. |
| 34 | // One solution is to increase ABI version using -fabi-version=4 (or greater). |
| 35 | // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper |
| 36 | // structure: |
| 37 | template<typename T> |
| 38 | struct eigen_packet_wrapper |
| 39 | { |
| 40 | EIGEN_ALWAYS_INLINE operator T&() { return m_val; } |
| 41 | EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } |
| 42 | EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} |
| 43 | EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} |
| 44 | EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { |
| 45 | m_val = v; |
| 46 | return *this; |
| 47 | } |
| 48 | |
| 49 | T m_val; |
| 50 | }; |
| 51 | typedef eigen_packet_wrapper<__m128> Packet4f; |
| 52 | typedef eigen_packet_wrapper<__m128i> Packet4i; |
| 53 | typedef eigen_packet_wrapper<__m128d> Packet2d; |
| 54 | #else |
| 55 | typedef __m128 Packet4f; |
| 56 | typedef __m128i Packet4i; |
| 57 | typedef __m128d Packet2d; |
| 58 | #endif |
| 59 | |
| 60 | template<> struct is_arithmetic<__m128> { enum { value = true }; }; |
| 61 | template<> struct is_arithmetic<__m128i> { enum { value = true }; }; |
| 62 | template<> struct is_arithmetic<__m128d> { enum { value = true }; }; |
| 63 | |
| 64 | #define vec4f_swizzle1(v,p,q,r,s) \ |
| 65 | (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) |
| 66 | |
| 67 | #define vec4i_swizzle1(v,p,q,r,s) \ |
| 68 | (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) |
| 69 | |
| 70 | #define vec2d_swizzle1(v,p,q) \ |
| 71 | (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) |
| 72 | |
| 73 | #define vec4f_swizzle2(a,b,p,q,r,s) \ |
| 74 | (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) |
| 75 | |
| 76 | #define vec4i_swizzle2(a,b,p,q,r,s) \ |
| 77 | (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) |
| 78 | |
| 79 | #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ |
| 80 | const Packet4f p4f_##NAME = pset1<Packet4f>(X) |
| 81 | |
| 82 | #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ |
| 83 | const Packet2d p2d_##NAME = pset1<Packet2d>(X) |
| 84 | |
| 85 | #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ |
| 86 | const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X)) |
| 87 | |
| 88 | #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ |
| 89 | const Packet4i p4i_##NAME = pset1<Packet4i>(X) |
| 90 | |
| 91 | |
| 92 | // Use the packet_traits defined in AVX/PacketMath.h instead if we're going |
| 93 | // to leverage AVX instructions. |
| 94 | #ifndef EIGEN_VECTORIZE_AVX |
| 95 | template<> struct packet_traits<float> : default_packet_traits |
| 96 | { |
| 97 | typedef Packet4f type; |
| 98 | typedef Packet4f half; |
| 99 | enum { |
| 100 | Vectorizable = 1, |
| 101 | AlignedOnScalar = 1, |
| 102 | size=4, |
| 103 | HasHalfPacket = 0, |
| 104 | |
| 105 | HasDiv = 1, |
| 106 | HasSin = EIGEN_FAST_MATH, |
| 107 | HasCos = EIGEN_FAST_MATH, |
| 108 | HasLog = 1, |
| 109 | HasExp = 1, |
| 110 | HasSqrt = 1, |
| 111 | HasRsqrt = 1, |
| 112 | HasTanh = EIGEN_FAST_MATH, |
| 113 | HasBlend = 1 |
| 114 | |
| 115 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 116 | , |
| 117 | HasRound = 1, |
| 118 | HasFloor = 1, |
| 119 | HasCeil = 1 |
| 120 | #endif |
| 121 | }; |
| 122 | }; |
| 123 | template<> struct packet_traits<double> : default_packet_traits |
| 124 | { |
| 125 | typedef Packet2d type; |
| 126 | typedef Packet2d half; |
| 127 | enum { |
| 128 | Vectorizable = 1, |
| 129 | AlignedOnScalar = 1, |
| 130 | size=2, |
| 131 | HasHalfPacket = 0, |
| 132 | |
| 133 | HasDiv = 1, |
| 134 | HasExp = 1, |
| 135 | HasSqrt = 1, |
| 136 | HasRsqrt = 1, |
| 137 | HasBlend = 1 |
| 138 | |
| 139 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 140 | , |
| 141 | HasRound = 1, |
| 142 | HasFloor = 1, |
| 143 | HasCeil = 1 |
| 144 | #endif |
| 145 | }; |
| 146 | }; |
| 147 | #endif |
| 148 | template<> struct packet_traits<int> : default_packet_traits |
| 149 | { |
| 150 | typedef Packet4i type; |
| 151 | typedef Packet4i half; |
| 152 | enum { |
| 153 | Vectorizable = 1, |
| 154 | AlignedOnScalar = 1, |
| 155 | size=4, |
| 156 | |
| 157 | HasBlend = 1 |
| 158 | }; |
| 159 | }; |
| 160 | |
| 161 | template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; |
| 162 | template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; |
| 163 | template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; |
| 164 | |
| 165 | #ifndef EIGEN_VECTORIZE_AVX |
| 166 | template<> struct scalar_div_cost<float,true> { enum { value = 7 }; }; |
| 167 | template<> struct scalar_div_cost<double,true> { enum { value = 8 }; }; |
| 168 | #endif |
| 169 | |
| 170 | #if EIGEN_COMP_MSVC==1500 |
| 171 | // Workaround MSVC 9 internal compiler error. |
| 172 | // TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode |
| 173 | // TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)). |
| 174 | template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps(from,from,from,from); } |
| 175 | template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); } |
| 176 | template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set_epi32(from,from,from,from); } |
| 177 | #else |
| 178 | template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return _mm_set_ps1(from); } |
| 179 | template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); } |
| 180 | template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { return _mm_set1_epi32(from); } |
| 181 | #endif |
| 182 | |
| 183 | // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. |
| 184 | // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) |
| 185 | // Using inline assembly is also not an option because then gcc fails to reorder properly the instructions. |
| 186 | // Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply. |
| 187 | // Also note that with AVX, we want it to generate a vbroadcastss. |
| 188 | #if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__) |
| 189 | template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) { |
| 190 | return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); |
| 191 | } |
| 192 | #endif |
| 193 | |
| 194 | template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); } |
| 195 | template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); } |
| 196 | template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); } |
| 197 | |
| 198 | template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); } |
| 199 | template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } |
| 200 | template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } |
| 201 | |
| 202 | template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } |
| 203 | template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } |
| 204 | template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } |
| 205 | |
| 206 | template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) |
| 207 | { |
| 208 | const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000)); |
| 209 | return _mm_xor_ps(a,mask); |
| 210 | } |
| 211 | template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) |
| 212 | { |
| 213 | const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000)); |
| 214 | return _mm_xor_pd(a,mask); |
| 215 | } |
| 216 | template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) |
| 217 | { |
| 218 | return psub(Packet4i(_mm_setr_epi32(0,0,0,0)), a); |
| 219 | } |
| 220 | |
| 221 | template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } |
| 222 | template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } |
| 223 | template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } |
| 224 | |
| 225 | template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); } |
| 226 | template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); } |
| 227 | template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) |
| 228 | { |
| 229 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 230 | return _mm_mullo_epi32(a,b); |
| 231 | #else |
| 232 | // this version is slightly faster than 4 scalar products |
| 233 | return vec4i_swizzle1( |
| 234 | vec4i_swizzle2( |
| 235 | _mm_mul_epu32(a,b), |
| 236 | _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2), |
| 237 | vec4i_swizzle1(b,1,0,3,2)), |
| 238 | 0,2,0,2), |
| 239 | 0,2,1,3); |
| 240 | #endif |
| 241 | } |
| 242 | |
| 243 | template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } |
| 244 | template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } |
| 245 | |
| 246 | // for some weird raisons, it has to be overloaded for packet of integers |
| 247 | template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } |
| 248 | #ifdef __FMA__ |
| 249 | template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } |
| 250 | template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } |
| 251 | #endif |
| 252 | |
| 253 | template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } |
| 254 | template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } |
| 255 | template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) |
| 256 | { |
| 257 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 258 | return _mm_min_epi32(a,b); |
| 259 | #else |
| 260 | // after some bench, this version *is* faster than a scalar implementation |
| 261 | Packet4i mask = _mm_cmplt_epi32(a,b); |
| 262 | return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); |
| 263 | #endif |
| 264 | } |
| 265 | |
| 266 | template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } |
| 267 | template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } |
| 268 | template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) |
| 269 | { |
| 270 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 271 | return _mm_max_epi32(a,b); |
| 272 | #else |
| 273 | // after some bench, this version *is* faster than a scalar implementation |
| 274 | Packet4i mask = _mm_cmpgt_epi32(a,b); |
| 275 | return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b)); |
| 276 | #endif |
| 277 | } |
| 278 | |
| 279 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 280 | template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); } |
| 281 | template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); } |
| 282 | |
| 283 | template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); } |
| 284 | template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); } |
| 285 | |
| 286 | template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); } |
| 287 | template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); } |
| 288 | #endif |
| 289 | |
| 290 | template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } |
| 291 | template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } |
| 292 | template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } |
| 293 | |
| 294 | template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } |
| 295 | template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } |
| 296 | template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } |
| 297 | |
| 298 | template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } |
| 299 | template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } |
| 300 | template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } |
| 301 | |
| 302 | template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } |
| 303 | template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } |
| 304 | template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } |
| 305 | |
| 306 | template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } |
| 307 | template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } |
| 308 | template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); } |
| 309 | |
| 310 | #if EIGEN_COMP_MSVC |
| 311 | template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { |
| 312 | EIGEN_DEBUG_UNALIGNED_LOAD |
| 313 | #if (EIGEN_COMP_MSVC==1600) |
| 314 | // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps |
| 315 | // (i.e., it does not generate an unaligned load!! |
| 316 | __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from)); |
| 317 | res = _mm_loadh_pi(res, (const __m64*)(from+2)); |
| 318 | return res; |
| 319 | #else |
| 320 | return _mm_loadu_ps(from); |
| 321 | #endif |
| 322 | } |
| 323 | #else |
| 324 | // NOTE: with the code below, MSVC's compiler crashes! |
| 325 | |
| 326 | template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) |
| 327 | { |
| 328 | EIGEN_DEBUG_UNALIGNED_LOAD |
| 329 | return _mm_loadu_ps(from); |
| 330 | } |
| 331 | #endif |
| 332 | |
| 333 | template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) |
| 334 | { |
| 335 | EIGEN_DEBUG_UNALIGNED_LOAD |
| 336 | return _mm_loadu_pd(from); |
| 337 | } |
| 338 | template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) |
| 339 | { |
| 340 | EIGEN_DEBUG_UNALIGNED_LOAD |
| 341 | return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from)); |
| 342 | } |
| 343 | |
| 344 | |
| 345 | template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) |
| 346 | { |
| 347 | return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1); |
| 348 | } |
| 349 | template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) |
| 350 | { return pset1<Packet2d>(from[0]); } |
| 351 | template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) |
| 352 | { |
| 353 | Packet4i tmp; |
| 354 | tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from)); |
| 355 | return vec4i_swizzle1(tmp, 0, 0, 1, 1); |
| 356 | } |
| 357 | |
| 358 | template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } |
| 359 | template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } |
| 360 | template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } |
| 361 | |
| 362 | template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } |
| 363 | template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } |
| 364 | template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } |
| 365 | |
| 366 | template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) |
| 367 | { |
| 368 | return _mm_set_ps(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); |
| 369 | } |
| 370 | template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) |
| 371 | { |
| 372 | return _mm_set_pd(from[1*stride], from[0*stride]); |
| 373 | } |
| 374 | template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) |
| 375 | { |
| 376 | return _mm_set_epi32(from[3*stride], from[2*stride], from[1*stride], from[0*stride]); |
| 377 | } |
| 378 | |
| 379 | template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) |
| 380 | { |
| 381 | to[stride*0] = _mm_cvtss_f32(from); |
| 382 | to[stride*1] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 1)); |
| 383 | to[stride*2] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 2)); |
| 384 | to[stride*3] = _mm_cvtss_f32(_mm_shuffle_ps(from, from, 3)); |
| 385 | } |
| 386 | template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) |
| 387 | { |
| 388 | to[stride*0] = _mm_cvtsd_f64(from); |
| 389 | to[stride*1] = _mm_cvtsd_f64(_mm_shuffle_pd(from, from, 1)); |
| 390 | } |
| 391 | template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) |
| 392 | { |
| 393 | to[stride*0] = _mm_cvtsi128_si32(from); |
| 394 | to[stride*1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)); |
| 395 | to[stride*2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)); |
| 396 | to[stride*3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)); |
| 397 | } |
| 398 | |
| 399 | // some compilers might be tempted to perform multiple moves instead of using a vector path. |
| 400 | template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) |
| 401 | { |
| 402 | Packet4f pa = _mm_set_ss(a); |
| 403 | pstore(to, Packet4f(vec4f_swizzle1(pa,0,0,0,0))); |
| 404 | } |
| 405 | // some compilers might be tempted to perform multiple moves instead of using a vector path. |
| 406 | template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) |
| 407 | { |
| 408 | Packet2d pa = _mm_set_sd(a); |
| 409 | pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); |
| 410 | } |
| 411 | |
| 412 | #if EIGEN_COMP_PGI |
| 413 | typedef const void * SsePrefetchPtrType; |
| 414 | #else |
| 415 | typedef const char * SsePrefetchPtrType; |
| 416 | #endif |
| 417 | |
| 418 | #ifndef EIGEN_VECTORIZE_AVX |
| 419 | template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } |
| 420 | template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } |
| 421 | template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } |
| 422 | #endif |
| 423 | |
| 424 | #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 |
| 425 | // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 |
| 426 | // Direct of the struct members fixed bug #62. |
| 427 | template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; } |
| 428 | template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; } |
| 429 | template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } |
| 430 | #elif EIGEN_COMP_MSVC_STRICT |
| 431 | // The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010 |
| 432 | template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; } |
| 433 | template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; } |
| 434 | template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; } |
| 435 | #else |
| 436 | template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); } |
| 437 | template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); } |
| 438 | template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); } |
| 439 | #endif |
| 440 | |
| 441 | template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) |
| 442 | { return _mm_shuffle_ps(a,a,0x1B); } |
| 443 | template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) |
| 444 | { return _mm_shuffle_pd(a,a,0x1); } |
| 445 | template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) |
| 446 | { return _mm_shuffle_epi32(a,0x1B); } |
| 447 | |
| 448 | template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) |
| 449 | { |
| 450 | const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF)); |
| 451 | return _mm_and_ps(a,mask); |
| 452 | } |
| 453 | template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) |
| 454 | { |
| 455 | const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF)); |
| 456 | return _mm_and_pd(a,mask); |
| 457 | } |
| 458 | template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) |
| 459 | { |
| 460 | #ifdef EIGEN_VECTORIZE_SSSE3 |
| 461 | return _mm_abs_epi32(a); |
| 462 | #else |
| 463 | Packet4i aux = _mm_srai_epi32(a,31); |
| 464 | return _mm_sub_epi32(_mm_xor_si128(a,aux),aux); |
| 465 | #endif |
| 466 | } |
| 467 | |
| 468 | // with AVX, the default implementations based on pload1 are faster |
| 469 | #ifndef __AVX__ |
| 470 | template<> EIGEN_STRONG_INLINE void |
| 471 | pbroadcast4<Packet4f>(const float *a, |
| 472 | Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) |
| 473 | { |
| 474 | a3 = pload<Packet4f>(a); |
| 475 | a0 = vec4f_swizzle1(a3, 0,0,0,0); |
| 476 | a1 = vec4f_swizzle1(a3, 1,1,1,1); |
| 477 | a2 = vec4f_swizzle1(a3, 2,2,2,2); |
| 478 | a3 = vec4f_swizzle1(a3, 3,3,3,3); |
| 479 | } |
| 480 | template<> EIGEN_STRONG_INLINE void |
| 481 | pbroadcast4<Packet2d>(const double *a, |
| 482 | Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) |
| 483 | { |
| 484 | #ifdef EIGEN_VECTORIZE_SSE3 |
| 485 | a0 = _mm_loaddup_pd(a+0); |
| 486 | a1 = _mm_loaddup_pd(a+1); |
| 487 | a2 = _mm_loaddup_pd(a+2); |
| 488 | a3 = _mm_loaddup_pd(a+3); |
| 489 | #else |
| 490 | a1 = pload<Packet2d>(a); |
| 491 | a0 = vec2d_swizzle1(a1, 0,0); |
| 492 | a1 = vec2d_swizzle1(a1, 1,1); |
| 493 | a3 = pload<Packet2d>(a+2); |
| 494 | a2 = vec2d_swizzle1(a3, 0,0); |
| 495 | a3 = vec2d_swizzle1(a3, 1,1); |
| 496 | #endif |
| 497 | } |
| 498 | #endif |
| 499 | |
| 500 | EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) |
| 501 | { |
| 502 | vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55)); |
| 503 | vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA)); |
| 504 | vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF)); |
| 505 | vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); |
| 506 | } |
| 507 | |
| 508 | #ifdef EIGEN_VECTORIZE_SSE3 |
| 509 | template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) |
| 510 | { |
| 511 | return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); |
| 512 | } |
| 513 | |
| 514 | template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) |
| 515 | { |
| 516 | return _mm_hadd_pd(vecs[0], vecs[1]); |
| 517 | } |
| 518 | |
| 519 | #else |
| 520 | template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) |
| 521 | { |
| 522 | Packet4f tmp0, tmp1, tmp2; |
| 523 | tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); |
| 524 | tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); |
| 525 | tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); |
| 526 | tmp0 = _mm_add_ps(tmp0, tmp1); |
| 527 | tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); |
| 528 | tmp1 = _mm_add_ps(tmp1, tmp2); |
| 529 | tmp2 = _mm_movehl_ps(tmp1, tmp0); |
| 530 | tmp0 = _mm_movelh_ps(tmp0, tmp1); |
| 531 | return _mm_add_ps(tmp0, tmp2); |
| 532 | } |
| 533 | |
| 534 | template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) |
| 535 | { |
| 536 | return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); |
| 537 | } |
| 538 | #endif // SSE3 |
| 539 | |
| 540 | template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) |
| 541 | { |
| 542 | // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures |
| 543 | // (from Nehalem to Haswell) |
| 544 | // #ifdef EIGEN_VECTORIZE_SSE3 |
| 545 | // Packet4f tmp = _mm_add_ps(a, vec4f_swizzle1(a,2,3,2,3)); |
| 546 | // return pfirst<Packet4f>(_mm_hadd_ps(tmp, tmp)); |
| 547 | // #else |
| 548 | Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a)); |
| 549 | return pfirst<Packet4f>(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); |
| 550 | // #endif |
| 551 | } |
| 552 | |
| 553 | template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) |
| 554 | { |
| 555 | // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures |
| 556 | // (from Nehalem to Haswell) |
| 557 | // #ifdef EIGEN_VECTORIZE_SSE3 |
| 558 | // return pfirst<Packet2d>(_mm_hadd_pd(a, a)); |
| 559 | // #else |
| 560 | return pfirst<Packet2d>(_mm_add_sd(a, _mm_unpackhi_pd(a,a))); |
| 561 | // #endif |
| 562 | } |
| 563 | |
| 564 | #ifdef EIGEN_VECTORIZE_SSSE3 |
| 565 | template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) |
| 566 | { |
| 567 | return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); |
| 568 | } |
| 569 | template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) |
| 570 | { |
| 571 | Packet4i tmp0 = _mm_hadd_epi32(a,a); |
| 572 | return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0)); |
| 573 | } |
| 574 | #else |
| 575 | template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) |
| 576 | { |
| 577 | Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); |
| 578 | return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)); |
| 579 | } |
| 580 | |
| 581 | template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) |
| 582 | { |
| 583 | Packet4i tmp0, tmp1, tmp2; |
| 584 | tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); |
| 585 | tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); |
| 586 | tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); |
| 587 | tmp0 = _mm_add_epi32(tmp0, tmp1); |
| 588 | tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); |
| 589 | tmp1 = _mm_add_epi32(tmp1, tmp2); |
| 590 | tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); |
| 591 | tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); |
| 592 | return _mm_add_epi32(tmp0, tmp2); |
| 593 | } |
| 594 | #endif |
| 595 | // Other reduction functions: |
| 596 | |
| 597 | // mul |
| 598 | template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) |
| 599 | { |
| 600 | Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a)); |
| 601 | return pfirst<Packet4f>(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); |
| 602 | } |
| 603 | template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) |
| 604 | { |
| 605 | return pfirst<Packet2d>(_mm_mul_sd(a, _mm_unpackhi_pd(a,a))); |
| 606 | } |
| 607 | template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) |
| 608 | { |
| 609 | // after some experiments, it is seems this is the fastest way to implement it |
| 610 | // for GCC (eg., reusing pmul is very slow !) |
| 611 | // TODO try to call _mm_mul_epu32 directly |
| 612 | EIGEN_ALIGN16 int aux[4]; |
| 613 | pstore(aux, a); |
| 614 | return (aux[0] * aux[1]) * (aux[2] * aux[3]);; |
| 615 | } |
| 616 | |
| 617 | // min |
| 618 | template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) |
| 619 | { |
| 620 | Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a)); |
| 621 | return pfirst<Packet4f>(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); |
| 622 | } |
| 623 | template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) |
| 624 | { |
| 625 | return pfirst<Packet2d>(_mm_min_sd(a, _mm_unpackhi_pd(a,a))); |
| 626 | } |
| 627 | template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) |
| 628 | { |
| 629 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 630 | Packet4i tmp = _mm_min_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); |
| 631 | return pfirst<Packet4i>(_mm_min_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); |
| 632 | #else |
| 633 | // after some experiments, it is seems this is the fastest way to implement it |
| 634 | // for GCC (eg., it does not like using std::min after the pstore !!) |
| 635 | EIGEN_ALIGN16 int aux[4]; |
| 636 | pstore(aux, a); |
| 637 | int aux0 = aux[0]<aux[1] ? aux[0] : aux[1]; |
| 638 | int aux2 = aux[2]<aux[3] ? aux[2] : aux[3]; |
| 639 | return aux0<aux2 ? aux0 : aux2; |
| 640 | #endif // EIGEN_VECTORIZE_SSE4_1 |
| 641 | } |
| 642 | |
| 643 | // max |
| 644 | template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) |
| 645 | { |
| 646 | Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a)); |
| 647 | return pfirst<Packet4f>(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1))); |
| 648 | } |
| 649 | template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) |
| 650 | { |
| 651 | return pfirst<Packet2d>(_mm_max_sd(a, _mm_unpackhi_pd(a,a))); |
| 652 | } |
| 653 | template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) |
| 654 | { |
| 655 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 656 | Packet4i tmp = _mm_max_epi32(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0,0,3,2))); |
| 657 | return pfirst<Packet4i>(_mm_max_epi32(tmp,_mm_shuffle_epi32(tmp, 1))); |
| 658 | #else |
| 659 | // after some experiments, it is seems this is the fastest way to implement it |
| 660 | // for GCC (eg., it does not like using std::min after the pstore !!) |
| 661 | EIGEN_ALIGN16 int aux[4]; |
| 662 | pstore(aux, a); |
| 663 | int aux0 = aux[0]>aux[1] ? aux[0] : aux[1]; |
| 664 | int aux2 = aux[2]>aux[3] ? aux[2] : aux[3]; |
| 665 | return aux0>aux2 ? aux0 : aux2; |
| 666 | #endif // EIGEN_VECTORIZE_SSE4_1 |
| 667 | } |
| 668 | |
| 669 | #if EIGEN_COMP_GNUC |
| 670 | // template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) |
| 671 | // { |
| 672 | // Packet4f res = b; |
| 673 | // asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); |
| 674 | // return res; |
| 675 | // } |
| 676 | // EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i) |
| 677 | // { |
| 678 | // Packet4i res = a; |
| 679 | // asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); |
| 680 | // return res; |
| 681 | // } |
| 682 | #endif |
| 683 | |
| 684 | #ifdef EIGEN_VECTORIZE_SSSE3 |
| 685 | // SSSE3 versions |
| 686 | template<int Offset> |
| 687 | struct palign_impl<Offset,Packet4f> |
| 688 | { |
| 689 | static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) |
| 690 | { |
| 691 | if (Offset!=0) |
| 692 | first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); |
| 693 | } |
| 694 | }; |
| 695 | |
| 696 | template<int Offset> |
| 697 | struct palign_impl<Offset,Packet4i> |
| 698 | { |
| 699 | static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) |
| 700 | { |
| 701 | if (Offset!=0) |
| 702 | first = _mm_alignr_epi8(second,first, Offset*4); |
| 703 | } |
| 704 | }; |
| 705 | |
| 706 | template<int Offset> |
| 707 | struct palign_impl<Offset,Packet2d> |
| 708 | { |
| 709 | static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) |
| 710 | { |
| 711 | if (Offset==1) |
| 712 | first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); |
| 713 | } |
| 714 | }; |
| 715 | #else |
| 716 | // SSE2 versions |
| 717 | template<int Offset> |
| 718 | struct palign_impl<Offset,Packet4f> |
| 719 | { |
| 720 | static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) |
| 721 | { |
| 722 | if (Offset==1) |
| 723 | { |
| 724 | first = _mm_move_ss(first,second); |
| 725 | first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); |
| 726 | } |
| 727 | else if (Offset==2) |
| 728 | { |
| 729 | first = _mm_movehl_ps(first,first); |
| 730 | first = _mm_movelh_ps(first,second); |
| 731 | } |
| 732 | else if (Offset==3) |
| 733 | { |
| 734 | first = _mm_move_ss(first,second); |
| 735 | first = _mm_shuffle_ps(first,second,0x93); |
| 736 | } |
| 737 | } |
| 738 | }; |
| 739 | |
| 740 | template<int Offset> |
| 741 | struct palign_impl<Offset,Packet4i> |
| 742 | { |
| 743 | static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) |
| 744 | { |
| 745 | if (Offset==1) |
| 746 | { |
| 747 | first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); |
| 748 | first = _mm_shuffle_epi32(first,0x39); |
| 749 | } |
| 750 | else if (Offset==2) |
| 751 | { |
| 752 | first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); |
| 753 | first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); |
| 754 | } |
| 755 | else if (Offset==3) |
| 756 | { |
| 757 | first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); |
| 758 | first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); |
| 759 | } |
| 760 | } |
| 761 | }; |
| 762 | |
| 763 | template<int Offset> |
| 764 | struct palign_impl<Offset,Packet2d> |
| 765 | { |
| 766 | static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) |
| 767 | { |
| 768 | if (Offset==1) |
| 769 | { |
| 770 | first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); |
| 771 | first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); |
| 772 | } |
| 773 | } |
| 774 | }; |
| 775 | #endif |
| 776 | |
| 777 | EIGEN_DEVICE_FUNC inline void |
| 778 | ptranspose(PacketBlock<Packet4f,4>& kernel) { |
| 779 | _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]); |
| 780 | } |
| 781 | |
| 782 | EIGEN_DEVICE_FUNC inline void |
| 783 | ptranspose(PacketBlock<Packet2d,2>& kernel) { |
| 784 | __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]); |
| 785 | kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]); |
| 786 | kernel.packet[1] = tmp; |
| 787 | } |
| 788 | |
| 789 | EIGEN_DEVICE_FUNC inline void |
| 790 | ptranspose(PacketBlock<Packet4i,4>& kernel) { |
| 791 | __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]); |
| 792 | __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]); |
| 793 | __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]); |
| 794 | __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]); |
| 795 | |
| 796 | kernel.packet[0] = _mm_unpacklo_epi64(T0, T1); |
| 797 | kernel.packet[1] = _mm_unpackhi_epi64(T0, T1); |
| 798 | kernel.packet[2] = _mm_unpacklo_epi64(T2, T3); |
| 799 | kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); |
| 800 | } |
| 801 | |
| 802 | template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { |
| 803 | const __m128i zero = _mm_setzero_si128(); |
| 804 | const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); |
| 805 | __m128i false_mask = _mm_cmpeq_epi32(select, zero); |
| 806 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 807 | return _mm_blendv_epi8(thenPacket, elsePacket, false_mask); |
| 808 | #else |
| 809 | return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket)); |
| 810 | #endif |
| 811 | } |
| 812 | template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { |
| 813 | const __m128 zero = _mm_setzero_ps(); |
| 814 | const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); |
| 815 | __m128 false_mask = _mm_cmpeq_ps(select, zero); |
| 816 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 817 | return _mm_blendv_ps(thenPacket, elsePacket, false_mask); |
| 818 | #else |
| 819 | return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket)); |
| 820 | #endif |
| 821 | } |
| 822 | template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { |
| 823 | const __m128d zero = _mm_setzero_pd(); |
| 824 | const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]); |
| 825 | __m128d false_mask = _mm_cmpeq_pd(select, zero); |
| 826 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 827 | return _mm_blendv_pd(thenPacket, elsePacket, false_mask); |
| 828 | #else |
| 829 | return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket)); |
| 830 | #endif |
| 831 | } |
| 832 | |
| 833 | template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) |
| 834 | { |
| 835 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 836 | return _mm_blend_ps(a,pset1<Packet4f>(b),1); |
| 837 | #else |
| 838 | return _mm_move_ss(a, _mm_load_ss(&b)); |
| 839 | #endif |
| 840 | } |
| 841 | |
| 842 | template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) |
| 843 | { |
| 844 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 845 | return _mm_blend_pd(a,pset1<Packet2d>(b),1); |
| 846 | #else |
| 847 | return _mm_move_sd(a, _mm_load_sd(&b)); |
| 848 | #endif |
| 849 | } |
| 850 | |
| 851 | template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) |
| 852 | { |
| 853 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 854 | return _mm_blend_ps(a,pset1<Packet4f>(b),(1<<3)); |
| 855 | #else |
| 856 | const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); |
| 857 | return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1<Packet4f>(b))); |
| 858 | #endif |
| 859 | } |
| 860 | |
| 861 | template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) |
| 862 | { |
| 863 | #ifdef EIGEN_VECTORIZE_SSE4_1 |
| 864 | return _mm_blend_pd(a,pset1<Packet2d>(b),(1<<1)); |
| 865 | #else |
| 866 | const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); |
| 867 | return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1<Packet2d>(b))); |
| 868 | #endif |
| 869 | } |
| 870 | |
| 871 | // Scalar path for pmadd with FMA to ensure consistency with vectorized path. |
| 872 | #ifdef __FMA__ |
| 873 | template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { |
| 874 | return ::fmaf(a,b,c); |
| 875 | } |
| 876 | template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) { |
| 877 | return ::fma(a,b,c); |
| 878 | } |
| 879 | #endif |
| 880 | |
| 881 | } // end namespace internal |
| 882 | |
| 883 | } // end namespace Eigen |
| 884 | |
| 885 | #if EIGEN_COMP_PGI |
| 886 | // PGI++ does not define the following intrinsics in C++ mode. |
| 887 | static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } |
| 888 | static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } |
| 889 | static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); } |
| 890 | static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); } |
| 891 | static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); } |
| 892 | static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); } |
| 893 | #endif |
| 894 | |
| 895 | #endif // EIGEN_PACKET_MATH_SSE_H |
| 896 | |