| 1 | /*************************************************************************** |
| 2 | * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * |
| 3 | * Martin Renou * |
| 4 | * Copyright (c) QuantStack * |
| 5 | * Copyright (c) Serge Guelton * |
| 6 | * * |
| 7 | * Distributed under the terms of the BSD 3-Clause License. * |
| 8 | * * |
| 9 | * The full license is in the file LICENSE, distributed with this software. * |
| 10 | ****************************************************************************/ |
| 11 | |
| 12 | #ifndef XSIMD_GENERIC_MEMORY_HPP |
| 13 | #define XSIMD_GENERIC_MEMORY_HPP |
| 14 | |
| 15 | #include <algorithm> |
| 16 | #include <complex> |
| 17 | #include <stdexcept> |
| 18 | |
| 19 | #include "../../types/xsimd_batch_constant.hpp" |
| 20 | #include "./xsimd_generic_details.hpp" |
| 21 | |
| 22 | namespace xsimd |
| 23 | { |
| 24 | template <class batch_type, typename batch_type::value_type... Values> |
| 25 | struct batch_constant; |
| 26 | |
| 27 | namespace kernel |
| 28 | { |
| 29 | |
| 30 | using namespace types; |
| 31 | |
| 32 | // extract_pair |
| 33 | template <class A, class T> |
| 34 | inline batch<T, A> (batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept |
| 35 | { |
| 36 | constexpr std::size_t size = batch<T, A>::size; |
| 37 | assert(i < size && "index in bounds" ); |
| 38 | |
| 39 | alignas(A::alignment()) T self_buffer[size]; |
| 40 | self.store_aligned(self_buffer); |
| 41 | |
| 42 | alignas(A::alignment()) T other_buffer[size]; |
| 43 | other.store_aligned(other_buffer); |
| 44 | |
| 45 | alignas(A::alignment()) T concat_buffer[size]; |
| 46 | |
| 47 | for (std::size_t j = 0; j < (size - i); ++j) |
| 48 | { |
| 49 | concat_buffer[j] = other_buffer[i + j]; |
| 50 | if (j < i) |
| 51 | { |
| 52 | concat_buffer[size - 1 - j] = self_buffer[i - 1 - j]; |
| 53 | } |
| 54 | } |
| 55 | return batch<T, A>::load_aligned(concat_buffer); |
| 56 | } |
| 57 | |
| 58 | // gather |
| 59 | namespace detail |
| 60 | { |
| 61 | template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0> |
| 62 | inline batch<T, A> gather(U const* src, batch<V, A> const& index, |
| 63 | ::xsimd::index<N> I) noexcept |
| 64 | { |
| 65 | return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I); |
| 66 | } |
| 67 | |
| 68 | template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0> |
| 69 | inline batch<T, A> |
| 70 | gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept |
| 71 | { |
| 72 | static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!" ); |
| 73 | |
| 74 | const auto test = gather<N - 1, T, A>(src, index, {}); |
| 75 | return insert(test, static_cast<T>(src[index.get(I)]), I); |
| 76 | } |
| 77 | } // namespace detail |
| 78 | |
| 79 | template <typename T, typename A, typename V> |
| 80 | inline batch<T, A> |
| 81 | gather(batch<T, A> const&, T const* src, batch<V, A> const& index, |
| 82 | kernel::requires_arch<generic>) noexcept |
| 83 | { |
| 84 | static_assert(batch<T, A>::size == batch<V, A>::size, |
| 85 | "Index and destination sizes must match" ); |
| 86 | |
| 87 | return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {}); |
| 88 | } |
| 89 | |
| 90 | // Gather with runtime indexes and mismatched strides. |
| 91 | template <typename T, typename A, typename U, typename V> |
| 92 | inline detail::sizes_mismatch_t<T, U, batch<T, A>> |
| 93 | gather(batch<T, A> const&, U const* src, batch<V, A> const& index, |
| 94 | kernel::requires_arch<generic>) noexcept |
| 95 | { |
| 96 | static_assert(batch<T, A>::size == batch<V, A>::size, |
| 97 | "Index and destination sizes must match" ); |
| 98 | |
| 99 | return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {}); |
| 100 | } |
| 101 | |
| 102 | // Gather with runtime indexes and matching strides. |
| 103 | template <typename T, typename A, typename U, typename V> |
| 104 | inline detail::stride_match_t<T, U, batch<T, A>> |
| 105 | gather(batch<T, A> const&, U const* src, batch<V, A> const& index, |
| 106 | kernel::requires_arch<generic>) noexcept |
| 107 | { |
| 108 | static_assert(batch<T, A>::size == batch<V, A>::size, |
| 109 | "Index and destination sizes must match" ); |
| 110 | |
| 111 | return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {})); |
| 112 | } |
| 113 | |
| 114 | // insert |
| 115 | template <class A, class T, size_t I> |
| 116 | inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept |
| 117 | { |
| 118 | struct index_mask |
| 119 | { |
| 120 | static constexpr bool get(size_t index, size_t /* size*/) |
| 121 | { |
| 122 | return index != I; |
| 123 | } |
| 124 | }; |
| 125 | batch<T, A> tmp(val); |
| 126 | return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp); |
| 127 | } |
| 128 | |
| 129 | // get |
| 130 | template <class A, size_t I, class T> |
| 131 | inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept |
| 132 | { |
| 133 | alignas(A::alignment()) T buffer[batch<T, A>::size]; |
| 134 | self.store_aligned(&buffer[0]); |
| 135 | return buffer[I]; |
| 136 | } |
| 137 | |
| 138 | template <class A, size_t I, class T> |
| 139 | inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept |
| 140 | { |
| 141 | alignas(A::alignment()) T buffer[batch_bool<T, A>::size]; |
| 142 | self.store_aligned(&buffer[0]); |
| 143 | return buffer[I]; |
| 144 | } |
| 145 | |
| 146 | template <class A, size_t I, class T> |
| 147 | inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type |
| 148 | { |
| 149 | alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size]; |
| 150 | self.store_aligned(&buffer[0]); |
| 151 | return buffer[I]; |
| 152 | } |
| 153 | |
| 154 | template <class A, class T> |
| 155 | inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept |
| 156 | { |
| 157 | alignas(A::alignment()) T buffer[batch<T, A>::size]; |
| 158 | self.store_aligned(&buffer[0]); |
| 159 | return buffer[i]; |
| 160 | } |
| 161 | |
| 162 | template <class A, class T> |
| 163 | inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept |
| 164 | { |
| 165 | alignas(A::alignment()) bool buffer[batch_bool<T, A>::size]; |
| 166 | self.store_aligned(&buffer[0]); |
| 167 | return buffer[i]; |
| 168 | } |
| 169 | |
| 170 | template <class A, class T> |
| 171 | inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type |
| 172 | { |
| 173 | using T2 = typename batch<std::complex<T>, A>::value_type; |
| 174 | alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size]; |
| 175 | self.store_aligned(&buffer[0]); |
| 176 | return buffer[i]; |
| 177 | } |
| 178 | |
| 179 | // load_aligned |
| 180 | namespace detail |
| 181 | { |
| 182 | template <class A, class T_in, class T_out> |
| 183 | inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept |
| 184 | { |
| 185 | using batch_type_in = batch<T_in, A>; |
| 186 | using batch_type_out = batch<T_out, A>; |
| 187 | return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {}); |
| 188 | } |
| 189 | template <class A, class T_in, class T_out> |
| 190 | inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept |
| 191 | { |
| 192 | static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination" ); |
| 193 | using batch_type_out = batch<T_out, A>; |
| 194 | alignas(A::alignment()) T_out buffer[batch_type_out::size]; |
| 195 | std::copy(mem, mem + batch_type_out::size, std::begin(buffer)); |
| 196 | return batch_type_out::load_aligned(buffer); |
| 197 | } |
| 198 | } |
| 199 | template <class A, class T_in, class T_out> |
| 200 | inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept |
| 201 | { |
| 202 | return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {}); |
| 203 | } |
| 204 | |
| 205 | // load_unaligned |
| 206 | namespace detail |
| 207 | { |
| 208 | template <class A, class T_in, class T_out> |
| 209 | inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept |
| 210 | { |
| 211 | using batch_type_in = batch<T_in, A>; |
| 212 | using batch_type_out = batch<T_out, A>; |
| 213 | return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {}); |
| 214 | } |
| 215 | |
| 216 | template <class A, class T_in, class T_out> |
| 217 | inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept |
| 218 | { |
| 219 | static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination" ); |
| 220 | return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {}); |
| 221 | } |
| 222 | } |
| 223 | template <class A, class T_in, class T_out> |
| 224 | inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept |
| 225 | { |
| 226 | return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {}); |
| 227 | } |
| 228 | |
| 229 | namespace detail |
| 230 | { |
| 231 | // Scatter with runtime indexes. |
| 232 | template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0> |
| 233 | inline void scatter(batch<T, A> const& src, U* dst, |
| 234 | batch<V, A> const& index, |
| 235 | ::xsimd::index<N> I) noexcept |
| 236 | { |
| 237 | dst[index.get(I)] = static_cast<U>(src.get(I)); |
| 238 | } |
| 239 | |
| 240 | template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0> |
| 241 | inline void |
| 242 | scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index, |
| 243 | ::xsimd::index<N> I) noexcept |
| 244 | { |
| 245 | static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!" ); |
| 246 | |
| 247 | kernel::detail::scatter<N - 1, T, A, U, V>( |
| 248 | src, dst, index, {}); |
| 249 | dst[index.get(I)] = static_cast<U>(src.get(I)); |
| 250 | } |
| 251 | } // namespace detail |
| 252 | |
| 253 | template <typename A, typename T, typename V> |
| 254 | inline void |
| 255 | scatter(batch<T, A> const& src, T* dst, |
| 256 | batch<V, A> const& index, |
| 257 | kernel::requires_arch<generic>) noexcept |
| 258 | { |
| 259 | static_assert(batch<T, A>::size == batch<V, A>::size, |
| 260 | "Source and index sizes must match" ); |
| 261 | kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>( |
| 262 | src, dst, index, {}); |
| 263 | } |
| 264 | |
| 265 | template <typename A, typename T, typename U, typename V> |
| 266 | inline detail::sizes_mismatch_t<T, U, void> |
| 267 | scatter(batch<T, A> const& src, U* dst, |
| 268 | batch<V, A> const& index, |
| 269 | kernel::requires_arch<generic>) noexcept |
| 270 | { |
| 271 | static_assert(batch<T, A>::size == batch<V, A>::size, |
| 272 | "Source and index sizes must match" ); |
| 273 | kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>( |
| 274 | src, dst, index, {}); |
| 275 | } |
| 276 | |
| 277 | template <typename A, typename T, typename U, typename V> |
| 278 | inline detail::stride_match_t<T, U, void> |
| 279 | scatter(batch<T, A> const& src, U* dst, |
| 280 | batch<V, A> const& index, |
| 281 | kernel::requires_arch<generic>) noexcept |
| 282 | { |
| 283 | static_assert(batch<T, A>::size == batch<V, A>::size, |
| 284 | "Source and index sizes must match" ); |
| 285 | const auto tmp = batch_cast<U>(src); |
| 286 | kernel::scatter<A>(tmp, dst, index, A {}); |
| 287 | } |
| 288 | |
| 289 | // store |
| 290 | template <class T, class A> |
| 291 | inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept |
| 292 | { |
| 293 | using batch_type = batch<T, A>; |
| 294 | constexpr auto size = batch_bool<T, A>::size; |
| 295 | alignas(A::alignment()) T buffer[size]; |
| 296 | kernel::store_aligned<A>(&buffer[0], batch_type(self), A {}); |
| 297 | for (std::size_t i = 0; i < size; ++i) |
| 298 | mem[i] = bool(buffer[i]); |
| 299 | } |
| 300 | |
| 301 | // store_aligned |
| 302 | template <class A, class T_in, class T_out> |
| 303 | inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept |
| 304 | { |
| 305 | static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination" ); |
| 306 | alignas(A::alignment()) T_in buffer[batch<T_in, A>::size]; |
| 307 | store_aligned(&buffer[0], self); |
| 308 | std::copy(std::begin(buffer), std::end(buffer), mem); |
| 309 | } |
| 310 | |
| 311 | // store_unaligned |
| 312 | template <class A, class T_in, class T_out> |
| 313 | inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept |
| 314 | { |
| 315 | static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination" ); |
| 316 | return store_aligned<A>(mem, self, generic {}); |
| 317 | } |
| 318 | |
| 319 | // swizzle |
| 320 | template <class A, class T, class ITy, ITy... Vs> |
| 321 | inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept |
| 322 | { |
| 323 | return { swizzle(self.real(), mask), swizzle(self.imag(), mask) }; |
| 324 | } |
| 325 | |
| 326 | namespace detail |
| 327 | { |
| 328 | template <class A, class T> |
| 329 | inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept |
| 330 | { |
| 331 | static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture" ); |
| 332 | } |
| 333 | |
| 334 | template <class A, class T> |
| 335 | inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept |
| 336 | { |
| 337 | static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture" ); |
| 338 | } |
| 339 | |
| 340 | template <class A, class T> |
| 341 | inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept |
| 342 | { |
| 343 | static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture" ); |
| 344 | } |
| 345 | } |
| 346 | |
| 347 | // load_complex_aligned |
| 348 | template <class A, class T_out, class T_in> |
| 349 | inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept |
| 350 | { |
| 351 | using real_batch = batch<T_out, A>; |
| 352 | T_in const* buffer = reinterpret_cast<T_in const*>(mem); |
| 353 | real_batch hi = real_batch::load_aligned(buffer), |
| 354 | lo = real_batch::load_aligned(buffer + real_batch::size); |
| 355 | return detail::load_complex(hi, lo, A {}); |
| 356 | } |
| 357 | |
| 358 | // load_complex_unaligned |
| 359 | template <class A, class T_out, class T_in> |
| 360 | inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept |
| 361 | { |
| 362 | using real_batch = batch<T_out, A>; |
| 363 | T_in const* buffer = reinterpret_cast<T_in const*>(mem); |
| 364 | real_batch hi = real_batch::load_unaligned(buffer), |
| 365 | lo = real_batch::load_unaligned(buffer + real_batch::size); |
| 366 | return detail::load_complex(hi, lo, A {}); |
| 367 | } |
| 368 | |
| 369 | // store_complex_aligned |
| 370 | template <class A, class T_out, class T_in> |
| 371 | inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept |
| 372 | { |
| 373 | using real_batch = batch<T_in, A>; |
| 374 | real_batch hi = detail::complex_high(src, A {}); |
| 375 | real_batch lo = detail::complex_low(src, A {}); |
| 376 | T_out* buffer = reinterpret_cast<T_out*>(dst); |
| 377 | lo.store_aligned(buffer); |
| 378 | hi.store_aligned(buffer + real_batch::size); |
| 379 | } |
| 380 | |
| 381 | // store_compelx_unaligned |
| 382 | template <class A, class T_out, class T_in> |
| 383 | inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept |
| 384 | { |
| 385 | using real_batch = batch<T_in, A>; |
| 386 | real_batch hi = detail::complex_high(src, A {}); |
| 387 | real_batch lo = detail::complex_low(src, A {}); |
| 388 | T_out* buffer = reinterpret_cast<T_out*>(dst); |
| 389 | lo.store_unaligned(buffer); |
| 390 | hi.store_unaligned(buffer + real_batch::size); |
| 391 | } |
| 392 | |
| 393 | } |
| 394 | |
| 395 | } |
| 396 | |
| 397 | #endif |
| 398 | |