| 1 | /*************************************************************************** |
| 2 | * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * |
| 3 | * Martin Renou * |
| 4 | * Copyright (c) QuantStack * |
| 5 | * Copyright (c) Serge Guelton * |
| 6 | * * |
| 7 | * Distributed under the terms of the BSD 3-Clause License. * |
| 8 | * * |
| 9 | * The full license is in the file LICENSE, distributed with this software. * |
| 10 | ****************************************************************************/ |
| 11 | |
| 12 | #ifndef XSIMD_CPUID_HPP |
| 13 | #define XSIMD_CPUID_HPP |
| 14 | |
| 15 | #include <algorithm> |
| 16 | #include <cstring> |
| 17 | |
| 18 | #if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM)) |
| 19 | #include <asm/hwcap.h> |
| 20 | #include <sys/auxv.h> |
| 21 | #endif |
| 22 | |
| 23 | #if defined(_MSC_VER) |
| 24 | // Contains the definition of __cpuidex |
| 25 | #include <intrin.h> |
| 26 | #endif |
| 27 | |
| 28 | #include "../types/xsimd_all_registers.hpp" |
| 29 | |
| 30 | namespace xsimd |
| 31 | { |
| 32 | namespace detail |
| 33 | { |
| 34 | struct supported_arch |
| 35 | { |
| 36 | unsigned sse2 : 1; |
| 37 | unsigned sse3 : 1; |
| 38 | unsigned ssse3 : 1; |
| 39 | unsigned sse4_1 : 1; |
| 40 | unsigned sse4_2 : 1; |
| 41 | unsigned sse4a : 1; |
| 42 | unsigned fma3_sse : 1; |
| 43 | unsigned fma4 : 1; |
| 44 | unsigned xop : 1; |
| 45 | unsigned avx : 1; |
| 46 | unsigned fma3_avx : 1; |
| 47 | unsigned avx2 : 1; |
| 48 | unsigned fma3_avx2 : 1; |
| 49 | unsigned avx512f : 1; |
| 50 | unsigned avx512cd : 1; |
| 51 | unsigned avx512dq : 1; |
| 52 | unsigned avx512bw : 1; |
| 53 | unsigned neon : 1; |
| 54 | unsigned neon64 : 1; |
| 55 | |
| 56 | // version number of the best arch available |
| 57 | unsigned best; |
| 58 | |
| 59 | supported_arch() noexcept |
| 60 | { |
| 61 | memset(s: this, c: 0, n: sizeof(supported_arch)); |
| 62 | |
| 63 | #if defined(__aarch64__) || defined(_M_ARM64) |
| 64 | neon = 1; |
| 65 | neon64 = 1; |
| 66 | best = neon64::version(); |
| 67 | #elif defined(__ARM_NEON) || defined(_M_ARM) |
| 68 | #if defined(__linux__) |
| 69 | neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON); |
| 70 | #else |
| 71 | // that's very conservative :-/ |
| 72 | neon = 0; |
| 73 | #endif |
| 74 | neon64 = 0; |
| 75 | best = neon::version() * neon; |
| 76 | |
| 77 | #elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) |
| 78 | auto get_cpuid = [](int reg[4], int func_id) noexcept |
| 79 | { |
| 80 | |
| 81 | #if defined(_MSC_VER) |
| 82 | __cpuidex(reg, func_id, 0); |
| 83 | |
| 84 | #elif defined(__INTEL_COMPILER) |
| 85 | __cpuid(reg, func_id); |
| 86 | |
| 87 | #elif defined(__GNUC__) || defined(__clang__) |
| 88 | |
| 89 | #if defined(__i386__) && defined(__PIC__) |
| 90 | // %ebx may be the PIC register |
| 91 | __asm__("xchg{l}\t{%%}ebx, %1\n\t" |
| 92 | "cpuid\n\t" |
| 93 | "xchg{l}\t{%%}ebx, %1\n\t" |
| 94 | : "=a" (reg[0]), "=r" (reg[1]), "=c" (reg[2]), |
| 95 | "=d" (reg[3]) |
| 96 | : "a" (func_id), "c" (0)); |
| 97 | |
| 98 | #else |
| 99 | __asm__("cpuid\n\t" |
| 100 | : "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), |
| 101 | "=d" (reg[3]) |
| 102 | : "a" (func_id), "c" (0)); |
| 103 | #endif |
| 104 | |
| 105 | #else |
| 106 | #error "Unsupported configuration" |
| 107 | #endif |
| 108 | }; |
| 109 | |
| 110 | int regs[4]; |
| 111 | |
| 112 | get_cpuid(regs, 0x1); |
| 113 | |
| 114 | sse2 = regs[3] >> 26 & 1; |
| 115 | best = std::max(best, sse2::version() * sse2); |
| 116 | |
| 117 | sse3 = regs[2] >> 0 & 1; |
| 118 | best = std::max(best, sse3::version() * sse3); |
| 119 | |
| 120 | ssse3 = regs[2] >> 9 & 1; |
| 121 | best = std::max(best, ssse3::version() * ssse3); |
| 122 | |
| 123 | sse4_1 = regs[2] >> 19 & 1; |
| 124 | best = std::max(best, sse4_1::version() * sse4_1); |
| 125 | |
| 126 | sse4_2 = regs[2] >> 20 & 1; |
| 127 | best = std::max(best, sse4_2::version() * sse4_2); |
| 128 | |
| 129 | fma3_sse = regs[2] >> 12 & 1; |
| 130 | if (sse4_2) |
| 131 | best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse); |
| 132 | |
| 133 | get_cpuid(regs, 0x80000001); |
| 134 | fma4 = regs[2] >> 16 & 1; |
| 135 | best = std::max(best, fma4::version() * fma4); |
| 136 | |
| 137 | // sse4a = regs[2] >> 6 & 1; |
| 138 | // best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a); |
| 139 | |
| 140 | // xop = regs[2] >> 11 & 1; |
| 141 | // best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop); |
| 142 | |
| 143 | avx = regs[2] >> 28 & 1; |
| 144 | best = std::max(best, avx::version() * avx); |
| 145 | |
| 146 | fma3_avx = avx && fma3_sse; |
| 147 | best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx); |
| 148 | |
| 149 | get_cpuid(regs, 0x7); |
| 150 | avx2 = regs[1] >> 5 & 1; |
| 151 | best = std::max(best, avx2::version() * avx2); |
| 152 | |
| 153 | fma3_avx2 = avx2 && fma3_sse; |
| 154 | best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2); |
| 155 | |
| 156 | avx512f = regs[1] >> 16 & 1; |
| 157 | best = std::max(best, avx512f::version() * avx512f); |
| 158 | |
| 159 | avx512cd = regs[1] >> 28 & 1; |
| 160 | best = std::max(best, avx512cd::version() * avx512cd * avx512f); |
| 161 | |
| 162 | avx512dq = regs[1] >> 17 & 1; |
| 163 | best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f); |
| 164 | |
| 165 | avx512bw = regs[1] >> 30 & 1; |
| 166 | best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f); |
| 167 | |
| 168 | #endif |
| 169 | } |
| 170 | }; |
| 171 | } |
| 172 | |
| 173 | inline detail::supported_arch available_architectures() noexcept |
| 174 | { |
| 175 | static detail::supported_arch supported; |
| 176 | return supported; |
| 177 | } |
| 178 | } |
| 179 | |
| 180 | #endif |
| 181 | |