diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp index 58ce53462..aead21c45 100644 --- a/include/xsimd/config/xsimd_config.hpp +++ b/include/xsimd/config/xsimd_config.hpp @@ -33,6 +33,17 @@ * @defgroup xsimd_config_macro Instruction Set Detection */ +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if the target is the x86 architecture family. + */ +#if defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) +#define XSIMD_TARGET_X86 1 +#else +#define XSIMD_TARGET_X86 0 +#endif + /** * @ingroup xsimd_config_macro * diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index 9805bf087..461eaded5 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -12,8 +12,9 @@ #ifndef XSIMD_CPUID_HPP #define XSIMD_CPUID_HPP -#include -#include +#include "../types/xsimd_all_registers.hpp" +#include "../xsimd_cpu_features_x86.hpp" +#include "xsimd_inline.hpp" #if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector)) #include @@ -25,13 +26,6 @@ #endif -#if defined(_MSC_VER) -// Contains the definition of __cpuidex -#include -#endif - -#include "../types/xsimd_all_registers.hpp" - namespace xsimd { namespace detail @@ -40,7 +34,7 @@ namespace xsimd { #define ARCH_FIELD_EX(arch, field_name) \ - unsigned field_name; \ + unsigned field_name = 0; \ XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; } #define ARCH_FIELD_EX_REUSE(arch, field_name) \ @@ -90,8 +84,6 @@ namespace xsimd XSIMD_INLINE supported_arch() noexcept { - memset(this, 0, sizeof(supported_arch)); - #if XSIMD_WITH_WASM wasm = 1; #endif @@ -126,138 +118,54 @@ namespace xsimd #endif rvv = bool(getauxval(AT_HWCAP) & HWCAP_V); #endif - -#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) - - auto get_xcr0_low = []() noexcept - { - uint32_t xcr0; - -#if defined(_MSC_VER) && _MSC_VER >= 1400 - - xcr0 = (uint32_t)_xgetbv(0); - -#elif defined(__GNUC__) - - __asm__( - "xorl %%ecx, %%ecx\n" - "xgetbv\n" - : "=a"(xcr0) - : -#if defined(__i386__) - : "ecx", "edx" -#else - : "rcx", "rdx" -#endif - ); - -#else /* _MSC_VER < 1400 */ -#error "_MSC_VER < 1400 is not supported" -#endif /* _MSC_VER && _MSC_VER >= 1400 */ - return xcr0; - }; - - auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept - { - -#if defined(_MSC_VER) - __cpuidex(reg, level, count); - -#elif defined(__INTEL_COMPILER) - __cpuid(reg, level); - -#elif defined(__GNUC__) || defined(__clang__) - -#if defined(__i386__) && defined(__PIC__) - // %ebx may be the PIC register - __asm__("xchg{l}\t{%%}ebx, %1\n\t" - "cpuid\n\t" - "xchg{l}\t{%%}ebx, %1\n\t" - : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) - : "0"(level), "2"(count)); - -#else - __asm__("cpuid\n\t" - : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) - : "0"(level), "2"(count)); -#endif - -#else -#error "Unsupported configuration" #endif - }; - - int regs1[4]; - - get_cpuid(regs1, 0x1); - - // OS can explicitly disable the usage of SSE/AVX extensions - // by setting an appropriate flag in CR0 register - // - // https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html + const auto cpuid = xsimd::x86_cpu_id::read(); + auto xcr0 = xsimd::x86_xcr0::make_false(); - unsigned sse_state_os_enabled = 1; + bool sse_state_os_enabled = true; // AVX and AVX512 strictly require OSXSAVE to be enabled by the OS. // If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1), // AVX state won't be preserved across context switches, so AVX cannot be used. - unsigned avx_state_os_enabled = 0; - unsigned avx512_state_os_enabled = 0; + bool avx_state_os_enabled = false; + bool avx512_state_os_enabled = false; - // OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit - // 18] to enable XSETBV/XGETBV instructions to access XCR0 and - // to support processor extended state management using - // XSAVE/XRSTOR. - bool osxsave = regs1[2] >> 27 & 1; - if (osxsave) + if (cpuid.osxsave()) { + xcr0 = xsimd::x86_xcr0::read(); - uint32_t xcr0 = get_xcr0_low(); - - sse_state_os_enabled = xcr0 >> 1 & 1; - avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled; - avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled; + sse_state_os_enabled = xcr0.sse_state_os_enabled(); + avx_state_os_enabled = xcr0.avx_state_os_enabled(); + avx512_state_os_enabled = xcr0.avx512_state_os_enabled(); } - sse2 = regs1[3] >> 26 & sse_state_os_enabled; - sse3 = regs1[2] >> 0 & sse_state_os_enabled; - ssse3 = regs1[2] >> 9 & sse_state_os_enabled; - sse4_1 = regs1[2] >> 19 & sse_state_os_enabled; - sse4_2 = regs1[2] >> 20 & sse_state_os_enabled; - fma3_sse42 = regs1[2] >> 12 & sse_state_os_enabled; - - avx = regs1[2] >> 28 & avx_state_os_enabled; - fma3_avx = avx && fma3_sse42; - - int regs8[4]; - get_cpuid(regs8, 0x80000001); - fma4 = regs8[2] >> 16 & avx_state_os_enabled; - - // sse4a = regs[2] >> 6 & 1; + sse2 = cpuid.sse2() && sse_state_os_enabled; + sse3 = cpuid.sse3() && sse_state_os_enabled; + ssse3 = cpuid.ssse3() && sse_state_os_enabled; + sse4_1 = cpuid.sse4_1() && sse_state_os_enabled; + sse4_2 = cpuid.sse4_2() && sse_state_os_enabled; + fma3_sse42 = cpuid.fma3() && sse_state_os_enabled; - // xop = regs[2] >> 11 & 1; - - int regs7[4]; - get_cpuid(regs7, 0x7); - avx2 = regs7[1] >> 5 & avx_state_os_enabled; - - int regs7a[4]; - get_cpuid(regs7a, 0x7, 0x1); - avxvnni = regs7a[0] >> 4 & avx_state_os_enabled; + // sse4a not implemented in cpu_id yet + // xop not implemented in cpu_id yet + avx = cpuid.avx() && avx_state_os_enabled; + fma3_avx = avx && fma3_sse42; + fma4 = cpuid.fma4() && avx_state_os_enabled; + avx2 = cpuid.avx2() && avx_state_os_enabled; + avxvnni = cpuid.avxvnni() && avx_state_os_enabled; fma3_avx2 = avx2 && fma3_sse42; - avx512f = regs7[1] >> 16 & avx512_state_os_enabled; - avx512cd = regs7[1] >> 28 & avx512_state_os_enabled; - avx512dq = regs7[1] >> 17 & avx512_state_os_enabled; - avx512bw = regs7[1] >> 30 & avx512_state_os_enabled; - avx512er = regs7[1] >> 27 & avx512_state_os_enabled; - avx512pf = regs7[1] >> 26 & avx512_state_os_enabled; - avx512ifma = regs7[1] >> 21 & avx512_state_os_enabled; - avx512vbmi = regs7[2] >> 1 & avx512_state_os_enabled; - avx512vbmi2 = regs7[2] >> 6 & avx512_state_os_enabled; - avx512vnni_bw = regs7[2] >> 11 & avx512_state_os_enabled; + avx512f = cpuid.avx512f() && avx512_state_os_enabled; + avx512cd = cpuid.avx512cd() && avx512_state_os_enabled; + avx512dq = cpuid.avx512dq() && avx512_state_os_enabled; + avx512bw = cpuid.avx512bw() && avx512_state_os_enabled; + avx512er = cpuid.avx512er() && avx512_state_os_enabled; + avx512pf = cpuid.avx512pf() && avx512_state_os_enabled; + avx512ifma = cpuid.avx512ifma() && avx512_state_os_enabled; + avx512vbmi = cpuid.avx512vbmi() && avx512_state_os_enabled; + avx512vbmi2 = cpuid.avx512vbmi2() && avx512_state_os_enabled; + avx512vnni_bw = cpuid.avx512vnni_bw() && avx512_state_os_enabled; avx512vnni_vbmi2 = avx512vbmi2 && avx512vnni_bw; -#endif } }; } // namespace detail diff --git a/include/xsimd/types/xsimd_common_arch.hpp b/include/xsimd/types/xsimd_common_arch.hpp index 28491aeda..a33c868ea 100644 --- a/include/xsimd/types/xsimd_common_arch.hpp +++ b/include/xsimd/types/xsimd_common_arch.hpp @@ -12,6 +12,8 @@ #ifndef XSIMD_COMMON_ARCH_HPP #define XSIMD_COMMON_ARCH_HPP +#include + #include "../config/xsimd_config.hpp" /** diff --git a/include/xsimd/types/xsimd_register.hpp b/include/xsimd/types/xsimd_register.hpp index 018418af6..b14962e5b 100644 --- a/include/xsimd/types/xsimd_register.hpp +++ b/include/xsimd/types/xsimd_register.hpp @@ -14,6 +14,8 @@ #include +#include "../config/xsimd_inline.hpp" + namespace xsimd { namespace types diff --git a/include/xsimd/utils/bits.hpp b/include/xsimd/utils/bits.hpp new file mode 100644 index 000000000..a8a862219 --- /dev/null +++ b/include/xsimd/utils/bits.hpp @@ -0,0 +1,41 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ***************************************************************************/ + +#ifndef XSIMD_CPUID_UTILS_HPP +#define XSIMD_CPUID_UTILS_HPP + +namespace xsimd +{ + namespace utils + { + template + constexpr I make_bit_mask(I bit) + { + return static_cast(I { 1 } << bit); + } + + template + constexpr I make_bit_mask(I bit, Args... bits) + { + // TODO(C++17): Use fold expression + return make_bit_mask(bit) | make_bit_mask(static_cast(bits)...); + } + + template + constexpr bool bit_is_set(I value) + { + constexpr I mask = make_bit_mask(static_cast(Bits)...); + return (value & mask) == mask; + } + } +} + +#endif diff --git a/include/xsimd/xsimd_cpu_features_x86.hpp b/include/xsimd/xsimd_cpu_features_x86.hpp new file mode 100644 index 000000000..edc885951 --- /dev/null +++ b/include/xsimd/xsimd_cpu_features_x86.hpp @@ -0,0 +1,309 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_CPU_FEATURES_X86_HPP +#define XSIMD_CPU_FEATURES_X86_HPP + +#include +#include + +#include "./config/xsimd_config.hpp" +#include "./utils/bits.hpp" + +#if XSIMD_TARGET_X86 && defined(_MSC_VER) +#include // Contains the definition of __cpuidex +#endif + +namespace xsimd +{ + namespace detail + { + using cpuid_reg_t = std::array; + inline cpuid_reg_t get_cpuid(int level, int count = 0) noexcept; + + using xcr0_reg_t = std::uint32_t; + inline xcr0_reg_t get_xcr0_low() noexcept; + } + + /* + * Extended Control Register 0 (XCR0). + * + * Operating systems can explicitly disable the usage of instruction set (such + * as SSE or AVX extensions) by setting an appropriate flag in XCR0 register. + * This utility parses such bit values. + * + * @see https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html + */ + class x86_xcr0 + { + public: + using reg_t = detail::xcr0_reg_t; + + /** Parse a XCR0 value into individual components. */ + constexpr explicit x86_xcr0(reg_t low) noexcept + : m_low(low) + { + } + + /** Create an object that has all features set to false. */ + static constexpr x86_xcr0 make_false() + { + return x86_xcr0(0); + } + + /** Read the XCR0 register from the CPU if on the correct architecture. */ + inline static x86_xcr0 read() + { + return x86_xcr0(detail::get_xcr0_low()); + } + + constexpr bool sse_state_os_enabled() const noexcept + { + return utils::bit_is_set<1>(m_low); + } + + constexpr bool avx_state_os_enabled() const noexcept + { + // Check both SSE and AVX bits even though AVX must imply SSE + return utils::bit_is_set<1, 2>(m_low); + } + + constexpr bool avx512_state_os_enabled() const noexcept + { + // Check all SSE, AVX, and AVX52 bits even though AVX512 must + // imply AVX and SSE + return utils::bit_is_set<1, 2, 6>(m_low); + } + + private: + reg_t m_low = {}; + }; + + class x86_cpu_id + { + public: + struct cpu_id_regs + { + using reg_t = detail::cpuid_reg_t; + + reg_t reg1 = {}; + reg_t reg7 = {}; + reg_t reg7a = {}; + reg_t reg8 = {}; + }; + + /** Parse CpuInfo register values into individual components. */ + constexpr explicit x86_cpu_id(const cpu_id_regs& regs) noexcept + : m_regs(regs) + { + } + + /** + * Read the CpuId registers from the CPU if on the correct architecture. + * + * This is only safe to call if bit 18 of CR4.OSXSAVE has been set. + * + * @see cpu_id::osxsave + */ + inline static x86_cpu_id read() + { + cpu_id_regs regs = {}; + // TODO(C++20): Use designated initializer + regs.reg1 = detail::get_cpuid(0x1); + regs.reg7 = detail::get_cpuid(0x7); + regs.reg7a = detail::get_cpuid(0x7, 0x1); + regs.reg8 = detail::get_cpuid(0x80000001); + return x86_cpu_id(regs); + } + + constexpr bool sse2() const noexcept + { + return utils::bit_is_set<26>(m_regs.reg1[3]); + } + + constexpr bool sse3() const noexcept + { + return utils::bit_is_set<0>(m_regs.reg1[2]); + } + + constexpr bool ssse3() const noexcept + { + return utils::bit_is_set<9>(m_regs.reg1[2]); + } + + constexpr bool sse4_1() const noexcept + { + return utils::bit_is_set<19>(m_regs.reg1[2]); + } + + constexpr bool sse4_2() const noexcept + { + return utils::bit_is_set<20>(m_regs.reg1[2]); + } + + constexpr bool fma3() const noexcept + { + return utils::bit_is_set<12>(m_regs.reg1[2]); + } + + /** + * Indicates whether the OS has enabled extended state management. + * + * When true, the OS has set bit 18 (OSXSAVE) in the CR4 control register, + * enabling the XGETBV/XSETBV instructions to access XCR0 and support + * processor extended state management using XSAVE/XRSTOR. + * + * This value is read from CPUID leaf 0x1, ECX bit 27, which reflects + * the state of CR4.OSXSAVE. + */ + constexpr bool osxsave() const noexcept + { + return utils::bit_is_set<27>(m_regs.reg1[2]); + } + + constexpr bool avx() const noexcept + { + return utils::bit_is_set<28>(m_regs.reg1[2]); + } + + constexpr bool avx2() const noexcept + { + return utils::bit_is_set<5>(m_regs.reg7[1]); + } + + constexpr bool avx512f() const noexcept + { + return utils::bit_is_set<16>(m_regs.reg7[1]); + } + + constexpr bool avx512dq() const noexcept + { + return utils::bit_is_set<17>(m_regs.reg7[1]); + } + + constexpr bool avx512ifma() const noexcept + { + return utils::bit_is_set<21>(m_regs.reg7[1]); + } + + constexpr bool avx512pf() const noexcept + { + return utils::bit_is_set<26>(m_regs.reg7[1]); + } + + constexpr bool avx512er() const noexcept + { + return utils::bit_is_set<27>(m_regs.reg7[1]); + } + + constexpr bool avx512cd() const noexcept + { + return utils::bit_is_set<28>(m_regs.reg7[1]); + } + + constexpr bool avx512bw() const noexcept + { + return utils::bit_is_set<30>(m_regs.reg7[1]); + } + + constexpr bool avx512vbmi() const noexcept + { + return utils::bit_is_set<1>(m_regs.reg7[2]); + } + + constexpr bool avx512vbmi2() const noexcept + { + return utils::bit_is_set<6>(m_regs.reg7[2]); + } + + constexpr bool avx512vnni_bw() const noexcept + { + return utils::bit_is_set<11>(m_regs.reg7[2]); + } + + constexpr bool avxvnni() const noexcept + { + return utils::bit_is_set<4>(m_regs.reg7a[0]); + } + + constexpr bool fma4() const noexcept + { + return utils::bit_is_set<16>(m_regs.reg8[2]); + } + + private: + cpu_id_regs m_regs = {}; + }; + + namespace detail + { + inline cpuid_reg_t get_cpuid(int level, int count) noexcept + { + cpuid_reg_t reg = {}; +#if !XSIMD_TARGET_X86 + (void)level; + (void)count; + return reg; // All bits to zero + +#elif defined(_MSC_VER) + __cpuidex(reg, level, count); + +#elif defined(__INTEL_COMPILER) + __cpuid(reg, level); + +#elif defined(__GNUC__) || defined(__clang__) + +#if defined(__i386__) && defined(__PIC__) + // %ebx may be the PIC register + __asm__("xchg{l}\t{%%}ebx, %1\n\t" + "cpuid\n\t" + "xchg{l}\t{%%}ebx, %1\n\t" + : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) + : "0"(level), "2"(count)); + +#else + __asm__("cpuid\n\t" + : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) + : "0"(level), "2"(count)); +#endif +#endif + } + + inline xcr0_reg_t get_xcr0_low() noexcept + { +#if !XSIMD_TARGET_X86 + return {}; // All bits to zero + +#elif defined(_MSC_VER) && _MSC_VER >= 1400 + return static_cast(_xgetbv(0)); + +#elif defined(__GNUC__) + xcr0_reg_t xcr0 = {}; + __asm__( + "xorl %%ecx, %%ecx\n" + "xgetbv\n" + : "=a"(xcr0) + : +#if defined(__i386__) + : "ecx", "edx" +#else + : "rcx", "rdx" +#endif + ); + return xcr0; + +#else /* _MSC_VER < 1400 */ +#error "_MSC_VER < 1400 is not supported" +#endif /* _MSC_VER && _MSC_VER >= 1400 */ + }; + } +} +#endif diff --git a/test/check_inline_specifier.sh b/test/check_inline_specifier.sh index 1ccdda130..2337b3707 100755 --- a/test/check_inline_specifier.sh +++ b/test/check_inline_specifier.sh @@ -3,7 +3,7 @@ # Usage: $0 top_srcdir # # This script walks all headers in $top_srcdir/include and makes sure that all -# functions declared tehre are marked as inline or constexpr (which implies +# functions declared there are marked as inline or constexpr (which implies # inline). This makes sure the xsimd headers does not define symbol with global # linkage, and somehow convey our itnent to have all functions in xsimd being # inlined by the compiler.