From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id BAD593858D1E; Mon, 6 May 2024 10:35:27 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BAD593858D1E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1714991727; bh=7UdcYL99bz1Ukv58+DxG3njkgPRui0nJCO3/Pr2MPyo=; h=From:To:Subject:Date:In-Reply-To:References:From; b=DBF93OoXWCaCv41P0GM1jY7+7oERKZQBPofknreIfqT9KiXJhmO50moJe2P+WbHVe z2WC+dnbjnyp6bf20ZgNCGy7b/atBWW+3u1/AJPKiZkkXIvGV2qLpBZsqamDsF05HH kcVG5FZ8hESXIvZtoJVAXIumoSiMn+xopkpwnVa0= From: "mkretz at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug libstdc++/114958] use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations Date: Mon, 06 May 2024 10:35:27 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: libstdc++ X-Bugzilla-Version: 15.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: mkretz at gcc dot gnu.org X-Bugzilla-Status: ASSIGNED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: mkretz at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_status everconfirmed cf_reconfirmed_on Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D114958 Matthias Kretz (Vir) changed: What |Removed |Added ---------------------------------------------------------------------------- Status|UNCONFIRMED |ASSIGNED Ever confirmed|0 |1 Last reconfirmed| |2024-05-06 --- Comment #1 from Matthias Kretz (Vir) --- Untested patch: diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h index 6ef9c955cfa..7cfaa84ba9e 100644 --- a/libstdc++-v3/include/experimental/bits/simd.h +++ b/libstdc++-v3/include/experimental/bits/simd.h @@ -1651,7 +1651,17 @@ __as_vector(_V __x) if constexpr (__is_vector_type_v<_V>) return __x; else if constexpr (is_simd<_V>::value || is_simd_mask<_V>::value) - return __data(__x)._M_data; + { + if constexpr (_V::size() > 1) + return __data(__x)._M_data; + else + { + static_assert(is_simd<_V>::value); + using _Tp =3D typename _V::value_type; + using _RV [[__gnu__::__vector_size__(sizeof(_Tp))]] =3D _Tp; + return _RV{__data(__x)}; + } + } else if constexpr (__is_vectorizable_v<_V>) return __vector_type_t<_V, 2>{__x}; else @@ -2061,6 +2071,22 @@ __not(_Tp __a) noexcept return ~__a; } +// }}} +// __vec_shuffle{{{ +template + _GLIBCXX_SIMD_INTRINSIC constexpr auto + __vec_shuffle(_T0 __x, _T1 __y, index_sequence<_Is...>, _Fun __idx_perm) + { return __builtin_shufflevector(__x, __y, [=3D] { + constexpr int __j =3D __idx_perm(_Is); + static_assert(__j < sizeof...(_Is)); + return __j; + }()...); } + +template + _GLIBCXX_SIMD_INTRINSIC constexpr auto + __vec_shuffle(_T0 __x, index_sequence<_Is...>, _Fun __idx_perm) + { return __builtin_shufflevector(__x, _T0(), __idx_perm(_Is)...); } + // }}} // __concat{{{ template , @@ -3947,7 +3973,7 @@ clamp(const simd<_Tp, _Ap>& __v, const simd<_Tp, _Ap>& __lo, const simd<_Tp, _Ap // __extract_part {{{ template _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr - _SimdWrapper<_Tp, _Np / _Total * _Combine> + conditional_t<_Np =3D=3D _Total and _Combine =3D=3D 1, _Tp, _SimdWrapper= <_Tp, _Np / _Total * _Combine>> __extract_part(const _SimdWrapper<_Tp, _Np> __x); template @@ -4231,47 +4257,28 @@ static_assert( __split_wrapper(_SL::template _S_pop_front<1>(), __data(__x).second)); } - else if constexpr ((!is_same_v> && ...) - && (!__is_fixed_size_abi_v< - simd_abi::deduce_t<_Tp, _Sizes>> && ...)) + else if constexpr ((!__is_fixed_size_abi_v> && ...)) { - if constexpr (((_Sizes * 2 =3D=3D _Np) && ...)) - return {{__private_init, __extract_part<0, 2>(__data(__x))}, - {__private_init, __extract_part<1, 2>(__data(__x))}}; - else if constexpr (is_same_v<_SizeList<_Sizes...>, - _SizeList<_Np / 3, _Np / 3, _Np / 3>>) - return {{__private_init, __extract_part<0, 3>(__data(__x))}, - {__private_init, __extract_part<1, 3>(__data(__x))}, - {__private_init, __extract_part<2, 3>(__data(__x))}}; - else if constexpr (is_same_v<_SizeList<_Sizes...>, - _SizeList<2 * _Np / 3, _Np / 3>>) - return {{__private_init, __extract_part<0, 3, 2>(__data(__x))}, - {__private_init, __extract_part<2, 3>(__data(__x))}}; - else if constexpr (is_same_v<_SizeList<_Sizes...>, - _SizeList<_Np / 3, 2 * _Np / 3>>) - return {{__private_init, __extract_part<0, 3>(__data(__x))}, - {__private_init, __extract_part<1, 3, 2>(__data(__x))}}; - else if constexpr (is_same_v<_SizeList<_Sizes...>, - _SizeList<_Np / 2, _Np / 4, _Np / 4>>) - return {{__private_init, __extract_part<0, 2>(__data(__x))}, - {__private_init, __extract_part<2, 4>(__data(__x))}, - {__private_init, __extract_part<3, 4>(__data(__x))}}; - else if constexpr (is_same_v<_SizeList<_Sizes...>, - _SizeList<_Np / 4, _Np / 4, _Np / 2>>) - return {{__private_init, __extract_part<0, 4>(__data(__x))}, - {__private_init, __extract_part<1, 4>(__data(__x))}, - {__private_init, __extract_part<1, 2>(__data(__x))}}; - else if constexpr (is_same_v<_SizeList<_Sizes...>, - _SizeList<_Np / 4, _Np / 2, _Np / 4>>) - return {{__private_init, __extract_part<0, 4>(__data(__x))}, - {__private_init, __extract_center(__data(__x))}, - {__private_init, __extract_part<3, 4>(__data(__x))}}; - else if constexpr (((_Sizes * 4 =3D=3D _Np) && ...)) - return {{__private_init, __extract_part<0, 4>(__data(__x))}, - {__private_init, __extract_part<1, 4>(__data(__x))}, - {__private_init, __extract_part<2, 4>(__data(__x))}, - {__private_init, __extract_part<3, 4>(__data(__x))}}; + constexpr size_t _N1 =3D _SL::template _S_at<1>(); + if constexpr (sizeof...(_Sizes) =3D=3D 2) + return {{__private_init, __extract_part<0, _Np, _N0>(__data(__x))= }, + {__private_init, __extract_part<_N0, _Np, _N1>(__data(__x))}}; + else if constexpr (sizeof...(_Sizes) =3D=3D 3) + { + constexpr size_t _N2 =3D _SL::template _S_at<2>(); + return {{__private_init, __extract_part<0, _Np, _N0>(__data(__x= ))}, + {__private_init, __extract_part<_N0, _Np, _N1>(__data(__x))}, + {__private_init, __extract_part<_N0 + _N1, _Np, _N2>(__data(__x))}}; + } + else if constexpr (sizeof...(_Sizes) =3D=3D 4) + { + constexpr size_t _N2 =3D _SL::template _S_at<2>(); + constexpr size_t _N3 =3D _SL::template _S_at<3>(); + return {{__private_init, __extract_part<0, _Np, _N0>(__data(__x= ))}, + {__private_init, __extract_part<_N0, _Np, _N1>(__data(__x))}, + {__private_init, __extract_part<_N0 + _N1, _Np, _N2>(__data(__x))}, + {__private_init, __extract_part<_N0 + _N1 + _N2, _Np, _N3>(__data(__x))}}; + } // else fall through } #ifdef _GLIBCXX_SIMD_USE_ALIASING_LOADS @@ -4334,14 +4341,37 @@ static_assert( simd<_Tp, simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>> concat(const simd<_Tp, _As>&... __xs) { - using _Rp =3D __deduced_simd<_Tp, (simd_size_v<_Tp, _As> + ...)>; + constexpr int _Np =3D (simd_size_v<_Tp, _As> + ...); + using _Abi =3D simd_abi::deduce_t<_Tp, _Np>; + using _Rp =3D simd<_Tp, _Abi>; + using _RW =3D typename _SimdTraits<_Tp, _Abi>::_SimdMember; if constexpr (sizeof...(__xs) =3D=3D 1) return simd_cast<_Rp>(__xs...); else if ((... && __xs._M_is_constprop())) - return simd<_Tp, - simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>( - [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + return _Rp([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { return __subscript_in_pack<__i>(__xs...); }); + else if constexpr (__is_simd_wrapper_v<_RW> and sizeof...(__xs) =3D=3D= 2) + { + return {__private_init, + __vec_shuffle(__as_vector(__xs)..., std::make_index_sequence<_RW::_S_full_size>(), + [](int __i) { + constexpr int __sizes[2] =3D {int(simd_size_v<_Tp, _As>)...}; + constexpr int __padding0 + =3D sizeof(__vector_type_t<_Tp, __sizes[0= ]>) / sizeof(_Tp) + - __sizes[0]; + return __i >=3D _Np ? -1 : __i < __sizes[0]= ? __i : __i + __padding0; + })}; + } + else if constexpr (__is_simd_wrapper_v<_RW> and sizeof...(__xs) =3D=3D= 3) + return [](const auto& __x0, const auto& __x1, const auto& __x2) + _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return concat(concat(__x0, __x1), __x2); + }(__xs...); + else if constexpr (__is_simd_wrapper_v<_RW> and sizeof...(__xs) > 3) + return [](const auto& __x0, const auto& __x1, const auto&... __rest) + _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return concat(concat(__x0, __x1), concat(__rest...)); + }(__xs...); else { _Rp __r{}; diff --git a/libstdc++-v3/include/experimental/bits/simd_builtin.h b/libstdc++-v3/include/experimental/bits/simd_builtin.h index 4ceeb423894..1064caf8de2 100644 --- a/libstdc++-v3/include/experimental/bits/simd_builtin.h +++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h @@ -206,10 +206,12 @@ __shift_elements_right(_Tp __v) // __extract_part(_SimdWrapper<_Tp, _Np>) {{{ template _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr - _SimdWrapper<_Tp, _Np / _Total * _Combine> + conditional_t<_Np =3D=3D _Total and _Combine =3D=3D 1, _Tp, _SimdWrapper= <_Tp, _Np / _Total * _Combine>> __extract_part(const _SimdWrapper<_Tp, _Np> __x) { - if constexpr (_Index % 2 =3D=3D 0 && _Total % 2 =3D=3D 0 && _Combine %= 2 =3D=3D 0) + if constexpr (_Np =3D=3D _Total and _Combine =3D=3D 1) + return __x[_Index]; + else if constexpr (_Index % 2 =3D=3D 0 && _Total % 2 =3D=3D 0 && _Comb= ine % 2 =3D=3D 0) return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x); else { @@ -235,39 +237,11 @@ __extract_part(const _SimdWrapper<_Tp, _Np> __x) return __x; else if constexpr (_Index =3D=3D 0) return __intrin_bitcast<_R>(__as_vector(__x)); -#if _GLIBCXX_SIMD_X86INTRIN // {{{ - else if constexpr (sizeof(__x) =3D=3D 32 - && __return_size * sizeof(_Tp) <=3D 16) - { - constexpr size_t __bytes_to_skip =3D __values_to_skip * sizeof(= _Tp); - if constexpr (__bytes_to_skip =3D=3D 16) - return __vector_bitcast<_Tp, __return_size>( - __hi128(__as_vector(__x))); - else - return __vector_bitcast<_Tp, __return_size>( - _mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)), - __lo128(__vector_bitcast<_LLong>(__x)), - __bytes_to_skip)); - } -#endif // _GLIBCXX_SIMD_X86INTRIN }}} - else if constexpr (_Index > 0 - && (__values_to_skip % __return_size !=3D 0 - || sizeof(_R) >=3D 8) - && (__values_to_skip + __return_size) * sizeof(_= Tp) - <=3D 64 - && sizeof(__x) >=3D 16) - return __intrin_bitcast<_R>( - __shift_elements_right<__values_to_skip * sizeof(_Tp)>( - __as_vector(__x))); else - { - _R __r =3D {}; - __builtin_memcpy(&__r, - reinterpret_cast(&__x) - + sizeof(_Tp) * __values_to_skip, - __return_size * sizeof(_Tp)); - return __r; - } + return __vec_shuffle(__as_vector(__x), make_index_sequence<__bit_ceil(__return_size)>(), + [](size_t __i) { + return __i + __values_to_skip; + }); } } diff --git a/libstdc++-v3/testsuite/experimental/simd/pr114958.cc b/libstdc++-v3/testsuite/experimental/simd/pr114958.cc new file mode 100644 index 00000000000..94c9e0a2d18 --- /dev/null +++ b/libstdc++-v3/testsuite/experimental/simd/pr114958.cc @@ -0,0 +1,20 @@ +// { dg-options "-std=3Dc++17" } +// { dg-do compile { target x86_64-*-* } } +// { dg-require-effective-target c++17 } +// { dg-additional-options "-march=3Dx86-64-v3" { target x86_64-*-* } } +// { dg-require-cmath "" } +// { dg-final { scan-assembler-times "vperm(q|pd)\[\\t \]+\\\$144" 1 } } + +#include + +namespace stdx =3D std::experimental; + +using T =3D std::uint64_t; +using V =3D stdx::simd>; +using V1 =3D stdx::simd; + +V perm(V data) +{ + auto [carry, _] =3D stdx::split<3, 1>(data); + return concat(V1(), carry); +}=