[Bug libstdc++/114958] New: use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug libstdc++/114958] New: use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations
@ 2024-05-06 10:29 mkretz at gcc dot gnu.org
  2024-05-06 10:35 ` [Bug libstdc++/114958] " mkretz at gcc dot gnu.org
                   ` (6 more replies)
  0 siblings, 7 replies; 8+ messages in thread
From: mkretz at gcc dot gnu.org @ 2024-05-06 10:29 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114958

            Bug ID: 114958
           Summary: use __builtin_shufflevector for
                    std::experimental::simd split and concat (at least the
                    common cases) to enable better optimizations
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: libstdc++
          Assignee: mkretz at gcc dot gnu.org
          Reporter: mkretz at gcc dot gnu.org
  Target Milestone: ---

PR114908 presents a missed optimization that would not occur if simd's split
and concat would make better use of __builtin_shufflevector.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug libstdc++/114958] use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations
  2024-05-06 10:29 [Bug libstdc++/114958] New: use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations mkretz at gcc dot gnu.org
@ 2024-05-06 10:35 ` mkretz at gcc dot gnu.org
  2024-05-06 10:45 ` jakub at gcc dot gnu.org
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: mkretz at gcc dot gnu.org @ 2024-05-06 10:35 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114958

Matthias Kretz (Vir) <mkretz at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |ASSIGNED
     Ever confirmed|0                           |1
   Last reconfirmed|                            |2024-05-06

--- Comment #1 from Matthias Kretz (Vir) <mkretz at gcc dot gnu.org> ---
Untested patch:

diff --git a/libstdc++-v3/include/experimental/bits/simd.h
b/libstdc++-v3/include/experimental/bits/simd.h
index 6ef9c955cfa..7cfaa84ba9e 100644
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -1651,7 +1651,17 @@ __as_vector(_V __x)
     if constexpr (__is_vector_type_v<_V>)
       return __x;
     else if constexpr (is_simd<_V>::value || is_simd_mask<_V>::value)
-      return __data(__x)._M_data;
+      {
+       if constexpr (_V::size() > 1)
+         return __data(__x)._M_data;
+       else
+         {
+           static_assert(is_simd<_V>::value);
+           using _Tp = typename _V::value_type;
+           using _RV [[__gnu__::__vector_size__(sizeof(_Tp))]] = _Tp;
+           return _RV{__data(__x)};
+         }
+      }
     else if constexpr (__is_vectorizable_v<_V>)
       return __vector_type_t<_V, 2>{__x};
     else
@@ -2061,6 +2071,22 @@ __not(_Tp __a) noexcept
       return ~__a;
   }

+// }}}
+// __vec_shuffle{{{
+template <typename _T0, typename _T1, typename _Fun, size_t... _Is>
+  _GLIBCXX_SIMD_INTRINSIC constexpr auto
+  __vec_shuffle(_T0 __x, _T1 __y, index_sequence<_Is...>, _Fun __idx_perm)
+  { return __builtin_shufflevector(__x, __y, [=] {
+            constexpr int __j = __idx_perm(_Is);
+            static_assert(__j < sizeof...(_Is));
+            return __j;
+          }()...); }
+
+template <typename _T0, typename _Fun, size_t... _Is>
+  _GLIBCXX_SIMD_INTRINSIC constexpr auto
+  __vec_shuffle(_T0 __x, index_sequence<_Is...>, _Fun __idx_perm)
+  { return __builtin_shufflevector(__x, _T0(), __idx_perm(_Is)...); }
+
 // }}}
 // __concat{{{
 template <typename _Tp, typename _TVT = _VectorTraits<_Tp>,
@@ -3947,7 +3973,7 @@ clamp(const simd<_Tp, _Ap>& __v, const simd<_Tp, _Ap>&
__lo, const simd<_Tp, _Ap
 // __extract_part {{{
 template <int _Index, int _Total, int _Combine = 1, typename _Tp, size_t _Np>
   _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
-  _SimdWrapper<_Tp, _Np / _Total * _Combine>
+  conditional_t<_Np == _Total and _Combine == 1, _Tp, _SimdWrapper<_Tp, _Np /
_Total * _Combine>>
   __extract_part(const _SimdWrapper<_Tp, _Np> __x);

 template <int _Index, int _Parts, int _Combine = 1, typename _Tp, typename
_A0, typename... _As>
@@ -4231,47 +4257,28 @@ static_assert(
                         __split_wrapper(_SL::template _S_pop_front<1>(),
                                         __data(__x).second));
       }
-    else if constexpr ((!is_same_v<simd_abi::scalar,
-                                  simd_abi::deduce_t<_Tp, _Sizes>> && ...)
-                      && (!__is_fixed_size_abi_v<
-                            simd_abi::deduce_t<_Tp, _Sizes>> && ...))
+    else if constexpr ((!__is_fixed_size_abi_v<simd_abi::deduce_t<_Tp,
_Sizes>> && ...))
       {
-       if constexpr (((_Sizes * 2 == _Np) && ...))
-         return {{__private_init, __extract_part<0, 2>(__data(__x))},
-                 {__private_init, __extract_part<1, 2>(__data(__x))}};
-       else if constexpr (is_same_v<_SizeList<_Sizes...>,
-                                    _SizeList<_Np / 3, _Np / 3, _Np / 3>>)
-         return {{__private_init, __extract_part<0, 3>(__data(__x))},
-                 {__private_init, __extract_part<1, 3>(__data(__x))},
-                 {__private_init, __extract_part<2, 3>(__data(__x))}};
-       else if constexpr (is_same_v<_SizeList<_Sizes...>,
-                                    _SizeList<2 * _Np / 3, _Np / 3>>)
-         return {{__private_init, __extract_part<0, 3, 2>(__data(__x))},
-                 {__private_init, __extract_part<2, 3>(__data(__x))}};
-       else if constexpr (is_same_v<_SizeList<_Sizes...>,
-                                    _SizeList<_Np / 3, 2 * _Np / 3>>)
-         return {{__private_init, __extract_part<0, 3>(__data(__x))},
-                 {__private_init, __extract_part<1, 3, 2>(__data(__x))}};
-       else if constexpr (is_same_v<_SizeList<_Sizes...>,
-                                    _SizeList<_Np / 2, _Np / 4, _Np / 4>>)
-         return {{__private_init, __extract_part<0, 2>(__data(__x))},
-                 {__private_init, __extract_part<2, 4>(__data(__x))},
-                 {__private_init, __extract_part<3, 4>(__data(__x))}};
-       else if constexpr (is_same_v<_SizeList<_Sizes...>,
-                                    _SizeList<_Np / 4, _Np / 4, _Np / 2>>)
-         return {{__private_init, __extract_part<0, 4>(__data(__x))},
-                 {__private_init, __extract_part<1, 4>(__data(__x))},
-                 {__private_init, __extract_part<1, 2>(__data(__x))}};
-       else if constexpr (is_same_v<_SizeList<_Sizes...>,
-                                    _SizeList<_Np / 4, _Np / 2, _Np / 4>>)
-         return {{__private_init, __extract_part<0, 4>(__data(__x))},
-                 {__private_init, __extract_center(__data(__x))},
-                 {__private_init, __extract_part<3, 4>(__data(__x))}};
-       else if constexpr (((_Sizes * 4 == _Np) && ...))
-         return {{__private_init, __extract_part<0, 4>(__data(__x))},
-                 {__private_init, __extract_part<1, 4>(__data(__x))},
-                 {__private_init, __extract_part<2, 4>(__data(__x))},
-                 {__private_init, __extract_part<3, 4>(__data(__x))}};
+       constexpr size_t _N1 = _SL::template _S_at<1>();
+       if constexpr (sizeof...(_Sizes) == 2)
+         return {{__private_init, __extract_part<0, _Np, _N0>(__data(__x))},
+                 {__private_init, __extract_part<_N0, _Np,
_N1>(__data(__x))}};
+       else if constexpr (sizeof...(_Sizes) == 3)
+         {
+           constexpr size_t _N2 = _SL::template _S_at<2>();
+           return {{__private_init, __extract_part<0, _Np, _N0>(__data(__x))},
+                   {__private_init, __extract_part<_N0, _Np,
_N1>(__data(__x))},
+                   {__private_init, __extract_part<_N0 + _N1, _Np,
_N2>(__data(__x))}};
+         }
+       else if constexpr (sizeof...(_Sizes) == 4)
+         {
+           constexpr size_t _N2 = _SL::template _S_at<2>();
+           constexpr size_t _N3 = _SL::template _S_at<3>();
+           return {{__private_init, __extract_part<0, _Np, _N0>(__data(__x))},
+                   {__private_init, __extract_part<_N0, _Np,
_N1>(__data(__x))},
+                   {__private_init, __extract_part<_N0 + _N1, _Np,
_N2>(__data(__x))},
+                   {__private_init, __extract_part<_N0 + _N1 + _N2, _Np,
_N3>(__data(__x))}};
+         }
        // else fall through
       }
 #ifdef _GLIBCXX_SIMD_USE_ALIASING_LOADS
@@ -4334,14 +4341,37 @@ static_assert(
   simd<_Tp, simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>
   concat(const simd<_Tp, _As>&... __xs)
   {
-    using _Rp = __deduced_simd<_Tp, (simd_size_v<_Tp, _As> + ...)>;
+    constexpr int _Np = (simd_size_v<_Tp, _As> + ...);
+    using _Abi = simd_abi::deduce_t<_Tp, _Np>;
+    using _Rp = simd<_Tp, _Abi>;
+    using _RW = typename _SimdTraits<_Tp, _Abi>::_SimdMember;
     if constexpr (sizeof...(__xs) == 1)
       return simd_cast<_Rp>(__xs...);
     else if ((... && __xs._M_is_constprop()))
-      return simd<_Tp,
-                 simd_abi::deduce_t<_Tp, (simd_size_v<_Tp, _As> + ...)>>(
-              [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
+      return _Rp([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
               { return __subscript_in_pack<__i>(__xs...); });
+    else if constexpr (__is_simd_wrapper_v<_RW> and sizeof...(__xs) == 2)
+      {
+       return {__private_init,
+               __vec_shuffle(__as_vector(__xs)...,
std::make_index_sequence<_RW::_S_full_size>(),
+                             [](int __i) {
+                               constexpr int __sizes[2] =
{int(simd_size_v<_Tp, _As>)...};
+                               constexpr int __padding0
+                                 = sizeof(__vector_type_t<_Tp, __sizes[0]>) /
sizeof(_Tp)
+                                     - __sizes[0];
+                               return __i >= _Np ? -1 : __i < __sizes[0] ? __i
: __i + __padding0;
+                             })};
+      }
+    else if constexpr (__is_simd_wrapper_v<_RW> and sizeof...(__xs) == 3)
+      return [](const auto& __x0, const auto& __x1, const auto& __x2)
+                _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+              return concat(concat(__x0, __x1), __x2);
+            }(__xs...);
+    else if constexpr (__is_simd_wrapper_v<_RW> and sizeof...(__xs) > 3)
+      return [](const auto& __x0, const auto& __x1, const auto&... __rest)
+                _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
+              return concat(concat(__x0, __x1), concat(__rest...));
+            }(__xs...);
     else
       {
        _Rp __r{};
diff --git a/libstdc++-v3/include/experimental/bits/simd_builtin.h
b/libstdc++-v3/include/experimental/bits/simd_builtin.h
index 4ceeb423894..1064caf8de2 100644
--- a/libstdc++-v3/include/experimental/bits/simd_builtin.h
+++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h
@@ -206,10 +206,12 @@ __shift_elements_right(_Tp __v)
 // __extract_part(_SimdWrapper<_Tp, _Np>) {{{
 template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
   _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
-  _SimdWrapper<_Tp, _Np / _Total * _Combine>
+  conditional_t<_Np == _Total and _Combine == 1, _Tp, _SimdWrapper<_Tp, _Np /
_Total * _Combine>>
   __extract_part(const _SimdWrapper<_Tp, _Np> __x)
   {
-    if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0)
+    if constexpr (_Np == _Total and _Combine == 1)
+      return __x[_Index];
+    else if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 ==
0)
       return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x);
     else
       {
@@ -235,39 +237,11 @@ __extract_part(const _SimdWrapper<_Tp, _Np> __x)
          return __x;
        else if constexpr (_Index == 0)
          return __intrin_bitcast<_R>(__as_vector(__x));
-#if _GLIBCXX_SIMD_X86INTRIN // {{{
-       else if constexpr (sizeof(__x) == 32
-                          && __return_size * sizeof(_Tp) <= 16)
-         {
-           constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp);
-           if constexpr (__bytes_to_skip == 16)
-             return __vector_bitcast<_Tp, __return_size>(
-               __hi128(__as_vector(__x)));
-           else
-             return __vector_bitcast<_Tp, __return_size>(
-               _mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)),
-                               __lo128(__vector_bitcast<_LLong>(__x)),
-                               __bytes_to_skip));
-         }
-#endif // _GLIBCXX_SIMD_X86INTRIN }}}
-       else if constexpr (_Index > 0
-                          && (__values_to_skip % __return_size != 0
-                              || sizeof(_R) >= 8)
-                          && (__values_to_skip + __return_size) * sizeof(_Tp)
-                               <= 64
-                          && sizeof(__x) >= 16)
-         return __intrin_bitcast<_R>(
-           __shift_elements_right<__values_to_skip * sizeof(_Tp)>(
-             __as_vector(__x)));
        else
-         {
-           _R __r = {};
-           __builtin_memcpy(&__r,
-                            reinterpret_cast<const char*>(&__x)
-                              + sizeof(_Tp) * __values_to_skip,
-                            __return_size * sizeof(_Tp));
-           return __r;
-         }
+         return __vec_shuffle(__as_vector(__x),
make_index_sequence<__bit_ceil(__return_size)>(),
+                              [](size_t __i) {
+                                return __i + __values_to_skip;
+                              });
       }
   }

diff --git a/libstdc++-v3/testsuite/experimental/simd/pr114958.cc
b/libstdc++-v3/testsuite/experimental/simd/pr114958.cc
new file mode 100644
index 00000000000..94c9e0a2d18
--- /dev/null
+++ b/libstdc++-v3/testsuite/experimental/simd/pr114958.cc
@@ -0,0 +1,20 @@
+// { dg-options "-std=c++17" }
+// { dg-do compile { target x86_64-*-* } }
+// { dg-require-effective-target c++17 }
+// { dg-additional-options "-march=x86-64-v3" { target x86_64-*-* } }
+// { dg-require-cmath "" }
+// { dg-final { scan-assembler-times "vperm(q|pd)\[\\t \]+\\\$144" 1 } }
+
+#include <experimental/simd>
+
+namespace stdx = std::experimental;
+
+using T = std::uint64_t;
+using V = stdx::simd<T, stdx::simd_abi::_VecBuiltin<32>>;
+using V1 = stdx::simd<T, stdx::simd_abi::scalar>;
+
+V perm(V data)
+{
+  auto [carry, _] = stdx::split<3, 1>(data);
+  return concat(V1(), carry);
+}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug libstdc++/114958] use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations
  2024-05-06 10:29 [Bug libstdc++/114958] New: use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations mkretz at gcc dot gnu.org
  2024-05-06 10:35 ` [Bug libstdc++/114958] " mkretz at gcc dot gnu.org
@ 2024-05-06 10:45 ` jakub at gcc dot gnu.org
  2024-05-06 11:08 ` mkretz at gcc dot gnu.org
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: jakub at gcc dot gnu.org @ 2024-05-06 10:45 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114958

Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |jakub at gcc dot gnu.org

--- Comment #2 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
I'm not sure the header should use __builtin_shufflevector unconditionally, I
think better test it using __has_builtin first.
GCC has it since GCC 12 I think, clang indeed much longer, not sure about other
compilers (ICC).

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug libstdc++/114958] use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations
  2024-05-06 10:29 [Bug libstdc++/114958] New: use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations mkretz at gcc dot gnu.org
  2024-05-06 10:35 ` [Bug libstdc++/114958] " mkretz at gcc dot gnu.org
  2024-05-06 10:45 ` jakub at gcc dot gnu.org
@ 2024-05-06 11:08 ` mkretz at gcc dot gnu.org
  2024-05-06 11:24 ` jakub at gcc dot gnu.org
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: mkretz at gcc dot gnu.org @ 2024-05-06 11:08 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114958

--- Comment #3 from Matthias Kretz (Vir) <mkretz at gcc dot gnu.org> ---
Hmm, it actually fails on Clang because Clang requires the vector width of both
arguments to be equal.

But yes, I guess I should make __vec_shuffle friendlier to non-GCC compilers.
I'm not sure about backporting, maybe GCC 14.2 can get it?

I'd be happy for feedback about my dg-* usage, though. It's the first time I
try to require a certain optimized outcome. I guess it would be better to scan
for the shuffle pattern rather than the selected instruction? I'm a noob on
that topic.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug libstdc++/114958] use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations
  2024-05-06 10:29 [Bug libstdc++/114958] New: use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations mkretz at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2024-05-06 11:08 ` mkretz at gcc dot gnu.org
@ 2024-05-06 11:24 ` jakub at gcc dot gnu.org
  2024-05-13 11:41 ` cvs-commit at gcc dot gnu.org
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 8+ messages in thread
From: jakub at gcc dot gnu.org @ 2024-05-06 11:24 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114958

--- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Strange, the __builtin_shufflevector in GCC has been added for Clang
compatibility.
__builtin_shuffle is the original native GCC builtin (which Clang doesn't
implement).

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug libstdc++/114958] use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations
  2024-05-06 10:29 [Bug libstdc++/114958] New: use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations mkretz at gcc dot gnu.org
                   ` (3 preceding siblings ...)
  2024-05-06 11:24 ` jakub at gcc dot gnu.org
@ 2024-05-13 11:41 ` cvs-commit at gcc dot gnu.org
  2024-05-15 10:04 ` mkretz at gcc dot gnu.org
  2024-05-29  7:03 ` mkretz at gcc dot gnu.org
  6 siblings, 0 replies; 8+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2024-05-13 11:41 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114958

--- Comment #5 from GCC Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Matthias Kretz <mkretz@gcc.gnu.org>:

https://gcc.gnu.org/g:fb1649f8b4ad5043dd0e65e4e3a643a0ced018a9

commit r15-429-gfb1649f8b4ad5043dd0e65e4e3a643a0ced018a9
Author: Matthias Kretz <m.kretz@gsi.de>
Date:   Mon May 6 12:13:55 2024 +0200

    libstdc++: Use __builtin_shufflevector for simd split and concat

    Signed-off-by: Matthias Kretz <m.kretz@gsi.de>

    libstdc++-v3/ChangeLog:

            PR libstdc++/114958
            * include/experimental/bits/simd.h (__as_vector): Return scalar
            simd as one-element vector. Return vector from single-vector
            fixed_size simd.
            (__vec_shuffle): New.
            (__extract_part): Adjust return type signature.
            (split): Use __extract_part for any split into non-fixed_size
            simds.
            (concat): If the return type stores a single vector, use
            __vec_shuffle (which calls __builtin_shufflevector) to produce
            the return value.
            * include/experimental/bits/simd_builtin.h
            (__shift_elements_right): Removed.
            (__extract_part): Return single elements directly. Use
            __vec_shuffle (which calls __builtin_shufflevector) to for all
            non-trivial cases.
            * include/experimental/bits/simd_fixed_size.h (__extract_part):
            Return single elements directly.
            * testsuite/experimental/simd/pr114958.cc: New test.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug libstdc++/114958] use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations
  2024-05-06 10:29 [Bug libstdc++/114958] New: use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations mkretz at gcc dot gnu.org
                   ` (4 preceding siblings ...)
  2024-05-13 11:41 ` cvs-commit at gcc dot gnu.org
@ 2024-05-15 10:04 ` mkretz at gcc dot gnu.org
  2024-05-29  7:03 ` mkretz at gcc dot gnu.org
  6 siblings, 0 replies; 8+ messages in thread
From: mkretz at gcc dot gnu.org @ 2024-05-15 10:04 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114958

--- Comment #6 from Matthias Kretz (Vir) <mkretz at gcc dot gnu.org> ---
The last commit introduced a regression on i686 where __builtin_shufflevector
was producing MMX vectors (which can mess up the FPU). Untested patch which
resolves the issue:

libstdc++-v3/ChangeLog:

        PR libstdc++/114958
        * include/experimental/bits/simd.h (__as_vector): Don't use
        vector_size(8) on __i386__.
        (__vec_shuffle): Never return MMX vectors, widen to 16 bytes
        instead.
        (concat): Fix padding calculation to pick up widening logic from
        __as_vector.

diff --git a/libstdc++-v3/include/experimental/bits/simd.h
b/libstdc++-v3/include/experimental/bits/simd.h
index 6a6fd4f109d..63951df488c 100644
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -1665,7 +1665,12 @@ __as_vector(_V __x)
          {
            static_assert(is_simd<_V>::value);
            using _Tp = typename _V::value_type;
+#ifdef __i386__
+           constexpr auto __bytes = sizeof(_Tp) == 8 ? 16 : sizeof(_Tp);
+           using _RV [[__gnu__::__vector_size__(__bytes)]] = _Tp;
+#else
            using _RV [[__gnu__::__vector_size__(sizeof(_Tp))]] = _Tp;
+#endif
            return _RV{__data(__x)};
          }
       }
@@ -2081,11 +2086,14 @@ __not(_Tp __a) noexcept
 // }}}
 // __vec_shuffle{{{
 template <typename _T0, typename _T1, typename _Fun, size_t... _Is>
-  _GLIBCXX_SIMD_INTRINSIC constexpr auto
+  _GLIBCXX_SIMD_INTRINSIC constexpr
+  __vector_type_t<typename _VectorTraits<_T0>::value_type, sizeof...(_Is)>
   __vec_shuffle(_T0 __x, _T1 __y, index_sequence<_Is...> __seq, _Fun
__idx_perm)
   {
     constexpr int _N0 = sizeof(__x) / sizeof(__x[0]);
     constexpr int _N1 = sizeof(__y) / sizeof(__y[0]);
+    using _Tp = typename _VectorTraits<_T0>::value_type;
+    using _RV [[maybe_unused]] = __vector_type_t<_Tp, sizeof...(_Is)>;
 #if __has_builtin(__builtin_shufflevector)
 #ifdef __clang__
     // Clang requires _T0 == _T1
@@ -2105,14 +2113,23 @@ __not(_Tp __a) noexcept
             });
     else
 #endif
-      return __builtin_shufflevector(__x, __y, [=] {
-              constexpr int __j = __idx_perm(_Is);
-              static_assert(__j < _N0 + _N1);
-              return __j;
-            }()...);
+      {
+       const auto __r = __builtin_shufflevector(__x, __y, [=] {
+                          constexpr int __j = __idx_perm(_Is);
+                          static_assert(__j < _N0 + _N1);
+                          return __j;
+                        }()...);
+#ifdef __i386__
+       if constexpr (sizeof(__r) == sizeof(_RV))
+         return __r;
+       else
+         return _RV {__r[_Is]...};
+#else
+       return __r;
+#endif
+      }
 #else
-    using _Tp = __remove_cvref_t<decltype(__x[0])>;
-    return __vector_type_t<_Tp, sizeof...(_Is)> {
+    return _RV {
       [=]() -> _Tp {
        constexpr int __j = __idx_perm(_Is);
        static_assert(__j < _N0 + _N1);
@@ -4393,9 +4410,9 @@ for (unsigned __j = 0; __j < __i; ++__j)
                __vec_shuffle(__as_vector(__xs)...,
std::make_index_sequence<_RW::_S_full_size>(),
                              [](int __i) {
                                constexpr int __sizes[2] =
{int(simd_size_v<_Tp, _As>)...};
-                               constexpr int __padding0
-                                 = sizeof(__vector_type_t<_Tp, __sizes[0]>) /
sizeof(_Tp)
-                                     - __sizes[0];
+                               constexpr int __vsizes[2]
+                                 = {int(sizeof(__as_vector(__xs)) /
sizeof(_Tp))...};
+                               constexpr int __padding0 = __vsizes[0] -
__sizes[0];
                                return __i >= _Np ? -1 : __i < __sizes[0] ? __i
: __i + __padding0;
                              })};
       }

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [Bug libstdc++/114958] use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations
  2024-05-06 10:29 [Bug libstdc++/114958] New: use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations mkretz at gcc dot gnu.org
                   ` (5 preceding siblings ...)
  2024-05-15 10:04 ` mkretz at gcc dot gnu.org
@ 2024-05-29  7:03 ` mkretz at gcc dot gnu.org
  6 siblings, 0 replies; 8+ messages in thread
From: mkretz at gcc dot gnu.org @ 2024-05-29  7:03 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114958

--- Comment #7 from Matthias Kretz (Vir) <mkretz at gcc dot gnu.org> ---
the regression is now PR115247

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2024-05-29  7:03 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-06 10:29 [Bug libstdc++/114958] New: use __builtin_shufflevector for std::experimental::simd split and concat (at least the common cases) to enable better optimizations mkretz at gcc dot gnu.org
2024-05-06 10:35 ` [Bug libstdc++/114958] " mkretz at gcc dot gnu.org
2024-05-06 10:45 ` jakub at gcc dot gnu.org
2024-05-06 11:08 ` mkretz at gcc dot gnu.org
2024-05-06 11:24 ` jakub at gcc dot gnu.org
2024-05-13 11:41 ` cvs-commit at gcc dot gnu.org
2024-05-15 10:04 ` mkretz at gcc dot gnu.org
2024-05-29  7:03 ` mkretz at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).