From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id B784D3882123; Fri, 14 Jun 2024 13:45:05 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org B784D3882123 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1718372705; bh=f8oZiugspOIpifMaJr6xPDrtRoXekFMbzvm7DddD02c=; h=From:To:Subject:Date:In-Reply-To:References:From; b=pZoURPbe6hlTd4OFXoCTHozSSgP0TaD6k3ba3lOZ9asfwQiuGiBIxOx1A85km3cYL blRq2/qkIPWi6dsynydBGpD95rLdfaqTkD1sE/+F0bJfJXDnRqGWrscxnQwL0VfSH0 hbFSzdgdluA6nlbrf0tfe0olXpUmEsDnkVM7gz88= From: "mkretz at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug libstdc++/115454] std::experimental::find_last_set is buggy on x86-64-v4 Date: Fri, 14 Jun 2024 13:45:04 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: libstdc++ X-Bugzilla-Version: 14.0 X-Bugzilla-Keywords: wrong-code X-Bugzilla-Severity: normal X-Bugzilla-Who: mkretz at gcc dot gnu.org X-Bugzilla-Status: ASSIGNED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: mkretz at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D115454 --- Comment #2 from Matthias Kretz (Vir) --- Actually, find_last_set needs to clear padding bits as well. This fixes the issue. Regression tests are running now. commit b85ff52c1fb059f75bcedf103a15fce9db8bc92b Author: Matthias Kretz Date: Fri Jun 14 15:11:25 2024 +0200 libstdc++: Fix find_last_set(simd_mask) to ignore padding bits With the change to the AVX512 find_last_set implementation, the change to AVX512 operator!=3D is unnecessary. However, the latter was not producing optimal code and unnecessarily set the padding bits. In theory, the compiler could determine that with the new !=3D implementation, the bit operation for clearing the padding bits is a no-op and can be elided. Signed-off-by: Matthias Kretz libstdc++-v3/ChangeLog: PR libstdc++/115454 * include/experimental/bits/simd_x86.h (_S_not_equal_to): Use neq comparison instead of bitwise negation after eq. (_S_find_last_set): Clear unused high bits before computing bit_width. * testsuite/experimental/simd/pr115454_find_last_set.cc: New test. diff --git a/libstdc++-v3/include/experimental/bits/simd_x86.h b/libstdc++-v3/include/experimental/bits/simd_x86.h index 517c4b4a5be..8a23aa2082b 100644 --- a/libstdc++-v3/include/experimental/bits/simd_x86.h +++ b/libstdc++-v3/include/experimental/bits/simd_x86.h @@ -2339,29 +2339,29 @@ _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) __assert_unreachable<_Tp>(); } else if constexpr (sizeof(__xi) =3D=3D 64 && sizeof(_Tp) =3D=3D= 8) - return ~_mm512_mask_cmpeq_epi64_mask(__k1, __xi, __yi); + return _mm512_mask_cmpneq_epi64_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 64 && sizeof(_Tp) =3D=3D= 4) - return ~_mm512_mask_cmpeq_epi32_mask(__k1, __xi, __yi); + return _mm512_mask_cmpneq_epi32_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 64 && sizeof(_Tp) =3D=3D= 2) - return ~_mm512_mask_cmpeq_epi16_mask(__k1, __xi, __yi); + return _mm512_mask_cmpneq_epi16_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 64 && sizeof(_Tp) =3D=3D= 1) - return ~_mm512_mask_cmpeq_epi8_mask(__k1, __xi, __yi); + return _mm512_mask_cmpneq_epi8_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 32 && sizeof(_Tp) =3D=3D= 8) - return ~_mm256_mask_cmpeq_epi64_mask(__k1, __xi, __yi); + return _mm256_mask_cmpneq_epi64_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 32 && sizeof(_Tp) =3D=3D= 4) - return ~_mm256_mask_cmpeq_epi32_mask(__k1, __xi, __yi); + return _mm256_mask_cmpneq_epi32_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 32 && sizeof(_Tp) =3D=3D= 2) - return ~_mm256_mask_cmpeq_epi16_mask(__k1, __xi, __yi); + return _mm256_mask_cmpneq_epi16_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 32 && sizeof(_Tp) =3D=3D= 1) - return ~_mm256_mask_cmpeq_epi8_mask(__k1, __xi, __yi); + return _mm256_mask_cmpneq_epi8_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 16 && sizeof(_Tp) =3D=3D= 8) - return ~_mm_mask_cmpeq_epi64_mask(__k1, __xi, __yi); + return _mm_mask_cmpneq_epi64_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 16 && sizeof(_Tp) =3D=3D= 4) - return ~_mm_mask_cmpeq_epi32_mask(__k1, __xi, __yi); + return _mm_mask_cmpneq_epi32_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 16 && sizeof(_Tp) =3D=3D= 2) - return ~_mm_mask_cmpeq_epi16_mask(__k1, __xi, __yi); + return _mm_mask_cmpneq_epi16_mask(__k1, __xi, __yi); else if constexpr (sizeof(__xi) =3D=3D 16 && sizeof(_Tp) =3D=3D= 1) - return ~_mm_mask_cmpeq_epi8_mask(__k1, __xi, __yi); + return _mm_mask_cmpneq_epi8_mask(__k1, __xi, __yi); else __assert_unreachable<_Tp>(); } // }}} @@ -5292,7 +5292,7 @@ _S_find_first_set(simd_mask<_Tp, _Abi> __k) _S_find_last_set(simd_mask<_Tp, _Abi> __k) { if constexpr (__is_avx512_abi<_Abi>()) - return std::__bit_width(__k._M_data._M_data) - 1; + return std::__bit_width(_Abi::_S_masked(__k._M_data)._M_data) - 1; else return _Base::_S_find_last_set(__k); } diff --git a/libstdc++-v3/testsuite/experimental/simd/pr115454_find_last_se= t.cc b/libstdc++-v3/testsuite/experimental/simd/pr115454_find_last_set.cc new file mode 100644 index 00000000000..b47f19d3067 --- /dev/null +++ b/libstdc++-v3/testsuite/experimental/simd/pr115454_find_last_set.cc @@ -0,0 +1,49 @@ +// { dg-options "-std=3Dgnu++17" } +// { dg-do run { target *-*-* } } +// { dg-require-effective-target c++17 } +// { dg-additional-options "-march=3Dx86-64-v4" { target avx512f } } +// { dg-require-cmath "" } + +#include + +namespace stdx =3D std::experimental; + +using T =3D std::uint64_t; + +template +using V =3D stdx::simd>; + +[[gnu::noinline, gnu::noipa]] +int reduce(V x) +{ + static_assert(stdx::find_last_set(V([](unsigned i) { return i; }) = !=3D V(0)) =3D=3D 3); + return stdx::find_last_set(x !=3D -1); +} + +[[gnu::noinline, gnu::noipa]] +int reduce2() +{ + using M8 =3D typename V::mask_type; + using M4 =3D typename V::mask_type; + if constexpr (sizeof(M8) =3D=3D sizeof(M4)) + { + M4 k; + __builtin_memcpy(&__data(k), &__data(M8(true)), sizeof(M4)); + return stdx::find_last_set(k); + } + return 3; +} + + +int main() +{ + const V x {}; + + const int r =3D reduce(x); + if (r !=3D 3) + __builtin_abort(); + + const int r2 =3D reduce2(); + if (r2 !=3D 3) + __builtin_abort(); +}=