From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from lxmtout1.gsi.de (lxmtout1.gsi.de [140.181.3.111]) by sourceware.org (Postfix) with ESMTPS id 1E96F3858D35; Fri, 8 Mar 2024 09:57:25 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 1E96F3858D35 Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gsi.de Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gsi.de ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 1E96F3858D35 Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=140.181.3.111 ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1709891854; cv=none; b=pPHsM2uOmu74HTDQ9PJtLfkT8yYJlZ55cMXBtHmY/hLPwKJsxXm32SViIz7mIXHVcFr5sycyMFBEEk4sitzCYFJx6xpvRaJIiEjHTY73kacALeY8Sc+KUA5EFALH/jYVCH/bgopx57uDYikVG+0orzP6qaxjtgn4spHq0kN4q6A= ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1709891854; c=relaxed/simple; bh=D6vlFkoW6bWpeKqppZhZqCKfEiW2uWwKGDX1sExgSes=; h=From:To:Subject:Date:Message-ID:MIME-Version; b=h/Mz4mjcUPjpHlj9NyLfFTbnrWOtson5vr8CNp5KZQhflMmFLaCG3fSX2lCMFI8eb0JXkIFZLPXVoBY16WlUiPRmL4SWO+tjN5GYWDPeOxvnUifxAaCcGAvERZML22mKpmLiwxu95PLa2Th3M1dLvswadERb4+lhyfq+mlp1Ngo= ARC-Authentication-Results: i=1; server2.sourceware.org Received: from localhost (localhost [127.0.0.1]) by lxmtout1.gsi.de (Postfix) with ESMTP id C341A2051044; Fri, 8 Mar 2024 10:57:23 +0100 (CET) X-Virus-Scanned: Debian amavisd-new at lxmtout1.gsi.de Received: from lxmtout1.gsi.de ([127.0.0.1]) by localhost (lxmtout1.gsi.de [127.0.0.1]) (amavisd-new, port 10024) with LMTP id n3yJHeoYyRsR; Fri, 8 Mar 2024 10:57:23 +0100 (CET) Received: from srvEX6.campus.gsi.de (unknown [10.10.4.96]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by lxmtout1.gsi.de (Postfix) with ESMTPS id A114E2051040; Fri, 8 Mar 2024 10:57:23 +0100 (CET) Received: from minbar.localnet (140.181.3.12) by srvEX6.campus.gsi.de (10.10.4.96) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.2.1118.40; Fri, 8 Mar 2024 10:57:23 +0100 From: Matthias Kretz To: , , , Srinivas Yadav Singanaboina Subject: Re: [PATCH v2] libstdc++: add ARM SVE support to std::experimental::simd Date: Fri, 8 Mar 2024 10:57:22 +0100 Message-ID: <5282839.4XsnlVU6TS@minbar> Organization: GSI Helmholtzzentrum =?UTF-8?B?ZsO8cg==?= Schwerionenforschung In-Reply-To: <20240209142810.97817-1-vasu.srinivasvasu.14@gmail.com> References: <20240209142810.97817-1-vasu.srinivasvasu.14@gmail.com> MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="nextPart11647824.VV5PYv0bhD" Content-Transfer-Encoding: 7Bit X-Originating-IP: [140.181.3.12] X-ClientProxiedBy: srvex5.Campus.gsi.de (10.10.4.95) To srvEX6.campus.gsi.de (10.10.4.96) X-Spam-Status: No, score=-11.6 required=5.0 tests=BAYES_00,GIT_PATCH_0,KAM_DMARC_STATUS,SPF_HELO_PASS,SPF_PASS,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: --nextPart11647824.VV5PYv0bhD Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="UTF-8" Hi, I applied and did extended testing on x86_64 (no regressions) and aarch64=20 using qemu testing SVE 256, 512, and 1024. Looks good! While going through the applied patch I noticed a few style issues that I=20 simply turned into a patch (attached). A few comments inline. Sorry for not seeing these before. On Friday, 9 February 2024 15:28:10 CET Srinivas Yadav Singanaboina wrote: > diff --git a/libstdc++-v3/include/experimental/bits/simd.h > b/libstdc++-v3/include/experimental/bits/simd.h index > 90523ea57dc..d274cd740fe 100644 > --- a/libstdc++-v3/include/experimental/bits/simd.h > +++ b/libstdc++-v3/include/experimental/bits/simd.h > @@ -39,12 +39,16 @@ > #include > #include > #include > +#include >=20 > #if _GLIBCXX_SIMD_X86INTRIN > #include > #elif _GLIBCXX_SIMD_HAVE_NEON > #include > #endif > +#if _GLIBCXX_SIMD_HAVE_SVE > +#include > +#endif >=20 > /** @ingroup ts_simd > * @{ > @@ -83,6 +87,12 @@ using __m512d [[__gnu__::__vector_size__(64)]] =3D dou= ble; > using __m512i [[__gnu__::__vector_size__(64)]] =3D long long; > #endif >=20 > +#if _GLIBCXX_SIMD_HAVE_SVE > +constexpr inline int __sve_vectorized_size_bytes =3D __ARM_FEATURE_SVE_B= ITS / > 8; +#else > +constexpr inline int __sve_vectorized_size_bytes =3D 0; > +#endif > + > namespace simd_abi { > // simd_abi forward declarations {{{ > // implementation details: > @@ -108,6 +118,9 @@ template > template > struct _VecBltnBtmsk; >=20 > +template > + struct _SveAbi; > + > template > using _VecN =3D _VecBuiltin; >=20 > @@ -123,6 +136,9 @@ template > template > using _Neon =3D _VecBuiltin<_UsedBytes>; >=20 > +template > + using _Sve =3D _SveAbi<_UsedBytes, __sve_vectorized_size_bytes>; > + > // implementation-defined: > using __sse =3D _Sse<>; > using __avx =3D _Avx<>; > @@ -130,6 +146,7 @@ using __avx512 =3D _Avx512<>; > using __neon =3D _Neon<>; > using __neon128 =3D _Neon<16>; > using __neon64 =3D _Neon<8>; > +using __sve =3D _Sve<>; >=20 > // standard: > template > @@ -250,6 +267,8 @@ constexpr inline bool __support_neon_float =3D > false; > #endif >=20 > +constexpr inline bool __have_sve =3D _GLIBCXX_SIMD_HAVE_SVE; > + > #ifdef _ARCH_PWR10 > constexpr inline bool __have_power10vec =3D true; > #else > @@ -356,12 +375,13 @@ namespace __detail >=20 > | (__have_avx512vnni << 27) > | (__have_avx512vpopcntdq << 28) > | (__have_avx512vp2intersect << 29); >=20 > - else if constexpr (__have_neon) > + else if constexpr (__have_neon || __have_sve) > return __have_neon >=20 > | (__have_neon_a32 << 1) > | (__have_neon_a64 << 2) > | (__have_neon_a64 << 2) >=20 > - | (__support_neon_float << 3); > + | (__support_neon_float << 3) > + | (__have_sve << 4); This is not enough. This should list all feature flags that might have a=20 (significant enough) influence on code-gen in inline functions (that are no= t=20 always_inline). AFAIU at least __ARM_FEATURE_SVE2 is necessary. But I assum= e=20 __ARM_FEATURE_SVE2_BITPERM, __ARM_FEATURE_SVE_BITS,=20 __ARM_FEATURE_SVE_MATMUL_INT8, and __ARM_FEATURE_SVE_VECTOR_OPERATORS are a= lso=20 relevant. Maybe more? > [...] bits/simd.h: > // fall back to fixed_size only if scalar and native ABIs don't match > template > struct __deduce_fixed_size_fallback {}; >=20 > +template > + struct __no_sve_deduce_fixed_size_fallback {}; > + > template > struct __deduce_fixed_size_fallback<_Tp, _Np, > enable_if_t::template _S_is_valid_v<_Tp>>> > { using type =3D simd_abi::fixed_size<_Np>; }; >=20 > +template > + struct __no_sve_deduce_fixed_size_fallback<_Tp, _Np, > + enable_if_t::template _S_is_valid_v<_Tp>>> > + { using type =3D simd_abi::fixed_size<_Np>; }; > + > template > struct __deduce_impl : public __deduce_fixed_size_fallback<_Tp, _Np> {= }; >=20 > +template > + struct __no_sve_deduce_impl : public > __no_sve_deduce_fixed_size_fallback<_Tp, _Np> {}; I believe you don't need __no_sve_deduce_fixed_size_fallback. Simply derive= =20 __no_sve_deduce_impl from __deduce_fixed_size_fallback. No? > diff --git a/libstdc++-v3/include/experimental/bits/simd_converter.h > b/libstdc++-v3/include/experimental/bits/simd_converter.h index > 3160e251632..b233d2c70a5 100644 > --- a/libstdc++-v3/include/experimental/bits/simd_converter.h > +++ b/libstdc++-v3/include/experimental/bits/simd_converter.h > @@ -28,6 +28,18 @@ > #if __cplusplus >=3D 201703L >=20 > _GLIBCXX_SIMD_BEGIN_NAMESPACE > + > +template > +_Ret __converter_fallback(_Arg __a) > + { > + _Ret __ret{}; > + __execute_n_times<_Np>( > + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { > + __ret._M_set(__i, static_cast<_To>(__a[__i])); > + }); > + return __ret; > + } > + > // _SimdConverter scalar -> scalar {{{ > template > struct _SimdConverter<_From, simd_abi::scalar, _To, simd_abi::scalar, > @@ -56,14 +68,16 @@ template > }; >=20 > // }}} > -// _SimdConverter "native 1" -> "native 2" {{{ > +// _SimdConverter "native non-sve 1" -> "native non-sve 2" {{{ > template > struct _SimdConverter< > _From, _AFrom, _To, _ATo, > enable_if_t __is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>, > is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>, > - conjunction, is_same<_AFrom, _ATo>>>>> > + conjunction, is_same<_AFrom, _ATo>>> > + && !(__is_sve_abi<_AFrom>() || __is_sve_abi<_ATo>()) > + >> > { > using _Arg =3D typename _AFrom::template __traits<_From>::_SimdMembe= r; > using _Ret =3D typename _ATo::template __traits<_To>::_SimdMember; > @@ -75,6 +89,26 @@ template typename _ATo> { return __vector_convert<_V>(__a, __more...); } > }; >=20 > +// }}} > +// _SimdConverter "native 1" -> "native 2" {{{ > +template > + struct _SimdConverter< > + _From, _AFrom, _To, _ATo, > + enable_if_t + __is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>, > + is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>, > + conjunction, is_same<_AFrom, _ATo>>> > + && (__is_sve_abi<_AFrom>() || __is_sve_abi<_ATo>()) > + >> > + { > + using _Arg =3D typename _AFrom::template __traits<_From>::_SimdMembe= r; > + using _Ret =3D typename _ATo::template __traits<_To>::_SimdMember; > + > + _GLIBCXX_SIMD_INTRINSIC constexpr _Ret > + operator()(_Arg __x) const noexcept > + { return __converter_fallback<_Arg, _Ret, _To, simd_size_v<_To, > _ATo>>(__x); } + }; > + I'd prefer if you could solve this with a constexpr-if in operator() instea= d=20 of making the enable_if condition even longer. Feel free to=20 static_assert(sizeof...(_More) =3D=3D 0) in the SVE branch. (Why is it=20 unnecessary, though?) > // }}} > // _SimdConverter scalar -> fixed_size<1> {{{1 > template > @@ -111,6 +145,10 @@ template > if constexpr (is_same_v<_From, _To>) > return __x; >=20 > + // fallback to sequential when sve is available > + else if constexpr (__have_sve) > + return __converter_fallback<_Arg, _Ret, _To, _Np>(__x); > + At least the next three cases should all work, no? Or is the point that thi= s=20 fallback leads to better code-gen with SVE? > diff --git a/libstdc++-v3/include/experimental/bits/simd_detail.h > b/libstdc++-v3/include/experimental/bits/simd_detail.h index > 1fb77866bb2..52fdf7149bf 100644 > --- a/libstdc++-v3/include/experimental/bits/simd_detail.h > +++ b/libstdc++-v3/include/experimental/bits/simd_detail.h > @@ -61,6 +61,11 @@ > #else > #define _GLIBCXX_SIMD_HAVE_NEON_A64 0 > #endif > +#if (__ARM_FEATURE_SVE_BITS > 0 && __ARM_FEATURE_SVE_VECTOR_OPERATORS=3D= =3D1) > +#define _GLIBCXX_SIMD_HAVE_SVE 1 > +#else > +#define _GLIBCXX_SIMD_HAVE_SVE 0 > +#endif > //}}} > // x86{{{ > #ifdef __MMX__ > @@ -267,7 +272,7 @@ > #define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0) > #define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1) >=20 > -#if __STRICT_ANSI__ || defined __clang__ > +#if _GLIBCXX_SIMD_HAVE_SVE || __STRICT_ANSI__ || defined __clang__ > #define _GLIBCXX_SIMD_CONSTEXPR > #define _GLIBCXX_SIMD_USE_CONSTEXPR_API const This is something I'd like to see resolved. (But not necessary for this pat= ch,=20 IMHO.) Even if some parts of the SVE interface can't be used in constant=20 expressions, it must be possible to work around those with `if=20 (__builtin_is_constant_evaluated())` branches. For C++26 we will have to do= =20 this, because the std::simd interface is fully constexpr. > diff --git a/libstdc++-v3/include/experimental/bits/simd_sve.h > b/libstdc++-v3/include/experimental/bits/simd_sve.h new file mode 100644 > index 00000000000..123242a3a62 > --- /dev/null > +++ b/libstdc++-v3/include/experimental/bits/simd_sve.h [...] > +template > + struct __sve_vector_type > + {}; > + > +template > + using __sve_vector_type_t =3D typename __sve_vector_type<_Tp, _Np>::ty= pe; > + > +template > + struct __sve_vector_type > + { > + typedef svint8_t __sve_vlst_type > __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + > + inline static __sve_vlst_type > + __sve_broadcast(int8_t __dup) > + { return svdup_s8(__dup); } > + > + inline static __sve_bool_type > + __sve_active_mask() > + { return svwhilelt_b8(size_t(0), _Np); }; > + > + using type =3D __sve_vlst_type; > + }; > + > +template > + struct __sve_vector_type > + { > + typedef svuint8_t __sve_vlst_type > __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + > + inline static __sve_vlst_type > + __sve_broadcast(uint8_t __dup) > + { return svdup_u8(__dup); } > + > + inline static __sve_bool_type > + __sve_active_mask() > + { return svwhilelt_b8(size_t(0), _Np); }; > + > + using type =3D __sve_vlst_type; > + }; > + > +template > + struct __sve_vector_type > + { > + typedef svint16_t __sve_vlst_type > __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + > + inline static __sve_vlst_type > + __sve_broadcast(int16_t __dup) > + { return svdup_s16(__dup); } > + > + inline static __sve_bool_type > + __sve_active_mask() > + { return svwhilelt_b16(size_t(0), _Np); }; > + > + using type =3D __sve_vlst_type; > + }; > + > +template > + struct __sve_vector_type > + { > + typedef svuint16_t __sve_vlst_type > __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + > + inline static __sve_vlst_type > + __sve_broadcast(uint16_t __dup) > + { return svdup_u16(__dup); } > + > + inline static __sve_bool_type > + __sve_active_mask() > + { return svwhilelt_b16(size_t(0), _Np); }; > + > + using type =3D __sve_vlst_type; > + }; > + > +template > + struct __sve_vector_type > + { > + typedef svint32_t __sve_vlst_type > __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + > + inline static __sve_vlst_type > + __sve_broadcast(int32_t __dup) > + { return svdup_s32(__dup); } > + > + inline static __sve_bool_type > + __sve_active_mask() > + { return svwhilelt_b32(size_t(0), _Np); }; > + > + using type =3D __sve_vlst_type; > + }; > + > +template > + struct __sve_vector_type > + { > + typedef svuint32_t __sve_vlst_type > __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + > + inline static __sve_vlst_type > + __sve_broadcast(uint32_t __dup) > + { return svdup_u32(__dup); } > + > + inline static __sve_bool_type > + __sve_active_mask() > + { return svwhilelt_b32(size_t(0), _Np); }; > + > + using type =3D __sve_vlst_type; > + }; > + > +template > + struct __sve_vector_type > + { > + typedef svint64_t __sve_vlst_type > __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + > + inline static __sve_vlst_type > + __sve_broadcast(int64_t __dup) > + { return svdup_s64(__dup); } > + > + inline static __sve_bool_type > + __sve_active_mask() > + { return svwhilelt_b64(size_t(0), _Np); }; > + > + using type =3D __sve_vlst_type; > + }; > + > +template > + struct __sve_vector_type > + { > + typedef svuint64_t __sve_vlst_type > __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + > + inline static __sve_vlst_type > + __sve_broadcast(uint64_t __dup) > + { return svdup_u64(__dup); } > + > + inline static __sve_bool_type > + __sve_active_mask() > + { return svwhilelt_b64(size_t(0), _Np); }; > + > + using type =3D __sve_vlst_type; > + }; > + > +template > + struct __sve_vector_type > + { > + typedef svfloat32_t __sve_vlst_type > __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + > + inline static __sve_vlst_type > + __sve_broadcast(float __dup) > + { return svdup_f32(__dup); } > + > + inline static __sve_bool_type > + __sve_active_mask() > + { return svwhilelt_b32(size_t(0), _Np); }; > + > + using type =3D __sve_vlst_type; > + }; > + > +template > + struct __sve_vector_type > + { > + typedef svfloat64_t __sve_vlst_type > __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + > + inline static __sve_vlst_type > + __sve_broadcast(double __dup) > + { return svdup_f64(__dup); } > + > + inline static __sve_bool_type > + __sve_active_mask() > + { return svwhilelt_b64(size_t(0), _Np); }; > + > + using type =3D __sve_vlst_type; > + }; > + > +template > + struct __sve_vector_type > + : __sve_vector_type<__get_sve_value_type_t, _Np> > + {}; > + > +template > + struct __sve_vector_type > + : __sve_vector_type<__get_sve_value_type_t, _Np> > + {}; > + > +template > + struct __sve_vector_type > + : __sve_vector_type<__get_sve_value_type_t, _Np> > + {}; > + > +template > + struct __sve_vector_type > + : __sve_vector_type<__get_sve_value_type_t, _Np> > + {}; > + > +template > + struct __sve_vector_type > + : __sve_vector_type<__get_sve_value_type_t, _Np> > + {}; > + > +template > + struct __sve_vector_type > + : __sve_vector_type<__get_sve_value_type_t, _N= p> > + {}; Please replace the last 6 partial specializations with a generic=20 implementation of the primary template: template struct __sve_vector_type : __sve_vector_type<__get_sve_value_type_t, _Np> {}; This avoids issues on platforms that define (u)int64_t as (unsigned) long l= ong=20 and is simpler in any case. [...] > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _N= p> > + _S_load(const _Up* __p, _SveMaskWrapper __k) > + { > + using _STp =3D __get_sve_value_type_t<_Tp>; > + using _SUp =3D __get_sve_value_type_t<_Up>; > + using _V =3D __sve_vector_type_t<_Tp, _Np>; > + const _SUp* __up =3D reinterpret_cast(__p); > + > + if constexpr (std::is_same_v<_Tp, _Up>) > + return _V(svld1(__k._M_data, __up)); > + if constexpr (std::is_integral_v<_Tp> && std::is_integral_v<_Up> > + && (sizeof(_Tp) > sizeof(_Up))) > + { > + if constexpr (std::is_same_v<_SUp, int8_t>) > + { > + if constexpr (std::is_same_v<_STp, int16_t>) > + return _V(svld1sb_s16(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint16_t>) > + return _V(svld1sb_u16(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, int32_t>) > + return _V(svld1sb_s32(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint32_t>) > + return _V(svld1sb_u32(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, int64_t>) > + return _V(svld1sb_s64(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint64_t>) > + return _V(svld1sb_u64(__k._M_data, __up)); > + } > + if constexpr (std::is_same_v<_SUp, uint8_t>) > + { > + if constexpr (std::is_same_v<_STp, int16_t>) > + return _V(svld1ub_s16(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint16_t>) > + return _V(svld1ub_u16(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, int32_t>) > + return _V(svld1ub_s32(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint32_t>) > + return _V(svld1ub_u32(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, int64_t>) > + return _V(svld1ub_s64(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint64_t>) > + return _V(svld1ub_u64(__k._M_data, __up)); > + } > + if constexpr (std::is_same_v<_SUp, int16_t>) > + { > + if constexpr (std::is_same_v<_STp, int32_t>) > + return _V(svld1sh_s32(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint32_t>) > + return _V(svld1sh_u32(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, int64_t>) > + return _V(svld1sh_s64(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint64_t>) > + return _V(svld1sh_u64(__k._M_data, __up)); > + } > + if constexpr (std::is_same_v<_SUp, uint16_t>) > + { > + if constexpr (std::is_same_v<_STp, int32_t>) > + return _V(svld1uh_s32(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint32_t>) > + return _V(svld1uh_u32(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, int64_t>) > + return _V(svld1uh_s64(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint64_t>) > + return _V(svld1uh_u64(__k._M_data, __up)); > + } > + if constexpr (std::is_same_v<_SUp, int32_t>) > + { > + if constexpr (std::is_same_v<_STp, int64_t>) > + return _V(svld1sw_s64(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint64_t>) > + return _V(svld1sw_u64(__k._M_data, __up)); > + } > + if constexpr (std::is_same_v<_SUp, uint32_t>) > + { > + if constexpr (std::is_same_v<_STp, int64_t>) > + return _V(svld1uw_s64(__k._M_data, __up)); > + if constexpr (std::is_same_v<_STp, uint64_t>) > + return _V(svld1uw_u64(__k._M_data, __up)); > + } > + } > + return __generate_from_n_evaluations<_Np, __sve_vector_type_t<_Tp, > _Np>>( > + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { > + return __k[__i] ? static_cast<_Tp>(__p[__i]) : _Tp{}; > + }); =46ine for now, because this unlikely to be used much anyway. But I'd prefe= r to=20 see masked vector load(s) + vector conversion(s) at some point. > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr void > + _S_store(_Up* __p, _SveSimdWrapper<_Tp, _Np> __x, > _SveMaskWrapper __k) + { > + using _SUp =3D __get_sve_value_type_t<_Up>; > + using _STp =3D __get_sve_value_type_t<_Tp>; > + > + _SUp* __up =3D reinterpret_cast<_SUp*>(__p); > + > + if constexpr (std::is_same_v<_Tp, _Up>) > + return svst1(__k._M_data, __up, __x); > + if constexpr (std::is_integral_v<_Tp> && std::is_integral_v<_Up> > + && (sizeof(_Tp) > sizeof(_Up))) > + { > + if constexpr (std::is_same_v<_SUp, int8_t> && std::is_signed_v<_STp>) > + return svst1b(__k._M_data, __up, __x); > + if constexpr (std::is_same_v<_SUp, uint8_t> && > std::is_unsigned_v<_STp>) + return svst1b(__k._M_data, __up, __x); > + if constexpr (std::is_same_v<_SUp, int16_t> && std::is_signed_v<_STp= >) > + return svst1h(__k._M_data, __up, __x); > + if constexpr (std::is_same_v<_SUp, uint16_t> && > std::is_unsigned_v<_STp>) + return svst1h(__k._M_data, __up, __x); > + if constexpr (std::is_same_v<_SUp, int32_t> && std::is_signed_v<_STp= >) > + return svst1w(__k._M_data, __up, __x); > + if constexpr (std::is_same_v<_SUp, uint32_t> && > std::is_unsigned_v<_STp>) + return svst1w(__k._M_data, __up, __x); > + } > + > + __execute_n_times<_Np>([&](auto __i) > _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if (__k[__i]) > + __p[__i] =3D static_cast<_Up>(__x[__i]); > + }); Same as for converting masked loads... > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _N= p> > + _S_blend(_SveMaskWrapper __k, _SveSimdWrapper<_Tp, > _Np> __at0, + _SveSimdWrapper<_Tp, _Np> __at1) > + { return svsel(__k._M_data, __at1._M_data, __at0._M_data); } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr void > + _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem) > + { > + __execute_n_times<_Np>([&](auto __i) > _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __mem[__i] =3D __x[__i]; > + }); > + } > +}; > + > +template > + struct _SimdImplSve > + { > + template > + using _MaskMember =3D typename _Abi::template _MaskMember<_Tp>; > + > + template > + using _SimdMember =3D typename _Abi::template > __traits<_Tp>::_SimdMember; + > + using _CommonImpl =3D typename _Abi::_CommonImpl; > + using _SuperImpl =3D typename _Abi::_SimdImpl; > + using _MaskImpl =3D typename _Abi::_MaskImpl; > + > + template > + static constexpr size_t _S_full_size =3D _Abi::template > _S_full_size<_Tp>; + > + template > + static constexpr size_t _S_size =3D _Abi::template _S_size<_Tp>; > + > + template > + using _TypeTag =3D _Tp*; > + > + using abi_type =3D _Abi; > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr auto > + _S_broadcast(_Tp __x) noexcept > + { > + return __sve_vector_type<_Tp, __sve_vectorized_size_bytes /=20 sizeof(_Tp)> > + ::__sve_broadcast(__x); > + } > + > + template > + inline static constexpr _SimdMember<_Tp> > + _S_generator(_Fp&& __gen, _TypeTag<_Tp>) > + { > + constexpr size_t _Np =3D _S_size<_Tp>; > + _SveSimdWrapper<_Tp, _Np> __ret; > + __execute_n_times<_S_size<_Tp>>( > + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {=20 __ret._M_set(__i, > __gen(__i)); }); + return __ret; > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp> > + _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept > + { > + constexpr size_t _Np =3D _S_size<_Tp>; > + _SimdMember<_Tp> __ret =3D _CommonImpl::template _S_load<_Tp, _Up,=20 _Np>( > + __mem, _SveMaskWrapper{ > + __sve_vector_type<_Tp,=20 _Np>::__sve_active_mask()}); > + return __ret; > + } > + > + template > + static constexpr inline _SveSimdWrapper<_Tp, _Np> > + _S_masked_load(_SveSimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> > __k, const _Up* __mem) + noexcept > + { > + __sve_vector_type_t<_Tp, _Np> __v > + =3D _CommonImpl::template _S_load<_Tp, _Up, _Np>(__mem, __k); > + __sve_vector_type_t<_Tp, _Np> __ret =3D svsel(__k._M_data, __v, > __merge._M_data); + return __ret; > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr void > + _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept > + { > + constexpr size_t _Np =3D _S_size<_Tp>; > + _CommonImpl::template _S_store<_Tp, _Up, _Np>( > + __mem, __v, __sve_vector_type<_Tp, _Np>::__sve_active_mask()); > + } > + > + template > + static constexpr inline void > + _S_masked_store(const _SveSimdWrapper<_Tp, _Np> __v, _Up* __mem, > + const _SveMaskWrapper __k) noexcept > + { _CommonImpl::template _S_store<_Tp, _Up, _Np>(__mem, __v, __k); } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> > + _S_negate(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { > + return svcmpeq(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, + __sve_vector_type<_Tp, > _Np>::__sve_broadcast(_Tp{})); > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp > + _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) > + { > + auto __x_data =3D __x._M_data; > + constexpr size_t _Np =3D simd_size_v<_Tp, _Abi>; > + using __sve_vec_t =3D __sve_vector_type_t<_Tp, _Np>; > + std::size_t __i =3D __x.size(); > + for (; (__i % 2) !=3D 1; __i /=3D 2) > + { > + __x_data =3D __binary_op(simd<_Tp, _Abi>( > + __private_init, _SveSimdWrapper<_Tp,=20 _Np>( > + =20 __sve_vec_t(svuzp1(__x_data, __x_data)))), > + simd<_Tp, _Abi>( > + __private_init, _SveSimdWrapper<_Tp,=20 _Np>( > + =20 __sve_vec_t(svuzp2(__x_data, __x_data)))) > + )._M_data; > + } > + _Tp __res =3D __x_data[0]; > + for (size_t __ri =3D 1; __ri !=3D __i; __ri++) > + __res =3D __binary_op(__x_data[__ri], __res); > + return __res; > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp > + _S_reduce(simd<_Tp, _Abi> __x, plus<>) > + { > + return svaddv(__sve_vector_type<_Tp, > _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp > + _S_reduce(simd<_Tp, _Abi> __x, bit_and<>) > + { > + return svandv(__sve_vector_type<_Tp, > _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp > + _S_reduce(simd<_Tp, _Abi> __x, bit_or<>) > + { > + return svorv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask= (), > __x._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp > + _S_reduce(simd<_Tp, _Abi> __x, bit_xor<>) > + { > + return sveorv(__sve_vector_type<_Tp, > _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp > + _S_reduce(simd<_Tp, _Abi> __x, __detail::_Maximum()) > + { > + return svmaxv(__sve_vector_type<_Tp, > _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp > + _S_reduce(simd<_Tp, _Abi> __x, __detail::_Minimum()) > + { > + return svminv(__sve_vector_type<_Tp, > _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } > + > + template > + _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr > + __sve_vector_type_t<_Tp, _Np> > + _S_min(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> __= b) > + { > + return svmin_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __a._M_data, __b._M_data); + } > + > + template > + _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr > + __sve_vector_type_t<_Tp, _Np> > + _S_max(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> __= b) > + { > + return svmax_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __a._M_data, __b._M_data); + } > + > + template > + _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr > + pair<_SveSimdWrapper<_Tp, _Np>, _SveSimdWrapper<_Tp, _Np>> > + _S_minmax(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> > __b) + { > + return { > + svmin_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(),=20 __a._M_data, > __b._M_data), + svmax_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(= ), > __a._M_data, __b._M_data) + }; > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_complement(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { > + if constexpr (is_floating_point_v<_Tp>) > + { > + using _Ip =3D __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; > + return __sve_reinterpret_cast<_Tp>( > + svnot_z(__sve_vector_type<_Tp,=20 _Np>::__sve_active_mask(), > + __sve_reinterpret_cast<_Ip>(__x))); > + } > + else > + return svnot_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _SveSimdWrapper<_Tp, _Np> > + _S_unary_minus(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { > + return svmul_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, + static_cast<_Tp>(-1)); > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_plus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) + { return __x._M_data + __y._M_data; } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_minus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) + { return __x._M_data - __y._M_data; } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_multiplies(_SveSimdWrapper<_Tp, _Np> __x, > _SveSimdWrapper<_Tp, _Np> __y) + { return __x._M_data * __y._M_data;= } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_divides(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) + { > + __sve_vector_type_t<_Tp, _Np> __y_padded =3D > svsel(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + = =20 > __y._M_data, __sve_vector_type<_Tp, _Np>::__sve_broadcast(1)); + = =20 > return __x._M_data / __y_padded; > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_modulus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) + { > + __sve_vector_type_t<_Tp, _Np> __y_padded =3D > svsel(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + = =20 > __y._M_data, __sve_vector_type<_Tp, _Np>::__sve_broadcast(1)); + = =20 > return __x._M_data % __y_padded; > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_bit_and(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) + { > + if constexpr (is_floating_point_v<_Tp>) > + { > + using _Ip =3D __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; > + return __sve_reinterpret_cast<_Tp>( > + svand_x(__sve_vector_type<_Tp,=20 _Np>::__sve_active_mask(), > + __sve_reinterpret_cast<_Ip>(__x), > __sve_reinterpret_cast<_Ip>(__y))); + } > + else > + return svand_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > + __x._M_data, __y._M_data); > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_bit_or(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) + { > + if constexpr (is_floating_point_v<_Tp>) > + { > + using _Ip =3D __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; > + return __sve_reinterpret_cast<_Tp>( > + svorr_x(__sve_vector_type<_Tp,=20 _Np>::__sve_active_mask(), > + __sve_reinterpret_cast<_Ip>(__x), > __sve_reinterpret_cast<_Ip>(__y))); + } > + else > + return svorr_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > + __x._M_data, __y._M_data); > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_bit_xor(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) + { > + if constexpr (is_floating_point_v<_Tp>) > + { > + using _Ip =3D __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; > + return __sve_reinterpret_cast<_Tp>( > + sveor_x(__sve_vector_type<_Tp,=20 _Np>::__sve_active_mask(), > + __sve_reinterpret_cast<_Ip>(__x), > __sve_reinterpret_cast<_Ip>(__y))); + } > + else > + return sveor_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > + __x._M_data, __y._M_data); > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static __sve_vector_type_t<_Tp, _Np> > + _S_bit_shift_left(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_= Tp, > _Np> __y) + { return __x._M_data << __y._M_data; } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static __sve_vector_type_t<_Tp, _Np> > + _S_bit_shift_right(_SveSimdWrapper<_Tp, _Np> __x, > _SveSimdWrapper<_Tp, _Np> __y) + { return __x._M_data >> __y._M_data; > } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_bit_shift_left(_SveSimdWrapper<_Tp, _Np> __x, int __y) > + { return __x._M_data << __y; } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, > _Np> + _S_bit_shift_right(_SveSimdWrapper<_Tp, _Np> __x, int __y) > + { return __x._M_data >> __y; } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr void > + _S_increment(_SveSimdWrapper<_Tp, _Np>& __x) > + { __x =3D __x._M_data + 1; } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr void > + _S_decrement(_SveSimdWrapper<_Tp, _Np>& __x) > + { __x =3D __x._M_data - 1; } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> > + _S_equal_to(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _N= p> > __y) + { > + return svcmpeq(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, __y._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> > + _S_not_equal_to(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) + { > + return svcmpne(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, __y._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> > + _S_less(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> _= _y) > + { > + return svcmplt(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, __y._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> > + _S_less_equal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) + { > + return svcmple(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, __y._M_data); + } > + > + // simd.math > +#define _GLIBCXX_SIMD_MATH_FALLBACK(__name) = =20 > \ + template typename... _More> \ + static > _SveSimdWrapper<_Tp, _Np> _S_##__name(const _SveSimdWrapper<_Tp, _Np>& __= x, > \ + const=20 _More&... __more) \ > + { = =20 > \ + _SveSimdWrapper<_Tp, _Np> __r; = =20 > \ > + __execute_n_times<_Np>([&](auto __i)=20 _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { > \ + __r._M_set(__i, __name(__x[__i], __more[__i]...));= =20 > \ + }); = =20 > \ + return __r; = =20 > = \ > + } > + > +#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) = =20 > \ + template = =20 > \ + static auto > _S_##__name(const _Tp& __x, const _More&... __more) = =20 > \ + { = =20 > \ + return=20 __fixed_size_storage_t<_RetTp, > _Tp::_S_size>::_S_generate( \ + [&] (auto __meta) > _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + =09 =20 > return __meta._S_generator( = =20 > \ + [&](auto __i)=20 _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { =20 > \ + return __name(__x[__meta._S_offset +=20 __i], =20 > \ + =20 __more[__meta._S_offset + __i]...); =20 > \ + }, static_cast<_RetTp*>(nullptr)); = =20 > \ + }); = =20 > \ + } > + > + _GLIBCXX_SIMD_MATH_FALLBACK(acos) > + _GLIBCXX_SIMD_MATH_FALLBACK(asin) > + _GLIBCXX_SIMD_MATH_FALLBACK(atan) > + _GLIBCXX_SIMD_MATH_FALLBACK(atan2) > + _GLIBCXX_SIMD_MATH_FALLBACK(cos) > + _GLIBCXX_SIMD_MATH_FALLBACK(sin) > + _GLIBCXX_SIMD_MATH_FALLBACK(tan) > + _GLIBCXX_SIMD_MATH_FALLBACK(acosh) > + _GLIBCXX_SIMD_MATH_FALLBACK(asinh) > + _GLIBCXX_SIMD_MATH_FALLBACK(atanh) > + _GLIBCXX_SIMD_MATH_FALLBACK(cosh) > + _GLIBCXX_SIMD_MATH_FALLBACK(sinh) > + _GLIBCXX_SIMD_MATH_FALLBACK(tanh) > + _GLIBCXX_SIMD_MATH_FALLBACK(exp) > + _GLIBCXX_SIMD_MATH_FALLBACK(exp2) > + _GLIBCXX_SIMD_MATH_FALLBACK(expm1) > + _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb) > + _GLIBCXX_SIMD_MATH_FALLBACK(log) > + _GLIBCXX_SIMD_MATH_FALLBACK(log10) > + _GLIBCXX_SIMD_MATH_FALLBACK(log1p) > + _GLIBCXX_SIMD_MATH_FALLBACK(log2) > + _GLIBCXX_SIMD_MATH_FALLBACK(logb) > + > + // modf implemented in simd_math.h > + _GLIBCXX_SIMD_MATH_FALLBACK(scalbn) > + _GLIBCXX_SIMD_MATH_FALLBACK(scalbln) > + _GLIBCXX_SIMD_MATH_FALLBACK(cbrt) > + _GLIBCXX_SIMD_MATH_FALLBACK(pow) > + _GLIBCXX_SIMD_MATH_FALLBACK(erf) > + _GLIBCXX_SIMD_MATH_FALLBACK(erfc) > + _GLIBCXX_SIMD_MATH_FALLBACK(lgamma) > + _GLIBCXX_SIMD_MATH_FALLBACK(tgamma) > + > + _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint) > + _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint) > + > + _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround) > + _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround) > + > + _GLIBCXX_SIMD_MATH_FALLBACK(fmod) > + _GLIBCXX_SIMD_MATH_FALLBACK(remainder) > + > + template > + static _SveSimdWrapper<_Tp, _Np> > + _S_remquo(const _SveSimdWrapper<_Tp, _Np> __x, const > _SveSimdWrapper<_Tp, _Np> __y, + __fixed_size_storage_t*=20 __z) > + { > + _SveSimdWrapper<_Tp, _Np> __r{}; > + __execute_n_times<_Np>([&](auto __i)=20 _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { > + int __tmp; > + __r._M_set(__i, remquo(__x[__i], __y[__i], &__tmp)); > + __z->_M_set(__i, __tmp); > + }); > + return __r; > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t > + _S_fpclassify(_SveSimdWrapper<_Tp, _Np> __x) > + { > + __fixed_size_storage_t __r{}; > + __execute_n_times<_Np>([&](auto __i)=20 _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { > + __r._M_set(__i, std::fpclassify(__x[__i])); > + }); > + return __r; > + } > + > + // copysign in simd_math.h > + _GLIBCXX_SIMD_MATH_FALLBACK(nextafter) > + _GLIBCXX_SIMD_MATH_FALLBACK(fdim) > + > +#undef _GLIBCXX_SIMD_MATH_FALLBACK > +#undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET > + > + template > + static constexpr _MaskMember<_Tp> > + __fp_cmp(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> > __y, _Op __op) + { > + using _Ip =3D __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; > + using _VI =3D __sve_vector_type_t<_Ip, _Np>; > + using _WI =3D _SveSimdWrapper<_Ip, _Np>; > + const _WI __fmv =3D __sve_vector_type<_Ip, > _Np>::__sve_broadcast(__finite_max_v<_Ip>); + const _WI __zerov =3D > __sve_vector_type<_Ip, _Np>::__sve_broadcast(0); + const _WI __xn =3D > _VI(__sve_reinterpret_cast<_Ip>(__x)); > + const _WI __yn =3D _VI(__sve_reinterpret_cast<_Ip>(__y)); > + > + const _WI __xp > + =3D svsel(_S_less(__xn, __zerov), _S_unary_minus(_WI(_S_bit_and(__xn, > __fmv))), __xn); + const _WI __yp > + =3D svsel(_S_less(__yn, __zerov), _S_unary_minus(_WI(_S_bit_and(__yn, > __fmv))), __yn); + return svbic_z(__sve_vector_type<_Ip, > _Np>::__sve_active_mask(), __op(__xp, __yp)._M_data, + =20 > _SuperImpl::_S_isunordered(__x, __y)._M_data); > + } > + > + template > + static constexpr _MaskMember<_Tp> > + _S_isgreater(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _= Np> > __y) noexcept + { return __fp_cmp(__x, __y, [](auto __xp, auto __yp)= { > return _S_less(__yp, __xp); }); } + > + template > + static constexpr _MaskMember<_Tp> > + _S_isgreaterequal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_= Tp, > _Np> __y) noexcept + { return __fp_cmp(__x, __y, [](auto __xp, auto > __yp) { return _S_less_equal(__yp, __xp); }); } + > + template > + static constexpr _MaskMember<_Tp> > + _S_isless(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> > __y) noexcept + { return __fp_cmp(__x, __y, [](auto __xp, auto __yp)= { > return _S_less(__xp, __yp); }); } + > + template > + static constexpr _MaskMember<_Tp> > + _S_islessequal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) noexcept + { return __fp_cmp(__x, __y, [](auto __xp, auto > __yp) { return _S_less_equal(__xp, __yp); }); } + > + template > + static constexpr _MaskMember<_Tp> > + _S_islessgreater(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_T= p, > _Np> __y) noexcept + { > + return svbic_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > + _SuperImpl::_S_not_equal_to(__x, __y)._M_data, > + _SuperImpl::_S_isunordered(__x, __y)._M_data); > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_abs(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { return svabs_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data); } + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_fabs(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { return svabs_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data); } + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_sqrt(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { return svsqrt_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data); } + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_ldexp(_SveSimdWrapper<_Tp, _Np> __x, __fixed_size_storage_t _Np> __y) noexcept + { > + auto __sve_register =3D __y.first; > + if constexpr (std::is_same_v<_Tp, float>) > + return svscale_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, + __sve_register._M_data); > + else > + { > + __sve_vector_type_t __sve_d_register =3D > svunpklo(__sve_register); + return svscale_z(__sve_vector_type<_Tp, > _Np>::__sve_active_mask(), __x._M_data, + =20 __sve_d_register); > + } > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_fma(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __= y, > + _SveSimdWrapper<_Tp, _Np> __z) > + { > + return svmad_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, __y._M_data, + __z._M_data); > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_fmax(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> _= _y) > + { > + return svmaxnm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, __y._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_fmin(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> _= _y) > + { > + return svminnm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, __y._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> > + _S_isfinite([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x) > + { > +#if __FINITE_MATH_ONLY__ > + return __sve_vector_type_t<_Tp, _Np>::__sve_all_true_mask(); > +#else > + // if all exponent bits are set, __x is either inf or NaN > + > + using _Ip =3D __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; > + const __sve_vector_type_t<_Ip, _Np> __absn =3D > __sve_reinterpret_cast<_Ip>(_S_abs(__x)); + const=20 __sve_vector_type_t<_Ip, > _Np> __maxn > + =3D __sve_reinterpret_cast<_Ip>( > + __sve_vector_type<_Tp,=20 _Np>::__sve_broadcast(__finite_max_v<_Tp>)); > + > + return _S_less_equal(_SveSimdWrapper<_Ip, _Np>{__absn}, > _SveSimdWrapper<_Ip, _Np>{__maxn}); +#endif > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> > + _S_isinf([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x) > + { > +#if __FINITE_MATH_ONLY__ > + return {}; // false > +#else > + return _S_equal_to<_Tp, _Np>(_S_abs(__x), > _S_broadcast(__infinity_v<_Tp>)); +#endif > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> > + _S_isnan([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x) > + { > +#if __FINITE_MATH_ONLY__ > + return {}; // false > +#else > + return svcmpuo(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, __x._M_data); +#endif > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> > + _S_isnormal(_SveSimdWrapper<_Tp, _Np> __x) > + { > + using _Ip =3D __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; > + using _V =3D __sve_vector_type_t<_Ip, _Np>; > + using _VW =3D _SveSimdWrapper<_Ip, _Np>; > + > + const _V __absn =3D __sve_reinterpret_cast<_Ip>(_S_abs(__x)); > + const _V __minn =3D __sve_reinterpret_cast<_Ip>( > + __sve_vector_type<_Tp,=20 _Np>::__sve_broadcast(__norm_min_v<_Tp>)); > +#if __FINITE_MATH_ONLY__ > + return _S_greater_equal(_VW{__absn}, _VW{__minn}); > +#else > + const _V __maxn =3D __sve_reinterpret_cast<_Ip>( > + __sve_vector_type<_Tp,=20 _Np>::__sve_broadcast(__finite_max_v<_Tp>)); > + return _MaskImpl::_S_bit_and(_S_less_equal(_VW{__minn}, _VW{__absn}), > + _S_less_equal(_VW{__absn},=20 _VW{__maxn})); > +#endif > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> > + _S_signbit(_SveSimdWrapper<_Tp, _Np> __x) > + { > + using _Ip =3D __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; > + using _V =3D __sve_vector_type_t<_Ip, _Np>; > + using _VW =3D _SveSimdWrapper<_Ip, _Np>; > + > + const _V __xn =3D __sve_reinterpret_cast<_Ip>(__x); > + const _V __zeron =3D __sve_vector_type<_Ip, _Np>::__sve_broadcast(0); > + return _S_less(_VW{__xn}, _VW{__zeron}); > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> > + _S_isunordered(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, > _Np> __y) + { > + return svcmpuo(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), > __x._M_data, __y._M_data); + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_nearbyint(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { return svrinti_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(= ), > __x._M_data); } + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_rint(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { return _SuperImpl::_S_nearbyint(__x); } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_trunc(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { return svrintz_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(= ), > __x._M_data); } + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_round(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { return svrinta_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(= ), > __x._M_data); } + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_floor(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { return svrintm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(= ), > __x._M_data); } + > + template > + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> > + _S_ceil(_SveSimdWrapper<_Tp, _Np> __x) noexcept > + { return svrintp_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(= ), > __x._M_data); } + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr void > + _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k, > _SveSimdWrapper<_Tp, _Np>& __lhs, + =20 > __type_identity_t<_SveSimdWrapper<_Tp, _Np>> __rhs) > + { __lhs =3D _CommonImpl::_S_blend(__k, __lhs, __rhs); } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr void > + _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k, > _SveSimdWrapper<_Tp, _Np>& __lhs, + =20 __type_identity_t<_Tp> __rhs) > + { __lhs =3D _CommonImpl::_S_blend(__k, __lhs, __data(simd<_Tp, > _Abi>(__rhs))); } + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr void > + _S_masked_cassign(const _SveMaskWrapper<_Bits, _Np> __k, > _SveSimdWrapper<_Tp, _Np>& __lhs, + const > __type_identity_t<_SveSimdWrapper<_Tp, _Np>> __rhs, _Op __op) + { > + __lhs =3D _CommonImpl::_S_blend(__k, __lhs, > + _SveSimdWrapper<_Tp,=20 _Np>(__op(_SuperImpl{}, __lhs, __rhs))); > + } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr void > + _S_masked_cassign(const _SveMaskWrapper<_Bits, _Np> __k, > _SveSimdWrapper<_Tp, _Np>& __lhs, + const=20 __type_identity_t<_Tp> __rhs, > _Op __op) > + { _S_masked_cassign(__k, __lhs, _S_broadcast(__rhs), __op); } > + > + template > + _GLIBCXX_SIMD_INTRINSIC static constexpr void > + _S_set(_SveSimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept > + { __v._M_set(__i, static_cast<_Up&&>(__x)); } > + > + template