public inbox for libstdc++-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r11-7083] libstdc++: Improve "find_first/last_set" for NEON
@ 2021-02-03 15:51 Jonathan Wakely
  0 siblings, 0 replies; only message in thread
From: Jonathan Wakely @ 2021-02-03 15:51 UTC (permalink / raw)
  To: gcc-cvs, libstdc++-cvs

https://gcc.gnu.org/g:598876574184e745defee4b36dc2408068b7a22e

commit r11-7083-g598876574184e745defee4b36dc2408068b7a22e
Author: yaozhongxiao <yaozhongxiao@linux.alibaba.com>
Date:   Wed Feb 3 15:49:30 2021 +0000

    libstdc++: Improve "find_first/last_set" for NEON
    
    The find_first_set and find_last_set method is not optimal for neon, it
    needs to be improved by synthesized with horizontal adds(vaddv) which
    will reduce the generated assembly code. In the following cases,
    vaddvq_s16 will generate 2 instructions but vpadd_s16 will generate 4
    instructions:
    
     # vaddvq_s16
        vaddvq_s16(__asint);
        //  addv    h0, v1.8h
        //  smov    w1, v0.h[0]
     # vpadd_s16
        vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero), __zero)[0]
        // addp v1.8h,v1.8h,v2.8h
        // addp v1.8h,v1.8h,v2.8h
        // addp v1.8h,v1.8h,v2.8h
        // smov    w1, v1.h[0]
     #
    
    libstdc++-v3/ChangeLog:
    
            * include/experimental/bits/simd_neon.h: Replace repeated vpadd
            calls with a single vaddv for aarch64.

Diff:
---
 libstdc++-v3/include/experimental/bits/simd_neon.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/libstdc++-v3/include/experimental/bits/simd_neon.h b/libstdc++-v3/include/experimental/bits/simd_neon.h
index 8bb21169c8d..7f472e88649 100644
--- a/libstdc++-v3/include/experimental/bits/simd_neon.h
+++ b/libstdc++-v3/include/experimental/bits/simd_neon.h
@@ -311,8 +311,7 @@ struct _MaskImplNeonMixin
 		  });
 	      __asint &= __bitsel;
 #ifdef __aarch64__
-	      return vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero),
-				__zero)[0];
+	      return vaddvq_s16(__asint);
 #else
 	      return vpadd_s16(
 		vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
@@ -328,7 +327,7 @@ struct _MaskImplNeonMixin
 		  });
 	      __asint &= __bitsel;
 #ifdef __aarch64__
-	      return vpaddq_s32(vpaddq_s32(__asint, __zero), __zero)[0];
+	      return vaddvq_s32(__asint);
 #else
 	      return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
 			       __zero)[0];
@@ -351,8 +350,12 @@ struct _MaskImplNeonMixin
 		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
 		  });
 	      __asint &= __bitsel;
+#ifdef __aarch64__
+	      return vaddv_s8(__asint);
+#else
 	      return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
 			      __zero)[0];
+#endif
 	    }
 	  else if constexpr (sizeof(_Tp) == 2)
 	    {
@@ -362,12 +365,20 @@ struct _MaskImplNeonMixin
 		    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
 		  });
 	      __asint &= __bitsel;
+#ifdef __aarch64__
+	      return vaddv_s16(__asint);
+#else
 	      return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
+#endif
 	    }
 	  else if constexpr (sizeof(_Tp) == 4)
 	    {
 	      __asint &= __make_vector<_I>(0x1, 0x2);
+#ifdef __aarch64__
+	      return vaddv_s32(__asint);
+#else
 	      return vpadd_s32(__asint, __zero)[0];
+#endif
 	    }
 	  else
 	    __assert_unreachable<_Tp>();


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-02-03 15:51 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-03 15:51 [gcc r11-7083] libstdc++: Improve "find_first/last_set" for NEON Jonathan Wakely

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).