public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-1235] aarch64: Implement vector FP absolute compare intrinsics with builtins
@ 2023-05-25  8:49 Kyrylo Tkachov
  0 siblings, 0 replies; only message in thread
From: Kyrylo Tkachov @ 2023-05-25  8:49 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:0d1e0d7433c2c625d67c58fca435b0ffeab8c8ba

commit r14-1235-g0d1e0d7433c2c625d67c58fca435b0ffeab8c8ba
Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date:   Thu May 25 09:48:33 2023 +0100

    aarch64: Implement vector FP absolute compare intrinsics with builtins
    
    While optimising some vector math library code with intrinsics we stumbled upon the issue in the testcase.
    The compiler should be generating a FACGT instruction but instead we generate:
    foo(__Float32x4_t, __Float32x4_t, __Float32x4_t):
            fabs    v0.4s, v0.4s
            adrp    x0, .LC0
            ldr     q31, [x0, #:lo12:.LC0]
            fcmgt   v0.4s, v0.4s, v31.4s
            ret
    
    This is because the vcagtq_f32 intrinsic is open-coded in arm_neon.h as
    return vabsq_f32 (__a) > vabsq_f32 (__b)
    thus relying on the optimisers to merge it back together. But since one of the arms of the comparison
    is a vector constant the combine pass optimises the abs into it and tries matching:
    (set (reg:V4SI 101)
        (neg:V4SI (gt:V4SI (reg:V4SF 100)
                (const_vector:V4SF [
                        (const_double:SF 1.0e+2 [0x0.c8p+7]) repeated x4
                    ]))))
    and
    (set (reg:V4SI 101)
        (neg:V4SI (gt:V4SI (abs:V4SF (reg:V4SF 104))
                (reg:V4SF 103))))
    
    instead of what we want:
    (insn 13 9 14 2 (set (reg/i:V4SI 32 v0)
            (neg:V4SI (gt:V4SI (abs:V4SF (reg:V4SF 98))
                    (abs:V4SF (reg:V4SF 96)))))
    
    I don't really see a good way around that with our current implementation of these intrinsics.
    Therefore this patch reimplements these intrinsics with aarch64 builtins that generate the RTL for these
    instructions directly. Apparently we already had them defined in aarch64-simd-builtins.def and have been
    using them for the fp16 case already.
    I realise that this approach is against the general principle of expressing intrinsics in the higher-level constructs,
    so I'm willing to listen to counter-arguments.
    That said, the FACGT/FACGE instructions are as fast as the non-ABS comparison instructions on all microarchitectures that I know of
    so it should always be a win to have them in the merged form rather than split the fabs step separately or try to hoist it.
    And the testcase does come from real library code that we're trying to optimise.
    With this patch for the testcase we generate:
    foo:
            adrp    x0, .LC0
            ldr     q31, [x0, #:lo12:.LC0]
            facgt   v0.4s, v0.4s, v31.4s
            ret
    
    gcc/ChangeLog:
    
            * config/aarch64/arm_neon.h (vcage_f64): Reimplement with builtins.
            (vcage_f32): Likewise.
            (vcages_f32): Likewise.
            (vcageq_f32): Likewise.
            (vcaged_f64): Likewise.
            (vcageq_f64): Likewise.
            (vcagts_f32): Likewise.
            (vcagt_f32): Likewise.
            (vcagt_f64): Likewise.
            (vcagtq_f32): Likewise.
            (vcagtd_f64): Likewise.
            (vcagtq_f64): Likewise.
            (vcale_f32): Likewise.
            (vcale_f64): Likewise.
            (vcaled_f64): Likewise.
            (vcales_f32): Likewise.
            (vcaleq_f32): Likewise.
            (vcaleq_f64): Likewise.
            (vcalt_f32): Likewise.
            (vcalt_f64): Likewise.
            (vcaltd_f64): Likewise.
            (vcaltq_f32): Likewise.
            (vcaltq_f64): Likewise.
            (vcalts_f32): Likewise.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/simd/facgt_constpool_1.c: New test.

Diff:
---
 gcc/config/aarch64/arm_neon.h                      | 48 +++++++++++-----------
 .../gcc.target/aarch64/simd/facgt_constpool_1.c    | 16 ++++++++
 2 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index eeec9f162e2..afe205cb83c 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7550,42 +7550,42 @@ __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcage_f64 (float64x1_t __a, float64x1_t __b)
 {
-  return vabs_f64 (__a) >= vabs_f64 (__b);
+  return vcreate_u64 (__builtin_aarch64_facgedf_uss (__a[0], __b[0]));
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcages_f32 (float32_t __a, float32_t __b)
 {
-  return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0;
+  return __builtin_aarch64_facgesf_uss  (__a, __b);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcage_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return vabs_f32 (__a) >= vabs_f32 (__b);
+  return __builtin_aarch64_facgev2sf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcageq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return vabsq_f32 (__a) >= vabsq_f32 (__b);
+  return __builtin_aarch64_facgev4sf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcaged_f64 (float64_t __a, float64_t __b)
 {
-  return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0;
+  return __builtin_aarch64_facgedf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcageq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return vabsq_f64 (__a) >= vabsq_f64 (__b);
+  return __builtin_aarch64_facgev2df_uss (__a, __b);
 }
 
 /* vcagt  */
@@ -7594,42 +7594,42 @@ __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcagts_f32 (float32_t __a, float32_t __b)
 {
-  return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0;
+  return __builtin_aarch64_facgtsf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcagt_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return vabs_f32 (__a) > vabs_f32 (__b);
+  return __builtin_aarch64_facgtv2sf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcagt_f64 (float64x1_t __a, float64x1_t __b)
 {
-  return vabs_f64 (__a) > vabs_f64 (__b);
+  return vcreate_u64 (__builtin_aarch64_facgtdf_uss (__a[0], __b[0]));
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcagtq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return vabsq_f32 (__a) > vabsq_f32 (__b);
+  return __builtin_aarch64_facgtv4sf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcagtd_f64 (float64_t __a, float64_t __b)
 {
-  return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0;
+  return __builtin_aarch64_facgtdf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcagtq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return vabsq_f64 (__a) > vabsq_f64 (__b);
+  return __builtin_aarch64_facgtv2df_uss (__a, __b);
 }
 
 /* vcale  */
@@ -7638,42 +7638,42 @@ __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcale_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return vabs_f32 (__a) <= vabs_f32 (__b);
+  return __builtin_aarch64_faclev2sf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcale_f64 (float64x1_t __a, float64x1_t __b)
 {
-  return vabs_f64 (__a) <= vabs_f64 (__b);
+  return vcreate_u64 (__builtin_aarch64_facledf_uss (__a[0], __b[0]));
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcaled_f64 (float64_t __a, float64_t __b)
 {
-  return __builtin_fabs (__a) <= __builtin_fabs (__b) ? -1 : 0;
+  return __builtin_aarch64_facledf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcales_f32 (float32_t __a, float32_t __b)
 {
-  return __builtin_fabsf (__a) <= __builtin_fabsf (__b) ? -1 : 0;
+  return __builtin_aarch64_faclesf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcaleq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return vabsq_f32 (__a) <= vabsq_f32 (__b);
+  return __builtin_aarch64_faclev4sf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcaleq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return vabsq_f64 (__a) <= vabsq_f64 (__b);
+  return __builtin_aarch64_faclev2df_uss (__a, __b);
 }
 
 /* vcalt  */
@@ -7682,42 +7682,42 @@ __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcalt_f32 (float32x2_t __a, float32x2_t __b)
 {
-  return vabs_f32 (__a) < vabs_f32 (__b);
+  return __builtin_aarch64_facltv2sf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcalt_f64 (float64x1_t __a, float64x1_t __b)
 {
-  return vabs_f64 (__a) < vabs_f64 (__b);
+  return vcreate_u64 (__builtin_aarch64_facltdf_uss (__a[0], __b[0]));
 }
 
 __extension__ extern __inline uint64_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcaltd_f64 (float64_t __a, float64_t __b)
 {
-  return __builtin_fabs (__a) < __builtin_fabs (__b) ? -1 : 0;
+  return __builtin_aarch64_facltdf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcaltq_f32 (float32x4_t __a, float32x4_t __b)
 {
-  return vabsq_f32 (__a) < vabsq_f32 (__b);
+  return __builtin_aarch64_facltv4sf_uss (__a, __b);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcaltq_f64 (float64x2_t __a, float64x2_t __b)
 {
-  return vabsq_f64 (__a) < vabsq_f64 (__b);
+  return __builtin_aarch64_facltv2df_uss (__a, __b);
 }
 
 __extension__ extern __inline uint32_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcalts_f32 (float32_t __a, float32_t __b)
 {
-  return __builtin_fabsf (__a) < __builtin_fabsf (__b) ? -1 : 0;
+  return __builtin_aarch64_facltsf_uss (__a, __b);
 }
 
 /* vceq - vector.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/facgt_constpool_1.c b/gcc/testsuite/gcc.target/aarch64/simd/facgt_constpool_1.c
new file mode 100644
index 00000000000..4ebfd1f4c75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/facgt_constpool_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+uint32x4_t g (uint32x4_t, uint32x4_t);
+
+uint32x4_t
+foo (float32x4_t x, float32x4_t a, float32x4_t b)
+{
+  return vcagtq_f32 (x, (float32x4_t){ 100.0, 100.0, 100.0, 100.0});
+}
+
+/* { dg-final { scan-assembler-times {facgt\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.4s} 1 } } */
+/* { dg-final { scan-assembler-not {\tfcmgt\t} } } */
+

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-05-25  8:49 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-25  8:49 [gcc r14-1235] aarch64: Implement vector FP absolute compare intrinsics with builtins Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).