public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-1235] aarch64: Implement vector FP absolute compare intrinsics with builtins
@ 2023-05-25 8:49 Kyrylo Tkachov
0 siblings, 0 replies; only message in thread
From: Kyrylo Tkachov @ 2023-05-25 8:49 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:0d1e0d7433c2c625d67c58fca435b0ffeab8c8ba
commit r14-1235-g0d1e0d7433c2c625d67c58fca435b0ffeab8c8ba
Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Thu May 25 09:48:33 2023 +0100
aarch64: Implement vector FP absolute compare intrinsics with builtins
While optimising some vector math library code with intrinsics we stumbled upon the issue in the testcase.
The compiler should be generating a FACGT instruction but instead we generate:
foo(__Float32x4_t, __Float32x4_t, __Float32x4_t):
fabs v0.4s, v0.4s
adrp x0, .LC0
ldr q31, [x0, #:lo12:.LC0]
fcmgt v0.4s, v0.4s, v31.4s
ret
This is because the vcagtq_f32 intrinsic is open-coded in arm_neon.h as
return vabsq_f32 (__a) > vabsq_f32 (__b)
thus relying on the optimisers to merge it back together. But since one of the arms of the comparison
is a vector constant the combine pass optimises the abs into it and tries matching:
(set (reg:V4SI 101)
(neg:V4SI (gt:V4SI (reg:V4SF 100)
(const_vector:V4SF [
(const_double:SF 1.0e+2 [0x0.c8p+7]) repeated x4
]))))
and
(set (reg:V4SI 101)
(neg:V4SI (gt:V4SI (abs:V4SF (reg:V4SF 104))
(reg:V4SF 103))))
instead of what we want:
(insn 13 9 14 2 (set (reg/i:V4SI 32 v0)
(neg:V4SI (gt:V4SI (abs:V4SF (reg:V4SF 98))
(abs:V4SF (reg:V4SF 96)))))
I don't really see a good way around that with our current implementation of these intrinsics.
Therefore this patch reimplements these intrinsics with aarch64 builtins that generate the RTL for these
instructions directly. Apparently we already had them defined in aarch64-simd-builtins.def and have been
using them for the fp16 case already.
I realise that this approach is against the general principle of expressing intrinsics in the higher-level constructs,
so I'm willing to listen to counter-arguments.
That said, the FACGT/FACGE instructions are as fast as the non-ABS comparison instructions on all microarchitectures that I know of
so it should always be a win to have them in the merged form rather than split the fabs step separately or try to hoist it.
And the testcase does come from real library code that we're trying to optimise.
With this patch for the testcase we generate:
foo:
adrp x0, .LC0
ldr q31, [x0, #:lo12:.LC0]
facgt v0.4s, v0.4s, v31.4s
ret
gcc/ChangeLog:
* config/aarch64/arm_neon.h (vcage_f64): Reimplement with builtins.
(vcage_f32): Likewise.
(vcages_f32): Likewise.
(vcageq_f32): Likewise.
(vcaged_f64): Likewise.
(vcageq_f64): Likewise.
(vcagts_f32): Likewise.
(vcagt_f32): Likewise.
(vcagt_f64): Likewise.
(vcagtq_f32): Likewise.
(vcagtd_f64): Likewise.
(vcagtq_f64): Likewise.
(vcale_f32): Likewise.
(vcale_f64): Likewise.
(vcaled_f64): Likewise.
(vcales_f32): Likewise.
(vcaleq_f32): Likewise.
(vcaleq_f64): Likewise.
(vcalt_f32): Likewise.
(vcalt_f64): Likewise.
(vcaltd_f64): Likewise.
(vcaltq_f32): Likewise.
(vcaltq_f64): Likewise.
(vcalts_f32): Likewise.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/simd/facgt_constpool_1.c: New test.
Diff:
---
gcc/config/aarch64/arm_neon.h | 48 +++++++++++-----------
.../gcc.target/aarch64/simd/facgt_constpool_1.c | 16 ++++++++
2 files changed, 40 insertions(+), 24 deletions(-)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index eeec9f162e2..afe205cb83c 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -7550,42 +7550,42 @@ __extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcage_f64 (float64x1_t __a, float64x1_t __b)
{
- return vabs_f64 (__a) >= vabs_f64 (__b);
+ return vcreate_u64 (__builtin_aarch64_facgedf_uss (__a[0], __b[0]));
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcages_f32 (float32_t __a, float32_t __b)
{
- return __builtin_fabsf (__a) >= __builtin_fabsf (__b) ? -1 : 0;
+ return __builtin_aarch64_facgesf_uss (__a, __b);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcage_f32 (float32x2_t __a, float32x2_t __b)
{
- return vabs_f32 (__a) >= vabs_f32 (__b);
+ return __builtin_aarch64_facgev2sf_uss (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcageq_f32 (float32x4_t __a, float32x4_t __b)
{
- return vabsq_f32 (__a) >= vabsq_f32 (__b);
+ return __builtin_aarch64_facgev4sf_uss (__a, __b);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcaged_f64 (float64_t __a, float64_t __b)
{
- return __builtin_fabs (__a) >= __builtin_fabs (__b) ? -1 : 0;
+ return __builtin_aarch64_facgedf_uss (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcageq_f64 (float64x2_t __a, float64x2_t __b)
{
- return vabsq_f64 (__a) >= vabsq_f64 (__b);
+ return __builtin_aarch64_facgev2df_uss (__a, __b);
}
/* vcagt */
@@ -7594,42 +7594,42 @@ __extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcagts_f32 (float32_t __a, float32_t __b)
{
- return __builtin_fabsf (__a) > __builtin_fabsf (__b) ? -1 : 0;
+ return __builtin_aarch64_facgtsf_uss (__a, __b);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcagt_f32 (float32x2_t __a, float32x2_t __b)
{
- return vabs_f32 (__a) > vabs_f32 (__b);
+ return __builtin_aarch64_facgtv2sf_uss (__a, __b);
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcagt_f64 (float64x1_t __a, float64x1_t __b)
{
- return vabs_f64 (__a) > vabs_f64 (__b);
+ return vcreate_u64 (__builtin_aarch64_facgtdf_uss (__a[0], __b[0]));
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcagtq_f32 (float32x4_t __a, float32x4_t __b)
{
- return vabsq_f32 (__a) > vabsq_f32 (__b);
+ return __builtin_aarch64_facgtv4sf_uss (__a, __b);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcagtd_f64 (float64_t __a, float64_t __b)
{
- return __builtin_fabs (__a) > __builtin_fabs (__b) ? -1 : 0;
+ return __builtin_aarch64_facgtdf_uss (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcagtq_f64 (float64x2_t __a, float64x2_t __b)
{
- return vabsq_f64 (__a) > vabsq_f64 (__b);
+ return __builtin_aarch64_facgtv2df_uss (__a, __b);
}
/* vcale */
@@ -7638,42 +7638,42 @@ __extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcale_f32 (float32x2_t __a, float32x2_t __b)
{
- return vabs_f32 (__a) <= vabs_f32 (__b);
+ return __builtin_aarch64_faclev2sf_uss (__a, __b);
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcale_f64 (float64x1_t __a, float64x1_t __b)
{
- return vabs_f64 (__a) <= vabs_f64 (__b);
+ return vcreate_u64 (__builtin_aarch64_facledf_uss (__a[0], __b[0]));
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcaled_f64 (float64_t __a, float64_t __b)
{
- return __builtin_fabs (__a) <= __builtin_fabs (__b) ? -1 : 0;
+ return __builtin_aarch64_facledf_uss (__a, __b);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcales_f32 (float32_t __a, float32_t __b)
{
- return __builtin_fabsf (__a) <= __builtin_fabsf (__b) ? -1 : 0;
+ return __builtin_aarch64_faclesf_uss (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcaleq_f32 (float32x4_t __a, float32x4_t __b)
{
- return vabsq_f32 (__a) <= vabsq_f32 (__b);
+ return __builtin_aarch64_faclev4sf_uss (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcaleq_f64 (float64x2_t __a, float64x2_t __b)
{
- return vabsq_f64 (__a) <= vabsq_f64 (__b);
+ return __builtin_aarch64_faclev2df_uss (__a, __b);
}
/* vcalt */
@@ -7682,42 +7682,42 @@ __extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcalt_f32 (float32x2_t __a, float32x2_t __b)
{
- return vabs_f32 (__a) < vabs_f32 (__b);
+ return __builtin_aarch64_facltv2sf_uss (__a, __b);
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcalt_f64 (float64x1_t __a, float64x1_t __b)
{
- return vabs_f64 (__a) < vabs_f64 (__b);
+ return vcreate_u64 (__builtin_aarch64_facltdf_uss (__a[0], __b[0]));
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcaltd_f64 (float64_t __a, float64_t __b)
{
- return __builtin_fabs (__a) < __builtin_fabs (__b) ? -1 : 0;
+ return __builtin_aarch64_facltdf_uss (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcaltq_f32 (float32x4_t __a, float32x4_t __b)
{
- return vabsq_f32 (__a) < vabsq_f32 (__b);
+ return __builtin_aarch64_facltv4sf_uss (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcaltq_f64 (float64x2_t __a, float64x2_t __b)
{
- return vabsq_f64 (__a) < vabsq_f64 (__b);
+ return __builtin_aarch64_facltv2df_uss (__a, __b);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcalts_f32 (float32_t __a, float32_t __b)
{
- return __builtin_fabsf (__a) < __builtin_fabsf (__b) ? -1 : 0;
+ return __builtin_aarch64_facltsf_uss (__a, __b);
}
/* vceq - vector. */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/facgt_constpool_1.c b/gcc/testsuite/gcc.target/aarch64/simd/facgt_constpool_1.c
new file mode 100644
index 00000000000..4ebfd1f4c75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/facgt_constpool_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+#include <arm_neon.h>
+
+uint32x4_t g (uint32x4_t, uint32x4_t);
+
+uint32x4_t
+foo (float32x4_t x, float32x4_t a, float32x4_t b)
+{
+ return vcagtq_f32 (x, (float32x4_t){ 100.0, 100.0, 100.0, 100.0});
+}
+
+/* { dg-final { scan-assembler-times {facgt\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.4s} 1 } } */
+/* { dg-final { scan-assembler-not {\tfcmgt\t} } } */
+
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-05-25 8:49 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-25 8:49 [gcc r14-1235] aarch64: Implement vector FP absolute compare intrinsics with builtins Kyrylo Tkachov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).