public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] aarch64: Reimplement vget_low* intrinsics
@ 2021-02-05  8:13 Kyrylo Tkachov
  2021-02-05  9:24 ` Richard Biener
  0 siblings, 1 reply; 3+ messages in thread
From: Kyrylo Tkachov @ 2021-02-05  8:13 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1461 bytes --]

Hi all,

We can do better on the vget_low* intrinsics.
Currently they reinterpret their argument into a V2DI vector and extract the low "lane",
reinterpreting that back into the shorter vector.
This is functionally correct and generates a sequence of subregs and a vec_select that, by itself,
gets optimised away eventually.
However it's bad when we want to use the result in a other SIMD operations.
Then the subreg-vec_select-subreg combo blocks many combine patterns.

This patch reimplements them to emit a proper low vec_select from the start.
It generates much cleaner RTL and allows for more aggressive combinations, particularly
with the patterns that Jonathan has been pushing lately.

Bootstrapped and tested on aarch64-none-linux-gnu and aarch64_be-none-elf.
Pushing to trunk.
Thanks,
Kyrill

Thanks,
Kyrill

gcc/ChangeLog:

	* config/aarch64/aarch64-simd-builtins.def (get_low): Define builtin.
	* config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Define.
	* config/aarch64/arm_neon.h (__GET_LOW): Delete.
	(vget_low_f16): Reimplement using new builtin.
	(vget_low_f32): Likewise.
	(vget_low_f64): Likewise.
	(vget_low_p8): Likewise.
	(vget_low_p16): Likewise.
	(vget_low_p64): Likewise.
	(vget_low_s8): Likewise.
	(vget_low_s16): Likewise.
	(vget_low_s32): Likewise.
	(vget_low_s64): Likewise.
	(vget_low_u8): Likewise.
	(vget_low_u16): Likewise.
	(vget_low_u32): Likewise.
	(vget_low_u64): Likewise.

[-- Attachment #2: get_low.patch --]
[-- Type: application/octet-stream, Size: 5529 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 29a7bbc24a7370fc077ab6c66f3de551f6926b7e..66420cf4f4b84b210c2ba7a9919d49d012cfc59f 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -55,6 +55,9 @@
   BUILTIN_VS (UNOP, ctz, 2, NONE)
   BUILTIN_VB (UNOP, popcount, 2, NONE)
 
+  /* Implemented by aarch64_get_low<mode>.  */
+  BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP)
+
   /* Implemented by aarch64_<sur>q<r>shl<mode>.  */
   BUILTIN_VSDQ_I (BINOP, sqshl, 0, NONE)
   BUILTIN_VSDQ_I (BINOP_UUS, uqshl, 0, NONE)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 60eeddcc946daac87869e11cf138d837c7f0ea6f..e730ff5f28e9c942ff083d905bb5ac2e9955e7e3 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -297,6 +297,17 @@ (define_expand "aarch64_get_half<mode>"
   "TARGET_SIMD"
 )
 
+(define_expand "aarch64_get_low<mode>"
+  [(match_operand:<VHALF> 0 "register_operand")
+   (match_operand:VQMOV 1 "register_operand")]
+  "TARGET_SIMD"
+  {
+    rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+    emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo));
+    DONE;
+  }
+)
+
 (define_insn_and_split "aarch64_simd_mov_from_<mode>low"
   [(set (match_operand:<VHALF> 0 "register_operand" "=w,?r")
         (vec_select:<VHALF>
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 2d776ef7ef4ed7fad166dd00c4b4eb8bcaf75fc8..67c7f2493893c22e571b6e9107f01fda72168399 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -6302,111 +6302,104 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-#define __GET_LOW(__TYPE) \
-  uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a);  \
-  uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0));  \
-  return vreinterpret_##__TYPE##_u64 (lo);
-
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f16 (float16x8_t __a)
 {
-  __GET_LOW (f16);
+  return __builtin_aarch64_get_lowv8hf (__a);
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f32 (float32x4_t __a)
 {
-  __GET_LOW (f32);
+  return __builtin_aarch64_get_lowv4sf (__a);
 }
 
 __extension__ extern __inline float64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_f64 (float64x2_t __a)
 {
-  return (float64x1_t) {vgetq_lane_f64 (__a, 0)};
+  return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)};
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p8 (poly8x16_t __a)
 {
-  __GET_LOW (p8);
+  return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
 }
 
 __extension__ extern __inline poly16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p16 (poly16x8_t __a)
 {
-  __GET_LOW (p16);
+  return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
 }
 
 __extension__ extern __inline poly64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_p64 (poly64x2_t __a)
 {
-  __GET_LOW (p64);
+  return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a);
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s8 (int8x16_t __a)
 {
-  __GET_LOW (s8);
+  return  __builtin_aarch64_get_lowv16qi (__a);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s16 (int16x8_t __a)
 {
-  __GET_LOW (s16);
+  return  __builtin_aarch64_get_lowv8hi (__a);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s32 (int32x4_t __a)
 {
-  __GET_LOW (s32);
+  return  __builtin_aarch64_get_lowv4si (__a);
 }
 
 __extension__ extern __inline int64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_s64 (int64x2_t __a)
 {
-  __GET_LOW (s64);
+  return  (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)};
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u8 (uint8x16_t __a)
 {
-  __GET_LOW (u8);
+  return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u16 (uint16x8_t __a)
 {
-  __GET_LOW (u16);
+  return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u32 (uint32x4_t __a)
 {
-  __GET_LOW (u32);
+  return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_low_u64 (uint64x2_t __a)
 {
-  return vcreate_u64 (vgetq_lane_u64 (__a, 0));
+  return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)};
 }
 
-#undef __GET_LOW
-
 #define __GET_HIGH(__TYPE)					\
   uint64x2_t tmp = vreinterpretq_u64_##__TYPE (__a);		\
   uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1));	\

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] aarch64: Reimplement vget_low* intrinsics
  2021-02-05  8:13 [PATCH] aarch64: Reimplement vget_low* intrinsics Kyrylo Tkachov
@ 2021-02-05  9:24 ` Richard Biener
  2021-02-05  9:46   ` Kyrylo Tkachov
  0 siblings, 1 reply; 3+ messages in thread
From: Richard Biener @ 2021-02-05  9:24 UTC (permalink / raw)
  To: Kyrylo Tkachov; +Cc: gcc-patches

On Fri, Feb 5, 2021 at 9:59 AM Kyrylo Tkachov via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi all,
>
> We can do better on the vget_low* intrinsics.
> Currently they reinterpret their argument into a V2DI vector and extract the low "lane",
> reinterpreting that back into the shorter vector.
> This is functionally correct and generates a sequence of subregs and a vec_select that, by itself,
> gets optimised away eventually.
> However it's bad when we want to use the result in a other SIMD operations.
> Then the subreg-vec_select-subreg combo blocks many combine patterns.
>
> This patch reimplements them to emit a proper low vec_select from the start.
> It generates much cleaner RTL and allows for more aggressive combinations, particularly
> with the patterns that Jonathan has been pushing lately.
>
> Bootstrapped and tested on aarch64-none-linux-gnu and aarch64_be-none-elf.
> Pushing to trunk.

Just to remind you folks that we're in stage4 which means fixes to regressions
(or wrong-code) only.  aarch64 is a primary target and you should provide a good
example of following the rules we set up for GCC development.

I'd expect _at least_ a short sentence on why you think this change is
absolutely
required for GCC 11.

The change also comes with zero testcases and zero bug references.

Sorry for this particular change taking the fire, I just picked a random one of
the non-regression change-storm I'm seeing for arm/aarch64 recently.

Thanks for your consideration,
Richard.

> Thanks,
> Kyrill
>
> Thanks,
> Kyrill
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64-simd-builtins.def (get_low): Define builtin.
>         * config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Define.
>         * config/aarch64/arm_neon.h (__GET_LOW): Delete.
>         (vget_low_f16): Reimplement using new builtin.
>         (vget_low_f32): Likewise.
>         (vget_low_f64): Likewise.
>         (vget_low_p8): Likewise.
>         (vget_low_p16): Likewise.
>         (vget_low_p64): Likewise.
>         (vget_low_s8): Likewise.
>         (vget_low_s16): Likewise.
>         (vget_low_s32): Likewise.
>         (vget_low_s64): Likewise.
>         (vget_low_u8): Likewise.
>         (vget_low_u16): Likewise.
>         (vget_low_u32): Likewise.
>         (vget_low_u64): Likewise.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [PATCH] aarch64: Reimplement vget_low* intrinsics
  2021-02-05  9:24 ` Richard Biener
@ 2021-02-05  9:46   ` Kyrylo Tkachov
  0 siblings, 0 replies; 3+ messages in thread
From: Kyrylo Tkachov @ 2021-02-05  9:46 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches

Hi Richard,

> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: 05 February 2021 09:25
> To: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] aarch64: Reimplement vget_low* intrinsics
> 
> On Fri, Feb 5, 2021 at 9:59 AM Kyrylo Tkachov via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Hi all,
> >
> > We can do better on the vget_low* intrinsics.
> > Currently they reinterpret their argument into a V2DI vector and extract the
> low "lane",
> > reinterpreting that back into the shorter vector.
> > This is functionally correct and generates a sequence of subregs and a
> vec_select that, by itself,
> > gets optimised away eventually.
> > However it's bad when we want to use the result in a other SIMD
> operations.
> > Then the subreg-vec_select-subreg combo blocks many combine patterns.
> >
> > This patch reimplements them to emit a proper low vec_select from the
> start.
> > It generates much cleaner RTL and allows for more aggressive
> combinations, particularly
> > with the patterns that Jonathan has been pushing lately.
> >
> > Bootstrapped and tested on aarch64-none-linux-gnu and aarch64_be-
> none-elf.
> > Pushing to trunk.
> 
> Just to remind you folks that we're in stage4 which means fixes to
> regressions
> (or wrong-code) only.  aarch64 is a primary target and you should provide a
> good
> example of following the rules we set up for GCC development.

Apologies for the stream of such patches this late in development.
Indeed it is quite late in the development cycle and I'll be more careful for the rest of stage4.

> 
> I'd expect _at least_ a short sentence on why you think this change is
> absolutely
> required for GCC 11.

It was mostly reports from some users on really bad code generation with intrinsics, similar to PR94442.
The root cause for most of these is are implementations of the intrinsics with inline assembly, which can be fixed in a mostly-mechanical way, thus the similarly-looking patches.
I appreciate though that this needs to be weighed against the stability requirements at this stage...
> 
> The change also comes with zero testcases and zero bug references.

Indeed, I could have elaborated more. The recent changes have been targeted at the intrinsics in arm_neon.h.
We have quite a detailed testsuite for them at gcc.target/aarch64/advsimd-intrinsics that exercises them, which is why I felt confident to push changes in that area at this stage. 

> 
> Sorry for this particular change taking the fire, I just picked a random one of
> the non-regression change-storm I'm seeing for arm/aarch64 recently.
> 

Thanks keeping me honest.
Kyrill

> Thanks for your consideration,
> Richard.
> 
> > Thanks,
> > Kyrill
> >
> > Thanks,
> > Kyrill
> >
> > gcc/ChangeLog:
> >
> >         * config/aarch64/aarch64-simd-builtins.def (get_low): Define builtin.
> >         * config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Define.
> >         * config/aarch64/arm_neon.h (__GET_LOW): Delete.
> >         (vget_low_f16): Reimplement using new builtin.
> >         (vget_low_f32): Likewise.
> >         (vget_low_f64): Likewise.
> >         (vget_low_p8): Likewise.
> >         (vget_low_p16): Likewise.
> >         (vget_low_p64): Likewise.
> >         (vget_low_s8): Likewise.
> >         (vget_low_s16): Likewise.
> >         (vget_low_s32): Likewise.
> >         (vget_low_s64): Likewise.
> >         (vget_low_u8): Likewise.
> >         (vget_low_u16): Likewise.
> >         (vget_low_u32): Likewise.
> >         (vget_low_u64): Likewise.

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2021-02-05  9:47 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-05  8:13 [PATCH] aarch64: Reimplement vget_low* intrinsics Kyrylo Tkachov
2021-02-05  9:24 ` Richard Biener
2021-02-05  9:46   ` Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).