[PATCH][committed] aarch64: Improve representation of ADDLV instructions

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH][committed] aarch64: Improve representation of ADDLV instructions
@ 2023-06-06  8:54 Kyrylo Tkachov
  0 siblings, 0 replies; only message in thread
From: Kyrylo Tkachov @ 2023-06-06  8:54 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2107 bytes --]

Hi all,

We've received requests to optimise the attached intrinsics testcase.
We currently generate:
foo_1:
        uaddlp  v0.4s, v0.8h
        uaddlv  d31, v0.4s
        fmov    x0, d31
        ret
foo_2:
        uaddlp  v0.4s, v0.8h
        addv    s31, v0.4s
        fmov    w0, s31
        ret
foo_3:
        saddlp  v0.4s, v0.8h
        addv    s31, v0.4s
        fmov    w0, s31
        ret

The widening pair-wise addition addlp instructions can be omitted if we're just doing an ADDV afterwards.
Making this optimisation would be quite simple if we had a standard RTL PLUS vector reduction code.
As we don't, we can use UNSPEC_ADDV as a stand in.
This patch expresses the SADDLV and UADDLV instructions as an UNSPEC_ADDV over a widened input, thus removing
the need for separate UNSPEC_SADDLV and UNSPEC_UADDLV codes.
To optimise the testcases involved we add two splitters that match a vector addition where all participating elements
are taken and widened from the same vector and then fed into an UNSPEC_ADDV. In that case we can just remove the
vector PLUS and just emit the simple RTL for SADDLV/UADDLV.

Bootstrapped and tested on aarch64-none-linux-gnu.
Pushing to trunk.

Thanks,
Kyrill

gcc/ChangeLog:

	* config/aarch64/aarch64-protos.h (aarch64_parallel_select_half_p):
	Define prototype.
	(aarch64_pars_overlap_p): Likewise.
	* config/aarch64/aarch64-simd.md (aarch64_<su>addlv<mode>):
	Express in terms of UNSPEC_ADDV.
	(*aarch64_<su>addlv<VDQV_L:mode>_ze<GPI:mode>): Likewise.
	(*aarch64_<su>addlv<mode>_reduction): Define.
	(*aarch64_uaddlv<mode>_reduction_2): Likewise.
	* config/aarch64/aarch64.cc	(aarch64_parallel_select_half_p): Define.
	(aarch64_pars_overlap_p): Likewise.
	* config/aarch64/iterators.md (UNSPEC_SADDLV, UNSPEC_UADDLV): Delete.
	(VQUADW): New mode attribute.
	(VWIDE2X_S): Likewise.
	(USADDLV): Delete.
	(su): Delete handling of UNSPEC_SADDLV, UNSPEC_UADDLV.
	* config/aarch64/predicates.md (vect_par_cnst_select_half): Define.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/simd/addlv_1.c: New test.

[-- Attachment #2: addlv2.patch --]
[-- Type: application/octet-stream, Size: 10062 bytes --]

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index a0642df26dbc83737c5c765d123e39d08357234f..a20a20ce15f77c58a25b8badd73b53e50bb5be8d 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -819,6 +819,8 @@ bool aarch64_regno_ok_for_index_p (int, bool);
 bool aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *fail);
 bool aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
 					    bool high);
+bool aarch64_parallel_select_half_p (machine_mode, rtx);
+bool aarch64_pars_overlap_p (rtx, rtx);
 bool aarch64_simd_scalar_immediate_valid_for_move (rtx, scalar_int_mode);
 bool aarch64_simd_shift_imm_p (rtx, machine_mode, bool);
 bool aarch64_sve_ptrue_svpattern_p (rtx, struct simd_immediate_info *);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9307a573cefdd16b0b54af10c72b948877ab74d7..5cfb5aebcc73b392461e4309dc462887476e0232 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3665,15 +3665,73 @@ (define_expand "reduc_plus_scal_v4sf"
   DONE;
 })
 
+;; SADDLV and UADDLV can be expressed as an ADDV instruction that first
+;; sign or zero-extends its elements.
 (define_insn "aarch64_<su>addlv<mode>"
  [(set (match_operand:<VWIDE_S> 0 "register_operand" "=w")
-       (unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")]
-		    USADDLV))]
+       (unspec:<VWIDE_S>
+	 [(ANY_EXTEND:<V2XWIDE>
+	    (match_operand:VDQV_L 1 "register_operand" "w"))]
+	 UNSPEC_ADDV))]
  "TARGET_SIMD"
  "<su>addl<vp>\\t%<Vwstype>0<Vwsuf>, %1.<Vtype>"
   [(set_attr "type" "neon_reduc_add<q>")]
 )
 
+;; An ADDV over a vector PLUS of elements extracted and widened all from the
+;; same vector is the same as an [SU]ADDLV above, so long as all the elements
+;; of that vector are used.  We can greatly simplify the RTL expression using
+;; this splitter.
+(define_insn_and_split "*aarch64_<su>addlv<mode>_reduction"
+ [(set (match_operand:<VWIDE_S> 0 "register_operand")
+       (unspec:<VWIDE_S>
+	 [(plus:<VDBLW>
+	    (vec_select:<VDBLW>
+	      (ANY_EXTEND:<V2XWIDE>
+		(match_operand:VDQV_L 1 "register_operand"))
+	      (match_operand:<V2XWIDE> 2 "vect_par_cnst_select_half"))
+	    (vec_select:<VDBLW> (ANY_EXTEND:<V2XWIDE> (match_dup 1))
+	      (match_operand:<V2XWIDE> 3 "vect_par_cnst_select_half")))]
+	 UNSPEC_ADDV))]
+ "TARGET_SIMD && !aarch64_pars_overlap_p (operands[2], operands[3])"
+ "#"
+ "&& 1"
+  [(set (match_dup 0)
+       (unspec:<VWIDE_S>
+	 [(ANY_EXTEND:<V2XWIDE>
+	    (match_dup 1))]
+	 UNSPEC_ADDV))]
+  {}
+)
+
+;; Similar to the above but for two-step zero-widening reductions.
+;; We can push the outer zero_extend outside the ADDV unspec and make
+;; use of the implicit high-part zeroing semantics of UADDLV to do it all
+;; in a single instruction.
+(define_insn_and_split "*aarch64_uaddlv<mode>_reduction_2"
+ [(set (match_operand:<VWIDE2X_S> 0 "register_operand" "=w")
+       (unspec:<VWIDE2X_S>
+	 [(zero_extend:<VQUADW>
+	    (plus:<VDBLW>
+	      (vec_select:<VDBLW>
+		(zero_extend:<V2XWIDE>
+		  (match_operand:VDQQH 1 "register_operand" "w"))
+		(match_operand:<V2XWIDE> 2 "vect_par_cnst_select_half"))
+	      (vec_select:<VDBLW> (zero_extend:<V2XWIDE> (match_dup 1))
+		(match_operand:<V2XWIDE> 3 "vect_par_cnst_select_half"))))]
+	 UNSPEC_ADDV))]
+ "TARGET_SIMD && !aarch64_pars_overlap_p (operands[2], operands[3])"
+ "#"
+ "&& 1"
+  [(set (match_dup 0)
+	(zero_extend:<VWIDE2X_S>
+	  (unspec:<VWIDE_S>
+	    [(zero_extend:<V2XWIDE>
+	       (match_dup 1))]
+	    UNSPEC_ADDV)))]
+  {}
+)
+
 ;; Zero-extending version of the above.  As these intrinsics produce a scalar
 ;; value that may be used by further intrinsics we want to avoid moving the
 ;; result into GP regs to do a zero-extension that ADDLV/ADDLP gives for free.
@@ -3681,9 +3739,10 @@ (define_insn "aarch64_<su>addlv<mode>"
 (define_insn "*aarch64_<su>addlv<VDQV_L:mode>_ze<GPI:mode>"
  [(set (match_operand:GPI 0 "register_operand" "=w")
        (zero_extend:GPI
-	(unspec:<VWIDE_S>
-	  [(match_operand:VDQV_L 1 "register_operand" "w")]
-	    USADDLV)))]
+	 (unspec:<VWIDE_S>
+	   [(ANY_EXTEND:<VDQV_L:V2XWIDE>
+	      (match_operand:VDQV_L 1 "register_operand" "w"))]
+	 UNSPEC_ADDV)))]
  "TARGET_SIMD
   && (GET_MODE_SIZE (<GPI:MODE>mode) > GET_MODE_SIZE (<VWIDE_S>mode))"
  "<su>addl<VDQV_L:vp>\\t%<VDQV_L:Vwstype>0<VDQV_L:Vwsuf>, %1.<VDQV_L:Vtype>"
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 44935e80565face84be4d3879c9c25f0dc3817ba..1f1f27e197dfeac67e75815350b6c825c1d9b55d 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27717,6 +27717,50 @@ aarch64_adjust_reg_alloc_order ()
       reg_alloc_order[i] = i;
 }
 
+/* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
+   of vector mode MODE to select half the elements of that vector.
+   Allow any combination of indices except duplicates (or out of range of
+   the mode units).  */
+
+bool
+aarch64_parallel_select_half_p (machine_mode mode, rtx par)
+{
+  int nunits = XVECLEN (par, 0);
+  if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
+    return false;
+  int mode_nunits = nunits * 2;
+  /* Put all the elements of PAR into a hash_set and use its
+     uniqueness guarantees to check that we don't try to insert the same
+     element twice.  */
+  hash_set<rtx> parset;
+  for (int i = 0; i < nunits; ++i)
+    {
+      rtx elt = XVECEXP (par, 0, i);
+      if (!CONST_INT_P (elt)
+	  || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
+	  || parset.add (elt))
+	return false;
+    }
+  return true;
+}
+
+/* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
+   contain any common elements.  */
+
+bool
+aarch64_pars_overlap_p (rtx par1, rtx par2)
+{
+  int len1 = XVECLEN (par1, 0);
+  int len2 = XVECLEN (par2, 0);
+  hash_set<rtx> parset;
+  for (int i = 0; i < len1; ++i)
+    parset.add (XVECEXP (par1, 0, i));
+  for (int i = 0; i < len2; ++i)
+    if (parset.contains (XVECEXP (par2, 0, i)))
+      return true;
+  return false;
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index d9c7354730ac5870c0042f1e30fb1140a117d110..9e1e17bc1b9c9e2fa488f3962841be8075673db6 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -622,8 +622,6 @@ (define_c_enum "unspec"
     UNSPEC_FMINV	; Used in aarch64-simd.md.
     UNSPEC_FADDV	; Used in aarch64-simd.md.
     UNSPEC_ADDV		; Used in aarch64-simd.md.
-    UNSPEC_SADDLV	; Used in aarch64-simd.md.
-    UNSPEC_UADDLV	; Used in aarch64-simd.md.
     UNSPEC_SMAXV	; Used in aarch64-simd.md.
     UNSPEC_SMINV	; Used in aarch64-simd.md.
     UNSPEC_UMAXV	; Used in aarch64-simd.md.
@@ -1482,6 +1480,9 @@ (define_mode_attr VDBLW [(V8QI "V4HI") (V16QI "V8HI")
                   (V4HI "V2SI") (V8HI "V4SI")
                   (V2SI "DI")   (V4SI "V2DI")])
 
+(define_mode_attr VQUADW [(V8QI "V4SI") (V16QI "V8SI")
+                  (V4HI "V2DI") (V8HI "V4DI")])
+
 ;; Narrowed modes for VDN.
 (define_mode_attr VNARROWD [(V4HI "V8QI") (V2SI "V4HI")
 			    (DI   "V2SI")])
@@ -1563,6 +1564,9 @@ (define_mode_attr VWIDE_S [(V8QI "HI") (V4HI "SI")
 			  (V2SI "DI") (V16QI "HI")
 			  (V8HI "SI") (V4SI "DI")])
 
+(define_mode_attr VWIDE2X_S [(V8QI "SI") (V4HI "DI")
+			  (V16QI "SI") (V8HI "DI")])
+
 ;; Widened mode with half the element register suffixes for VD_BHSI/VQW/VQ_HSF.
 (define_mode_attr Vwhalf [(V8QI "4h") (V4HI "2s")
 			  (V2SI "1d") (V16QI "8h")
@@ -2589,8 +2593,6 @@ (define_int_iterator FMAXMINNMV [UNSPEC_FMAXNMV UNSPEC_FMINNMV])
 
 (define_int_iterator SVE_INT_ADDV [UNSPEC_SADDV UNSPEC_UADDV])
 
-(define_int_iterator USADDLV [UNSPEC_SADDLV UNSPEC_UADDLV])
-
 (define_int_iterator LOGICALF [UNSPEC_ANDF UNSPEC_IORF UNSPEC_XORF])
 
 (define_int_iterator HADDSUB [UNSPEC_SHADD UNSPEC_UHADD
@@ -3332,8 +3334,6 @@ (define_int_attr last_op [(UNSPEC_CLASTA "after_last")
 ;; "s" for signed operations and "u" for unsigned ones.
 (define_int_attr su [(UNSPEC_SADDV "s")
 		     (UNSPEC_UADDV "u")
-		     (UNSPEC_SADDLV "s")
-		     (UNSPEC_UADDLV "u")
 		     (UNSPEC_UNPACKSHI "s")
 		     (UNSPEC_UNPACKUHI "u")
 		     (UNSPEC_UNPACKSLO "s")
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 714efbbd86b29c7d541e7eac71c4dc24e4ddde4a..b476f2a54039aa1bad7acad6ea2fe81e291f8da3 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -475,6 +475,15 @@ (define_special_predicate "vect_par_cnst_even_or_odd_half"
 	 && aarch64_stepped_int_parallel_p (op, 2);
 })
 
+;; PARALLEL for a vec_select that selects half the elements in a vector of
+;; MODE.  Allows any combination of elements, as long as there's no
+;; duplicate entries.
+(define_special_predicate "vect_par_cnst_select_half"
+  (match_code "parallel")
+{
+  return aarch64_parallel_select_half_p (mode, op);
+})
+
 (define_predicate "descending_int_parallel"
   (match_code "parallel")
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addlv_1.c b/gcc/testsuite/gcc.target/aarch64/simd/addlv_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..21fbdb348a40e0af640fa04e60b8bc16a3e53dd3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/addlv_1.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+#include <arm_neon.h>
+
+/*
+** foo_1:
+**	uaddlv	s([0-9]+), v0.8h
+**	fmov	x0, d\1
+**	ret
+*/
+
+uint64_t
+foo_1 (uint16x8_t b)
+{
+  return vaddlvq_u32 (vpadalq_u16 (vdupq_n_u32 (0), b));
+}
+
+/*
+** foo_2:
+**	uaddlv	s([0-9]+), v0.8h
+**	fmov	w0, s\1
+**	ret
+*/
+
+uint32_t
+foo_2 (uint16x8_t b)
+{
+  return vaddvq_u32 (vpadalq_u16 (vdupq_n_u32 (0), b));
+}
+
+/*
+** foo_3:
+**	saddlv	s([0-9]+), v0.8h
+**	fmov	w0, s\1
+**	ret
+*/
+
+int32_t
+foo_3 (int16x8_t b)
+{
+  return vaddvq_s32 (vpadalq_s16 (vdupq_n_s32 (0), b));
+}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-06-06  8:54 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-06  8:54 [PATCH][committed] aarch64: Improve representation of ADDLV instructions Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).