[gcc r14-1400] aarch64: Reimplement v(r)hadd and vhsub intrinsics with RTL codes

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r14-1400] aarch64: Reimplement v(r)hadd and vhsub intrinsics with RTL codes
@ 2023-05-30  9:38 Kyrylo Tkachov
  0 siblings, 0 replies; only message in thread
From: Kyrylo Tkachov @ 2023-05-30  9:38 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:a4dae58abe1a3961aece740b0fada995750c277c

commit r14-1400-ga4dae58abe1a3961aece740b0fada995750c277c
Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date:   Tue May 30 10:36:46 2023 +0100

    aarch64: Reimplement v(r)hadd and vhsub intrinsics with RTL codes
    
    This patch reimplements the MD patterns for the UHADD,SHADD,UHSUB,SHSUB,URHADD,SRHADD instructions using
    standard RTL operations rather than unspecs. The correct RTL representations involves widening
    the inputs before adding them and halving, followed by a truncation back to the original mode.
    An unfortunate wart in the patch is that we end up having very similar expanders for the intrinsics
    through the aarch64_<su>h<ADDSUB:optab><mode> and aarch64_<su>rhadd<mode> names and the standard names
    for the vector averaging optabs <su>avg<mode>3_floor and <su>avg<mode>3_ceil.
    I'd like to reuse <su>avg<mode>3_ceil for the intrinsics builtin as well but our scheme
    in aarch64-simd-builtins.def and aarch64-builtins.cc makes it awkward by only allowing mappings
    of entries in aarch64-simd-builtins.def to:
       0 - CODE_FOR_aarch64_<name><mode>
       1-9 - CODE_FOR_<name><mode><1-9>
       10 - CODE_FOR_<name><mode>
    
    whereas here we want a string after the <mode> i.e. CODE_FOR_uavg<mode>3_ceil.
    This patch adds a bit of remapping logic in aarch64-builtins.cc before the construction of the
    builtin info that remaps the CODE_FOR_* definitions in aarch64-simd-builtins.def to the
    optab-derived ones. CODE_FOR_aarch64_srhaddv4si gets remapped to CODE_FOR_avgv4si3_ceil, for example.
    It's a bit specific to this case, but this solution requires the least invasive changes while avoiding
    having duplicate expanders just for the sake of a different pattern name.
    
    Bootstrapped and tested on aarch64-none-linux-gnu and aarch64_be-none-elf.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-builtins.cc (VAR1): Move to after inclusion of
            aarch64-builtin-iterators.h.  Add definition to remap shadd, uhadd,
            srhadd, urhadd builtin codes for standard optab ones.
            * config/aarch64/aarch64-simd.md (<u>avg<mode>3_floor): Rename to...
            (<su_optab>avg<mode>3_floor): ... This.  Expand to RTL codes rather than
            unspec.
            (<u>avg<mode>3_ceil): Rename to...
            (<su_optab>avg<mode>3_ceil): ... This.  Expand to RTL codes rather than
            unspec.
            (aarch64_<su>hsub<mode>): New define_expand.
            (aarch64_<sur>h<addsub><mode><vczle><vczbe>): Split into...
            (*aarch64_<su>h<ADDSUB:optab><mode><vczle><vczbe>_insn): ... This...
            (*aarch64_<su>rhadd<mode><vczle><vczbe>_insn): ... And this.

Diff:
---
 gcc/config/aarch64/aarch64-builtins.cc | 27 ++++++++++-
 gcc/config/aarch64/aarch64-simd.md     | 84 ++++++++++++++++++++++++++++------
 2 files changed, 95 insertions(+), 16 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index cb6aae3f1fa..e0bb2128e02 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -502,8 +502,11 @@ aarch64_types_storestruct_lane_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define CF4(N, X) CODE_FOR_##N##X##4
 #define CF10(N, X) CODE_FOR_##N##X
 
-#define VAR1(T, N, MAP, FLAG, A) \
-  {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
+/* Define cascading VAR<N> macros that are used from
+   aarch64-builtin-iterators.h to iterate over modes.  These definitions
+   will end up generating a number of VAR1 expansions and code later on in the
+   file should redefine VAR1 to whatever it needs to process on a per-mode
+   basis.  */
 #define VAR2(T, N, MAP, FLAG, A, B) \
   VAR1 (T, N, MAP, FLAG, A) \
   VAR1 (T, N, MAP, FLAG, B)
@@ -552,6 +555,26 @@ aarch64_types_storestruct_lane_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 
 #include "aarch64-builtin-iterators.h"
 
+/* The builtins below should be expanded through the standard optabs
+   CODE_FOR_[u]avg<mode>3_[floor,ceil].  However the mapping scheme in
+   aarch64-simd-builtins.def does not easily allow us to have a pre-mode
+   ("uavg") and post-mode string ("_ceil") in the CODE_FOR_* construction.
+   So the builtins use a name that is natural for AArch64 instructions
+   e.g. "aarch64_srhadd<mode>" and we re-map these to the optab-related
+   CODE_FOR_ here.  */
+#undef VAR1
+#define VAR1(F,T1,T2,I,M) \
+constexpr insn_code CODE_FOR_aarch64_##F##M = CODE_FOR_##T1##M##3##T2;
+
+BUILTIN_VDQ_BHSI (srhadd, avg, _ceil, 0)
+BUILTIN_VDQ_BHSI (urhadd, uavg, _ceil, 0)
+BUILTIN_VDQ_BHSI (shadd, avg, _floor, 0)
+BUILTIN_VDQ_BHSI (uhadd, uavg, _floor, 0)
+
+#undef VAR1
+#define VAR1(T, N, MAP, FLAG, A) \
+  {#N #A, UP (A), CF##MAP (N, A), 0, TYPES_##T, FLAG_##FLAG},
+
 static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
 #include "aarch64-simd-builtins.def"
 };
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 52901642f93..c4171ed214d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4944,30 +4944,86 @@
 
 ;; <su><r>h<addsub>.
 
-(define_expand "<u>avg<mode>3_floor"
+(define_expand "<su_optab>avg<mode>3_floor"
   [(set (match_operand:VDQ_BHSI 0 "register_operand")
-	(unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand")
-			  (match_operand:VDQ_BHSI 2 "register_operand")]
-			 HADD))]
+	(truncate:VDQ_BHSI
+	  (ashiftrt:<V2XWIDE>
+	    (plus:<V2XWIDE>
+	      (ANY_EXTEND:<V2XWIDE>
+		(match_operand:VDQ_BHSI 1 "register_operand"))
+	      (ANY_EXTEND:<V2XWIDE>
+		(match_operand:VDQ_BHSI 2 "register_operand")))
+	    (match_dup 3))))]
   "TARGET_SIMD"
+  {
+    operands[3] = CONST1_RTX (<V2XWIDE>mode);
+  }
 )
 
-(define_expand "<u>avg<mode>3_ceil"
+(define_expand "<su_optab>avg<mode>3_ceil"
   [(set (match_operand:VDQ_BHSI 0 "register_operand")
-	(unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand")
-			  (match_operand:VDQ_BHSI 2 "register_operand")]
-			 RHADD))]
+	(truncate:VDQ_BHSI
+	  (ashiftrt:<V2XWIDE>
+	    (plus:<V2XWIDE>
+	      (plus:<V2XWIDE>
+		(ANY_EXTEND:<V2XWIDE>
+		  (match_operand:VDQ_BHSI 1 "register_operand"))
+		(ANY_EXTEND:<V2XWIDE>
+		  (match_operand:VDQ_BHSI 2 "register_operand")))
+	       (match_dup 3))
+	    (match_dup 3))))]
   "TARGET_SIMD"
+  {
+    operands[3] = CONST1_RTX (<V2XWIDE>mode);
+  }
 )
 
-(define_insn "aarch64_<sur>h<addsub><mode><vczle><vczbe>"
+(define_expand "aarch64_<su>hsub<mode>"
+  [(set (match_operand:VDQ_BHSI 0 "register_operand")
+	(truncate:VDQ_BHSI
+	  (ashiftrt:<V2XWIDE>
+	    (minus:<V2XWIDE>
+	      (ANY_EXTEND:<V2XWIDE>
+		(match_operand:VDQ_BHSI 1 "register_operand"))
+	      (ANY_EXTEND:<V2XWIDE>
+		(match_operand:VDQ_BHSI 2 "register_operand")))
+	    (match_dup 3))))]
+  "TARGET_SIMD"
+  {
+    operands[3] = CONST1_RTX (<V2XWIDE>mode);
+  }
+)
+
+(define_insn "*aarch64_<su>h<ADDSUB:optab><mode><vczle><vczbe>_insn"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
-        (unspec:VDQ_BHSI [(match_operand:VDQ_BHSI 1 "register_operand" "w")
-		      (match_operand:VDQ_BHSI 2 "register_operand" "w")]
-		     HADDSUB))]
+	(truncate:VDQ_BHSI
+	  (ashiftrt:<V2XWIDE>
+	    (ADDSUB:<V2XWIDE>
+	      (ANY_EXTEND:<V2XWIDE>
+		(match_operand:VDQ_BHSI 1 "register_operand" "w"))
+	      (ANY_EXTEND:<V2XWIDE>
+		(match_operand:VDQ_BHSI 2 "register_operand" "w")))
+	    (match_operand:<V2XWIDE> 3 "aarch64_simd_imm_one"))))]
   "TARGET_SIMD"
-  "<sur>h<addsub>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
-  [(set_attr "type" "neon_<addsub>_halve<q>")]
+  "<su>h<ADDSUB:optab>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_<ADDSUB:optab>_halve<q>")]
+)
+
+(define_insn "*aarch64_<su>rhadd<mode><vczle><vczbe>_insn"
+  [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
+	(truncate:VDQ_BHSI
+	  (ashiftrt:<V2XWIDE>
+	    (plus:<V2XWIDE>
+	      (plus:<V2XWIDE>
+		(ANY_EXTEND:<V2XWIDE>
+		  (match_operand:VDQ_BHSI 1 "register_operand" "w"))
+		(ANY_EXTEND:<V2XWIDE>
+		  (match_operand:VDQ_BHSI 2 "register_operand" "w")))
+	       (match_operand:<V2XWIDE> 3 "aarch64_simd_imm_one"))
+	    (match_dup 3))))]
+  "TARGET_SIMD"
+  "<su>rhadd\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
+  [(set_attr "type" "neon_add_halve<q>")]
 )
 
 ;; <r><addsub>hn<q>.

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-05-30  9:38 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-30  9:38 [gcc r14-1400] aarch64: Reimplement v(r)hadd and vhsub intrinsics with RTL codes Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).