public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r11-9625] AArch32: correct dot-product RTL patterns.
@ 2022-02-25 12:13 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2022-02-25 12:13 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:b7be60cb5810b0bc1d9d9d28949e6fef364c3f84

commit r11-9625-gb7be60cb5810b0bc1d9d9d28949e6fef364c3f84
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Fri Feb 25 12:01:56 2022 +0000

    AArch32: correct dot-product RTL patterns.
    
    The previous fix for this problem was wrong due to a subtle difference between
    where NEON expects the RMW values and where intrinsics expects them.
    
    The insn pattern is modeled after the intrinsics and so needs an expand for
    the vectorizer optab to switch the RTL.
    
    However operand[3] is not expected to be written to so the current pattern is
    bogus.
    
    Instead we use the expand to shuffle around the RTL.
    
    The vectorizer expects operands[3] and operands[0] to be
    the same but the aarch64 intrinsics expanders expect operands[0] and
    operands[1] to be the same.
    
    This also fixes some issues with big-endian, each dot product performs 4 8-byte
    multiplications.  However compared to AArch64 we don't enter lanes in GCC
    lane indexed in AArch32 aside from loads/stores.  This means no lane remappings
    are done in arm-builtins.c and so none should be done at the instruction side.
    
    There are some other instructions that need inspections as I think there are
    more incorrect ones.
    
    Third there was a bug in the ACLE specication for dot product which has now been
    fixed[1].  This means some intrinsics were missing and are added by this patch.
    
    [1] https://github.com/ARM-software/acle/releases/tag/r2021Q3
    
    gcc/ChangeLog:
    
            * config/arm/arm_neon.h (vdot_laneq_u32, vdotq_laneq_u32,
            vdot_laneq_s32, vdotq_laneq_s32): New.
            * config/arm/arm_neon_builtins.def (sdot_laneq, udot_laneq): New.
            * config/arm/neon.md (neon_<sup>dot<vsi2qi>): New.
            (<sup>dot_prod<vsi2qi>): Re-order rtl.
            (neon_<sup>dot_lane<vsi2qi>): Fix rtl order and endiannes.
            (neon_<sup>dot_laneq<vsi2qi>): New.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/arm/simd/vdot-compile.c: Add new cases.
            * gcc.target/arm/simd/vdot-exec.c: Likewise.

Diff:
---
 gcc/config/arm/arm_neon.h                        |  29 ++++++
 gcc/config/arm/arm_neon_builtins.def             |   2 +
 gcc/config/arm/neon.md                           | 125 +++++++++++++----------
 gcc/testsuite/gcc.target/arm/simd/vdot-compile.c |  26 ++++-
 gcc/testsuite/gcc.target/arm/simd/vdot-exec.c    |  24 ++++-
 5 files changed, 145 insertions(+), 61 deletions(-)

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 5280b962d90..781a1552818 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18145,6 +18145,35 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index)
   return __builtin_neon_sdot_lanev16qi (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_laneq_u32 (uint32x2_t __r, uint8x8_t __a, uint8x16_t __b, const int __index)
+{
+  return __builtin_neon_udot_laneqv8qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_laneq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b,
+		const int __index)
+{
+  return __builtin_neon_udot_laneqv16qi_uuuus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdot_laneq_s32 (int32x2_t __r, int8x8_t __a, int8x16_t __b, const int __index)
+{
+  return __builtin_neon_sdot_laneqv8qi (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index)
+{
+  return __builtin_neon_sdot_laneqv16qi (__r, __a, __b, __index);
+}
+
 #pragma GCC pop_options
 #endif
 
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index e2ce6602ac4..f00d7f39a51 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -346,6 +346,8 @@ VAR2 (TERNOP, sdot, v8qi, v16qi)
 VAR2 (UTERNOP, udot, v8qi, v16qi)
 VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi)
 VAR2 (UMAC_LANE, udot_lane, v8qi, v16qi)
+VAR2 (MAC_LANE, sdot_laneq, v8qi, v16qi)
+VAR2 (UMAC_LANE, udot_laneq, v8qi, v16qi)
 
 VAR2 (USTERNOP, usdot, v8qi, v16qi)
 VAR2 (USMAC_LANE_QUADTUP, usdot_lane, v8qi, v16qi)
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 43dd8626368..c0dcec80bc6 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2972,20 +2972,49 @@
 })
 
 
-;; These instructions map to the __builtins for the Dot Product operations.
-(define_insn "neon_<sup>dot<vsi2qi>"
+;; These map to the auto-vectorizer Dot Product optab.
+;; The auto-vectorizer expects a dot product builtin that also does an
+;; accumulation into the provided register.
+;; Given the following pattern
+;;
+;; for (i=0; i<len; i++) {
+;;     c = a[i] * b[i];
+;;     r += c;
+;; }
+;; return result;
+;;
+;; This can be auto-vectorized to
+;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
+;;
+;; given enough iterations.  However the vectorizer can keep unrolling the loop
+;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
+;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
+;; ...
+;;
+;; and so the vectorizer provides r, in which the result has to be accumulated.
+(define_insn "<sup>dot_prod<vsi2qi>"
   [(set (match_operand:VCVTI 0 "register_operand" "=w")
-	(plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
-		    (unspec:VCVTI [(match_operand:<VSI2QI> 2
-							"register_operand" "w")
-				   (match_operand:<VSI2QI> 3
-							"register_operand" "w")]
-		DOTPROD)))]
+	(plus:VCVTI
+	  (unspec:VCVTI [(match_operand:<VSI2QI> 1 "register_operand" "w")
+			 (match_operand:<VSI2QI> 2 "register_operand" "w")]
+			 DOTPROD)
+	  (match_operand:VCVTI 3 "register_operand" "0")))]
   "TARGET_DOTPROD"
-  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
   [(set_attr "type" "neon_dot<q>")]
 )
 
+;; These instructions map to the __builtins for the Dot Product operations
+(define_expand "neon_<sup>dot<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+	(plus:VCVTI
+	  (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand")
+			 (match_operand:<VSI2QI> 3 "register_operand")]
+			 DOTPROD)
+	  (match_operand:VCVTI 1 "register_operand")))]
+  "TARGET_DOTPROD"
+)
+
 ;; These instructions map to the __builtins for the Dot Product operations.
 (define_insn "neon_usdot<vsi2qi>"
   [(set (match_operand:VCVTI 0 "register_operand" "=w")
@@ -3004,17 +3033,40 @@
 ;; indexed operations.
 (define_insn "neon_<sup>dot_lane<vsi2qi>"
   [(set (match_operand:VCVTI 0 "register_operand" "=w")
-	(plus:VCVTI (match_operand:VCVTI 1 "register_operand" "0")
-		    (unspec:VCVTI [(match_operand:<VSI2QI> 2
-							"register_operand" "w")
-				   (match_operand:V8QI 3 "register_operand" "t")
-				   (match_operand:SI 4 "immediate_operand" "i")]
-		DOTPROD)))]
+	(plus:VCVTI
+	  (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w")
+			 (match_operand:V8QI 3 "register_operand" "t")
+			 (match_operand:SI 4 "immediate_operand" "i")]
+			 DOTPROD)
+	  (match_operand:VCVTI 1 "register_operand" "0")))]
+  "TARGET_DOTPROD"
+  "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+;; These instructions map to the __builtins for the Dot Product
+;; indexed operations.
+(define_insn "neon_<sup>dot_laneq<vsi2qi>"
+  [(set (match_operand:VCVTI 0 "register_operand" "=w")
+	(plus:VCVTI
+	  (unspec:VCVTI [(match_operand:<VSI2QI> 2 "register_operand" "w")
+			 (match_operand:V16QI 3 "register_operand" "t")
+			 (match_operand:SI 4 "immediate_operand" "i")]
+			 DOTPROD)
+	  (match_operand:VCVTI 1 "register_operand" "0")))]
   "TARGET_DOTPROD"
   {
-    operands[4]
-      = GEN_INT (NEON_ENDIAN_LANE_N (V8QImode, INTVAL (operands[4])));
-    return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %P3[%c4]";
+    int lane = INTVAL (operands[4]);
+    if (lane > GET_MODE_NUNITS (V2SImode) - 1)
+      {
+	operands[4] = GEN_INT (lane - GET_MODE_NUNITS (V2SImode));
+	return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
+      }
+    else
+      {
+	operands[4] = GEN_INT (lane);
+	return "v<sup>dot.<opsuffix>\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
+      }
   }
   [(set_attr "type" "neon_dot<q>")]
 )
@@ -3062,43 +3114,6 @@
   [(set_attr "type" "neon_dot<q>")]
 )
 
-;; These expands map to the Dot Product optab the vectorizer checks for.
-;; The auto-vectorizer expects a dot product builtin that also does an
-;; accumulation into the provided register.
-;; Given the following pattern
-;;
-;; for (i=0; i<len; i++) {
-;;     c = a[i] * b[i];
-;;     r += c;
-;; }
-;; return result;
-;;
-;; This can be auto-vectorized to
-;; r  = a[0]*b[0] + a[1]*b[1] + a[2]*b[2] + a[3]*b[3];
-;;
-;; given enough iterations.  However the vectorizer can keep unrolling the loop
-;; r += a[4]*b[4] + a[5]*b[5] + a[6]*b[6] + a[7]*b[7];
-;; r += a[8]*b[8] + a[9]*b[9] + a[10]*b[10] + a[11]*b[11];
-;; ...
-;;
-;; and so the vectorizer provides r, in which the result has to be accumulated.
-(define_expand "<sup>dot_prod<vsi2qi>"
-  [(set (match_operand:VCVTI 0 "register_operand")
-	(plus:VCVTI (unspec:VCVTI [(match_operand:<VSI2QI> 1
-							"register_operand")
-				   (match_operand:<VSI2QI> 2
-							"register_operand")]
-		     DOTPROD)
-		    (match_operand:VCVTI 3 "register_operand")))]
-  "TARGET_DOTPROD"
-{
-  emit_insn (
-    gen_neon_<sup>dot<vsi2qi> (operands[3], operands[3], operands[1],
-				 operands[2]));
-  emit_insn (gen_rtx_SET (operands[0], operands[3]));
-  DONE;
-})
-
 (define_expand "neon_copysignf<mode>"
   [(match_operand:VCVTF 0 "register_operand")
    (match_operand:VCVTF 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
index b3bd3bf00e3..d3541e829a4 100644
--- a/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
+++ b/gcc/testsuite/gcc.target/arm/simd/vdot-compile.c
@@ -49,8 +49,28 @@ int32x4_t sfooq_lane (int32x4_t r, int8x16_t x, int8x8_t y)
   return vdotq_lane_s32 (r, x, y, 0);
 }
 
-/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+} 4 } } */
+int32x2_t sfoo_laneq1 (int32x2_t r, int8x8_t x, int8x16_t y)
+{
+  return vdot_laneq_s32 (r, x, y, 0);
+}
+
+int32x4_t sfooq_lane1 (int32x4_t r, int8x16_t x, int8x16_t y)
+{
+  return vdotq_laneq_s32 (r, x, y, 0);
+}
+
+int32x2_t sfoo_laneq2 (int32x2_t r, int8x8_t x, int8x16_t y)
+{
+  return vdot_laneq_s32 (r, x, y, 2);
+}
+
+int32x4_t sfooq_lane2 (int32x4_t r, int8x16_t x, int8x16_t y)
+{
+  return vdotq_laneq_s32 (r, x, y, 2);
+}
+
+/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+} 6 } } */
 /* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */
-/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+\[#?[0-9]\]} 2 } } */
-/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0-9]+\[#?[0-9]\]} 2 } } */
+/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\td[0-9]+, d[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */
+/* { dg-final { scan-assembler-times {v[us]dot\.[us]8\tq[0-9]+, q[0-9]+, d[0-9]+\[#?[0-9]\]} 4 } } */
 
diff --git a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
index 054f4703394..89a196ea17c 100644
--- a/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
+++ b/gcc/testsuite/gcc.target/arm/simd/vdot-exec.c
@@ -10,7 +10,7 @@ extern void abort();
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 # define ORDER(x, y) y
 #else
-# define ORDER(x, y) x - y
+# define ORDER(x, y) (x - y)
 #endif
 
 #define P(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2
@@ -33,7 +33,20 @@ extern void abort();
 	t3 f##_##rx1 = {0};			    \
 	f##_##rx1 =  f (f##_##rx1, f##_##x, f##_##y, ORDER (1, 1));  \
 	if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4)   \
-	  abort (); \
+	  abort ();
+
+#define P2(n1,n2) n1,n1,n1,n1,n2,n2,n2,n2,n1,n1,n1,n1,n2,n2,n2,n2
+#define TEST_LANEQ(t1, t2, t3, f, r1, r2, n1, n2, n3, n4) \
+	ARR(f, x, t1, r1);		    \
+	ARR(f, y, t2, r2);		    \
+	t3 f##_##rx = {0};		    \
+	f##_##rx = f (f##_##rx, f##_##x, f##_##y, ORDER (3, 2));  \
+	if (f##_##rx[0] != n1 || f##_##rx[1] != n2)   \
+	  abort ();				    \
+	t3 f##_##rx1 = {0};			    \
+	f##_##rx1 =  f (f##_##rx1, f##_##x, f##_##y, ORDER (3, 3));  \
+	if (f##_##rx1[0] != n3 || f##_##rx1[1] != n4)   \
+	  abort ();
 
 int
 main()
@@ -45,11 +58,16 @@ main()
   TEST (int8x16_t, int8x16_t, int32x4_t, vdotq_s32, P(1,2), P(-2,-3), -8, -24);
 
   TEST_LANE (uint8x8_t, uint8x8_t, uint32x2_t, vdot_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24);
-
   TEST_LANE (int8x8_t, int8x8_t, int32x2_t, vdot_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24);
 
   TEST_LANE (uint8x16_t, uint8x8_t, uint32x4_t, vdotq_lane_u32, P(1,2), P(2,3), 8, 16, 12, 24);
   TEST_LANE (int8x16_t, int8x8_t, int32x4_t, vdotq_lane_s32, P(1,2), P(-2,-3), -8, -16, -12, -24);
 
+  TEST_LANEQ (uint8x8_t, uint8x16_t, uint32x2_t, vdot_laneq_u32, P(1,2), P2(2,3), 8, 16, 12, 24);
+  TEST_LANEQ (int8x8_t, int8x16_t, int32x2_t, vdot_laneq_s32, P(1,2), P2(-2,-3), -8, -16, -12, -24);
+
+  TEST_LANEQ (uint8x16_t, uint8x16_t, uint32x4_t, vdotq_laneq_u32, P2(1,2), P2(2,3), 8, 16, 12, 24);
+  TEST_LANEQ (int8x16_t, int8x16_t, int32x4_t, vdotq_laneq_s32, P2(1,2), P2(-2,-3), -8, -16, -12, -24);
+
   return 0;
 }


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-02-25 12:13 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-25 12:13 [gcc r11-9625] AArch32: correct dot-product RTL patterns Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).