[PATCH]AArch64 Fix left fold sum reduction RTL patterns [PR104049]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH]AArch64 Fix left fold sum reduction RTL patterns [PR104049]
@ 2022-04-05 18:08 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2022-04-05 18:08 UTC (permalink / raw)
  To: nd, GCC Patches
  Cc: Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov, Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 7301 bytes --]

Hi All,

As the discussion in the PR pointed out the RTL we have for the REDUC_PLUS
patterns are wrong.  The UNSPECs are modelled as returning a vector and then
in an expand pattern we emit a vec_select of the 0th element to get the scalar.

This is incorrect as the instruction itself already only returns a single scalar
and by declaring it returns a vector it allows combine to push in a subreg into
the pattern, which causes reload to make duplicate moves.

This patch corrects this by removing the weird indirection and making the RTL
pattern model the correct semantics of the instruction immediately.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

    PR target/104049
	* config/aarch64/aarch64-simd.md
	(aarch64_reduc_plus_internal<mode>): Fix RTL and rename to...
     (reduc_plus_scal_<mode>): ... This.
	(reduc_plus_scal_v4sf): Moved.
	(aarch64_reduc_plus_internalv2si): Fix RTL and rename to...
	(reduc_plus_scal_v2si): ... This.

gcc/testsuite/ChangeLog:

    PR target/104049
	* gcc.target/aarch64/vadd_reduc-1.c: New test.
	* gcc.target/aarch64/vadd_reduc-2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 18733428f3fb91d937346aa360f6d1fe13ca1eae..ad69a1c4eafe30a98e3d2273fe498694eeddc05a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3385,20 +3385,6 @@ (define_insn "<fmaxmin><mode>3"
 
 ;; 'across lanes' add.
 
-(define_expand "reduc_plus_scal_<mode>"
-  [(match_operand:<VEL> 0 "register_operand")
-   (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand")]
-	       UNSPEC_ADDV)]
-  "TARGET_SIMD"
-  {
-    rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
-    rtx scratch = gen_reg_rtx (<MODE>mode);
-    emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
-    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
-    DONE;
-  }
-)
-
 (define_insn "aarch64_faddp<mode>"
  [(set (match_operand:VHSDF 0 "register_operand" "=w")
        (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
@@ -3409,15 +3395,47 @@ (define_insn "aarch64_faddp<mode>"
   [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )
 
-(define_insn "aarch64_reduc_plus_internal<mode>"
- [(set (match_operand:VDQV 0 "register_operand" "=w")
-       (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
+(define_insn "reduc_plus_scal_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (unspec:<VEL> [(match_operand:VDQV 1 "register_operand" "w")]
 		    UNSPEC_ADDV))]
  "TARGET_SIMD"
  "add<VDQV:vp>\\t%<Vetype>0, %1.<Vtype>"
   [(set_attr "type" "neon_reduc_add<q>")]
 )
 
+(define_insn "reduc_plus_scal_v2si"
+ [(set (match_operand:SI 0 "register_operand" "=w")
+       (unspec:SI [(match_operand:V2SI 1 "register_operand" "w")]
+		    UNSPEC_ADDV))]
+ "TARGET_SIMD"
+ "addp\\t%0.2s, %1.2s, %1.2s"
+  [(set_attr "type" "neon_reduc_add")]
+)
+
+(define_insn "reduc_plus_scal_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
+		   UNSPEC_FADDV))]
+ "TARGET_SIMD"
+ "faddp\\t%<Vetype>0, %1.<Vtype>"
+  [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
+)
+
+(define_expand "reduc_plus_scal_v4sf"
+ [(set (match_operand:SF 0 "register_operand")
+       (unspec:SF [(match_operand:V4SF 1 "register_operand")]
+		    UNSPEC_FADDV))]
+ "TARGET_SIMD"
+{
+  rtx elt = aarch64_endian_lane_rtx (V4SFmode, 0);
+  rtx scratch = gen_reg_rtx (V4SFmode);
+  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
+  emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
+  DONE;
+})
+
 (define_insn "aarch64_<su>addlv<mode>"
  [(set (match_operand:<VWIDE_S> 0 "register_operand" "=w")
        (unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")]
@@ -3447,38 +3465,6 @@ (define_insn "aarch64_zero_extend<GPI:mode>_reduc_plus_<VDQV_E:mode>"
   [(set_attr "type" "neon_reduc_add<VDQV_E:q>")]
 )
 
-(define_insn "aarch64_reduc_plus_internalv2si"
- [(set (match_operand:V2SI 0 "register_operand" "=w")
-       (unspec:V2SI [(match_operand:V2SI 1 "register_operand" "w")]
-		    UNSPEC_ADDV))]
- "TARGET_SIMD"
- "addp\\t%0.2s, %1.2s, %1.2s"
-  [(set_attr "type" "neon_reduc_add")]
-)
-
-(define_insn "reduc_plus_scal_<mode>"
- [(set (match_operand:<VEL> 0 "register_operand" "=w")
-       (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
-		   UNSPEC_FADDV))]
- "TARGET_SIMD"
- "faddp\\t%<Vetype>0, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
-)
-
-(define_expand "reduc_plus_scal_v4sf"
- [(set (match_operand:SF 0 "register_operand")
-       (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
-		    UNSPEC_FADDV))]
- "TARGET_SIMD"
-{
-  rtx elt = aarch64_endian_lane_rtx (V4SFmode, 0);
-  rtx scratch = gen_reg_rtx (V4SFmode);
-  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
-  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
-  emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
-  DONE;
-})
-
 (define_insn "clrsb<mode>2"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
         (clrsb:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..271a1c3e8c319f7bdb8669e4b3199b6f88ef7f5d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+**bar:
+**	...
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..5355cbcca34f40a686da837acf75780eb4286974
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+
+/*
+**test:
+**	...
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	and	w1, w0, 65535
+**	add	w0, w1, w0, lsr 16
+**	lsr	w0, w0, 1
+**	ret
+*/
+int test (uint8_t *p, uint32_t t[1][1], int n) {
+
+  int sum = 0;
+  uint32_t a0;
+  for (int i = 0; i < 4; i++, p++)
+    t[i][0] = p[0];
+
+  for (int i = 0; i < 4; i++) {
+    {
+      int t0 = t[0][i] + t[0][i];
+      a0 = t0;
+    };
+    sum += a0;
+  }
+  return (((uint16_t)sum) + ((uint32_t)sum >> 16)) >> 1;
+}


-- 

[-- Attachment #2: rb15496.patch --]
[-- Type: text/plain, Size: 6075 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 18733428f3fb91d937346aa360f6d1fe13ca1eae..ad69a1c4eafe30a98e3d2273fe498694eeddc05a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3385,20 +3385,6 @@ (define_insn "<fmaxmin><mode>3"
 
 ;; 'across lanes' add.
 
-(define_expand "reduc_plus_scal_<mode>"
-  [(match_operand:<VEL> 0 "register_operand")
-   (unspec:VDQ_I [(match_operand:VDQ_I 1 "register_operand")]
-	       UNSPEC_ADDV)]
-  "TARGET_SIMD"
-  {
-    rtx elt = aarch64_endian_lane_rtx (<MODE>mode, 0);
-    rtx scratch = gen_reg_rtx (<MODE>mode);
-    emit_insn (gen_aarch64_reduc_plus_internal<mode> (scratch, operands[1]));
-    emit_insn (gen_aarch64_get_lane<mode> (operands[0], scratch, elt));
-    DONE;
-  }
-)
-
 (define_insn "aarch64_faddp<mode>"
  [(set (match_operand:VHSDF 0 "register_operand" "=w")
        (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")
@@ -3409,15 +3395,47 @@ (define_insn "aarch64_faddp<mode>"
   [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )
 
-(define_insn "aarch64_reduc_plus_internal<mode>"
- [(set (match_operand:VDQV 0 "register_operand" "=w")
-       (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
+(define_insn "reduc_plus_scal_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (unspec:<VEL> [(match_operand:VDQV 1 "register_operand" "w")]
 		    UNSPEC_ADDV))]
  "TARGET_SIMD"
  "add<VDQV:vp>\\t%<Vetype>0, %1.<Vtype>"
   [(set_attr "type" "neon_reduc_add<q>")]
 )
 
+(define_insn "reduc_plus_scal_v2si"
+ [(set (match_operand:SI 0 "register_operand" "=w")
+       (unspec:SI [(match_operand:V2SI 1 "register_operand" "w")]
+		    UNSPEC_ADDV))]
+ "TARGET_SIMD"
+ "addp\\t%0.2s, %1.2s, %1.2s"
+  [(set_attr "type" "neon_reduc_add")]
+)
+
+(define_insn "reduc_plus_scal_<mode>"
+ [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
+		   UNSPEC_FADDV))]
+ "TARGET_SIMD"
+ "faddp\\t%<Vetype>0, %1.<Vtype>"
+  [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
+)
+
+(define_expand "reduc_plus_scal_v4sf"
+ [(set (match_operand:SF 0 "register_operand")
+       (unspec:SF [(match_operand:V4SF 1 "register_operand")]
+		    UNSPEC_FADDV))]
+ "TARGET_SIMD"
+{
+  rtx elt = aarch64_endian_lane_rtx (V4SFmode, 0);
+  rtx scratch = gen_reg_rtx (V4SFmode);
+  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
+  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
+  emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
+  DONE;
+})
+
 (define_insn "aarch64_<su>addlv<mode>"
  [(set (match_operand:<VWIDE_S> 0 "register_operand" "=w")
        (unspec:<VWIDE_S> [(match_operand:VDQV_L 1 "register_operand" "w")]
@@ -3447,38 +3465,6 @@ (define_insn "aarch64_zero_extend<GPI:mode>_reduc_plus_<VDQV_E:mode>"
   [(set_attr "type" "neon_reduc_add<VDQV_E:q>")]
 )
 
-(define_insn "aarch64_reduc_plus_internalv2si"
- [(set (match_operand:V2SI 0 "register_operand" "=w")
-       (unspec:V2SI [(match_operand:V2SI 1 "register_operand" "w")]
-		    UNSPEC_ADDV))]
- "TARGET_SIMD"
- "addp\\t%0.2s, %1.2s, %1.2s"
-  [(set_attr "type" "neon_reduc_add")]
-)
-
-(define_insn "reduc_plus_scal_<mode>"
- [(set (match_operand:<VEL> 0 "register_operand" "=w")
-       (unspec:<VEL> [(match_operand:V2F 1 "register_operand" "w")]
-		   UNSPEC_FADDV))]
- "TARGET_SIMD"
- "faddp\\t%<Vetype>0, %1.<Vtype>"
-  [(set_attr "type" "neon_fp_reduc_add_<Vetype><q>")]
-)
-
-(define_expand "reduc_plus_scal_v4sf"
- [(set (match_operand:SF 0 "register_operand")
-       (unspec:V4SF [(match_operand:V4SF 1 "register_operand")]
-		    UNSPEC_FADDV))]
- "TARGET_SIMD"
-{
-  rtx elt = aarch64_endian_lane_rtx (V4SFmode, 0);
-  rtx scratch = gen_reg_rtx (V4SFmode);
-  emit_insn (gen_aarch64_faddpv4sf (scratch, operands[1], operands[1]));
-  emit_insn (gen_aarch64_faddpv4sf (scratch, scratch, scratch));
-  emit_insn (gen_aarch64_get_lanev4sf (operands[0], scratch, elt));
-  DONE;
-})
-
 (define_insn "clrsb<mode>2"
   [(set (match_operand:VDQ_BHSI 0 "register_operand" "=w")
         (clrsb:VDQ_BHSI (match_operand:VDQ_BHSI 1 "register_operand" "w")))]
diff --git a/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..271a1c3e8c319f7bdb8669e4b3199b6f88ef7f5d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_neon.h>
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+**bar:
+**	...
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	lsr	w1, w0, 16
+**	add	w0, w1, w0, uxth
+**	ret
+*/
+int bar (v4si x)
+{
+  unsigned int sum = vaddvq_s32 (x);
+  return (((uint16_t)(sum & 0xffff)) + ((uint32_t)sum >> 16));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..5355cbcca34f40a686da837acf75780eb4286974
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vadd_reduc-2.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <stdint.h>
+
+/*
+**test:
+**	...
+**	addv	s0, v0.4s
+**	fmov	w0, s0
+**	and	w1, w0, 65535
+**	add	w0, w1, w0, lsr 16
+**	lsr	w0, w0, 1
+**	ret
+*/
+int test (uint8_t *p, uint32_t t[1][1], int n) {
+
+  int sum = 0;
+  uint32_t a0;
+  for (int i = 0; i < 4; i++, p++)
+    t[i][0] = p[0];
+
+  for (int i = 0; i < 4; i++) {
+    {
+      int t0 = t[0][i] + t[0][i];
+      a0 = t0;
+    };
+    sum += a0;
+  }
+  return (((uint16_t)sum) + ((uint32_t)sum >> 16)) >> 1;
+}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-04-05 18:09 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-05 18:08 [PATCH]AArch64 Fix left fold sum reduction RTL patterns [PR104049] Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).