[PATCH 4/5]AArch64 sve: optimize add reduction patterns

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 4/5]AArch64 sve: optimize add reduction patterns
@ 2021-08-31 13:35 Tamar Christina
  2021-10-14  9:24 ` Richard Sandiford
  0 siblings, 1 reply; 2+ messages in thread
From: Tamar Christina @ 2021-08-31 13:35 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 2841 bytes --]

Hi All,

The following loop does a conditional reduction using an add:

#include <stdint.h>

int32_t f (int32_t *restrict array, int len, int min)
{
  int32_t iSum = 0;

  for (int i=0; i<len; i++) {
    if (array[i] >= min)
       iSum += array[i];
  }
  return iSum;
}

for this we currently generate:

        mov     z1.b, #0
        mov     z2.s, w2
        mov     z3.d, z1.d
        ptrue   p2.b, all
        ld1w    z0.s, p0/z, [x0, x3, lsl 2]
        cmpge   p1.s, p2/z, z0.s, z2.s
        add     x3, x3, x4
        sel     z0.s, p1, z0.s, z3.s
        add     z1.s, p0/m, z1.s, z0.s
        whilelo p0.s, w3, w1

where the SEL is unneeded as it's selecting between 0 or a value.  This can be
optimized to just doing the conditional add on p1 instead of p0.  After this
patch we generate:

        mov     z2.s, w2
        mov     z0.b, #0
        ptrue   p1.b, all
        ld1w    z1.s, p0/z, [x0, x3, lsl 2]
        cmpge   p0.s, p0/z, z1.s, z2.s
        add     x3, x3, x4
        add     z0.s, p0/m, z0.s, z1.s
        whilelo p0.s, w3, w1

and so we drop the SEL and the 0 move.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: New rule.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/pred-cond-reduc.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/match.pd b/gcc/match.pd
index 19cbad7592787a568d4a7cfd62746d5844c0be5f..ec98a302ac773647413f776fba15930ad247c747 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6978,6 +6978,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
         && element_precision (type) == element_precision (op_type))
     (view_convert (cond_op @2 @3 @4 @5 (view_convert:op_type @1)))))))
 
+/* Detect simplication for a conditional reduction where
+
+   a = mask1 ? b : 0
+   c = mask2 ? d + a : d
+
+   is turned into
+
+   c = mask1 && mask2 ? d + b : d.  */
+(simplify
+  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
+   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
+
 /* For pointers @0 and @2 and nonnegative constant offset @1, look for
    expressions like:
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c
new file mode 100644
index 0000000000000000000000000000000000000000..bd53025d3f17224004244dadc88e0c68ded23f12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <stdint.h>
+
+int32_t f (int32_t *restrict array, int len, int min)
+{
+  int32_t iSum = 0;
+
+  for (int i=0; i<len; i++) {
+    if (array[i] >= min)
+       iSum += array[i];
+  }
+  return iSum;
+}
+
+
+/* { dg-final { scan-assembler-not {\tsel\tz[0-9]+\.s, p1, z[0-9]+\.s, z[0-9]+\.s} } } */


-- 

[-- Attachment #2: rb14779.patch --]
[-- Type: text/x-diff, Size: 1494 bytes --]

diff --git a/gcc/match.pd b/gcc/match.pd
index 19cbad7592787a568d4a7cfd62746d5844c0be5f..ec98a302ac773647413f776fba15930ad247c747 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6978,6 +6978,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
         && element_precision (type) == element_precision (op_type))
     (view_convert (cond_op @2 @3 @4 @5 (view_convert:op_type @1)))))))
 
+/* Detect simplication for a conditional reduction where
+
+   a = mask1 ? b : 0
+   c = mask2 ? d + a : d
+
+   is turned into
+
+   c = mask1 && mask2 ? d + b : d.  */
+(simplify
+  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
+   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
+
 /* For pointers @0 and @2 and nonnegative constant offset @1, look for
    expressions like:
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c
new file mode 100644
index 0000000000000000000000000000000000000000..bd53025d3f17224004244dadc88e0c68ded23f12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <stdint.h>
+
+int32_t f (int32_t *restrict array, int len, int min)
+{
+  int32_t iSum = 0;
+
+  for (int i=0; i<len; i++) {
+    if (array[i] >= min)
+       iSum += array[i];
+  }
+  return iSum;
+}
+
+
+/* { dg-final { scan-assembler-not {\tsel\tz[0-9]+\.s, p1, z[0-9]+\.s, z[0-9]+\.s} } } */


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH 4/5]AArch64 sve: optimize add reduction patterns
  2021-08-31 13:35 [PATCH 4/5]AArch64 sve: optimize add reduction patterns Tamar Christina
@ 2021-10-14  9:24 ` Richard Sandiford
  0 siblings, 0 replies; 2+ messages in thread
From: Richard Sandiford @ 2021-10-14  9:24 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> The following loop does a conditional reduction using an add:
>
> #include <stdint.h>
>
> int32_t f (int32_t *restrict array, int len, int min)
> {
>   int32_t iSum = 0;
>
>   for (int i=0; i<len; i++) {
>     if (array[i] >= min)
>        iSum += array[i];
>   }
>   return iSum;
> }
>
> for this we currently generate:
>
>         mov     z1.b, #0
>         mov     z2.s, w2
>         mov     z3.d, z1.d
>         ptrue   p2.b, all
>         ld1w    z0.s, p0/z, [x0, x3, lsl 2]
>         cmpge   p1.s, p2/z, z0.s, z2.s
>         add     x3, x3, x4
>         sel     z0.s, p1, z0.s, z3.s
>         add     z1.s, p0/m, z1.s, z0.s
>         whilelo p0.s, w3, w1
>
> where the SEL is unneeded as it's selecting between 0 or a value.  This can be
> optimized to just doing the conditional add on p1 instead of p0.  After this
> patch we generate:
>
>         mov     z2.s, w2
>         mov     z0.b, #0
>         ptrue   p1.b, all
>         ld1w    z1.s, p0/z, [x0, x3, lsl 2]
>         cmpge   p0.s, p0/z, z1.s, z2.s
>         add     x3, x3, x4
>         add     z0.s, p0/m, z0.s, z1.s
>         whilelo p0.s, w3, w1
>
> and so we drop the SEL and the 0 move.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?

OK, thanks.

Richard

> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* match.pd: New rule.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve/pred-cond-reduc.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 19cbad7592787a568d4a7cfd62746d5844c0be5f..ec98a302ac773647413f776fba15930ad247c747 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -6978,6 +6978,18 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>          && element_precision (type) == element_precision (op_type))
>      (view_convert (cond_op @2 @3 @4 @5 (view_convert:op_type @1)))))))
>  
> +/* Detect simplication for a conditional reduction where
> +
> +   a = mask1 ? b : 0
> +   c = mask2 ? d + a : d
> +
> +   is turned into
> +
> +   c = mask1 && mask2 ? d + b : d.  */
> +(simplify
> +  (IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> +   (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
> +
>  /* For pointers @0 and @2 and nonnegative constant offset @1, look for
>     expressions like:
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..bd53025d3f17224004244dadc88e0c68ded23f12
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-cond-reduc.c
> @@ -0,0 +1,18 @@
> +/* { dg-do assemble { target aarch64_asm_sve_ok } } */
> +/* { dg-options "-O3 --save-temps" } */
> +
> +#include <stdint.h>
> +
> +int32_t f (int32_t *restrict array, int len, int min)
> +{
> +  int32_t iSum = 0;
> +
> +  for (int i=0; i<len; i++) {
> +    if (array[i] >= min)
> +       iSum += array[i];
> +  }
> +  return iSum;
> +}
> +
> +
> +/* { dg-final { scan-assembler-not {\tsel\tz[0-9]+\.s, p1, z[0-9]+\.s, z[0-9]+\.s} } } */

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-10-14  9:24 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-31 13:35 [PATCH 4/5]AArch64 sve: optimize add reduction patterns Tamar Christina
2021-10-14  9:24 ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).