[gcc r12-4349] sve: combine inverted masks into NOTs

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r12-4349] sve: combine inverted masks into NOTs
@ 2021-10-12 10:36 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2021-10-12 10:36 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:e36206c9940d224637083f2e91bd4c70f4b7dd20

commit r12-4349-ge36206c9940d224637083f2e91bd4c70f4b7dd20
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Tue Oct 12 11:34:06 2021 +0100

    sve: combine inverted masks into NOTs
    
    The following example
    
    void f10(double * restrict z, double * restrict w, double * restrict x,
             double * restrict y, int n)
    {
        for (int i = 0; i < n; i++) {
            z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
        }
    }
    
    generates currently:
    
            ld1d    z1.d, p1/z, [x1, x5, lsl 3]
            fcmgt   p2.d, p1/z, z1.d, #0.0
            fcmgt   p0.d, p3/z, z1.d, #0.0
            ld1d    z2.d, p2/z, [x2, x5, lsl 3]
            bic     p0.b, p3/z, p1.b, p0.b
            ld1d    z0.d, p0/z, [x3, x5, lsl 3]
    
    where a BIC is generated between p1 and p0 where a NOT would be better here
    since we won't require the use of p3 and opens the pattern up to being CSEd.
    
    After this patch using a 2 -> 2 split we generate:
    
            ld1d    z1.d, p0/z, [x1, x5, lsl 3]
            fcmgt   p2.d, p0/z, z1.d, #0.0
            not     p1.b, p0/z, p2.b
    
    The additional scratch is needed such that we can CSE the two operations.  If
    both statements wrote to the same register then CSE won't be able to CSE the
    values if there are other statements in between that use the register.
    
    A second pattern is needed to capture the nor case as combine will match the
    longest sequence first.  So without this pattern we end up de-optimizing nor
    and instead emit two nots.  I did not find a better way to do this.
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-sve.md (*fcm<cmp_op><mode>_bic_combine,
            *fcm<cmp_op><mode>_nor_combine, *fcmuo<mode>_bic_combine,
            *fcmuo<mode>_nor_combine): New.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve/pred-not-gen-1.c: New test.
            * gcc.target/aarch64/sve/pred-not-gen-2.c: New test.
            * gcc.target/aarch64/sve/pred-not-gen-3.c: New test.
            * gcc.target/aarch64/sve/pred-not-gen-4.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md                  | 154 +++++++++++++++++++++
 .../gcc.target/aarch64/sve/pred-not-gen-1.c        |  23 +++
 .../gcc.target/aarch64/sve/pred-not-gen-2.c        |  23 +++
 .../gcc.target/aarch64/sve/pred-not-gen-3.c        |  21 +++
 .../gcc.target/aarch64/sve/pred-not-gen-4.c        |  14 ++
 5 files changed, 235 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 359fe0e4570..8fe4c721313 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8126,6 +8126,160 @@
 	  UNSPEC_COND_FCMUO))]
 )
 
+;; Similar to *fcm<cmp_op><mode>_and_combine, but for BIC rather than AND.
+;; In this case, we still need a separate NOT/BIC operation, but predicating
+;; the comparison on the BIC operand removes the need for a PTRUE.
+(define_insn_and_split "*fcm<cmp_op><mode>_bic_combine"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(and:<VPRED>
+	  (and:<VPRED>
+	    (not:<VPRED>
+	      (unspec:<VPRED>
+	        [(match_operand:<VPRED> 1)
+	         (const_int SVE_KNOWN_PTRUE)
+	         (match_operand:SVE_FULL_F 2 "register_operand" "w")
+	         (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")]
+	        SVE_COND_FP_CMP_I0))
+	    (match_operand:<VPRED> 4 "register_operand" "Upa"))
+	  (match_dup:<VPRED> 1)))
+   (clobber (match_scratch:<VPRED> 5 "=&Upl"))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 5)
+	(unspec:<VPRED>
+	  [(match_dup 4)
+	   (const_int SVE_MAYBE_NOT_PTRUE)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  SVE_COND_FP_CMP_I0))
+   (set (match_dup 0)
+	(and:<VPRED>
+	  (not:<VPRED>
+	    (match_dup 5))
+	  (match_dup 4)))]
+{
+  if (can_create_pseudo_p ())
+    operands[5] = gen_reg_rtx (<VPRED>mode);
+}
+)
+
+;; Make sure that we expand to a nor when the operand 4 of
+;; *fcm<cmp_op><mode>_bic_combine is a not.
+(define_insn_and_split "*fcm<cmp_op><mode>_nor_combine"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(and:<VPRED>
+	  (and:<VPRED>
+	    (not:<VPRED>
+	      (unspec:<VPRED>
+	        [(match_operand:<VPRED> 1)
+	         (const_int SVE_KNOWN_PTRUE)
+	         (match_operand:SVE_FULL_F 2 "register_operand" "w")
+	         (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")]
+	        SVE_COND_FP_CMP_I0))
+	    (not:<VPRED>
+	      (match_operand:<VPRED> 4 "register_operand" "Upa")))
+	  (match_dup:<VPRED> 1)))
+   (clobber (match_scratch:<VPRED> 5 "=&Upl"))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 5)
+	(unspec:<VPRED>
+	  [(match_dup 1)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  SVE_COND_FP_CMP_I0))
+   (set (match_dup 0)
+	(and:<VPRED>
+	  (and:<VPRED>
+	    (not:<VPRED>
+	      (match_dup 5))
+	    (not:<VPRED>
+	      (match_dup 4)))
+	  (match_dup 1)))]
+{
+  if (can_create_pseudo_p ())
+    operands[5] = gen_reg_rtx (<VPRED>mode);
+}
+)
+
+(define_insn_and_split "*fcmuo<mode>_bic_combine"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(and:<VPRED>
+	  (and:<VPRED>
+	    (not:<VPRED>
+	      (unspec:<VPRED>
+	        [(match_operand:<VPRED> 1)
+	         (const_int SVE_KNOWN_PTRUE)
+	         (match_operand:SVE_FULL_F 2 "register_operand" "w")
+	         (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")]
+	        UNSPEC_COND_FCMUO))
+	    (match_operand:<VPRED> 4 "register_operand" "Upa"))
+	  (match_dup:<VPRED> 1)))
+   (clobber (match_scratch:<VPRED> 5 "=&Upl"))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 5)
+	(unspec:<VPRED>
+	  [(match_dup 4)
+	   (const_int SVE_MAYBE_NOT_PTRUE)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_COND_FCMUO))
+   (set (match_dup 0)
+	(and:<VPRED>
+	  (not:<VPRED>
+	    (match_dup 5))
+	  (match_dup 4)))]
+{
+  if (can_create_pseudo_p ())
+    operands[5] = gen_reg_rtx (<VPRED>mode);
+}
+)
+
+;; Same for unordered comparisons.
+(define_insn_and_split "*fcmuo<mode>_nor_combine"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(and:<VPRED>
+	  (and:<VPRED>
+	    (not:<VPRED>
+	      (unspec:<VPRED>
+	        [(match_operand:<VPRED> 1)
+	         (const_int SVE_KNOWN_PTRUE)
+	         (match_operand:SVE_FULL_F 2 "register_operand" "w")
+	         (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")]
+	        UNSPEC_COND_FCMUO))
+	    (not:<VPRED>
+	      (match_operand:<VPRED> 4 "register_operand" "Upa")))
+	  (match_dup:<VPRED> 1)))
+   (clobber (match_scratch:<VPRED> 5 "=&Upl"))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 5)
+	(unspec:<VPRED>
+	  [(match_dup 1)
+	   (const_int SVE_KNOWN_PTRUE)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  UNSPEC_COND_FCMUO))
+   (set (match_dup 0)
+	(and:<VPRED>
+	  (and:<VPRED>
+	    (not:<VPRED>
+	      (match_dup 5))
+	    (not:<VPRED>
+	      (match_dup 4)))
+	  (match_dup 1)))]
+{
+  if (can_create_pseudo_p ())
+    operands[5] = gen_reg_rtx (<VPRED>mode);
+}
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP] Absolute comparisons
 ;; -------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
new file mode 100644
index 00000000000..2c06564186c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
@@ -0,0 +1,23 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+/*
+** f10:
+** ...
+** 	ld1d	z1.d, p0/z, \[x1, x5, lsl 3\]
+** 	fcmgt	p2.d, p0/z, z1.d, #0.0
+** 	ld1d	z2.d, p2/z, \[x2, x5, lsl 3\]
+** 	not	p1.b, p0/z, p2.b
+** 	ld1d	z0.d, p1/z, \[x3, x5, lsl 3\]
+** ...
+*/
+
+void f10(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
+    }
+}
+
+/* { dg-final { scan-assembler-not {\tbic\t} } } */
+/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c
new file mode 100644
index 00000000000..0c3b78d4c67
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c
@@ -0,0 +1,23 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+/*
+** f11:
+** ...
+** 	ld1d	z0.d, p0/z, \[x1, x2, lsl 3\]
+** 	fcmgt	p2.d, p3/z, z0.d, #0.0
+** 	fcmgt	p1.d, p0/z, z0.d, #0.0
+** 	not	p1.b, p0/z, p1.b
+** 	ld1d	z1.d, p1/z, \[x3, x2, lsl 3\]
+** ...
+*/
+
+void f11(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        z[i] = (w[i] > 0) ? w[i] : y[i];
+    }
+}
+
+/* { dg-final { scan-assembler-not {\tbic\t} } } */
+/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c
new file mode 100644
index 00000000000..248f8ab5719
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c
@@ -0,0 +1,21 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+/*
+** f12:
+** ...
+** 	ld1w	z1.s, p0/z, \[x1, x2, lsl 2\]
+** 	cmple	p1.s, p0/z, z1.s, #0
+** 	ld1w	z0.s, p1/z, \[x3, x2, lsl 2\]
+** ...
+*/
+
+void f12(int * restrict z, int * restrict w, int * restrict x, int * restrict y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        z[i] = (w[i] > 0) ? w[i] : y[i];
+    }
+}
+
+/* { dg-final { scan-assembler-not {\tbic\t} } } */
+/* { dg-final { scan-assembler-not {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
new file mode 100644
index 00000000000..96200309880
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
@@ -0,0 +1,14 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+#include <math.h>
+
+void f13(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        z[i] = (isunordered(w[i], 0)) ? x[i] + w[i] : y[i] - w[i];
+    }
+}
+
+/* { dg-final { scan-assembler-not {\tbic\t} } } */
+/* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 } } */


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-10-12 10:36 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-12 10:36 [gcc r12-4349] sve: combine inverted masks into NOTs Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).