public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 1/5]AArch64 sve: combine inverted masks into NOTs
@ 2021-08-31 13:30 Tamar Christina
  2021-08-31 15:45 ` Richard Sandiford
  0 siblings, 1 reply; 5+ messages in thread
From: Tamar Christina @ 2021-08-31 13:30 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 4719 bytes --]

Hi All,

The following example

void f10(double * restrict z, double * restrict w, double * restrict x,
	 double * restrict y, int n)
{
    for (int i = 0; i < n; i++) {
        z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
    }
}

generates currently:

        ld1d    z1.d, p1/z, [x1, x5, lsl 3]
        fcmgt   p2.d, p1/z, z1.d, #0.0
        fcmgt   p0.d, p3/z, z1.d, #0.0
        ld1d    z2.d, p2/z, [x2, x5, lsl 3]
        bic     p0.b, p3/z, p1.b, p0.b
        ld1d    z0.d, p0/z, [x3, x5, lsl 3]

where a BIC is generated between p1 and p0 where a NOT would be better here
since we won't require the use of p3 and opens the pattern up to being CSEd.

After this patch using a 2 -> 2 split we generate:

        ld1d    z1.d, p0/z, [x1, x5, lsl 3]
        fcmgt   p2.d, p0/z, z1.d, #0.0
        not     p1.b, p0/z, p2.b

The additional scratch is needed such that we can CSE the two operations.  If
both statements wrote to the same register then CSE won't be able to CSE the
values if there are other statements in between that use the register.

Note: This patch series is working incrementally towards generating the most
      efficient code for this and other loops in small steps.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-sve.md (*mask_inv_combine): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/pred-not-gen.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 359fe0e457096cf4042a774789a5c241420703d3..2c23c6b12bafb038d82920e7141a418e078a2c65 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8126,6 +8126,42 @@ (define_insn_and_split "*fcmuo<mode>_and_combine"
 	  UNSPEC_COND_FCMUO))]
 )
 
+;; Make sure that inversions of masked comparisons are always on the mask
+;; instead of on the operation.
+(define_insn_and_split "*mask_inv_combine"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(and:<VPRED>
+	  (and:<VPRED>
+	    (not:<VPRED>
+	      (unspec:<VPRED>
+	        [(match_operand:<VPRED> 1)
+	         (const_int SVE_KNOWN_PTRUE)
+	         (match_operand:SVE_FULL_F 2 "register_operand" "w")
+	         (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")]
+	        SVE_COND_FP_CMP_I0))
+	    (match_operand:<VPRED> 4 "register_operand" "Upa"))
+	      (match_dup:<VPRED> 1)))
+   (clobber (match_scratch:<VPRED> 5 "=&Upa"))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 5)
+	(unspec:<VPRED>
+	  [(match_dup 4)
+	   (const_int SVE_MAYBE_NOT_PTRUE)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  SVE_COND_FP_CMP_I0))
+   (set (match_dup 0)
+	(and:<VPRED>
+	  (not:<VPRED>
+	    (match_dup 5))
+	  (match_dup 4)))]
+{
+  operands[5] = gen_reg_rtx (<VPRED>mode);
+}
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP] Absolute comparisons
 ;; -------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c
new file mode 100644
index 0000000000000000000000000000000000000000..a5bb616ef505a63075cf33203de8cf8e8c38b95d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c
@@ -0,0 +1,56 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f10:
+** ...
+** 	ld1d	z1.d, p0/z, \[x1, x5, lsl 3\]
+** 	fcmgt	p2.d, p0/z, z1.d, #0.0
+** 	ld1d	z2.d, p2/z, \[x2, x5, lsl 3\]
+** 	not	p1.b, p0/z, p2.b
+** 	ld1d	z0.d, p1/z, \[x3, x5, lsl 3\]
+** ...
+*/
+
+void f10(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
+    }
+}
+
+/*
+** f11:
+** ...
+** 	ld1d	z0.d, p0/z, \[x1, x2, lsl 3\]
+** 	fcmgt	p2.d, p3/z, z0.d, #0.0
+** 	fcmgt	p1.d, p0/z, z0.d, #0.0
+** 	not	p1.b, p0/z, p1.b
+** 	ld1d	z1.d, p1/z, \[x3, x2, lsl 3\]
+** ...
+*/
+
+void f11(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        z[i] = (w[i] > 0) ? w[i] : y[i];
+    }
+}
+
+
+/*
+** f12:
+** ...
+** 	ld1w	z1.s, p0/z, \[x1, x2, lsl 2\]
+** 	cmple	p1.s, p0/z, z1.s, #0
+** 	ld1w	z0.s, p1/z, \[x3, x2, lsl 2\]
+** ...
+*/
+
+void f12(int * restrict z, int * restrict w, int * restrict x, int * restrict y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        z[i] = (w[i] > 0) ? w[i] : y[i];
+    }
+}


-- 

[-- Attachment #2: rb14776.patch --]
[-- Type: text/x-diff, Size: 3238 bytes --]

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 359fe0e457096cf4042a774789a5c241420703d3..2c23c6b12bafb038d82920e7141a418e078a2c65 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8126,6 +8126,42 @@ (define_insn_and_split "*fcmuo<mode>_and_combine"
 	  UNSPEC_COND_FCMUO))]
 )
 
+;; Make sure that inversions of masked comparisons are always on the mask
+;; instead of on the operation.
+(define_insn_and_split "*mask_inv_combine"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(and:<VPRED>
+	  (and:<VPRED>
+	    (not:<VPRED>
+	      (unspec:<VPRED>
+	        [(match_operand:<VPRED> 1)
+	         (const_int SVE_KNOWN_PTRUE)
+	         (match_operand:SVE_FULL_F 2 "register_operand" "w")
+	         (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")]
+	        SVE_COND_FP_CMP_I0))
+	    (match_operand:<VPRED> 4 "register_operand" "Upa"))
+	      (match_dup:<VPRED> 1)))
+   (clobber (match_scratch:<VPRED> 5 "=&Upa"))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 5)
+	(unspec:<VPRED>
+	  [(match_dup 4)
+	   (const_int SVE_MAYBE_NOT_PTRUE)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  SVE_COND_FP_CMP_I0))
+   (set (match_dup 0)
+	(and:<VPRED>
+	  (not:<VPRED>
+	    (match_dup 5))
+	  (match_dup 4)))]
+{
+  operands[5] = gen_reg_rtx (<VPRED>mode);
+}
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP] Absolute comparisons
 ;; -------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c
new file mode 100644
index 0000000000000000000000000000000000000000..a5bb616ef505a63075cf33203de8cf8e8c38b95d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen.c
@@ -0,0 +1,56 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+/*
+** f10:
+** ...
+** 	ld1d	z1.d, p0/z, \[x1, x5, lsl 3\]
+** 	fcmgt	p2.d, p0/z, z1.d, #0.0
+** 	ld1d	z2.d, p2/z, \[x2, x5, lsl 3\]
+** 	not	p1.b, p0/z, p2.b
+** 	ld1d	z0.d, p1/z, \[x3, x5, lsl 3\]
+** ...
+*/
+
+void f10(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
+    }
+}
+
+/*
+** f11:
+** ...
+** 	ld1d	z0.d, p0/z, \[x1, x2, lsl 3\]
+** 	fcmgt	p2.d, p3/z, z0.d, #0.0
+** 	fcmgt	p1.d, p0/z, z0.d, #0.0
+** 	not	p1.b, p0/z, p1.b
+** 	ld1d	z1.d, p1/z, \[x3, x2, lsl 3\]
+** ...
+*/
+
+void f11(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        z[i] = (w[i] > 0) ? w[i] : y[i];
+    }
+}
+
+
+/*
+** f12:
+** ...
+** 	ld1w	z1.s, p0/z, \[x1, x2, lsl 2\]
+** 	cmple	p1.s, p0/z, z1.s, #0
+** 	ld1w	z0.s, p1/z, \[x3, x2, lsl 2\]
+** ...
+*/
+
+void f12(int * restrict z, int * restrict w, int * restrict x, int * restrict y, int n)
+{
+    for (int i = 0; i < n; i++) {
+        z[i] = (w[i] > 0) ? w[i] : y[i];
+    }
+}


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-10-11 16:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-31 13:30 [PATCH 1/5]AArch64 sve: combine inverted masks into NOTs Tamar Christina
2021-08-31 15:45 ` Richard Sandiford
2021-09-16 16:18   ` Tamar Christina
2021-09-22 15:26     ` Tamar Christina
2021-10-11 16:29       ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).