public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/tnfchris/heads/gcc-14-early-break)] aarch64: Add optimization for vector cbranch combining SVE and Adv. SIMD
@ 2023-11-15 14:56 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2023-11-15 14:56 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:ea25526de29d14406910ad348e854c83917555b9

commit ea25526de29d14406910ad348e854c83917555b9
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Tue Jun 20 16:11:56 2023 +0100

    aarch64: Add optimization for vector cbranch combining SVE and Adv. SIMD
    
    Reviewed at https://reviewboard.gnu.aws.arm.com/r/17511/

Diff:
---
 gcc/config/aarch64/aarch64-simd.md                 |  69 ++++++++----
 gcc/config/aarch64/aarch64-sve.md                  | 101 +++++++++++++++++-
 gcc/genemit.cc                                     |   1 +
 .../aarch64/sve/vect-early-break-cbranch_1.c       | 117 +++++++++++++++++++++
 .../aarch64/sve/vect-early-break-cbranch_2.c       | 114 ++++++++++++++++++++
 5 files changed, 380 insertions(+), 22 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 31adb649877..60f3758d312 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3883,31 +3883,58 @@
   "TARGET_SIMD"
 {
   auto code = GET_CODE (operands[0]);
-  rtx tmp = operands[1];
-
-  /* If comparing against a non-zero vector we have to do a comparison first
-     so we can have a != 0 comparison with the result.  */
-  if (operands[2] != CONST0_RTX (<MODE>mode))
-    emit_insn (gen_vec_cmp<mode><mode> (tmp, operands[0], operands[1],
-					operands[2]));
-
-  /* For 64-bit vectors we need no reductions.  */
-  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+  /* If SVE is available, lets borrow some instructions.  We will optimize
+     these further later in combine.  */
+  if (TARGET_SVE)
     {
-      /* Always reduce using a V4SI.  */
-      rtx reduc = gen_lowpart (V4SImode, tmp);
-      rtx res = gen_reg_rtx (V4SImode);
-      emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
-      emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
+      machine_mode full_mode = aarch64_full_sve_mode (<VEL>mode).require ();
+      rtx in1 = lowpart_subreg (full_mode, operands[1], <MODE>mode);
+      rtx in2 = lowpart_subreg (full_mode, operands[2], <MODE>mode);
+
+      machine_mode pred_mode = aarch64_sve_pred_mode (full_mode);
+      rtx_vector_builder builder (VNx16BImode, 16, 2);
+      for (unsigned int i = 0; i < 16; ++i)
+	builder.quick_push (CONST1_RTX (BImode));
+      for (unsigned int i = 0; i < 16; ++i)
+	builder.quick_push (CONST0_RTX (BImode));
+      rtx ptrue = force_reg (VNx16BImode, builder.build ());
+      rtx cast_ptrue = gen_lowpart (pred_mode, ptrue);
+      rtx ptrue_flag = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
+
+      rtx tmp = gen_reg_rtx (pred_mode);
+      aarch64_expand_sve_vec_cmp_int (tmp, reverse_condition (code), in1, in2);
+      emit_insn (gen_aarch64_ptest (pred_mode, ptrue, cast_ptrue, ptrue_flag, tmp));
+      operands[1] = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
+      operands[2] = const0_rtx;
     }
+  else
+    {
+      rtx tmp = operands[1];
 
-  rtx val = gen_reg_rtx (DImode);
-  emit_move_insn (val, gen_lowpart (DImode, tmp));
+      /* If comparing against a non-zero vector we have to do a comparison first
+	 so we can have a != 0 comparison with the result.  */
+      if (operands[2] != CONST0_RTX (<MODE>mode))
+	emit_insn (gen_vec_cmp<mode><mode> (tmp, operands[0], operands[1],
+					    operands[2]));
 
-  rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
-  rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
-  emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
-  DONE;
+      /* For 64-bit vectors we need no reductions.  */
+      if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+	{
+	  /* Always reduce using a V4SI.  */
+	  rtx reduc = gen_lowpart (V4SImode, tmp);
+	  rtx res = gen_reg_rtx (V4SImode);
+	  emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
+	  emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
+	}
+
+      rtx val = gen_reg_rtx (DImode);
+      emit_move_insn (val, gen_lowpart (DImode, tmp));
+
+      rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
+      rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
+      emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
+      DONE;
+    }
 })
 
 ;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 5a652d8536a..d9cc5c7e562 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8123,6 +8123,105 @@
   "cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d"
 )
 
+;; Predicated integer comparisons over Advanced SIMD arguments in which only
+;; the flags result is interesting.
+(define_insn "*aarch64_pred_cmp<UCOMPARISONS:cmp_op><mode><EQL:code>_neon_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:VNx4BI
+	     [(match_operand:VNx4BI 6 "register_operand" "Upl")
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (EQL:VNx4BI
+		(subreg:SVE_FULL_BHSI
+		 (neg:<V128>
+		  (UCOMPARISONS:<V128>
+		   (match_operand:<V128> 2 "register_operand" "w")
+		   (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))) 0)
+		(match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+  operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+  operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+  if (EQ == <EQL:CODE>)
+    std::swap (operands[2], operands[3]);
+
+  return "cmp<UCOMPARISONS:cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
+;; Predicated integer comparisons over Advanced SIMD arguments in which only
+;; the flags result is interesting.
+(define_insn "*aarch64_pred_cmpeq<mode><EQL:code>_neon_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:VNx4BI
+	     [(match_operand:VNx4BI 6 "register_operand" "Upl")
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (EQL:VNx4BI
+		(subreg:SVE_FULL_BHSI
+		 (neg:<V128>
+		  (eq:<V128>
+		   (match_operand:<V128> 2 "register_operand" "w")
+		   (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))) 0)
+		(match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+  operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+  operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+  if (EQ == <EQL:CODE>)
+    std::swap (operands[2], operands[3]);
+
+  return "cmpeq\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
+;; Same as the above but version for == and !=
+(define_insn "*aarch64_pred_cmpne<mode><EQL:code>_neon_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:VNx4BI
+	     [(match_operand:VNx4BI 6 "register_operand" "Upl")
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (EQL:VNx4BI
+		(subreg:SVE_FULL_BHSI
+		 (plus:<V128>
+		  (eq:<V128>
+		   (match_operand:<V128> 2 "register_operand" "w")
+		   (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))
+		  (match_operand:<V128> 9 "aarch64_simd_imm_minus_one" "i")) 0)
+		(match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+  operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+  operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+  if (EQ == <EQL:CODE>)
+    std::swap (operands[2], operands[3]);
+
+  return "cmpne\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] While tests
 ;; -------------------------------------------------------------------------
@@ -8602,7 +8701,7 @@
 )
 
 ;; See "Description of UNSPEC_PTEST" above for details.
-(define_insn "aarch64_ptest<mode>"
+(define_insn "@aarch64_ptest<mode>"
   [(set (reg:CC_NZC CC_REGNUM)
 	(unspec:CC_NZC [(match_operand:VNx16BI 0 "register_operand" "Upa")
 			(match_operand 1)
diff --git a/gcc/genemit.cc b/gcc/genemit.cc
index 471fd46a10b..33122bbe6b4 100644
--- a/gcc/genemit.cc
+++ b/gcc/genemit.cc
@@ -895,6 +895,7 @@ from the machine description file `md'.  */\n\n");
   fprintf (file, "#include \"tm-constrs.h\"\n");
   fprintf (file, "#include \"ggc.h\"\n");
   fprintf (file, "#include \"target.h\"\n\n");
+  fprintf (file, "#include \"rtx-vector-builder.h\"\n\n");
 }
 
 auto_vec<const char *, 10> output_files;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_1.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_1.c
new file mode 100644
index 00000000000..c281cfccbe1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_1.c
@@ -0,0 +1,117 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+
+/*
+** f1:
+**	...
+**	cmpgt	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	cmpge	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	cmpeq	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	cmplt	p[0-9]+.s, p7/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	cmple	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_2.c
new file mode 100644
index 00000000000..f1ca3eafc5a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_2.c
@@ -0,0 +1,114 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param=aarch64-autovec-preference=1" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+
+/*
+** f1:
+**	...
+**	cmgt	v[0-9]+.4s, v[0-9]+.4s, #0
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	cmge	v[0-9]+.4s, v[0-9]+.4s, #0
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	cmpeq	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, z[0-9]+.s
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, z[0-9]+.s
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	cmlt	v[0-9]+.4s, v[0-9]+.4s, #0
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	cmle	v[0-9]+.4s, v[0-9]+.4s, #0
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-11-15 14:56 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-15 14:56 [gcc(refs/users/tnfchris/heads/gcc-14-early-break)] aarch64: Add optimization for vector cbranch combining SVE and Adv. SIMD Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).