From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <tnfchris@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1984)
	id A91803857B9B; Wed, 28 Jun 2023 13:33:23 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A91803857B9B
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1687959203;
	bh=949vlIgEE88wUpoKp9VP+BuwMAHnpK7IOeUZAKEpZLo=;
	h=From:To:Subject:Date:From;
	b=BmI0uE9oL2OCdUE3eLQ0FPTMk+h3gtOFy5+hy2aPS3WYzeXbm4BAyRULobdSdXSGr
	 P5kNAua18U9W7lF48cpbivAhqehE7VRJWUi1wWftZF1Yn8csjjCZeHMd7Dz3U824iW
	 JuUWQ0L9NNT6IT0HudCMmGw50r4lt5Fv5eQL0YSU=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Tamar Christina <tnfchris@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc(refs/users/tnfchris/heads/gcc-14-early-break)] Add optimization
 for vector cbranch combining SVE and Advanced SIMD
X-Act-Checkin: gcc
X-Git-Author: Tamar Christina <tamar.christina@arm.com>
X-Git-Refname: refs/users/tnfchris/heads/gcc-14-early-break
X-Git-Oldrev: bdc0399b07d5cd12afdd400cbb194a14f0ce7575
X-Git-Newrev: d9a94d04881d711cdd4b5a0c590d7ce849b5b444
Message-Id: <20230628133323.A91803857B9B@sourceware.org>
Date: Wed, 28 Jun 2023 13:33:23 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:d9a94d04881d711cdd4b5a0c590d7ce849b5b444

commit d9a94d04881d711cdd4b5a0c590d7ce849b5b444
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Wed Jun 28 14:22:32 2023 +0100

    Add optimization for vector cbranch combining SVE and Advanced SIMD
    
    Advanced SIMD lacks flag setting vector comparisons which SVE adds.  Since machines
    with SVE also support Advanced SIMD we can use the SVE comparisons to perform the
    operation in cases where SVE codegen is allowed, but the vectorizer has decided
    to generate Advanced SIMD because of loop costing.
    
    e.g. for
    
    void f1 (int x)
    {
      for (int i = 0; i < N; i++)
        {
          b[i] += a[i];
          if (a[i] != x)
            break;
        }
    }
    
    We currently generate:
    
            cmeq    v31.4s, v31.4s, v28.4s
            uminp   v31.4s, v31.4s, v31.4s
            fmov    x5, d31
            cbz     x5, .L2
    
    and after this patch:
    
            ptrue   p7.b, vl16
            ...
            cmpne   p15.s, p7/z, z31.s, z28.s
            b.any   .L2
    
    Because we need to lift the predicate creation to outside of the loop we need to
    expand the predicate early,  however in the cbranch expansion we don't see the
    outer compare which we need to consume.
    
    For this reason the expansion is two fold, when expanding the cbranch we emit an
    SVE predicated comparison and later on during combine we match the SVE and NEON
    comparison while also consuming the ptest.
    
    Unfortunately *aarch64_pred_cmpne<mode><EQL:code>_neon_ptest is needed because
    for some reason combine destroys the NOT and transforms it into a plus and -1.
    
    For the straight SVE ones, we seem to fail to eliminate the ptest in these cases
    but that's a separate optimization
    
    Test show that I'm missing a few, but before I write the patterns for them, are
    these OK?
    
    gcc/ChangeLog:
    
            * config/aarch64/aarch64-simd.md (cbranch<mode>4): Update with SVE.
            * config/aarch64/aarch64-sve.md
            (*aarch64_pred_cmp<UCOMPARISONS:cmp_op><mode><EQL:code>_neon_ptest,
            *aarch64_pred_cmpeq<mode><EQL:code>_neon_ptest,
            *aarch64_pred_cmpne<mode><EQL:code>_neon_ptest): New.
            (aarch64_ptest<mode>): Rename to...
            (@aarch64_ptest<mode>): ... This.
            * genemit.cc: Include rtx-vector-builder.h.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve/vect-early-break-cbranch_1.c: New test.
            * gcc.target/aarch64/sve/vect-early-break-cbranch_2.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md                 |  68 ++++++++----
 gcc/config/aarch64/aarch64-sve.md                  | 101 +++++++++++++++++-
 gcc/genemit.cc                                     |   1 +
 .../aarch64/sve/vect-early-break-cbranch_1.c       | 117 +++++++++++++++++++++
 .../aarch64/sve/vect-early-break-cbranch_2.c       | 114 ++++++++++++++++++++
 5 files changed, 380 insertions(+), 21 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index b1a2c617d7d..75cb5d6f7f9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3843,31 +3843,59 @@
   "TARGET_SIMD"
 {
   auto code = GET_CODE (operands[0]);
-  rtx tmp = operands[1];
 
-  /* If comparing against a non-zero vector we have to do a comparison first
-     so we can have a != 0 comparison with the result.  */
-  if (operands[2] != CONST0_RTX (<MODE>mode))
-    emit_insn (gen_vec_cmp<mode><mode> (tmp, operands[0], operands[1],
-					operands[2]));
-
-  /* For 64-bit vectors we need no reductions.  */
-  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+  /* If SVE is available, lets borrow some instructions.  We will optimize
+     these further later in combine.  */
+  if (TARGET_SVE)
     {
-      /* Always reduce using a V4SI.  */
-      rtx reduc = gen_lowpart (V4SImode, tmp);
-      rtx res = gen_reg_rtx (V4SImode);
-      emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
-      emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
+      machine_mode full_mode = aarch64_full_sve_mode (<VEL>mode).require ();
+      rtx in1 = lowpart_subreg (full_mode, operands[1], <MODE>mode);
+      rtx in2 = lowpart_subreg (full_mode, operands[2], <MODE>mode);
+
+      machine_mode pred_mode = aarch64_sve_pred_mode (full_mode);
+      rtx_vector_builder builder (VNx16BImode, 16, 2);
+      for (unsigned int i = 0; i < 16; ++i)
+	builder.quick_push (CONST1_RTX (BImode));
+      for (unsigned int i = 0; i < 16; ++i)
+	builder.quick_push (CONST0_RTX (BImode));
+      rtx ptrue = force_reg (VNx16BImode, builder.build ());
+      rtx cast_ptrue = gen_lowpart (pred_mode, ptrue);
+      rtx ptrue_flag = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
+
+      rtx tmp = gen_reg_rtx (pred_mode);
+      aarch64_expand_sve_vec_cmp_int (tmp, code, in1, in2);
+      emit_insn (gen_aarch64_ptest (pred_mode, ptrue, cast_ptrue, ptrue_flag, tmp));
+      operands[1] = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
+      operands[2] = const0_rtx;
     }
+  else
+    {
+      rtx tmp = operands[1];
 
-  rtx val = gen_reg_rtx (DImode);
-  emit_move_insn (val, gen_lowpart (DImode, tmp));
+      /* If comparing against a non-zero vector we have to do a comparison first
+	 so we can have a != 0 comparison with the result.  */
+      if (operands[2] != CONST0_RTX (<MODE>mode))
+	emit_insn (gen_vec_cmp<mode><mode> (tmp, operands[0], operands[1],
+					    operands[2]));
 
-  rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
-  rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
-  emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
-  DONE;
+      /* For 64-bit vectors we need no reductions.  */
+      if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+	{
+	  /* Always reduce using a V4SI.  */
+	  rtx reduc = gen_lowpart (V4SImode, tmp);
+	  rtx res = gen_reg_rtx (V4SImode);
+	  emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
+	  emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
+	}
+
+      rtx val = gen_reg_rtx (DImode);
+      emit_move_insn (val, gen_lowpart (DImode, tmp));
+
+      rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
+      rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
+      emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
+      DONE;
+    }
 })
 
 ;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index da5534c3e32..0e10e497e07 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8059,6 +8059,105 @@
   "cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d"
 )
 
+;; Predicated integer comparisons over Advanced SIMD arguments in which only
+;; the flags result is interesting.
+(define_insn "*aarch64_pred_cmp<UCOMPARISONS:cmp_op><mode><EQL:code>_neon_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:VNx4BI
+	     [(match_operand:VNx4BI 6 "register_operand" "Upl")
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (EQL:VNx4BI
+		(subreg:SVE_FULL_BHSI
+		 (neg:<V128>
+		  (UCOMPARISONS:<V128>
+		   (match_operand:<V128> 2 "register_operand" "w")
+		   (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))) 0)
+		(match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+  operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+  operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+  if (EQ == <EQL:CODE>)
+    std::swap (operands[2], operands[3]);
+
+  return "cmp<UCOMPARISONS:cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
+;; Predicated integer comparisons over Advanced SIMD arguments in which only
+;; the flags result is interesting.
+(define_insn "*aarch64_pred_cmpeq<mode><EQL:code>_neon_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:VNx4BI
+	     [(match_operand:VNx4BI 6 "register_operand" "Upl")
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (EQL:VNx4BI
+		(subreg:SVE_FULL_BHSI
+		 (neg:<V128>
+		  (eq:<V128>
+		   (match_operand:<V128> 2 "register_operand" "w")
+		   (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))) 0)
+		(match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+  operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+  operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+  if (EQ == <EQL:CODE>)
+    std::swap (operands[2], operands[3]);
+
+  return "cmpeq\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
+;; Same as the above but version for == and !=
+(define_insn "*aarch64_pred_cmpne<mode><EQL:code>_neon_ptest"
+  [(set (reg:CC_NZC CC_REGNUM)
+	(unspec:CC_NZC
+	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 4)
+	   (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+	   (unspec:VNx4BI
+	     [(match_operand:VNx4BI 6 "register_operand" "Upl")
+	      (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+	      (EQL:VNx4BI
+		(subreg:SVE_FULL_BHSI
+		 (plus:<V128>
+		  (eq:<V128>
+		   (match_operand:<V128> 2 "register_operand" "w")
+		   (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))
+		  (match_operand:<V128> 9 "aarch64_simd_imm_minus_one" "i")) 0)
+		(match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+	     UNSPEC_PRED_Z)]
+	  UNSPEC_PTEST))
+   (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+  "TARGET_SVE
+   && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+  operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+  operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+  if (EQ == <EQL:CODE>)
+    std::swap (operands[2], operands[3]);
+
+  return "cmpne\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] While tests
 ;; -------------------------------------------------------------------------
@@ -8537,7 +8636,7 @@
 )
 
 ;; See "Description of UNSPEC_PTEST" above for details.
-(define_insn "aarch64_ptest<mode>"
+(define_insn "@aarch64_ptest<mode>"
   [(set (reg:CC_NZC CC_REGNUM)
 	(unspec:CC_NZC [(match_operand:VNx16BI 0 "register_operand" "Upa")
 			(match_operand 1)
diff --git a/gcc/genemit.cc b/gcc/genemit.cc
index 1ce0564076d..73309ca00ec 100644
--- a/gcc/genemit.cc
+++ b/gcc/genemit.cc
@@ -906,6 +906,7 @@ from the machine description file `md'.  */\n\n");
   printf ("#include \"tm-constrs.h\"\n");
   printf ("#include \"ggc.h\"\n");
   printf ("#include \"target.h\"\n\n");
+  printf ("#include \"rtx-vector-builder.h\"\n\n");
 
   /* Read the machine description.  */
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_1.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_1.c
new file mode 100644
index 00000000000..c281cfccbe1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_1.c
@@ -0,0 +1,117 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+
+/*
+** f1:
+**	...
+**	cmpgt	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	cmpge	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	cmpeq	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	cmplt	p[0-9]+.s, p7/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	cmple	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	ptest	p[0-9]+, p[0-9]+.b
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_2.c
new file mode 100644
index 00000000000..f1ca3eafc5a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_2.c
@@ -0,0 +1,114 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param=aarch64-autovec-preference=1" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+
+/*
+** f1:
+**	...
+**	cmgt	v[0-9]+.4s, v[0-9]+.4s, #0
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	cmge	v[0-9]+.4s, v[0-9]+.4s, #0
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	cmpeq	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, z[0-9]+.s
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, z[0-9]+.s
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	cmlt	v[0-9]+.4s, v[0-9]+.4s, #0
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	cmle	v[0-9]+.4s, v[0-9]+.4s, #0
+**	cmpne	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	b.any	\.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}