public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc(refs/users/tnfchris/heads/gcc-14-early-break)] aarch64: Add optimization for vector cbranch combining SVE and Adv. SIMD
@ 2023-11-15 14:56 Tamar Christina
0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2023-11-15 14:56 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:ea25526de29d14406910ad348e854c83917555b9
commit ea25526de29d14406910ad348e854c83917555b9
Author: Tamar Christina <tamar.christina@arm.com>
Date: Tue Jun 20 16:11:56 2023 +0100
aarch64: Add optimization for vector cbranch combining SVE and Adv. SIMD
Reviewed at https://reviewboard.gnu.aws.arm.com/r/17511/
Diff:
---
gcc/config/aarch64/aarch64-simd.md | 69 ++++++++----
gcc/config/aarch64/aarch64-sve.md | 101 +++++++++++++++++-
gcc/genemit.cc | 1 +
.../aarch64/sve/vect-early-break-cbranch_1.c | 117 +++++++++++++++++++++
.../aarch64/sve/vect-early-break-cbranch_2.c | 114 ++++++++++++++++++++
5 files changed, 380 insertions(+), 22 deletions(-)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 31adb649877..60f3758d312 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3883,31 +3883,58 @@
"TARGET_SIMD"
{
auto code = GET_CODE (operands[0]);
- rtx tmp = operands[1];
-
- /* If comparing against a non-zero vector we have to do a comparison first
- so we can have a != 0 comparison with the result. */
- if (operands[2] != CONST0_RTX (<MODE>mode))
- emit_insn (gen_vec_cmp<mode><mode> (tmp, operands[0], operands[1],
- operands[2]));
-
- /* For 64-bit vectors we need no reductions. */
- if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+ /* If SVE is available, lets borrow some instructions. We will optimize
+ these further later in combine. */
+ if (TARGET_SVE)
{
- /* Always reduce using a V4SI. */
- rtx reduc = gen_lowpart (V4SImode, tmp);
- rtx res = gen_reg_rtx (V4SImode);
- emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
- emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
+ machine_mode full_mode = aarch64_full_sve_mode (<VEL>mode).require ();
+ rtx in1 = lowpart_subreg (full_mode, operands[1], <MODE>mode);
+ rtx in2 = lowpart_subreg (full_mode, operands[2], <MODE>mode);
+
+ machine_mode pred_mode = aarch64_sve_pred_mode (full_mode);
+ rtx_vector_builder builder (VNx16BImode, 16, 2);
+ for (unsigned int i = 0; i < 16; ++i)
+ builder.quick_push (CONST1_RTX (BImode));
+ for (unsigned int i = 0; i < 16; ++i)
+ builder.quick_push (CONST0_RTX (BImode));
+ rtx ptrue = force_reg (VNx16BImode, builder.build ());
+ rtx cast_ptrue = gen_lowpart (pred_mode, ptrue);
+ rtx ptrue_flag = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
+
+ rtx tmp = gen_reg_rtx (pred_mode);
+ aarch64_expand_sve_vec_cmp_int (tmp, reverse_condition (code), in1, in2);
+ emit_insn (gen_aarch64_ptest (pred_mode, ptrue, cast_ptrue, ptrue_flag, tmp));
+ operands[1] = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
+ operands[2] = const0_rtx;
}
+ else
+ {
+ rtx tmp = operands[1];
- rtx val = gen_reg_rtx (DImode);
- emit_move_insn (val, gen_lowpart (DImode, tmp));
+ /* If comparing against a non-zero vector we have to do a comparison first
+ so we can have a != 0 comparison with the result. */
+ if (operands[2] != CONST0_RTX (<MODE>mode))
+ emit_insn (gen_vec_cmp<mode><mode> (tmp, operands[0], operands[1],
+ operands[2]));
- rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
- rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
- emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
- DONE;
+ /* For 64-bit vectors we need no reductions. */
+ if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+ {
+ /* Always reduce using a V4SI. */
+ rtx reduc = gen_lowpart (V4SImode, tmp);
+ rtx res = gen_reg_rtx (V4SImode);
+ emit_insn (gen_aarch64_umaxpv4si (res, reduc, reduc));
+ emit_move_insn (tmp, gen_lowpart (<MODE>mode, res));
+ }
+
+ rtx val = gen_reg_rtx (DImode);
+ emit_move_insn (val, gen_lowpart (DImode, tmp));
+
+ rtx cc_reg = aarch64_gen_compare_reg (code, val, const0_rtx);
+ rtx cmp_rtx = gen_rtx_fmt_ee (code, DImode, cc_reg, const0_rtx);
+ emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[3]));
+ DONE;
+ }
})
;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 5a652d8536a..d9cc5c7e562 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8123,6 +8123,105 @@
"cmp<cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.d"
)
+;; Predicated integer comparisons over Advanced SIMD arguments in which only
+;; the flags result is interesting.
+(define_insn "*aarch64_pred_cmp<UCOMPARISONS:cmp_op><mode><EQL:code>_neon_ptest"
+ [(set (reg:CC_NZC CC_REGNUM)
+ (unspec:CC_NZC
+ [(match_operand:VNx16BI 1 "register_operand" "Upl")
+ (match_operand 4)
+ (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+ (unspec:VNx4BI
+ [(match_operand:VNx4BI 6 "register_operand" "Upl")
+ (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+ (EQL:VNx4BI
+ (subreg:SVE_FULL_BHSI
+ (neg:<V128>
+ (UCOMPARISONS:<V128>
+ (match_operand:<V128> 2 "register_operand" "w")
+ (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))) 0)
+ (match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+ UNSPEC_PRED_Z)]
+ UNSPEC_PTEST))
+ (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+ "TARGET_SVE
+ && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+ operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+ operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+ if (EQ == <EQL:CODE>)
+ std::swap (operands[2], operands[3]);
+
+ return "cmp<UCOMPARISONS:cmp_op>\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
+;; Predicated integer comparisons over Advanced SIMD arguments in which only
+;; the flags result is interesting.
+(define_insn "*aarch64_pred_cmpeq<mode><EQL:code>_neon_ptest"
+ [(set (reg:CC_NZC CC_REGNUM)
+ (unspec:CC_NZC
+ [(match_operand:VNx16BI 1 "register_operand" "Upl")
+ (match_operand 4)
+ (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+ (unspec:VNx4BI
+ [(match_operand:VNx4BI 6 "register_operand" "Upl")
+ (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+ (EQL:VNx4BI
+ (subreg:SVE_FULL_BHSI
+ (neg:<V128>
+ (eq:<V128>
+ (match_operand:<V128> 2 "register_operand" "w")
+ (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))) 0)
+ (match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+ UNSPEC_PRED_Z)]
+ UNSPEC_PTEST))
+ (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+ "TARGET_SVE
+ && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+ operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+ operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+ if (EQ == <EQL:CODE>)
+ std::swap (operands[2], operands[3]);
+
+ return "cmpeq\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
+;; Same as the above but version for == and !=
+(define_insn "*aarch64_pred_cmpne<mode><EQL:code>_neon_ptest"
+ [(set (reg:CC_NZC CC_REGNUM)
+ (unspec:CC_NZC
+ [(match_operand:VNx16BI 1 "register_operand" "Upl")
+ (match_operand 4)
+ (match_operand:SI 5 "aarch64_sve_ptrue_flag")
+ (unspec:VNx4BI
+ [(match_operand:VNx4BI 6 "register_operand" "Upl")
+ (match_operand:SI 7 "aarch64_sve_ptrue_flag")
+ (EQL:VNx4BI
+ (subreg:SVE_FULL_BHSI
+ (plus:<V128>
+ (eq:<V128>
+ (match_operand:<V128> 2 "register_operand" "w")
+ (match_operand:<V128> 3 "aarch64_simd_reg_or_zero" "w"))
+ (match_operand:<V128> 9 "aarch64_simd_imm_minus_one" "i")) 0)
+ (match_operand:SVE_FULL_BHSI 8 "aarch64_simd_imm_zero" "Dz"))]
+ UNSPEC_PRED_Z)]
+ UNSPEC_PTEST))
+ (clobber (match_scratch:VNx4BI 0 "=Upa"))]
+ "TARGET_SVE
+ && aarch64_sve_same_pred_for_ptest_p (&operands[4], &operands[6])"
+{
+ operands[2] = lowpart_subreg (<MODE>mode, operands[2], <V128>mode);
+ operands[3] = lowpart_subreg (<MODE>mode, operands[3], <V128>mode);
+ if (EQ == <EQL:CODE>)
+ std::swap (operands[2], operands[3]);
+
+ return "cmpne\t%0.<Vetype>, %1/z, %2.<Vetype>, %3.<Vetype>";
+}
+)
+
;; -------------------------------------------------------------------------
;; ---- [INT] While tests
;; -------------------------------------------------------------------------
@@ -8602,7 +8701,7 @@
)
;; See "Description of UNSPEC_PTEST" above for details.
-(define_insn "aarch64_ptest<mode>"
+(define_insn "@aarch64_ptest<mode>"
[(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC [(match_operand:VNx16BI 0 "register_operand" "Upa")
(match_operand 1)
diff --git a/gcc/genemit.cc b/gcc/genemit.cc
index 471fd46a10b..33122bbe6b4 100644
--- a/gcc/genemit.cc
+++ b/gcc/genemit.cc
@@ -895,6 +895,7 @@ from the machine description file `md'. */\n\n");
fprintf (file, "#include \"tm-constrs.h\"\n");
fprintf (file, "#include \"ggc.h\"\n");
fprintf (file, "#include \"target.h\"\n\n");
+ fprintf (file, "#include \"rtx-vector-builder.h\"\n\n");
}
auto_vec<const char *, 10> output_files;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_1.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_1.c
new file mode 100644
index 00000000000..c281cfccbe1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_1.c
@@ -0,0 +1,117 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+
+/*
+** f1:
+** ...
+** cmpgt p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any \.L[0-9]+
+** ...
+*/
+void f1 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] > 0)
+ break;
+ }
+}
+
+/*
+** f2:
+** ...
+** cmpge p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any \.L[0-9]+
+** ...
+*/
+void f2 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] >= 0)
+ break;
+ }
+}
+
+/*
+** f3:
+** ...
+** cmpeq p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any \.L[0-9]+
+** ...
+*/
+void f3 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] == 0)
+ break;
+ }
+}
+
+/*
+** f4:
+** ...
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any \.L[0-9]+
+** ...
+*/
+void f4 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] != 0)
+ break;
+ }
+}
+
+/*
+** f5:
+** ...
+** cmplt p[0-9]+.s, p7/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any .L[0-9]+
+** ...
+*/
+void f5 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] < 0)
+ break;
+ }
+}
+
+/*
+** f6:
+** ...
+** cmple p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** ptest p[0-9]+, p[0-9]+.b
+** b.any \.L[0-9]+
+** ...
+*/
+void f6 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] <= 0)
+ break;
+ }
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_2.c b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_2.c
new file mode 100644
index 00000000000..f1ca3eafc5a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-early-break-cbranch_2.c
@@ -0,0 +1,114 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param=aarch64-autovec-preference=1" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+
+/*
+** f1:
+** ...
+** cmgt v[0-9]+.4s, v[0-9]+.4s, #0
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** b.any \.L[0-9]+
+** ...
+*/
+void f1 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] > 0)
+ break;
+ }
+}
+
+/*
+** f2:
+** ...
+** cmge v[0-9]+.4s, v[0-9]+.4s, #0
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** b.any \.L[0-9]+
+** ...
+*/
+void f2 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] >= 0)
+ break;
+ }
+}
+
+/*
+** f3:
+** ...
+** cmpeq p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, z[0-9]+.s
+** b.any \.L[0-9]+
+** ...
+*/
+void f3 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] == 0)
+ break;
+ }
+}
+
+/*
+** f4:
+** ...
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, z[0-9]+.s
+** b.any \.L[0-9]+
+** ...
+*/
+void f4 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] != 0)
+ break;
+ }
+}
+
+/*
+** f5:
+** ...
+** cmlt v[0-9]+.4s, v[0-9]+.4s, #0
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** b.any \.L[0-9]+
+** ...
+*/
+void f5 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] < 0)
+ break;
+ }
+}
+
+/*
+** f6:
+** ...
+** cmle v[0-9]+.4s, v[0-9]+.4s, #0
+** cmpne p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+** b.any \.L[0-9]+
+** ...
+*/
+void f6 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] <= 0)
+ break;
+ }
+}
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-11-15 14:56 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-15 14:56 [gcc(refs/users/tnfchris/heads/gcc-14-early-break)] aarch64: Add optimization for vector cbranch combining SVE and Adv. SIMD Tamar Christina
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).