[PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
@ 2023-12-29 14:42 Tamar Christina
  2024-01-04 11:06 ` Tamar Christina
  0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2023-12-29 14:42 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Ramana.Radhakrishnan, Richard.Earnshaw, nickc, Kyrylo.Tkachov

[-- Attachment #1: Type: text/plain, Size: 11342 bytes --]

Hi All,

This adds an implementation for conditional branch optab for AArch32.
The previous version only allowed operand 0 but it looks like cbranch
expansion does not check with the target and so we have to implement all.

I therefore did not commit it.  This is a larger version. 

For e.g.

void f1 ()
{
  for (int i = 0; i < N; i++)
    {
      b[i] += a[i];
      if (a[i] > 0)
	break;
    }
}

For 128-bit vectors we generate:

        vcgt.s32        q8, q9, #0
        vpmax.u32       d7, d16, d17
        vpmax.u32       d7, d7, d7
        vmov    r3, s14 @ int
        cmp     r3, #0

and of 64-bit vector we can omit one vpmax as we still need to compress to
32-bits.

Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/neon.md (cbranch<mode>4): New.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-early-break_2.c: Skip Arm.
	* gcc.dg/vect/vect-early-break_7.c: Likewise.
	* gcc.dg/vect/vect-early-break_75.c: Likewise.
	* gcc.dg/vect/vect-early-break_77.c: Likewise.
	* gcc.dg/vect/vect-early-break_82.c: Likewise.
	* gcc.dg/vect/vect-early-break_88.c: Likewise.
	* lib/target-supports.exp (add_options_for_vect_early_break,
	check_effective_target_vect_early_break_hw,
	check_effective_target_vect_early_break): Support AArch32.
	* gcc.target/arm/vect-early-break-cbranch.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..0f088a51d31e6882bc0fabbad99862b8b465dd22 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,54 @@ (define_insn "vec_extract<mode><V_elem_l>"
   [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
 )
 
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation.  To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+  [(set (pc) (if_then_else
+	      (match_operator 0 "expandable_comparison_operator"
+	       [(match_operand:VDQI 1 "register_operand")
+	        (match_operand:VDQI 2 "reg_or_zero_operand")])
+	      (label_ref (match_operand 3 "" ""))
+	      (pc)))]
+  "TARGET_NEON"
+{
+  rtx mask = operands[1];
+
+  /* If comparing against a non-zero vector we have to do a comparison first
+     so we can have a != 0 comparison with the result.  */
+  if (operands[2] != CONST0_RTX (<MODE>mode))
+    {
+      mask = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
+    }
+
+  /* For 128-bit vectors we need an additional reductions.  */
+  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+    {
+      /* Always reduce using a V4SI.  */
+      mask = gen_reg_rtx (V2SImode);
+      rtx low = gen_reg_rtx (V2SImode);
+      rtx high = gen_reg_rtx (V2SImode);
+      rtx op1 = simplify_gen_subreg (V4SImode, operands[1], <MODE>mode, 0);
+      emit_insn (gen_neon_vget_lowv4si (low, op1));
+      emit_insn (gen_neon_vget_highv4si (high, op1));
+      emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+    }
+
+  emit_insn (gen_neon_vpumaxv2si (mask, mask, mask));
+
+  rtx val = gen_reg_rtx (SImode);
+  emit_move_insn (val, gen_lowpart (SImode, mask));
+  emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+  DONE;
+})
+
 ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
 ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
 ;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
index 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a95fd554a109a68a39 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
index 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dcf26402851b53260 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
index ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce36c4bd96f02fd52 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-*" } } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-* arm*-*-*" } } } } */
 
 #include <limits.h>
 #include <assert.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
index 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653fd6ad1fc65f4e8e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
index 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e1943c87e9481c41fd2c3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
index b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc20058eeb0a81a55815cc 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e9a39d231fdf4cb56590945e7cedfabd11d39b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,138 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard  -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/* 
+** f1:
+**	...
+**	vcgt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	vcge.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vmvn	q[0-9]+, q[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	vclt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	vcle.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd9e0b89369595fcf87 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
     return [check_cached_effective_target_indexed vect_early_break {
       expr {
 	[istarget aarch64*-*-*]
+	|| [check_effective_target_arm_v8_neon_ok]
 	|| [check_effective_target_sse4]
 	}}]
 }
@@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { } {
     return [check_cached_effective_target_indexed vect_early_break_hw {
       expr {
 	[istarget aarch64*-*-*]
+	|| [check_effective_target_arm_v8_neon_hw]
 	|| [check_sse4_hw_available]
 	}}]
 }
@@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
 	return "$flags"
     }
 
+    if { [check_effective_target_arm_v8_neon_ok] } {
+	global et_arm_v8_neon_flags
+	return "$flags $et_arm_v8_neon_flags -march=armv8-a"
+    }
+
     if { [check_effective_target_sse4] } {
 	return "$flags -msse4.1"
     }




-- 

[-- Attachment #2: rb17512.patch --]
[-- Type: text/plain, Size: 9932 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..0f088a51d31e6882bc0fabbad99862b8b465dd22 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,54 @@ (define_insn "vec_extract<mode><V_elem_l>"
   [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
 )
 
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation.  To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+  [(set (pc) (if_then_else
+	      (match_operator 0 "expandable_comparison_operator"
+	       [(match_operand:VDQI 1 "register_operand")
+	        (match_operand:VDQI 2 "reg_or_zero_operand")])
+	      (label_ref (match_operand 3 "" ""))
+	      (pc)))]
+  "TARGET_NEON"
+{
+  rtx mask = operands[1];
+
+  /* If comparing against a non-zero vector we have to do a comparison first
+     so we can have a != 0 comparison with the result.  */
+  if (operands[2] != CONST0_RTX (<MODE>mode))
+    {
+      mask = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
+    }
+
+  /* For 128-bit vectors we need an additional reductions.  */
+  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+    {
+      /* Always reduce using a V4SI.  */
+      mask = gen_reg_rtx (V2SImode);
+      rtx low = gen_reg_rtx (V2SImode);
+      rtx high = gen_reg_rtx (V2SImode);
+      rtx op1 = simplify_gen_subreg (V4SImode, operands[1], <MODE>mode, 0);
+      emit_insn (gen_neon_vget_lowv4si (low, op1));
+      emit_insn (gen_neon_vget_highv4si (high, op1));
+      emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+    }
+
+  emit_insn (gen_neon_vpumaxv2si (mask, mask, mask));
+
+  rtx val = gen_reg_rtx (SImode);
+  emit_move_insn (val, gen_lowpart (SImode, mask));
+  emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+  DONE;
+})
+
 ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
 ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
 ;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
index 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a95fd554a109a68a39 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
index 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dcf26402851b53260 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
index ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce36c4bd96f02fd52 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-*" } } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-* arm*-*-*" } } } } */
 
 #include <limits.h>
 #include <assert.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
index 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653fd6ad1fc65f4e8e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
index 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e1943c87e9481c41fd2c3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
index b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc20058eeb0a81a55815cc 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e9a39d231fdf4cb56590945e7cedfabd11d39b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,138 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard  -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/* 
+** f1:
+**	...
+**	vcgt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	vcge.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vmvn	q[0-9]+, q[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	vclt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	vcle.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd9e0b89369595fcf87 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
     return [check_cached_effective_target_indexed vect_early_break {
       expr {
 	[istarget aarch64*-*-*]
+	|| [check_effective_target_arm_v8_neon_ok]
 	|| [check_effective_target_sse4]
 	}}]
 }
@@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { } {
     return [check_cached_effective_target_indexed vect_early_break_hw {
       expr {
 	[istarget aarch64*-*-*]
+	|| [check_effective_target_arm_v8_neon_hw]
 	|| [check_sse4_hw_available]
 	}}]
 }
@@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
 	return "$flags"
     }
 
+    if { [check_effective_target_arm_v8_neon_ok] } {
+	global et_arm_v8_neon_flags
+	return "$flags $et_arm_v8_neon_flags -march=armv8-a"
+    }
+
     if { [check_effective_target_sse4] } {
 	return "$flags -msse4.1"
     }




^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
  2023-12-29 14:42 [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation Tamar Christina
@ 2024-01-04 11:06 ` Tamar Christina
  2024-01-04 11:12   ` Kyrylo Tkachov
  0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2024-01-04 11:06 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: nd, Ramana Radhakrishnan, Richard Earnshaw, nickc, Kyrylo Tkachov

[-- Attachment #1: Type: text/plain, Size: 12232 bytes --]

Ping,

---

Hi All,

This adds an implementation for conditional branch optab for AArch32.
The previous version only allowed operand 0 but it looks like cbranch
expansion does not check with the target and so we have to implement all.

I therefore did not commit it.  This is a larger version. I've also dropped the MVE
version because the mid-end can rewrite the comparison into comparing two
predicates without checking with the backend.  Since MVE only has 1 predicate
register this would need to go through memory and two MRS calls.  It's unlikely
to be beneficial and so that's for GCC 15 when I can fix the middle-end.

The cases where AArch32 is skipped in the testsuite are all missed-optimizations
due to AArch32 missing some optabs.

For e.g.

void f1 ()
{
  for (int i = 0; i < N; i++)
    {
      b[i] += a[i];
      if (a[i] > 0)
	break;
    }
}

For 128-bit vectors we generate:

        vcgt.s32        q8, q9, #0
        vpmax.u32       d7, d16, d17
        vpmax.u32       d7, d7, d7
        vmov    r3, s14 @ int
        cmp     r3, #0

and of 64-bit vector we can omit one vpmax as we still need to compress to
32-bits.

Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/neon.md (cbranch<mode>4): New.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-early-break_2.c: Skip Arm.
	* gcc.dg/vect/vect-early-break_7.c: Likewise.
	* gcc.dg/vect/vect-early-break_75.c: Likewise.
	* gcc.dg/vect/vect-early-break_77.c: Likewise.
	* gcc.dg/vect/vect-early-break_82.c: Likewise.
	* gcc.dg/vect/vect-early-break_88.c: Likewise.
	* lib/target-supports.exp (add_options_for_vect_early_break,
	check_effective_target_vect_early_break_hw,
	check_effective_target_vect_early_break): Support AArch32.
	* gcc.target/arm/vect-early-break-cbranch.c: New test.

--- inline version of patch ---

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..ed659ab736862da416d1ff6241d0d3e6c6b96ff1 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,55 @@ (define_insn "vec_extract<mode><V_elem_l>"
   [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
 )
 
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation.  To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+  [(set (pc) (if_then_else
+	      (match_operator 0 "expandable_comparison_operator"
+	       [(match_operand:VDQI 1 "register_operand")
+	        (match_operand:VDQI 2 "reg_or_zero_operand")])
+	      (label_ref (match_operand 3 "" ""))
+	      (pc)))]
+  "TARGET_NEON"
+{
+  rtx mask = operands[1];
+
+  /* If comparing against a non-zero vector we have to do a comparison first
+     so we can have a != 0 comparison with the result.  */
+  if (operands[2] != CONST0_RTX (<MODE>mode))
+    {
+      mask = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
+    }
+
+  /* For 128-bit vectors we need an additional reductions.  */
+  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+    {
+      /* Always reduce using a V4SI.  */
+      mask = gen_reg_rtx (V2SImode);
+      rtx low = gen_reg_rtx (V2SImode);
+      rtx high = gen_reg_rtx (V2SImode);
+      rtx op1 = lowpart_subreg (V4SImode, operands[1], <MODE>mode);
+      emit_insn (gen_neon_vget_lowv4si (low, op1));
+      emit_insn (gen_neon_vget_highv4si (high, op1));
+      emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+    }
+
+  rtx op1 = lowpart_subreg (V2SImode, mask, GET_MODE (mask));
+  emit_insn (gen_neon_vpumaxv2si (op1, op1, op1));
+
+  rtx val = gen_reg_rtx (SImode);
+  emit_move_insn (val, gen_lowpart (SImode, mask));
+  emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+  DONE;
+})
+
 ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
 ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
 ;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
index 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a95fd554a109a68a39 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
index 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dcf26402851b53260 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
index ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce36c4bd96f02fd52 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-*" } } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-* arm*-*-*" } } } } */
 
 #include <limits.h>
 #include <assert.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
index 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653fd6ad1fc65f4e8e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
index 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e1943c87e9481c41fd2c3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
index b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc20058eeb0a81a55815cc 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e9a39d231fdf4cb56590945e7cedfabd11d39b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,138 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard  -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/* 
+** f1:
+**	...
+**	vcgt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	vcge.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vmvn	q[0-9]+, q[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	vclt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	vcle.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd9e0b89369595fcf87 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
     return [check_cached_effective_target_indexed vect_early_break {
       expr {
 	[istarget aarch64*-*-*]
+	|| [check_effective_target_arm_v8_neon_ok]
 	|| [check_effective_target_sse4]
 	}}]
 }
@@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { } {
     return [check_cached_effective_target_indexed vect_early_break_hw {
       expr {
 	[istarget aarch64*-*-*]
+	|| [check_effective_target_arm_v8_neon_hw]
 	|| [check_sse4_hw_available]
 	}}]
 }
@@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
 	return "$flags"
     }
 
+    if { [check_effective_target_arm_v8_neon_ok] } {
+	global et_arm_v8_neon_flags
+	return "$flags $et_arm_v8_neon_flags -march=armv8-a"
+    }
+
     if { [check_effective_target_sse4] } {
 	return "$flags -msse4.1"
     }

[-- Attachment #2: rb17512.patch --]
[-- Type: application/octet-stream, Size: 9981 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..ed659ab736862da416d1ff6241d0d3e6c6b96ff1 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,55 @@ (define_insn "vec_extract<mode><V_elem_l>"
   [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
 )
 
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation.  To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+  [(set (pc) (if_then_else
+	      (match_operator 0 "expandable_comparison_operator"
+	       [(match_operand:VDQI 1 "register_operand")
+	        (match_operand:VDQI 2 "reg_or_zero_operand")])
+	      (label_ref (match_operand 3 "" ""))
+	      (pc)))]
+  "TARGET_NEON"
+{
+  rtx mask = operands[1];
+
+  /* If comparing against a non-zero vector we have to do a comparison first
+     so we can have a != 0 comparison with the result.  */
+  if (operands[2] != CONST0_RTX (<MODE>mode))
+    {
+      mask = gen_reg_rtx (<MODE>mode);
+      emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
+    }
+
+  /* For 128-bit vectors we need an additional reductions.  */
+  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+    {
+      /* Always reduce using a V4SI.  */
+      mask = gen_reg_rtx (V2SImode);
+      rtx low = gen_reg_rtx (V2SImode);
+      rtx high = gen_reg_rtx (V2SImode);
+      rtx op1 = lowpart_subreg (V4SImode, operands[1], <MODE>mode);
+      emit_insn (gen_neon_vget_lowv4si (low, op1));
+      emit_insn (gen_neon_vget_highv4si (high, op1));
+      emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+    }
+
+  rtx op1 = lowpart_subreg (V2SImode, mask, GET_MODE (mask));
+  emit_insn (gen_neon_vpumaxv2si (op1, op1, op1));
+
+  rtx val = gen_reg_rtx (SImode);
+  emit_move_insn (val, gen_lowpart (SImode, mask));
+  emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+  DONE;
+})
+
 ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
 ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
 ;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
index 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a95fd554a109a68a39 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
index 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dcf26402851b53260 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
index ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce36c4bd96f02fd52 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-*" } } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-* arm*-*-*" } } } } */
 
 #include <limits.h>
 #include <assert.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
index 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653fd6ad1fc65f4e8e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
index 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e1943c87e9481c41fd2c3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
@@ -5,7 +5,7 @@
 
 /* { dg-additional-options "-Ofast" } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include <complex.h>
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
index b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc20058eeb0a81a55815cc 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
@@ -3,7 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 
 /* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e9a39d231fdf4cb56590945e7cedfabd11d39b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,138 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard  -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/* 
+** f1:
+**	...
+**	vcgt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	vcge.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vmvn	q[0-9]+, q[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	vclt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	vcle.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd9e0b89369595fcf87 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
     return [check_cached_effective_target_indexed vect_early_break {
       expr {
 	[istarget aarch64*-*-*]
+	|| [check_effective_target_arm_v8_neon_ok]
 	|| [check_effective_target_sse4]
 	}}]
 }
@@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { } {
     return [check_cached_effective_target_indexed vect_early_break_hw {
       expr {
 	[istarget aarch64*-*-*]
+	|| [check_effective_target_arm_v8_neon_hw]
 	|| [check_sse4_hw_available]
 	}}]
 }
@@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
 	return "$flags"
     }
 
+    if { [check_effective_target_arm_v8_neon_ok] } {
+	global et_arm_v8_neon_flags
+	return "$flags $et_arm_v8_neon_flags -march=armv8-a"
+    }
+
     if { [check_effective_target_sse4] } {
 	return "$flags -msse4.1"
     }

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
  2024-01-04 11:06 ` Tamar Christina
@ 2024-01-04 11:12   ` Kyrylo Tkachov
  2024-01-04 11:26     ` Tamar Christina
  0 siblings, 1 reply; 6+ messages in thread
From: Kyrylo Tkachov @ 2024-01-04 11:12 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: nd, Ramana Radhakrishnan, Richard Earnshaw, nickc

Hi Tamar,

> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Thursday, January 4, 2024 11:06 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
> 
> Ping,
> 
> ---
> 
> Hi All,
> 
> This adds an implementation for conditional branch optab for AArch32.
> The previous version only allowed operand 0 but it looks like cbranch
> expansion does not check with the target and so we have to implement all.
> 
> I therefore did not commit it.  This is a larger version. I've also dropped the MVE
> version because the mid-end can rewrite the comparison into comparing two
> predicates without checking with the backend.  Since MVE only has 1 predicate
> register this would need to go through memory and two MRS calls.  It's unlikely
> to be beneficial and so that's for GCC 15 when I can fix the middle-end.
> 
> The cases where AArch32 is skipped in the testsuite are all missed-optimizations
> due to AArch32 missing some optabs.

Does the testsuite have vect_* checks that can be used instead of target arm*?
If so let's use those.
Otherwise it's okay as is.
Thanks,
Kyrill

> 
> For e.g.
> 
> void f1 ()
> {
>   for (int i = 0; i < N; i++)
>     {
>       b[i] += a[i];
>       if (a[i] > 0)
> 	break;
>     }
> }
> 
> For 128-bit vectors we generate:
> 
>         vcgt.s32        q8, q9, #0
>         vpmax.u32       d7, d16, d17
>         vpmax.u32       d7, d7, d7
>         vmov    r3, s14 @ int
>         cmp     r3, #0
> 
> and of 64-bit vector we can omit one vpmax as we still need to compress to
> 32-bits.
> 
> Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/arm/neon.md (cbranch<mode>4): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-early-break_2.c: Skip Arm.
> 	* gcc.dg/vect/vect-early-break_7.c: Likewise.
> 	* gcc.dg/vect/vect-early-break_75.c: Likewise.
> 	* gcc.dg/vect/vect-early-break_77.c: Likewise.
> 	* gcc.dg/vect/vect-early-break_82.c: Likewise.
> 	* gcc.dg/vect/vect-early-break_88.c: Likewise.
> 	* lib/target-supports.exp (add_options_for_vect_early_break,
> 	check_effective_target_vect_early_break_hw,
> 	check_effective_target_vect_early_break): Support AArch32.
> 	* gcc.target/arm/vect-early-break-cbranch.c: New test.
> 
> --- inline version of patch ---
> 
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index
> d213369ffc38fb88ad0357d848cc7da5af73bab7..ed659ab736862da416d1ff6241d
> 0d3e6c6b96ff1 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -408,6 +408,55 @@ (define_insn "vec_extract<mode><V_elem_l>"
>    [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
>  )
> 
> +;; Patterns comparing two vectors and conditionally jump.
> +;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
> +;; operation.  To not pay the penalty for inverting == we can map our any
> +;; comparisons to all i.e. any(~x) => all(x).
> +;;
> +;; However unlike the AArch64 version, we can't optimize this further as the
> +;; chain is too long for combine due to these being unspecs so it doesn't fold
> +;; the operation to something simpler.
> +(define_expand "cbranch<mode>4"
> +  [(set (pc) (if_then_else
> +	      (match_operator 0 "expandable_comparison_operator"
> +	       [(match_operand:VDQI 1 "register_operand")
> +	        (match_operand:VDQI 2 "reg_or_zero_operand")])
> +	      (label_ref (match_operand 3 "" ""))
> +	      (pc)))]
> +  "TARGET_NEON"
> +{
> +  rtx mask = operands[1];
> +
> +  /* If comparing against a non-zero vector we have to do a comparison first
> +     so we can have a != 0 comparison with the result.  */
> +  if (operands[2] != CONST0_RTX (<MODE>mode))
> +    {
> +      mask = gen_reg_rtx (<MODE>mode);
> +      emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
> +    }
> +
> +  /* For 128-bit vectors we need an additional reductions.  */
> +  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
> +    {
> +      /* Always reduce using a V4SI.  */
> +      mask = gen_reg_rtx (V2SImode);
> +      rtx low = gen_reg_rtx (V2SImode);
> +      rtx high = gen_reg_rtx (V2SImode);
> +      rtx op1 = lowpart_subreg (V4SImode, operands[1], <MODE>mode);
> +      emit_insn (gen_neon_vget_lowv4si (low, op1));
> +      emit_insn (gen_neon_vget_highv4si (high, op1));
> +      emit_insn (gen_neon_vpumaxv2si (mask, low, high));
> +    }
> +
> +  rtx op1 = lowpart_subreg (V2SImode, mask, GET_MODE (mask));
> +  emit_insn (gen_neon_vpumaxv2si (op1, op1, op1));
> +
> +  rtx val = gen_reg_rtx (SImode);
> +  emit_move_insn (val, gen_lowpart (SImode, mask));
> +  emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
> +  DONE;
> +})
> +
>  ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
>  ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
>  ;; by define_expand in vec-common.md file.
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> index
> 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a95f
> d554a109a68a39 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> @@ -5,7 +5,7 @@
> 
>  /* { dg-additional-options "-Ofast" } */
> 
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> *" } } } } */
> 
>  #include <complex.h>
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> index
> 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dcf2
> 6402851b53260 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> @@ -5,7 +5,7 @@
> 
>  /* { dg-additional-options "-Ofast" } */
> 
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> *" } } } } */
> 
>  #include <complex.h>
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> index
> ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce36
> c4bd96f02fd52 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> @@ -3,7 +3,7 @@
>  /* { dg-require-effective-target vect_int } */
> 
>  /* { dg-additional-options "-O3" } */
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-
> * i?86-*-*" } } } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-
> *-* i?86-*-* arm*-*-*" } } } } */
> 
>  #include <limits.h>
>  #include <assert.h>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> index
> 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653fd
> 6ad1fc65f4e8e 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> @@ -3,7 +3,7 @@
>  /* { dg-require-effective-target vect_int } */
> 
>  /* { dg-additional-options "-O3" } */
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> *" } } } } */
> 
>  #include "tree-vect.h"
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> index
> 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e1943c
> 87e9481c41fd2c3 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> @@ -5,7 +5,7 @@
> 
>  /* { dg-additional-options "-Ofast" } */
> 
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> *" } } } } */
> 
>  #include <complex.h>
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> index
> b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc20058
> eeb0a81a55815cc 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> @@ -3,7 +3,7 @@
>  /* { dg-require-effective-target vect_int } */
> 
>  /* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> *" } } } } */
> 
>  #include "tree-vect.h"
> 
> diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..0e9a39d231fdf4cb56590945e
> 7cedfabd11d39b5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> @@ -0,0 +1,138 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target arm_neon_ok } */
> +/* { dg-require-effective-target arm32 } */
> +/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard  -fno-
> schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#define N 640
> +int a[N] = {0};
> +int b[N] = {0};
> +
> +/*
> +** f1:
> +**	...
> +**	vcgt.s32	q[0-9]+, q[0-9]+, #0
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f1 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] > 0)
> +	break;
> +    }
> +}
> +
> +/*
> +** f2:
> +**	...
> +**	vcge.s32	q[0-9]+, q[0-9]+, #0
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f2 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] >= 0)
> +	break;
> +    }
> +}
> +
> +/*
> +** f3:
> +**	...
> +**	vceq.i32	q[0-9]+, q[0-9]+, #0
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f3 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] == 0)
> +	break;
> +    }
> +}
> +
> +/*
> +** f4:
> +**	...
> +**	vceq.i32	q[0-9]+, q[0-9]+, #0
> +**	vmvn	q[0-9]+, q[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f4 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] != 0)
> +	break;
> +    }
> +}
> +
> +/*
> +** f5:
> +**	...
> +**	vclt.s32	q[0-9]+, q[0-9]+, #0
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f5 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] < 0)
> +	break;
> +    }
> +}
> +
> +/*
> +** f6:
> +**	...
> +**	vcle.s32	q[0-9]+, q[0-9]+, #0
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f6 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] <= 0)
> +	break;
> +    }
> +}
> +
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> supports.exp
> index
> 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd9e
> 0b89369595fcf87 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
>      return [check_cached_effective_target_indexed vect_early_break {
>        expr {
>  	[istarget aarch64*-*-*]
> +	|| [check_effective_target_arm_v8_neon_ok]
>  	|| [check_effective_target_sse4]
>  	}}]
>  }
> @@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { }
> {
>      return [check_cached_effective_target_indexed vect_early_break_hw {
>        expr {
>  	[istarget aarch64*-*-*]
> +	|| [check_effective_target_arm_v8_neon_hw]
>  	|| [check_sse4_hw_available]
>  	}}]
>  }
> @@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
>  	return "$flags"
>      }
> 
> +    if { [check_effective_target_arm_v8_neon_ok] } {
> +	global et_arm_v8_neon_flags
> +	return "$flags $et_arm_v8_neon_flags -march=armv8-a"
> +    }
> +
>      if { [check_effective_target_sse4] } {
>  	return "$flags -msse4.1"
>      }


^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
  2024-01-04 11:12   ` Kyrylo Tkachov
@ 2024-01-04 11:26     ` Tamar Christina
  0 siblings, 0 replies; 6+ messages in thread
From: Tamar Christina @ 2024-01-04 11:26 UTC (permalink / raw)
  To: Kyrylo Tkachov, gcc-patches
  Cc: nd, Ramana Radhakrishnan, Richard Earnshaw, nickc

> -----Original Message-----
> From: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Sent: Thursday, January 4, 2024 11:12 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com
> Subject: RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
> 
> Hi Tamar,
> 
> > -----Original Message-----
> > From: Tamar Christina <Tamar.Christina@arm.com>
> > Sent: Thursday, January 4, 2024 11:06 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> > Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> > <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> > <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>
> > Subject: RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
> >
> > Ping,
> >
> > ---
> >
> > Hi All,
> >
> > This adds an implementation for conditional branch optab for AArch32.
> > The previous version only allowed operand 0 but it looks like cbranch
> > expansion does not check with the target and so we have to implement all.
> >
> > I therefore did not commit it.  This is a larger version. I've also dropped the MVE
> > version because the mid-end can rewrite the comparison into comparing two
> > predicates without checking with the backend.  Since MVE only has 1 predicate
> > register this would need to go through memory and two MRS calls.  It's unlikely
> > to be beneficial and so that's for GCC 15 when I can fix the middle-end.
> >
> > The cases where AArch32 is skipped in the testsuite are all missed-optimizations
> > due to AArch32 missing some optabs.
> 
> Does the testsuite have vect_* checks that can be used instead of target arm*?
> If so let's use those.

Unfortunately not, a lot of them center around handling of complex doubles.
Some tests work and some fail, which makes it hard to disable based on a
target effective test.  They are things that look easy to fix so I may file some tickets
for them.

Cheers,
Tamar

> Otherwise it's okay as is.
> Thanks,
> Kyrill
> 
> >
> > For e.g.
> >
> > void f1 ()
> > {
> >   for (int i = 0; i < N; i++)
> >     {
> >       b[i] += a[i];
> >       if (a[i] > 0)
> > 	break;
> >     }
> > }
> >
> > For 128-bit vectors we generate:
> >
> >         vcgt.s32        q8, q9, #0
> >         vpmax.u32       d7, d16, d17
> >         vpmax.u32       d7, d7, d7
> >         vmov    r3, s14 @ int
> >         cmp     r3, #0
> >
> > and of 64-bit vector we can omit one vpmax as we still need to compress to
> > 32-bits.
> >
> > Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/arm/neon.md (cbranch<mode>4): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.dg/vect/vect-early-break_2.c: Skip Arm.
> > 	* gcc.dg/vect/vect-early-break_7.c: Likewise.
> > 	* gcc.dg/vect/vect-early-break_75.c: Likewise.
> > 	* gcc.dg/vect/vect-early-break_77.c: Likewise.
> > 	* gcc.dg/vect/vect-early-break_82.c: Likewise.
> > 	* gcc.dg/vect/vect-early-break_88.c: Likewise.
> > 	* lib/target-supports.exp (add_options_for_vect_early_break,
> > 	check_effective_target_vect_early_break_hw,
> > 	check_effective_target_vect_early_break): Support AArch32.
> > 	* gcc.target/arm/vect-early-break-cbranch.c: New test.
> >
> > --- inline version of patch ---
> >
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index
> >
> d213369ffc38fb88ad0357d848cc7da5af73bab7..ed659ab736862da416d1ff624
> 1d
> > 0d3e6c6b96ff1 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -408,6 +408,55 @@ (define_insn "vec_extract<mode><V_elem_l>"
> >    [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
> >  )
> >
> > +;; Patterns comparing two vectors and conditionally jump.
> > +;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
> > +;; operation.  To not pay the penalty for inverting == we can map our any
> > +;; comparisons to all i.e. any(~x) => all(x).
> > +;;
> > +;; However unlike the AArch64 version, we can't optimize this further as the
> > +;; chain is too long for combine due to these being unspecs so it doesn't fold
> > +;; the operation to something simpler.
> > +(define_expand "cbranch<mode>4"
> > +  [(set (pc) (if_then_else
> > +	      (match_operator 0 "expandable_comparison_operator"
> > +	       [(match_operand:VDQI 1 "register_operand")
> > +	        (match_operand:VDQI 2 "reg_or_zero_operand")])
> > +	      (label_ref (match_operand 3 "" ""))
> > +	      (pc)))]
> > +  "TARGET_NEON"
> > +{
> > +  rtx mask = operands[1];
> > +
> > +  /* If comparing against a non-zero vector we have to do a comparison first
> > +     so we can have a != 0 comparison with the result.  */
> > +  if (operands[2] != CONST0_RTX (<MODE>mode))
> > +    {
> > +      mask = gen_reg_rtx (<MODE>mode);
> > +      emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
> > +    }
> > +
> > +  /* For 128-bit vectors we need an additional reductions.  */
> > +  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
> > +    {
> > +      /* Always reduce using a V4SI.  */
> > +      mask = gen_reg_rtx (V2SImode);
> > +      rtx low = gen_reg_rtx (V2SImode);
> > +      rtx high = gen_reg_rtx (V2SImode);
> > +      rtx op1 = lowpart_subreg (V4SImode, operands[1], <MODE>mode);
> > +      emit_insn (gen_neon_vget_lowv4si (low, op1));
> > +      emit_insn (gen_neon_vget_highv4si (high, op1));
> > +      emit_insn (gen_neon_vpumaxv2si (mask, low, high));
> > +    }
> > +
> > +  rtx op1 = lowpart_subreg (V2SImode, mask, GET_MODE (mask));
> > +  emit_insn (gen_neon_vpumaxv2si (op1, op1, op1));
> > +
> > +  rtx val = gen_reg_rtx (SImode);
> > +  emit_move_insn (val, gen_lowpart (SImode, mask));
> > +  emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx,
> operands[3]));
> > +  DONE;
> > +})
> > +
> >  ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
> >  ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
> >  ;; by define_expand in vec-common.md file.
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> > index
> >
> 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a9
> 5f
> > d554a109a68a39 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> > @@ -5,7 +5,7 @@
> >
> >  /* { dg-additional-options "-Ofast" } */
> >
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> > *" } } } } */
> >
> >  #include <complex.h>
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> > index
> >
> 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dc
> f2
> > 6402851b53260 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> > @@ -5,7 +5,7 @@
> >
> >  /* { dg-additional-options "-Ofast" } */
> >
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> > *" } } } } */
> >
> >  #include <complex.h>
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> > index
> >
> ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce
> 36
> > c4bd96f02fd52 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> > @@ -3,7 +3,7 @@
> >  /* { dg-require-effective-target vect_int } */
> >
> >  /* { dg-additional-options "-O3" } */
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-
> > * i?86-*-*" } } } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-
> > *-* i?86-*-* arm*-*-*" } } } } */
> >
> >  #include <limits.h>
> >  #include <assert.h>
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> > index
> >
> 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653f
> d
> > 6ad1fc65f4e8e 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> > @@ -3,7 +3,7 @@
> >  /* { dg-require-effective-target vect_int } */
> >
> >  /* { dg-additional-options "-O3" } */
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> > *" } } } } */
> >
> >  #include "tree-vect.h"
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> > index
> >
> 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e194
> 3c
> > 87e9481c41fd2c3 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> > @@ -5,7 +5,7 @@
> >
> >  /* { dg-additional-options "-Ofast" } */
> >
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> > *" } } } } */
> >
> >  #include <complex.h>
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> > index
> >
> b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc200
> 58
> > eeb0a81a55815cc 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> > @@ -3,7 +3,7 @@
> >  /* { dg-require-effective-target vect_int } */
> >
> >  /* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> > *" } } } } */
> >
> >  #include "tree-vect.h"
> >
> > diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> > b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..0e9a39d231fdf4cb565909
> 45e
> > 7cedfabd11d39b5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> > @@ -0,0 +1,138 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target vect_early_break } */
> > +/* { dg-require-effective-target arm_neon_ok } */
> > +/* { dg-require-effective-target arm32 } */
> > +/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard  -
> fno-
> > schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#define N 640
> > +int a[N] = {0};
> > +int b[N] = {0};
> > +
> > +/*
> > +** f1:
> > +**	...
> > +**	vcgt.s32	q[0-9]+, q[0-9]+, #0
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vmov	r[0-9]+, s[0-9]+	@ int
> > +**	cmp	r[0-9]+, #0
> > +**	bne	\.L[0-9]+
> > +**	...
> > +*/
> > +void f1 ()
> > +{
> > +  for (int i = 0; i < N; i++)
> > +    {
> > +      b[i] += a[i];
> > +      if (a[i] > 0)
> > +	break;
> > +    }
> > +}
> > +
> > +/*
> > +** f2:
> > +**	...
> > +**	vcge.s32	q[0-9]+, q[0-9]+, #0
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vmov	r[0-9]+, s[0-9]+	@ int
> > +**	cmp	r[0-9]+, #0
> > +**	bne	\.L[0-9]+
> > +**	...
> > +*/
> > +void f2 ()
> > +{
> > +  for (int i = 0; i < N; i++)
> > +    {
> > +      b[i] += a[i];
> > +      if (a[i] >= 0)
> > +	break;
> > +    }
> > +}
> > +
> > +/*
> > +** f3:
> > +**	...
> > +**	vceq.i32	q[0-9]+, q[0-9]+, #0
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vmov	r[0-9]+, s[0-9]+	@ int
> > +**	cmp	r[0-9]+, #0
> > +**	bne	\.L[0-9]+
> > +**	...
> > +*/
> > +void f3 ()
> > +{
> > +  for (int i = 0; i < N; i++)
> > +    {
> > +      b[i] += a[i];
> > +      if (a[i] == 0)
> > +	break;
> > +    }
> > +}
> > +
> > +/*
> > +** f4:
> > +**	...
> > +**	vceq.i32	q[0-9]+, q[0-9]+, #0
> > +**	vmvn	q[0-9]+, q[0-9]+
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vmov	r[0-9]+, s[0-9]+	@ int
> > +**	cmp	r[0-9]+, #0
> > +**	bne	\.L[0-9]+
> > +**	...
> > +*/
> > +void f4 ()
> > +{
> > +  for (int i = 0; i < N; i++)
> > +    {
> > +      b[i] += a[i];
> > +      if (a[i] != 0)
> > +	break;
> > +    }
> > +}
> > +
> > +/*
> > +** f5:
> > +**	...
> > +**	vclt.s32	q[0-9]+, q[0-9]+, #0
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vmov	r[0-9]+, s[0-9]+	@ int
> > +**	cmp	r[0-9]+, #0
> > +**	bne	\.L[0-9]+
> > +**	...
> > +*/
> > +void f5 ()
> > +{
> > +  for (int i = 0; i < N; i++)
> > +    {
> > +      b[i] += a[i];
> > +      if (a[i] < 0)
> > +	break;
> > +    }
> > +}
> > +
> > +/*
> > +** f6:
> > +**	...
> > +**	vcle.s32	q[0-9]+, q[0-9]+, #0
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> > +**	vmov	r[0-9]+, s[0-9]+	@ int
> > +**	cmp	r[0-9]+, #0
> > +**	bne	\.L[0-9]+
> > +**	...
> > +*/
> > +void f6 ()
> > +{
> > +  for (int i = 0; i < N; i++)
> > +    {
> > +      b[i] += a[i];
> > +      if (a[i] <= 0)
> > +	break;
> > +    }
> > +}
> > +
> > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> > supports.exp
> > index
> >
> 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd
> 9e
> > 0b89369595fcf87 100644
> > --- a/gcc/testsuite/lib/target-supports.exp
> > +++ b/gcc/testsuite/lib/target-supports.exp
> > @@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
> >      return [check_cached_effective_target_indexed vect_early_break {
> >        expr {
> >  	[istarget aarch64*-*-*]
> > +	|| [check_effective_target_arm_v8_neon_ok]
> >  	|| [check_effective_target_sse4]
> >  	}}]
> >  }
> > @@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { }
> > {
> >      return [check_cached_effective_target_indexed vect_early_break_hw {
> >        expr {
> >  	[istarget aarch64*-*-*]
> > +	|| [check_effective_target_arm_v8_neon_hw]
> >  	|| [check_sse4_hw_available]
> >  	}}]
> >  }
> > @@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
> >  	return "$flags"
> >      }
> >
> > +    if { [check_effective_target_arm_v8_neon_ok] } {
> > +	global et_arm_v8_neon_flags
> > +	return "$flags $et_arm_v8_neon_flags -march=armv8-a"
> > +    }
> > +
> >      if { [check_effective_target_sse4] } {
> >  	return "$flags -msse4.1"
> >      }


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v5 0/19] Support early break/return auto-vectorization
@ 2023-06-28 13:40 Tamar Christina
  2023-11-06  7:42 ` [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation Tamar Christina
  0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2023-06-28 13:40 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, rguenther, jlaw

[-- Attachment #1: Type: text/plain, Size: 10897 bytes --]

Hi All,

This patch adds initial support for early break vectorization in GCC.
The support is added for any target that implements a vector cbranch optab,
this includes both fully masked and non-masked targets.

Depending on the operation, the vectorizer may also require support for boolean
mask reductions using Inclusive OR.  This is however only checked then the
comparison would produce multiple statements.

Concretely the kind of loops supported are of the forms:

 for (int i = 0; i < N; i++)
 {
   <statements1>
   if (<condition>)
     {
       ...
       <action>;
     }
   <statements2>
 }

where <action> can be:
 - break
 - return
 - goto

Any number of statements can be used before the <action> occurs.

Since this is an initial version for GCC 14 it has the following limitations and
features:

- Only fixed sized iterations and buffers are supported.  That is to say any
  vectors loaded or stored must be to statically allocated arrays with known
  sizes. N must also be known.  This limitation is because our primary target
  for this optimization is SVE.  For VLA SVE we can't easily do cross page
  iteraion checks. The result is likely to also not be beneficial. For that
  reason we punt support for variable buffers till we have First-Faulting
  support in GCC.
- any stores in <statements1> should not be to the same objects as in
  <condition>.  Loads are fine as long as they don't have the possibility to
  alias.  More concretely, we block RAW dependencies when the intermediate value
  can't be separated fromt the store, or the store itself can't be moved.
- The number of loop iterations must be known,  this is just a temporarily
  limitation that I intend to address in GCC 14 itself as follow on patches.
- Prologue peeling, alignment peelinig and loop versioning are supported.
- Fully masked loops, unmasked loops and partially masked loops are supported
- Any number of loop early exits are supported.
- The early exit must be before the natural loop exit/latch.  The vectorizer is
  designed in way to propage phi-nodes downwards.  As such supporting this
  inverted control flow is hard.
- No support for epilogue vectorization.  The only epilogue supported is the
  scalar final one.  Epilogue vectorization would also not be profitable.
- Early breaks are only supported for inner loop vectorization.

I have pushed a branch to refs/users/tnfchris/heads/gcc-14-early-break

With the help of IPA and LTO this still gets hit quite often.  During bootstrap
it hit rather frequently.  Additionally TSVC s332, s481 and s482 all pass now
since these are tests for support for early exit vectorization.

This implementation does not support completely handling the early break inside
the vector loop itself but instead supports adding checks such that if we know
that we have to exit in the current iteration then we branch to scalar code to
actually do the final VF iterations which handles all the code in <action>.

niters analysis and the majority of the vectorizer with hardcoded single_exit
have been updated with the use of a new function vec_loop_iv value which returns
the exit the vectorizer wants to use as the main IV exit.

for niters the this exit is what determines the overall iterations as
that is the O(iters) for the loop.

For the scalar loop we know that whatever exit you take you have to perform at
most VF iterations.  For vector code we only case about the state of fully
performed iteration and reset the scalar code to the (partially) remaining loop.

This new version of the patch does the majority of the work in a new rewritten
loop peeling.  This new function maintains LCSSA all the way through and no
longer requires the touch up functions the vectorized used to incrementally
adjust them later on.  This means that aside from IV updates and guard edge
updates the early exit code is identical to the single exit cases.

When the loop is peeled during the copying I have to go through great lengths to
keep the dominators up to date.  All exits from the first loop are rewired to the
loop header of the second loop.  But this can change the immediate dominator.

The dominators can change again when we wire in the loop guard, as such peeling
now returns a list of dominators that need to be updated if a new guard edge is
added.

For the loop peeling we rewrite the loop form:

                     Header
                      ---
                      |x|
                       2
                       |
                       v
                -------3<------
     early exit |      |      |
                v      v      | latch
                7      4----->6
                |      |
                |      v
                |      8
                |      |
                |      v
                ------>5

into

                     Header
                      ---
                      |x|
                       2
                       |
                       v
                -------3<------
     early exit |      |      |
                v      v      | latch
                7      4----->6
                |      |
                |      v
                |      8
                |      |
                |      v
                |  New Header
                |     ---
                ----->|x|
                       9
                       |
                       v
                ------10<-----
     early exit |      |      |
                v      v      | latch
                14     11---->13
                |      |
                |      v
                |      12
                |      |
                |      v
                ------> 5

That is to say, the first vector loop executes so long as the early exit isn't
needed.  Once the exit is taken, the scalar code will perform at most VF extra
iterations.  The exact number depending on peeling and iteration start and which
exit was taken (natural or early).   For this scalar loop, all early exits are
treated the same.

When we vectorize we move any statement not related to the early break itself
and that would be incorrect to execute before the break (i.e. has side effects)
to after the break.  If this is not possible we decline to vectorize.

This means that we check at the start of iterations whether we are going to exit
or not.  During the analyis phase we check whether we are allowed to do this
moving of statements.  Also note that we only move the scalar statements, but
only do so after peeling but just before we start transforming statements.

Codegen:

for e.g.

#define N 803
unsigned vect_a[N];
unsigned vect_b[N];

unsigned test4(unsigned x)
{
 unsigned ret = 0;
 for (int i = 0; i < N; i++)
 {
   vect_b[i] = x + i;
   if (vect_a[i] > x)
     break;
   vect_a[i] = x;

 }
 return ret;
}

We generate for Adv. SIMD:

test4:
        adrp    x2, .LC0
        adrp    x3, .LANCHOR0
        dup     v2.4s, w0
        add     x3, x3, :lo12:.LANCHOR0
        movi    v4.4s, 0x4
        add     x4, x3, 3216
        ldr     q1, [x2, #:lo12:.LC0]
        mov     x1, 0
        mov     w2, 0
        .p2align 3,,7
.L3:
        ldr     q0, [x3, x1]
        add     v3.4s, v1.4s, v2.4s
        add     v1.4s, v1.4s, v4.4s
        cmhi    v0.4s, v0.4s, v2.4s
        umaxp   v0.4s, v0.4s, v0.4s
        fmov    x5, d0
        cbnz    x5, .L6
        add     w2, w2, 1
        str     q3, [x1, x4]
        str     q2, [x3, x1]
        add     x1, x1, 16
        cmp     w2, 200
        bne     .L3
        mov     w7, 3
.L2:
        lsl     w2, w2, 2
        add     x5, x3, 3216
        add     w6, w2, w0
        sxtw    x4, w2
        ldr     w1, [x3, x4, lsl 2]
        str     w6, [x5, x4, lsl 2]
        cmp     w0, w1
        bcc     .L4
        add     w1, w2, 1
        str     w0, [x3, x4, lsl 2]
        add     w6, w1, w0
        sxtw    x1, w1
        ldr     w4, [x3, x1, lsl 2]
        str     w6, [x5, x1, lsl 2]
        cmp     w0, w4
        bcc     .L4
        add     w4, w2, 2
        str     w0, [x3, x1, lsl 2]
        sxtw    x1, w4
        add     w6, w1, w0
        ldr     w4, [x3, x1, lsl 2]
        str     w6, [x5, x1, lsl 2]
        cmp     w0, w4
        bcc     .L4
        str     w0, [x3, x1, lsl 2]
        add     w2, w2, 3
        cmp     w7, 3
        beq     .L4
        sxtw    x1, w2
        add     w2, w2, w0
        ldr     w4, [x3, x1, lsl 2]
        str     w2, [x5, x1, lsl 2]
        cmp     w0, w4
        bcc     .L4
        str     w0, [x3, x1, lsl 2]
.L4:
        mov     w0, 0
        ret
        .p2align 2,,3
.L6:
        mov     w7, 4
        b       .L2

and for SVE:

test4:
        adrp    x2, .LANCHOR0
        add     x2, x2, :lo12:.LANCHOR0
        add     x5, x2, 3216
        mov     x3, 0
        mov     w1, 0
        cntw    x4
        mov     z1.s, w0
        index   z0.s, #0, #1
        ptrue   p1.b, all
        ptrue   p0.s, all
        .p2align 3,,7
.L3:
        ld1w    z2.s, p1/z, [x2, x3, lsl 2]
        add     z3.s, z0.s, z1.s
        cmplo   p2.s, p0/z, z1.s, z2.s
        b.any   .L2
        st1w    z3.s, p1, [x5, x3, lsl 2]
        add     w1, w1, 1
        st1w    z1.s, p1, [x2, x3, lsl 2]
        add     x3, x3, x4
        incw    z0.s
        cmp     w3, 803
        bls     .L3
.L5:
        mov     w0, 0
        ret
        .p2align 2,,3
.L2:
        cntw    x5
        mul     w1, w1, w5
        cbz     w5, .L5
        sxtw    x1, w1
        sub     w5, w5, #1
        add     x5, x5, x1
        add     x6, x2, 3216
        b       .L6
        .p2align 2,,3
.L14:
        str     w0, [x2, x1, lsl 2]
        cmp     x1, x5
        beq     .L5
        mov     x1, x4
.L6:
        ldr     w3, [x2, x1, lsl 2]
        add     w4, w0, w1
        str     w4, [x6, x1, lsl 2]
        add     x4, x1, 1
        cmp     w0, w3
        bcs     .L14
        mov     w0, 0
        ret

On the workloads this work is based on we see between 2-3x performance uplift
using this patch.

Follow up plan:
 - Boolean vectorization has several shortcomings.  I've filed PR110223 with the
   bigger ones that cause vectorization to fail with this patch.
 - SLP support.  This is planned for GCC 15 as for majority of the cases build
   SLP itself fails.  This means I'll need to spend time in making this more
   robust first.  Additionally it requires:
     * Adding support for vectorizing CFG (gconds)
     * Support for CFG to differ between vector and scalar loops.
   Both of which would be disruptive to the tree and I suspect I'll be handling
   fallouts from this patch for a while.  So I plan to work on the surrounding
   building blocks first for the remainder of the year.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Also ran across various workloads and no issues.

When closer to acceptance I will run on other targets as well and clean up
related testsuite fallouts there.

--- inline copy of patch -- 

-- 

[-- Attachment #2: rb17494.patch --]
[-- Type: text/plain, Size: 0 bytes --]

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
  2023-06-28 13:40 [PATCH v5 0/19] Support early break/return auto-vectorization Tamar Christina
@ 2023-11-06  7:42 ` Tamar Christina
  2023-11-27 12:48   ` Kyrylo Tkachov
  0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2023-11-06  7:42 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Ramana.Radhakrishnan, Richard.Earnshaw, nickc, Kyrylo.Tkachov

[-- Attachment #1: Type: text/plain, Size: 6094 bytes --]

Hi All,

This adds an implementation for conditional branch optab for AArch32.

For e.g.

void f1 ()
{
  for (int i = 0; i < N; i++)
    {
      b[i] += a[i];
      if (a[i] > 0)
	break;
    }
}

For 128-bit vectors we generate:

        vcgt.s32        q8, q9, #0
        vpmax.u32       d7, d16, d17
        vpmax.u32       d7, d7, d7
        vmov    r3, s14 @ int
        cmp     r3, #0

and of 64-bit vector we can omit one vpmax as we still need to compress to
32-bits.

Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/arm/neon.md (cbranch<mode>4): New.

gcc/testsuite/ChangeLog:

	* lib/target-supports.exp (vect_early_break): Add AArch32.
	* gcc.target/arm/vect-early-break-cbranch.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..130efbc37cfe3128533599dfadc344d2243dcb63 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,45 @@ (define_insn "vec_extract<mode><V_elem_l>"
   [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
 )
 
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation.  To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+  [(set (pc) (if_then_else
+	      (match_operator 0 "expandable_comparison_operator"
+	       [(match_operand:VDQI 1 "register_operand")
+	        (match_operand:VDQI 2 "zero_operand")])
+	      (label_ref (match_operand 3 "" ""))
+	      (pc)))]
+  "TARGET_NEON"
+{
+  rtx mask = operands[1];
+
+  /* For 128-bit vectors we need an additional reductions.  */
+  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+    {
+      /* Always reduce using a V4SI.  */
+      mask = gen_reg_rtx (V2SImode);
+      rtx low = gen_reg_rtx (V2SImode);
+      rtx high = gen_reg_rtx (V2SImode);
+      emit_insn (gen_neon_vget_lowv4si (low, operands[1]));
+      emit_insn (gen_neon_vget_highv4si (high, operands[1]));
+      emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+    }
+
+  emit_insn (gen_neon_vpumaxv2si (mask, mask, mask));
+
+  rtx val = gen_reg_rtx (SImode);
+  emit_move_insn (val, gen_lowpart (SImode, mask));
+  emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+  DONE;
+})
+
 ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
 ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
 ;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c05aa10d26ed4ac9785672e6e3b4355cef046dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,136 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/* f1:
+**	...
+**	vcgt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	vcge.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vmvn	q[0-9]+, q[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	vclt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	vcle.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 5516188dc0aa86d161d67dea5a7769e3c3d72f85..8f58671e6cfd3546c6a98e40341fe31c6492594b 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3784,6 +3784,7 @@ proc check_effective_target_vect_early_break { } {
     return [check_cached_effective_target_indexed vect_early_break {
       expr {
 	[istarget aarch64*-*-*]
+	|| [check_effective_target_arm_neon_ok]
 	}}]
 }
 # Return 1 if the target supports hardware vectorization of complex additions of




-- 

[-- Attachment #2: rb17512.patch --]
[-- Type: text/plain, Size: 5281 bytes --]

diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..130efbc37cfe3128533599dfadc344d2243dcb63 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,45 @@ (define_insn "vec_extract<mode><V_elem_l>"
   [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
 )
 
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation.  To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+  [(set (pc) (if_then_else
+	      (match_operator 0 "expandable_comparison_operator"
+	       [(match_operand:VDQI 1 "register_operand")
+	        (match_operand:VDQI 2 "zero_operand")])
+	      (label_ref (match_operand 3 "" ""))
+	      (pc)))]
+  "TARGET_NEON"
+{
+  rtx mask = operands[1];
+
+  /* For 128-bit vectors we need an additional reductions.  */
+  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+    {
+      /* Always reduce using a V4SI.  */
+      mask = gen_reg_rtx (V2SImode);
+      rtx low = gen_reg_rtx (V2SImode);
+      rtx high = gen_reg_rtx (V2SImode);
+      emit_insn (gen_neon_vget_lowv4si (low, operands[1]));
+      emit_insn (gen_neon_vget_highv4si (high, operands[1]));
+      emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+    }
+
+  emit_insn (gen_neon_vpumaxv2si (mask, mask, mask));
+
+  rtx val = gen_reg_rtx (SImode);
+  emit_move_insn (val, gen_lowpart (SImode, mask));
+  emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+  DONE;
+})
+
 ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
 ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
 ;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c05aa10d26ed4ac9785672e6e3b4355cef046dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,136 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/* f1:
+**	...
+**	vcgt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f1 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] > 0)
+	break;
+    }
+}
+
+/*
+** f2:
+**	...
+**	vcge.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f2 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] >= 0)
+	break;
+    }
+}
+
+/*
+** f3:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f3 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] == 0)
+	break;
+    }
+}
+
+/*
+** f4:
+**	...
+**	vceq.i32	q[0-9]+, q[0-9]+, #0
+**	vmvn	q[0-9]+, q[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f4 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] != 0)
+	break;
+    }
+}
+
+/*
+** f5:
+**	...
+**	vclt.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f5 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] < 0)
+	break;
+    }
+}
+
+/*
+** f6:
+**	...
+**	vcle.s32	q[0-9]+, q[0-9]+, #0
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
+**	vmov	r[0-9]+, s[0-9]+	@ int
+**	cmp	r[0-9]+, #0
+**	bne	\.L[0-9]+
+**	...
+*/
+void f6 ()
+{
+  for (int i = 0; i < N; i++)
+    {
+      b[i] += a[i];
+      if (a[i] <= 0)
+	break;
+    }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 5516188dc0aa86d161d67dea5a7769e3c3d72f85..8f58671e6cfd3546c6a98e40341fe31c6492594b 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3784,6 +3784,7 @@ proc check_effective_target_vect_early_break { } {
     return [check_cached_effective_target_indexed vect_early_break {
       expr {
 	[istarget aarch64*-*-*]
+	|| [check_effective_target_arm_neon_ok]
 	}}]
 }
 # Return 1 if the target supports hardware vectorization of complex additions of




^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
  2023-11-06  7:42 ` [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation Tamar Christina
@ 2023-11-27 12:48   ` Kyrylo Tkachov
  0 siblings, 0 replies; 6+ messages in thread
From: Kyrylo Tkachov @ 2023-11-27 12:48 UTC (permalink / raw)
  To: Tamar Christina, gcc-patches
  Cc: nd, Ramana Radhakrishnan, Richard Earnshaw, nickc

Hi Tamar,

> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Monday, November 6, 2023 7:43 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
> 
> Hi All,
> 
> This adds an implementation for conditional branch optab for AArch32.
> 
> For e.g.
> 
> void f1 ()
> {
>   for (int i = 0; i < N; i++)
>     {
>       b[i] += a[i];
>       if (a[i] > 0)
> 	break;
>     }
> }
> 
> For 128-bit vectors we generate:
> 
>         vcgt.s32        q8, q9, #0
>         vpmax.u32       d7, d16, d17
>         vpmax.u32       d7, d7, d7
>         vmov    r3, s14 @ int
>         cmp     r3, #0
> 
> and of 64-bit vector we can omit one vpmax as we still need to compress to
> 32-bits.
> 
> Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
> 
> Ok for master?
> 

This is okay once the prerequisites go in.
Thanks,
Kyrill

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/arm/neon.md (cbranch<mode>4): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* lib/target-supports.exp (vect_early_break): Add AArch32.
> 	* gcc.target/arm/vect-early-break-cbranch.c: New test.
> 
> --- inline copy of patch --
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index
> d213369ffc38fb88ad0357d848cc7da5af73bab7..130efbc37cfe3128533599dfadc
> 344d2243dcb63 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -408,6 +408,45 @@ (define_insn "vec_extract<mode><V_elem_l>"
>    [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
>  )
> 
> +;; Patterns comparing two vectors and conditionally jump.
> +;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
> +;; operation.  To not pay the penalty for inverting == we can map our any
> +;; comparisons to all i.e. any(~x) => all(x).
> +;;
> +;; However unlike the AArch64 version, we can't optimize this further as the
> +;; chain is too long for combine due to these being unspecs so it doesn't fold
> +;; the operation to something simpler.
> +(define_expand "cbranch<mode>4"
> +  [(set (pc) (if_then_else
> +	      (match_operator 0 "expandable_comparison_operator"
> +	       [(match_operand:VDQI 1 "register_operand")
> +	        (match_operand:VDQI 2 "zero_operand")])
> +	      (label_ref (match_operand 3 "" ""))
> +	      (pc)))]
> +  "TARGET_NEON"
> +{
> +  rtx mask = operands[1];
> +
> +  /* For 128-bit vectors we need an additional reductions.  */
> +  if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
> +    {
> +      /* Always reduce using a V4SI.  */
> +      mask = gen_reg_rtx (V2SImode);
> +      rtx low = gen_reg_rtx (V2SImode);
> +      rtx high = gen_reg_rtx (V2SImode);
> +      emit_insn (gen_neon_vget_lowv4si (low, operands[1]));
> +      emit_insn (gen_neon_vget_highv4si (high, operands[1]));
> +      emit_insn (gen_neon_vpumaxv2si (mask, low, high));
> +    }
> +
> +  emit_insn (gen_neon_vpumaxv2si (mask, mask, mask));
> +
> +  rtx val = gen_reg_rtx (SImode);
> +  emit_move_insn (val, gen_lowpart (SImode, mask));
> +  emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
> +  DONE;
> +})
> +
>  ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
>  ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
>  ;; by define_expand in vec-common.md file.
> diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..2c05aa10d26ed4ac9785672e
> 6e3b4355cef046dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> @@ -0,0 +1,136 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_ok } */
> +/* { dg-require-effective-target arm32 } */
> +/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#define N 640
> +int a[N] = {0};
> +int b[N] = {0};
> +
> +/* f1:
> +**	...
> +**	vcgt.s32	q[0-9]+, q[0-9]+, #0
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f1 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] > 0)
> +	break;
> +    }
> +}
> +
> +/*
> +** f2:
> +**	...
> +**	vcge.s32	q[0-9]+, q[0-9]+, #0
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f2 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] >= 0)
> +	break;
> +    }
> +}
> +
> +/*
> +** f3:
> +**	...
> +**	vceq.i32	q[0-9]+, q[0-9]+, #0
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f3 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] == 0)
> +	break;
> +    }
> +}
> +
> +/*
> +** f4:
> +**	...
> +**	vceq.i32	q[0-9]+, q[0-9]+, #0
> +**	vmvn	q[0-9]+, q[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f4 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] != 0)
> +	break;
> +    }
> +}
> +
> +/*
> +** f5:
> +**	...
> +**	vclt.s32	q[0-9]+, q[0-9]+, #0
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f5 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] < 0)
> +	break;
> +    }
> +}
> +
> +/*
> +** f6:
> +**	...
> +**	vcle.s32	q[0-9]+, q[0-9]+, #0
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vpmax.u32	d[0-9]+, d[0-9]+, d[0-9]+
> +**	vmov	r[0-9]+, s[0-9]+	@ int
> +**	cmp	r[0-9]+, #0
> +**	bne	\.L[0-9]+
> +**	...
> +*/
> +void f6 ()
> +{
> +  for (int i = 0; i < N; i++)
> +    {
> +      b[i] += a[i];
> +      if (a[i] <= 0)
> +	break;
> +    }
> +}
> +
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> supports.exp
> index
> 5516188dc0aa86d161d67dea5a7769e3c3d72f85..8f58671e6cfd3546c6a98e4034
> 1fe31c6492594b 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -3784,6 +3784,7 @@ proc check_effective_target_vect_early_break { } {
>      return [check_cached_effective_target_indexed vect_early_break {
>        expr {
>  	[istarget aarch64*-*-*]
> +	|| [check_effective_target_arm_neon_ok]
>  	}}]
>  }
>  # Return 1 if the target supports hardware vectorization of complex additions of
> 
> 
> 
> 
> --

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-01-04 11:27 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-29 14:42 [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation Tamar Christina
2024-01-04 11:06 ` Tamar Christina
2024-01-04 11:12   ` Kyrylo Tkachov
2024-01-04 11:26     ` Tamar Christina
  -- strict thread matches above, loose matches on Subject: below --
2023-06-28 13:40 [PATCH v5 0/19] Support early break/return auto-vectorization Tamar Christina
2023-11-06  7:42 ` [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation Tamar Christina
2023-11-27 12:48   ` Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).