* [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
@ 2023-12-29 14:42 Tamar Christina
2024-01-04 11:06 ` Tamar Christina
0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2023-12-29 14:42 UTC (permalink / raw)
To: gcc-patches
Cc: nd, Ramana.Radhakrishnan, Richard.Earnshaw, nickc, Kyrylo.Tkachov
[-- Attachment #1: Type: text/plain, Size: 11342 bytes --]
Hi All,
This adds an implementation for conditional branch optab for AArch32.
The previous version only allowed operand 0 but it looks like cbranch
expansion does not check with the target and so we have to implement all.
I therefore did not commit it. This is a larger version.
For e.g.
void f1 ()
{
for (int i = 0; i < N; i++)
{
b[i] += a[i];
if (a[i] > 0)
break;
}
}
For 128-bit vectors we generate:
vcgt.s32 q8, q9, #0
vpmax.u32 d7, d16, d17
vpmax.u32 d7, d7, d7
vmov r3, s14 @ int
cmp r3, #0
and of 64-bit vector we can omit one vpmax as we still need to compress to
32-bits.
Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/arm/neon.md (cbranch<mode>4): New.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/vect-early-break_2.c: Skip Arm.
* gcc.dg/vect/vect-early-break_7.c: Likewise.
* gcc.dg/vect/vect-early-break_75.c: Likewise.
* gcc.dg/vect/vect-early-break_77.c: Likewise.
* gcc.dg/vect/vect-early-break_82.c: Likewise.
* gcc.dg/vect/vect-early-break_88.c: Likewise.
* lib/target-supports.exp (add_options_for_vect_early_break,
check_effective_target_vect_early_break_hw,
check_effective_target_vect_early_break): Support AArch32.
* gcc.target/arm/vect-early-break-cbranch.c: New test.
--- inline copy of patch --
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..0f088a51d31e6882bc0fabbad99862b8b465dd22 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,54 @@ (define_insn "vec_extract<mode><V_elem_l>"
[(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
)
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation. To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+ [(set (pc) (if_then_else
+ (match_operator 0 "expandable_comparison_operator"
+ [(match_operand:VDQI 1 "register_operand")
+ (match_operand:VDQI 2 "reg_or_zero_operand")])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_NEON"
+{
+ rtx mask = operands[1];
+
+ /* If comparing against a non-zero vector we have to do a comparison first
+ so we can have a != 0 comparison with the result. */
+ if (operands[2] != CONST0_RTX (<MODE>mode))
+ {
+ mask = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
+ }
+
+ /* For 128-bit vectors we need an additional reductions. */
+ if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+ {
+ /* Always reduce using a V4SI. */
+ mask = gen_reg_rtx (V2SImode);
+ rtx low = gen_reg_rtx (V2SImode);
+ rtx high = gen_reg_rtx (V2SImode);
+ rtx op1 = simplify_gen_subreg (V4SImode, operands[1], <MODE>mode, 0);
+ emit_insn (gen_neon_vget_lowv4si (low, op1));
+ emit_insn (gen_neon_vget_highv4si (high, op1));
+ emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+ }
+
+ emit_insn (gen_neon_vpumaxv2si (mask, mask, mask));
+
+ rtx val = gen_reg_rtx (SImode);
+ emit_move_insn (val, gen_lowpart (SImode, mask));
+ emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+ DONE;
+})
+
;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
index 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a95fd554a109a68a39 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
index 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dcf26402851b53260 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
index ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce36c4bd96f02fd52 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-*" } } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-* arm*-*-*" } } } } */
#include <limits.h>
#include <assert.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
index 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653fd6ad1fc65f4e8e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
index 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e1943c87e9481c41fd2c3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
index b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc20058eeb0a81a55815cc 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e9a39d231fdf4cb56590945e7cedfabd11d39b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,138 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/*
+** f1:
+** ...
+** vcgt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f1 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] > 0)
+ break;
+ }
+}
+
+/*
+** f2:
+** ...
+** vcge.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f2 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] >= 0)
+ break;
+ }
+}
+
+/*
+** f3:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f3 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] == 0)
+ break;
+ }
+}
+
+/*
+** f4:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vmvn q[0-9]+, q[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f4 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] != 0)
+ break;
+ }
+}
+
+/*
+** f5:
+** ...
+** vclt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f5 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] < 0)
+ break;
+ }
+}
+
+/*
+** f6:
+** ...
+** vcle.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f6 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] <= 0)
+ break;
+ }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd9e0b89369595fcf87 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
return [check_cached_effective_target_indexed vect_early_break {
expr {
[istarget aarch64*-*-*]
+ || [check_effective_target_arm_v8_neon_ok]
|| [check_effective_target_sse4]
}}]
}
@@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { } {
return [check_cached_effective_target_indexed vect_early_break_hw {
expr {
[istarget aarch64*-*-*]
+ || [check_effective_target_arm_v8_neon_hw]
|| [check_sse4_hw_available]
}}]
}
@@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
return "$flags"
}
+ if { [check_effective_target_arm_v8_neon_ok] } {
+ global et_arm_v8_neon_flags
+ return "$flags $et_arm_v8_neon_flags -march=armv8-a"
+ }
+
if { [check_effective_target_sse4] } {
return "$flags -msse4.1"
}
--
[-- Attachment #2: rb17512.patch --]
[-- Type: text/plain, Size: 9932 bytes --]
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..0f088a51d31e6882bc0fabbad99862b8b465dd22 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,54 @@ (define_insn "vec_extract<mode><V_elem_l>"
[(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
)
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation. To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+ [(set (pc) (if_then_else
+ (match_operator 0 "expandable_comparison_operator"
+ [(match_operand:VDQI 1 "register_operand")
+ (match_operand:VDQI 2 "reg_or_zero_operand")])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_NEON"
+{
+ rtx mask = operands[1];
+
+ /* If comparing against a non-zero vector we have to do a comparison first
+ so we can have a != 0 comparison with the result. */
+ if (operands[2] != CONST0_RTX (<MODE>mode))
+ {
+ mask = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
+ }
+
+ /* For 128-bit vectors we need an additional reductions. */
+ if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+ {
+ /* Always reduce using a V4SI. */
+ mask = gen_reg_rtx (V2SImode);
+ rtx low = gen_reg_rtx (V2SImode);
+ rtx high = gen_reg_rtx (V2SImode);
+ rtx op1 = simplify_gen_subreg (V4SImode, operands[1], <MODE>mode, 0);
+ emit_insn (gen_neon_vget_lowv4si (low, op1));
+ emit_insn (gen_neon_vget_highv4si (high, op1));
+ emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+ }
+
+ emit_insn (gen_neon_vpumaxv2si (mask, mask, mask));
+
+ rtx val = gen_reg_rtx (SImode);
+ emit_move_insn (val, gen_lowpart (SImode, mask));
+ emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+ DONE;
+})
+
;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
index 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a95fd554a109a68a39 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
index 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dcf26402851b53260 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
index ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce36c4bd96f02fd52 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-*" } } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-* arm*-*-*" } } } } */
#include <limits.h>
#include <assert.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
index 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653fd6ad1fc65f4e8e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
index 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e1943c87e9481c41fd2c3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
index b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc20058eeb0a81a55815cc 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e9a39d231fdf4cb56590945e7cedfabd11d39b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,138 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/*
+** f1:
+** ...
+** vcgt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f1 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] > 0)
+ break;
+ }
+}
+
+/*
+** f2:
+** ...
+** vcge.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f2 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] >= 0)
+ break;
+ }
+}
+
+/*
+** f3:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f3 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] == 0)
+ break;
+ }
+}
+
+/*
+** f4:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vmvn q[0-9]+, q[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f4 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] != 0)
+ break;
+ }
+}
+
+/*
+** f5:
+** ...
+** vclt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f5 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] < 0)
+ break;
+ }
+}
+
+/*
+** f6:
+** ...
+** vcle.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f6 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] <= 0)
+ break;
+ }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd9e0b89369595fcf87 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
return [check_cached_effective_target_indexed vect_early_break {
expr {
[istarget aarch64*-*-*]
+ || [check_effective_target_arm_v8_neon_ok]
|| [check_effective_target_sse4]
}}]
}
@@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { } {
return [check_cached_effective_target_indexed vect_early_break_hw {
expr {
[istarget aarch64*-*-*]
+ || [check_effective_target_arm_v8_neon_hw]
|| [check_sse4_hw_available]
}}]
}
@@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
return "$flags"
}
+ if { [check_effective_target_arm_v8_neon_ok] } {
+ global et_arm_v8_neon_flags
+ return "$flags $et_arm_v8_neon_flags -march=armv8-a"
+ }
+
if { [check_effective_target_sse4] } {
return "$flags -msse4.1"
}
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
2023-12-29 14:42 [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation Tamar Christina
@ 2024-01-04 11:06 ` Tamar Christina
2024-01-04 11:12 ` Kyrylo Tkachov
0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2024-01-04 11:06 UTC (permalink / raw)
To: Tamar Christina, gcc-patches
Cc: nd, Ramana Radhakrishnan, Richard Earnshaw, nickc, Kyrylo Tkachov
[-- Attachment #1: Type: text/plain, Size: 12232 bytes --]
Ping,
---
Hi All,
This adds an implementation for conditional branch optab for AArch32.
The previous version only allowed operand 0 but it looks like cbranch
expansion does not check with the target and so we have to implement all.
I therefore did not commit it. This is a larger version. I've also dropped the MVE
version because the mid-end can rewrite the comparison into comparing two
predicates without checking with the backend. Since MVE only has 1 predicate
register this would need to go through memory and two MRS calls. It's unlikely
to be beneficial and so that's for GCC 15 when I can fix the middle-end.
The cases where AArch32 is skipped in the testsuite are all missed-optimizations
due to AArch32 missing some optabs.
For e.g.
void f1 ()
{
for (int i = 0; i < N; i++)
{
b[i] += a[i];
if (a[i] > 0)
break;
}
}
For 128-bit vectors we generate:
vcgt.s32 q8, q9, #0
vpmax.u32 d7, d16, d17
vpmax.u32 d7, d7, d7
vmov r3, s14 @ int
cmp r3, #0
and of 64-bit vector we can omit one vpmax as we still need to compress to
32-bits.
Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/arm/neon.md (cbranch<mode>4): New.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/vect-early-break_2.c: Skip Arm.
* gcc.dg/vect/vect-early-break_7.c: Likewise.
* gcc.dg/vect/vect-early-break_75.c: Likewise.
* gcc.dg/vect/vect-early-break_77.c: Likewise.
* gcc.dg/vect/vect-early-break_82.c: Likewise.
* gcc.dg/vect/vect-early-break_88.c: Likewise.
* lib/target-supports.exp (add_options_for_vect_early_break,
check_effective_target_vect_early_break_hw,
check_effective_target_vect_early_break): Support AArch32.
* gcc.target/arm/vect-early-break-cbranch.c: New test.
--- inline version of patch ---
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..ed659ab736862da416d1ff6241d0d3e6c6b96ff1 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,55 @@ (define_insn "vec_extract<mode><V_elem_l>"
[(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
)
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation. To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+ [(set (pc) (if_then_else
+ (match_operator 0 "expandable_comparison_operator"
+ [(match_operand:VDQI 1 "register_operand")
+ (match_operand:VDQI 2 "reg_or_zero_operand")])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_NEON"
+{
+ rtx mask = operands[1];
+
+ /* If comparing against a non-zero vector we have to do a comparison first
+ so we can have a != 0 comparison with the result. */
+ if (operands[2] != CONST0_RTX (<MODE>mode))
+ {
+ mask = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
+ }
+
+ /* For 128-bit vectors we need an additional reductions. */
+ if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+ {
+ /* Always reduce using a V4SI. */
+ mask = gen_reg_rtx (V2SImode);
+ rtx low = gen_reg_rtx (V2SImode);
+ rtx high = gen_reg_rtx (V2SImode);
+ rtx op1 = lowpart_subreg (V4SImode, operands[1], <MODE>mode);
+ emit_insn (gen_neon_vget_lowv4si (low, op1));
+ emit_insn (gen_neon_vget_highv4si (high, op1));
+ emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+ }
+
+ rtx op1 = lowpart_subreg (V2SImode, mask, GET_MODE (mask));
+ emit_insn (gen_neon_vpumaxv2si (op1, op1, op1));
+
+ rtx val = gen_reg_rtx (SImode);
+ emit_move_insn (val, gen_lowpart (SImode, mask));
+ emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+ DONE;
+})
+
;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
index 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a95fd554a109a68a39 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
index 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dcf26402851b53260 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
index ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce36c4bd96f02fd52 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-*" } } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-* arm*-*-*" } } } } */
#include <limits.h>
#include <assert.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
index 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653fd6ad1fc65f4e8e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
index 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e1943c87e9481c41fd2c3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
index b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc20058eeb0a81a55815cc 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e9a39d231fdf4cb56590945e7cedfabd11d39b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,138 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/*
+** f1:
+** ...
+** vcgt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f1 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] > 0)
+ break;
+ }
+}
+
+/*
+** f2:
+** ...
+** vcge.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f2 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] >= 0)
+ break;
+ }
+}
+
+/*
+** f3:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f3 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] == 0)
+ break;
+ }
+}
+
+/*
+** f4:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vmvn q[0-9]+, q[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f4 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] != 0)
+ break;
+ }
+}
+
+/*
+** f5:
+** ...
+** vclt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f5 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] < 0)
+ break;
+ }
+}
+
+/*
+** f6:
+** ...
+** vcle.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f6 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] <= 0)
+ break;
+ }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd9e0b89369595fcf87 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
return [check_cached_effective_target_indexed vect_early_break {
expr {
[istarget aarch64*-*-*]
+ || [check_effective_target_arm_v8_neon_ok]
|| [check_effective_target_sse4]
}}]
}
@@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { } {
return [check_cached_effective_target_indexed vect_early_break_hw {
expr {
[istarget aarch64*-*-*]
+ || [check_effective_target_arm_v8_neon_hw]
|| [check_sse4_hw_available]
}}]
}
@@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
return "$flags"
}
+ if { [check_effective_target_arm_v8_neon_ok] } {
+ global et_arm_v8_neon_flags
+ return "$flags $et_arm_v8_neon_flags -march=armv8-a"
+ }
+
if { [check_effective_target_sse4] } {
return "$flags -msse4.1"
}
[-- Attachment #2: rb17512.patch --]
[-- Type: application/octet-stream, Size: 9981 bytes --]
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..ed659ab736862da416d1ff6241d0d3e6c6b96ff1 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,55 @@ (define_insn "vec_extract<mode><V_elem_l>"
[(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
)
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation. To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+ [(set (pc) (if_then_else
+ (match_operator 0 "expandable_comparison_operator"
+ [(match_operand:VDQI 1 "register_operand")
+ (match_operand:VDQI 2 "reg_or_zero_operand")])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_NEON"
+{
+ rtx mask = operands[1];
+
+ /* If comparing against a non-zero vector we have to do a comparison first
+ so we can have a != 0 comparison with the result. */
+ if (operands[2] != CONST0_RTX (<MODE>mode))
+ {
+ mask = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
+ }
+
+ /* For 128-bit vectors we need an additional reductions. */
+ if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+ {
+ /* Always reduce using a V4SI. */
+ mask = gen_reg_rtx (V2SImode);
+ rtx low = gen_reg_rtx (V2SImode);
+ rtx high = gen_reg_rtx (V2SImode);
+ rtx op1 = lowpart_subreg (V4SImode, operands[1], <MODE>mode);
+ emit_insn (gen_neon_vget_lowv4si (low, op1));
+ emit_insn (gen_neon_vget_highv4si (high, op1));
+ emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+ }
+
+ rtx op1 = lowpart_subreg (V2SImode, mask, GET_MODE (mask));
+ emit_insn (gen_neon_vpumaxv2si (op1, op1, op1));
+
+ rtx val = gen_reg_rtx (SImode);
+ emit_move_insn (val, gen_lowpart (SImode, mask));
+ emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+ DONE;
+})
+
;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
index 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a95fd554a109a68a39 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
index 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dcf26402851b53260 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
index ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce36c4bd96f02fd52 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-*" } } } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-* i?86-*-* arm*-*-*" } } } } */
#include <limits.h>
#include <assert.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
index 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653fd6ad1fc65f4e8e 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-O3" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
index 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e1943c87e9481c41fd2c3 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
@@ -5,7 +5,7 @@
/* { dg-additional-options "-Ofast" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include <complex.h>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
index b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc20058eeb0a81a55815cc 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
@@ -3,7 +3,7 @@
/* { dg-require-effective-target vect_int } */
/* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-*" } } } } */
#include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..0e9a39d231fdf4cb56590945e7cedfabd11d39b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,138 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard -fno-schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/*
+** f1:
+** ...
+** vcgt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f1 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] > 0)
+ break;
+ }
+}
+
+/*
+** f2:
+** ...
+** vcge.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f2 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] >= 0)
+ break;
+ }
+}
+
+/*
+** f3:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f3 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] == 0)
+ break;
+ }
+}
+
+/*
+** f4:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vmvn q[0-9]+, q[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f4 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] != 0)
+ break;
+ }
+}
+
+/*
+** f5:
+** ...
+** vclt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f5 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] < 0)
+ break;
+ }
+}
+
+/*
+** f6:
+** ...
+** vcle.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f6 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] <= 0)
+ break;
+ }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd9e0b89369595fcf87 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
return [check_cached_effective_target_indexed vect_early_break {
expr {
[istarget aarch64*-*-*]
+ || [check_effective_target_arm_v8_neon_ok]
|| [check_effective_target_sse4]
}}]
}
@@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { } {
return [check_cached_effective_target_indexed vect_early_break_hw {
expr {
[istarget aarch64*-*-*]
+ || [check_effective_target_arm_v8_neon_hw]
|| [check_sse4_hw_available]
}}]
}
@@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
return "$flags"
}
+ if { [check_effective_target_arm_v8_neon_ok] } {
+ global et_arm_v8_neon_flags
+ return "$flags $et_arm_v8_neon_flags -march=armv8-a"
+ }
+
if { [check_effective_target_sse4] } {
return "$flags -msse4.1"
}
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
2024-01-04 11:06 ` Tamar Christina
@ 2024-01-04 11:12 ` Kyrylo Tkachov
2024-01-04 11:26 ` Tamar Christina
0 siblings, 1 reply; 6+ messages in thread
From: Kyrylo Tkachov @ 2024-01-04 11:12 UTC (permalink / raw)
To: Tamar Christina, gcc-patches
Cc: nd, Ramana Radhakrishnan, Richard Earnshaw, nickc
Hi Tamar,
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Thursday, January 4, 2024 11:06 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
>
> Ping,
>
> ---
>
> Hi All,
>
> This adds an implementation for conditional branch optab for AArch32.
> The previous version only allowed operand 0 but it looks like cbranch
> expansion does not check with the target and so we have to implement all.
>
> I therefore did not commit it. This is a larger version. I've also dropped the MVE
> version because the mid-end can rewrite the comparison into comparing two
> predicates without checking with the backend. Since MVE only has 1 predicate
> register this would need to go through memory and two MRS calls. It's unlikely
> to be beneficial and so that's for GCC 15 when I can fix the middle-end.
>
> The cases where AArch32 is skipped in the testsuite are all missed-optimizations
> due to AArch32 missing some optabs.
Does the testsuite have vect_* checks that can be used instead of target arm*?
If so let's use those.
Otherwise it's okay as is.
Thanks,
Kyrill
>
> For e.g.
>
> void f1 ()
> {
> for (int i = 0; i < N; i++)
> {
> b[i] += a[i];
> if (a[i] > 0)
> break;
> }
> }
>
> For 128-bit vectors we generate:
>
> vcgt.s32 q8, q9, #0
> vpmax.u32 d7, d16, d17
> vpmax.u32 d7, d7, d7
> vmov r3, s14 @ int
> cmp r3, #0
>
> and of 64-bit vector we can omit one vpmax as we still need to compress to
> 32-bits.
>
> Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/arm/neon.md (cbranch<mode>4): New.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/vect-early-break_2.c: Skip Arm.
> * gcc.dg/vect/vect-early-break_7.c: Likewise.
> * gcc.dg/vect/vect-early-break_75.c: Likewise.
> * gcc.dg/vect/vect-early-break_77.c: Likewise.
> * gcc.dg/vect/vect-early-break_82.c: Likewise.
> * gcc.dg/vect/vect-early-break_88.c: Likewise.
> * lib/target-supports.exp (add_options_for_vect_early_break,
> check_effective_target_vect_early_break_hw,
> check_effective_target_vect_early_break): Support AArch32.
> * gcc.target/arm/vect-early-break-cbranch.c: New test.
>
> --- inline version of patch ---
>
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index
> d213369ffc38fb88ad0357d848cc7da5af73bab7..ed659ab736862da416d1ff6241d
> 0d3e6c6b96ff1 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -408,6 +408,55 @@ (define_insn "vec_extract<mode><V_elem_l>"
> [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
> )
>
> +;; Patterns comparing two vectors and conditionally jump.
> +;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
> +;; operation. To not pay the penalty for inverting == we can map our any
> +;; comparisons to all i.e. any(~x) => all(x).
> +;;
> +;; However unlike the AArch64 version, we can't optimize this further as the
> +;; chain is too long for combine due to these being unspecs so it doesn't fold
> +;; the operation to something simpler.
> +(define_expand "cbranch<mode>4"
> + [(set (pc) (if_then_else
> + (match_operator 0 "expandable_comparison_operator"
> + [(match_operand:VDQI 1 "register_operand")
> + (match_operand:VDQI 2 "reg_or_zero_operand")])
> + (label_ref (match_operand 3 "" ""))
> + (pc)))]
> + "TARGET_NEON"
> +{
> + rtx mask = operands[1];
> +
> + /* If comparing against a non-zero vector we have to do a comparison first
> + so we can have a != 0 comparison with the result. */
> + if (operands[2] != CONST0_RTX (<MODE>mode))
> + {
> + mask = gen_reg_rtx (<MODE>mode);
> + emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
> + }
> +
> + /* For 128-bit vectors we need an additional reductions. */
> + if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
> + {
> + /* Always reduce using a V4SI. */
> + mask = gen_reg_rtx (V2SImode);
> + rtx low = gen_reg_rtx (V2SImode);
> + rtx high = gen_reg_rtx (V2SImode);
> + rtx op1 = lowpart_subreg (V4SImode, operands[1], <MODE>mode);
> + emit_insn (gen_neon_vget_lowv4si (low, op1));
> + emit_insn (gen_neon_vget_highv4si (high, op1));
> + emit_insn (gen_neon_vpumaxv2si (mask, low, high));
> + }
> +
> + rtx op1 = lowpart_subreg (V2SImode, mask, GET_MODE (mask));
> + emit_insn (gen_neon_vpumaxv2si (op1, op1, op1));
> +
> + rtx val = gen_reg_rtx (SImode);
> + emit_move_insn (val, gen_lowpart (SImode, mask));
> + emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
> + DONE;
> +})
> +
> ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
> ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
> ;; by define_expand in vec-common.md file.
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> index
> 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a95f
> d554a109a68a39 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> @@ -5,7 +5,7 @@
>
> /* { dg-additional-options "-Ofast" } */
>
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> *" } } } } */
>
> #include <complex.h>
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> index
> 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dcf2
> 6402851b53260 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> @@ -5,7 +5,7 @@
>
> /* { dg-additional-options "-Ofast" } */
>
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> *" } } } } */
>
> #include <complex.h>
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> index
> ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce36
> c4bd96f02fd52 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> @@ -3,7 +3,7 @@
> /* { dg-require-effective-target vect_int } */
>
> /* { dg-additional-options "-O3" } */
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-
> * i?86-*-*" } } } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-
> *-* i?86-*-* arm*-*-*" } } } } */
>
> #include <limits.h>
> #include <assert.h>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> index
> 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653fd
> 6ad1fc65f4e8e 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> @@ -3,7 +3,7 @@
> /* { dg-require-effective-target vect_int } */
>
> /* { dg-additional-options "-O3" } */
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> *" } } } } */
>
> #include "tree-vect.h"
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> index
> 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e1943c
> 87e9481c41fd2c3 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> @@ -5,7 +5,7 @@
>
> /* { dg-additional-options "-Ofast" } */
>
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> *" } } } } */
>
> #include <complex.h>
>
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> index
> b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc20058
> eeb0a81a55815cc 100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> @@ -3,7 +3,7 @@
> /* { dg-require-effective-target vect_int } */
>
> /* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> *" } } } } */
>
> #include "tree-vect.h"
>
> diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..0e9a39d231fdf4cb56590945e
> 7cedfabd11d39b5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> @@ -0,0 +1,138 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target arm_neon_ok } */
> +/* { dg-require-effective-target arm32 } */
> +/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard -fno-
> schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#define N 640
> +int a[N] = {0};
> +int b[N] = {0};
> +
> +/*
> +** f1:
> +** ...
> +** vcgt.s32 q[0-9]+, q[0-9]+, #0
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f1 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] > 0)
> + break;
> + }
> +}
> +
> +/*
> +** f2:
> +** ...
> +** vcge.s32 q[0-9]+, q[0-9]+, #0
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f2 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] >= 0)
> + break;
> + }
> +}
> +
> +/*
> +** f3:
> +** ...
> +** vceq.i32 q[0-9]+, q[0-9]+, #0
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f3 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] == 0)
> + break;
> + }
> +}
> +
> +/*
> +** f4:
> +** ...
> +** vceq.i32 q[0-9]+, q[0-9]+, #0
> +** vmvn q[0-9]+, q[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f4 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] != 0)
> + break;
> + }
> +}
> +
> +/*
> +** f5:
> +** ...
> +** vclt.s32 q[0-9]+, q[0-9]+, #0
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f5 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] < 0)
> + break;
> + }
> +}
> +
> +/*
> +** f6:
> +** ...
> +** vcle.s32 q[0-9]+, q[0-9]+, #0
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f6 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] <= 0)
> + break;
> + }
> +}
> +
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> supports.exp
> index
> 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd9e
> 0b89369595fcf87 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
> return [check_cached_effective_target_indexed vect_early_break {
> expr {
> [istarget aarch64*-*-*]
> + || [check_effective_target_arm_v8_neon_ok]
> || [check_effective_target_sse4]
> }}]
> }
> @@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { }
> {
> return [check_cached_effective_target_indexed vect_early_break_hw {
> expr {
> [istarget aarch64*-*-*]
> + || [check_effective_target_arm_v8_neon_hw]
> || [check_sse4_hw_available]
> }}]
> }
> @@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
> return "$flags"
> }
>
> + if { [check_effective_target_arm_v8_neon_ok] } {
> + global et_arm_v8_neon_flags
> + return "$flags $et_arm_v8_neon_flags -march=armv8-a"
> + }
> +
> if { [check_effective_target_sse4] } {
> return "$flags -msse4.1"
> }
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
2024-01-04 11:12 ` Kyrylo Tkachov
@ 2024-01-04 11:26 ` Tamar Christina
0 siblings, 0 replies; 6+ messages in thread
From: Tamar Christina @ 2024-01-04 11:26 UTC (permalink / raw)
To: Kyrylo Tkachov, gcc-patches
Cc: nd, Ramana Radhakrishnan, Richard Earnshaw, nickc
> -----Original Message-----
> From: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Sent: Thursday, January 4, 2024 11:12 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com
> Subject: RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
>
> Hi Tamar,
>
> > -----Original Message-----
> > From: Tamar Christina <Tamar.Christina@arm.com>
> > Sent: Thursday, January 4, 2024 11:06 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> > Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> > <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> > <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>
> > Subject: RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
> >
> > Ping,
> >
> > ---
> >
> > Hi All,
> >
> > This adds an implementation for conditional branch optab for AArch32.
> > The previous version only allowed operand 0 but it looks like cbranch
> > expansion does not check with the target and so we have to implement all.
> >
> > I therefore did not commit it. This is a larger version. I've also dropped the MVE
> > version because the mid-end can rewrite the comparison into comparing two
> > predicates without checking with the backend. Since MVE only has 1 predicate
> > register this would need to go through memory and two MRS calls. It's unlikely
> > to be beneficial and so that's for GCC 15 when I can fix the middle-end.
> >
> > The cases where AArch32 is skipped in the testsuite are all missed-optimizations
> > due to AArch32 missing some optabs.
>
> Does the testsuite have vect_* checks that can be used instead of target arm*?
> If so let's use those.
Unfortunately not, a lot of them center around handling of complex doubles.
Some tests work and some fail, which makes it hard to disable based on a
target effective test. They are things that look easy to fix so I may file some tickets
for them.
Cheers,
Tamar
> Otherwise it's okay as is.
> Thanks,
> Kyrill
>
> >
> > For e.g.
> >
> > void f1 ()
> > {
> > for (int i = 0; i < N; i++)
> > {
> > b[i] += a[i];
> > if (a[i] > 0)
> > break;
> > }
> > }
> >
> > For 128-bit vectors we generate:
> >
> > vcgt.s32 q8, q9, #0
> > vpmax.u32 d7, d16, d17
> > vpmax.u32 d7, d7, d7
> > vmov r3, s14 @ int
> > cmp r3, #0
> >
> > and of 64-bit vector we can omit one vpmax as we still need to compress to
> > 32-bits.
> >
> > Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * config/arm/neon.md (cbranch<mode>4): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.dg/vect/vect-early-break_2.c: Skip Arm.
> > * gcc.dg/vect/vect-early-break_7.c: Likewise.
> > * gcc.dg/vect/vect-early-break_75.c: Likewise.
> > * gcc.dg/vect/vect-early-break_77.c: Likewise.
> > * gcc.dg/vect/vect-early-break_82.c: Likewise.
> > * gcc.dg/vect/vect-early-break_88.c: Likewise.
> > * lib/target-supports.exp (add_options_for_vect_early_break,
> > check_effective_target_vect_early_break_hw,
> > check_effective_target_vect_early_break): Support AArch32.
> > * gcc.target/arm/vect-early-break-cbranch.c: New test.
> >
> > --- inline version of patch ---
> >
> > diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> > index
> >
> d213369ffc38fb88ad0357d848cc7da5af73bab7..ed659ab736862da416d1ff624
> 1d
> > 0d3e6c6b96ff1 100644
> > --- a/gcc/config/arm/neon.md
> > +++ b/gcc/config/arm/neon.md
> > @@ -408,6 +408,55 @@ (define_insn "vec_extract<mode><V_elem_l>"
> > [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
> > )
> >
> > +;; Patterns comparing two vectors and conditionally jump.
> > +;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
> > +;; operation. To not pay the penalty for inverting == we can map our any
> > +;; comparisons to all i.e. any(~x) => all(x).
> > +;;
> > +;; However unlike the AArch64 version, we can't optimize this further as the
> > +;; chain is too long for combine due to these being unspecs so it doesn't fold
> > +;; the operation to something simpler.
> > +(define_expand "cbranch<mode>4"
> > + [(set (pc) (if_then_else
> > + (match_operator 0 "expandable_comparison_operator"
> > + [(match_operand:VDQI 1 "register_operand")
> > + (match_operand:VDQI 2 "reg_or_zero_operand")])
> > + (label_ref (match_operand 3 "" ""))
> > + (pc)))]
> > + "TARGET_NEON"
> > +{
> > + rtx mask = operands[1];
> > +
> > + /* If comparing against a non-zero vector we have to do a comparison first
> > + so we can have a != 0 comparison with the result. */
> > + if (operands[2] != CONST0_RTX (<MODE>mode))
> > + {
> > + mask = gen_reg_rtx (<MODE>mode);
> > + emit_insn (gen_xor<mode>3 (mask, operands[1], operands[2]));
> > + }
> > +
> > + /* For 128-bit vectors we need an additional reductions. */
> > + if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
> > + {
> > + /* Always reduce using a V4SI. */
> > + mask = gen_reg_rtx (V2SImode);
> > + rtx low = gen_reg_rtx (V2SImode);
> > + rtx high = gen_reg_rtx (V2SImode);
> > + rtx op1 = lowpart_subreg (V4SImode, operands[1], <MODE>mode);
> > + emit_insn (gen_neon_vget_lowv4si (low, op1));
> > + emit_insn (gen_neon_vget_highv4si (high, op1));
> > + emit_insn (gen_neon_vpumaxv2si (mask, low, high));
> > + }
> > +
> > + rtx op1 = lowpart_subreg (V2SImode, mask, GET_MODE (mask));
> > + emit_insn (gen_neon_vpumaxv2si (op1, op1, op1));
> > +
> > + rtx val = gen_reg_rtx (SImode);
> > + emit_move_insn (val, gen_lowpart (SImode, mask));
> > + emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx,
> operands[3]));
> > + DONE;
> > +})
> > +
> > ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
> > ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
> > ;; by define_expand in vec-common.md file.
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> > index
> >
> 5c32bf94409e9743e72429985ab3bf13aab8f2c1..dec0b492ab883de6e02944a9
> 5f
> > d554a109a68a39 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_2.c
> > @@ -5,7 +5,7 @@
> >
> > /* { dg-additional-options "-Ofast" } */
> >
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> > *" } } } } */
> >
> > #include <complex.h>
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> > index
> >
> 8c86c5034d7522b3733543fb384a23c5d6ed0fcf..d218a0686719fee4c167684dc
> f2
> > 6402851b53260 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_7.c
> > @@ -5,7 +5,7 @@
> >
> > /* { dg-additional-options "-Ofast" } */
> >
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> > *" } } } } */
> >
> > #include <complex.h>
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> > index
> >
> ed27f8635730ff0d8803517c72693625a2feddef..9dcc3372acd657458df8d94ce
> 36
> > c4bd96f02fd52 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_75.c
> > @@ -3,7 +3,7 @@
> > /* { dg-require-effective-target vect_int } */
> >
> > /* { dg-additional-options "-O3" } */
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-*-
> > * i?86-*-*" } } } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "x86_64-
> > *-* i?86-*-* arm*-*-*" } } } } */
> >
> > #include <limits.h>
> > #include <assert.h>
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> > index
> >
> 225106aab0a3efc7536de6f6e45bc6ff16210ea8..9fa7e6948ebfb5f1723833653f
> d
> > 6ad1fc65f4e8e 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_77.c
> > @@ -3,7 +3,7 @@
> > /* { dg-require-effective-target vect_int } */
> >
> > /* { dg-additional-options "-O3" } */
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> > *" } } } } */
> >
> > #include "tree-vect.h"
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> > index
> >
> 0e9b2d8d385c556063a3c6fcb14383317b056a79..7cd21d33485f3abb823e194
> 3c
> > 87e9481c41fd2c3 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_82.c
> > @@ -5,7 +5,7 @@
> >
> > /* { dg-additional-options "-Ofast" } */
> >
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> > *" } } } } */
> >
> > #include <complex.h>
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> > b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> > index
> >
> b392dd46553994d813761da41c42989a79b90119..59ed57c5fb5f3e8197fc200
> 58
> > eeb0a81a55815cc 100644
> > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_88.c
> > @@ -3,7 +3,7 @@
> > /* { dg-require-effective-target vect_int } */
> >
> > /* { dg-additional-options "-Ofast --param vect-partial-vector-usage=2" } */
> > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! "arm*-*-
> > *" } } } } */
> >
> > #include "tree-vect.h"
> >
> > diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> > b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..0e9a39d231fdf4cb565909
> 45e
> > 7cedfabd11d39b5
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> > @@ -0,0 +1,138 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target vect_early_break } */
> > +/* { dg-require-effective-target arm_neon_ok } */
> > +/* { dg-require-effective-target arm32 } */
> > +/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard -
> fno-
> > schedule-insns -fno-reorder-blocks -fno-schedule-insns2" } */
> > +/* { dg-final { check-function-bodies "**" "" "" } } */
> > +
> > +#define N 640
> > +int a[N] = {0};
> > +int b[N] = {0};
> > +
> > +/*
> > +** f1:
> > +** ...
> > +** vcgt.s32 q[0-9]+, q[0-9]+, #0
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vmov r[0-9]+, s[0-9]+ @ int
> > +** cmp r[0-9]+, #0
> > +** bne \.L[0-9]+
> > +** ...
> > +*/
> > +void f1 ()
> > +{
> > + for (int i = 0; i < N; i++)
> > + {
> > + b[i] += a[i];
> > + if (a[i] > 0)
> > + break;
> > + }
> > +}
> > +
> > +/*
> > +** f2:
> > +** ...
> > +** vcge.s32 q[0-9]+, q[0-9]+, #0
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vmov r[0-9]+, s[0-9]+ @ int
> > +** cmp r[0-9]+, #0
> > +** bne \.L[0-9]+
> > +** ...
> > +*/
> > +void f2 ()
> > +{
> > + for (int i = 0; i < N; i++)
> > + {
> > + b[i] += a[i];
> > + if (a[i] >= 0)
> > + break;
> > + }
> > +}
> > +
> > +/*
> > +** f3:
> > +** ...
> > +** vceq.i32 q[0-9]+, q[0-9]+, #0
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vmov r[0-9]+, s[0-9]+ @ int
> > +** cmp r[0-9]+, #0
> > +** bne \.L[0-9]+
> > +** ...
> > +*/
> > +void f3 ()
> > +{
> > + for (int i = 0; i < N; i++)
> > + {
> > + b[i] += a[i];
> > + if (a[i] == 0)
> > + break;
> > + }
> > +}
> > +
> > +/*
> > +** f4:
> > +** ...
> > +** vceq.i32 q[0-9]+, q[0-9]+, #0
> > +** vmvn q[0-9]+, q[0-9]+
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vmov r[0-9]+, s[0-9]+ @ int
> > +** cmp r[0-9]+, #0
> > +** bne \.L[0-9]+
> > +** ...
> > +*/
> > +void f4 ()
> > +{
> > + for (int i = 0; i < N; i++)
> > + {
> > + b[i] += a[i];
> > + if (a[i] != 0)
> > + break;
> > + }
> > +}
> > +
> > +/*
> > +** f5:
> > +** ...
> > +** vclt.s32 q[0-9]+, q[0-9]+, #0
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vmov r[0-9]+, s[0-9]+ @ int
> > +** cmp r[0-9]+, #0
> > +** bne \.L[0-9]+
> > +** ...
> > +*/
> > +void f5 ()
> > +{
> > + for (int i = 0; i < N; i++)
> > + {
> > + b[i] += a[i];
> > + if (a[i] < 0)
> > + break;
> > + }
> > +}
> > +
> > +/*
> > +** f6:
> > +** ...
> > +** vcle.s32 q[0-9]+, q[0-9]+, #0
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> > +** vmov r[0-9]+, s[0-9]+ @ int
> > +** cmp r[0-9]+, #0
> > +** bne \.L[0-9]+
> > +** ...
> > +*/
> > +void f6 ()
> > +{
> > + for (int i = 0; i < N; i++)
> > + {
> > + b[i] += a[i];
> > + if (a[i] <= 0)
> > + break;
> > + }
> > +}
> > +
> > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> > supports.exp
> > index
> >
> 05fc417877bcd658931061b7245eb8ba5abd2e09..24a937dbb59b5723af038bd
> 9e
> > 0b89369595fcf87 100644
> > --- a/gcc/testsuite/lib/target-supports.exp
> > +++ b/gcc/testsuite/lib/target-supports.exp
> > @@ -4059,6 +4059,7 @@ proc check_effective_target_vect_early_break { } {
> > return [check_cached_effective_target_indexed vect_early_break {
> > expr {
> > [istarget aarch64*-*-*]
> > + || [check_effective_target_arm_v8_neon_ok]
> > || [check_effective_target_sse4]
> > }}]
> > }
> > @@ -4072,6 +4073,7 @@ proc check_effective_target_vect_early_break_hw { }
> > {
> > return [check_cached_effective_target_indexed vect_early_break_hw {
> > expr {
> > [istarget aarch64*-*-*]
> > + || [check_effective_target_arm_v8_neon_hw]
> > || [check_sse4_hw_available]
> > }}]
> > }
> > @@ -4081,6 +4083,11 @@ proc add_options_for_vect_early_break { flags } {
> > return "$flags"
> > }
> >
> > + if { [check_effective_target_arm_v8_neon_ok] } {
> > + global et_arm_v8_neon_flags
> > + return "$flags $et_arm_v8_neon_flags -march=armv8-a"
> > + }
> > +
> > if { [check_effective_target_sse4] } {
> > return "$flags -msse4.1"
> > }
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH v5 0/19] Support early break/return auto-vectorization
@ 2023-06-28 13:40 Tamar Christina
2023-11-06 7:42 ` [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation Tamar Christina
0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2023-06-28 13:40 UTC (permalink / raw)
To: gcc-patches; +Cc: nd, rguenther, jlaw
[-- Attachment #1: Type: text/plain, Size: 10897 bytes --]
Hi All,
This patch adds initial support for early break vectorization in GCC.
The support is added for any target that implements a vector cbranch optab,
this includes both fully masked and non-masked targets.
Depending on the operation, the vectorizer may also require support for boolean
mask reductions using Inclusive OR. This is however only checked then the
comparison would produce multiple statements.
Concretely the kind of loops supported are of the forms:
for (int i = 0; i < N; i++)
{
<statements1>
if (<condition>)
{
...
<action>;
}
<statements2>
}
where <action> can be:
- break
- return
- goto
Any number of statements can be used before the <action> occurs.
Since this is an initial version for GCC 14 it has the following limitations and
features:
- Only fixed sized iterations and buffers are supported. That is to say any
vectors loaded or stored must be to statically allocated arrays with known
sizes. N must also be known. This limitation is because our primary target
for this optimization is SVE. For VLA SVE we can't easily do cross page
iteraion checks. The result is likely to also not be beneficial. For that
reason we punt support for variable buffers till we have First-Faulting
support in GCC.
- any stores in <statements1> should not be to the same objects as in
<condition>. Loads are fine as long as they don't have the possibility to
alias. More concretely, we block RAW dependencies when the intermediate value
can't be separated fromt the store, or the store itself can't be moved.
- The number of loop iterations must be known, this is just a temporarily
limitation that I intend to address in GCC 14 itself as follow on patches.
- Prologue peeling, alignment peelinig and loop versioning are supported.
- Fully masked loops, unmasked loops and partially masked loops are supported
- Any number of loop early exits are supported.
- The early exit must be before the natural loop exit/latch. The vectorizer is
designed in way to propage phi-nodes downwards. As such supporting this
inverted control flow is hard.
- No support for epilogue vectorization. The only epilogue supported is the
scalar final one. Epilogue vectorization would also not be profitable.
- Early breaks are only supported for inner loop vectorization.
I have pushed a branch to refs/users/tnfchris/heads/gcc-14-early-break
With the help of IPA and LTO this still gets hit quite often. During bootstrap
it hit rather frequently. Additionally TSVC s332, s481 and s482 all pass now
since these are tests for support for early exit vectorization.
This implementation does not support completely handling the early break inside
the vector loop itself but instead supports adding checks such that if we know
that we have to exit in the current iteration then we branch to scalar code to
actually do the final VF iterations which handles all the code in <action>.
niters analysis and the majority of the vectorizer with hardcoded single_exit
have been updated with the use of a new function vec_loop_iv value which returns
the exit the vectorizer wants to use as the main IV exit.
for niters the this exit is what determines the overall iterations as
that is the O(iters) for the loop.
For the scalar loop we know that whatever exit you take you have to perform at
most VF iterations. For vector code we only case about the state of fully
performed iteration and reset the scalar code to the (partially) remaining loop.
This new version of the patch does the majority of the work in a new rewritten
loop peeling. This new function maintains LCSSA all the way through and no
longer requires the touch up functions the vectorized used to incrementally
adjust them later on. This means that aside from IV updates and guard edge
updates the early exit code is identical to the single exit cases.
When the loop is peeled during the copying I have to go through great lengths to
keep the dominators up to date. All exits from the first loop are rewired to the
loop header of the second loop. But this can change the immediate dominator.
The dominators can change again when we wire in the loop guard, as such peeling
now returns a list of dominators that need to be updated if a new guard edge is
added.
For the loop peeling we rewrite the loop form:
Header
---
|x|
2
|
v
-------3<------
early exit | | |
v v | latch
7 4----->6
| |
| v
| 8
| |
| v
------>5
into
Header
---
|x|
2
|
v
-------3<------
early exit | | |
v v | latch
7 4----->6
| |
| v
| 8
| |
| v
| New Header
| ---
----->|x|
9
|
v
------10<-----
early exit | | |
v v | latch
14 11---->13
| |
| v
| 12
| |
| v
------> 5
That is to say, the first vector loop executes so long as the early exit isn't
needed. Once the exit is taken, the scalar code will perform at most VF extra
iterations. The exact number depending on peeling and iteration start and which
exit was taken (natural or early). For this scalar loop, all early exits are
treated the same.
When we vectorize we move any statement not related to the early break itself
and that would be incorrect to execute before the break (i.e. has side effects)
to after the break. If this is not possible we decline to vectorize.
This means that we check at the start of iterations whether we are going to exit
or not. During the analyis phase we check whether we are allowed to do this
moving of statements. Also note that we only move the scalar statements, but
only do so after peeling but just before we start transforming statements.
Codegen:
for e.g.
#define N 803
unsigned vect_a[N];
unsigned vect_b[N];
unsigned test4(unsigned x)
{
unsigned ret = 0;
for (int i = 0; i < N; i++)
{
vect_b[i] = x + i;
if (vect_a[i] > x)
break;
vect_a[i] = x;
}
return ret;
}
We generate for Adv. SIMD:
test4:
adrp x2, .LC0
adrp x3, .LANCHOR0
dup v2.4s, w0
add x3, x3, :lo12:.LANCHOR0
movi v4.4s, 0x4
add x4, x3, 3216
ldr q1, [x2, #:lo12:.LC0]
mov x1, 0
mov w2, 0
.p2align 3,,7
.L3:
ldr q0, [x3, x1]
add v3.4s, v1.4s, v2.4s
add v1.4s, v1.4s, v4.4s
cmhi v0.4s, v0.4s, v2.4s
umaxp v0.4s, v0.4s, v0.4s
fmov x5, d0
cbnz x5, .L6
add w2, w2, 1
str q3, [x1, x4]
str q2, [x3, x1]
add x1, x1, 16
cmp w2, 200
bne .L3
mov w7, 3
.L2:
lsl w2, w2, 2
add x5, x3, 3216
add w6, w2, w0
sxtw x4, w2
ldr w1, [x3, x4, lsl 2]
str w6, [x5, x4, lsl 2]
cmp w0, w1
bcc .L4
add w1, w2, 1
str w0, [x3, x4, lsl 2]
add w6, w1, w0
sxtw x1, w1
ldr w4, [x3, x1, lsl 2]
str w6, [x5, x1, lsl 2]
cmp w0, w4
bcc .L4
add w4, w2, 2
str w0, [x3, x1, lsl 2]
sxtw x1, w4
add w6, w1, w0
ldr w4, [x3, x1, lsl 2]
str w6, [x5, x1, lsl 2]
cmp w0, w4
bcc .L4
str w0, [x3, x1, lsl 2]
add w2, w2, 3
cmp w7, 3
beq .L4
sxtw x1, w2
add w2, w2, w0
ldr w4, [x3, x1, lsl 2]
str w2, [x5, x1, lsl 2]
cmp w0, w4
bcc .L4
str w0, [x3, x1, lsl 2]
.L4:
mov w0, 0
ret
.p2align 2,,3
.L6:
mov w7, 4
b .L2
and for SVE:
test4:
adrp x2, .LANCHOR0
add x2, x2, :lo12:.LANCHOR0
add x5, x2, 3216
mov x3, 0
mov w1, 0
cntw x4
mov z1.s, w0
index z0.s, #0, #1
ptrue p1.b, all
ptrue p0.s, all
.p2align 3,,7
.L3:
ld1w z2.s, p1/z, [x2, x3, lsl 2]
add z3.s, z0.s, z1.s
cmplo p2.s, p0/z, z1.s, z2.s
b.any .L2
st1w z3.s, p1, [x5, x3, lsl 2]
add w1, w1, 1
st1w z1.s, p1, [x2, x3, lsl 2]
add x3, x3, x4
incw z0.s
cmp w3, 803
bls .L3
.L5:
mov w0, 0
ret
.p2align 2,,3
.L2:
cntw x5
mul w1, w1, w5
cbz w5, .L5
sxtw x1, w1
sub w5, w5, #1
add x5, x5, x1
add x6, x2, 3216
b .L6
.p2align 2,,3
.L14:
str w0, [x2, x1, lsl 2]
cmp x1, x5
beq .L5
mov x1, x4
.L6:
ldr w3, [x2, x1, lsl 2]
add w4, w0, w1
str w4, [x6, x1, lsl 2]
add x4, x1, 1
cmp w0, w3
bcs .L14
mov w0, 0
ret
On the workloads this work is based on we see between 2-3x performance uplift
using this patch.
Follow up plan:
- Boolean vectorization has several shortcomings. I've filed PR110223 with the
bigger ones that cause vectorization to fail with this patch.
- SLP support. This is planned for GCC 15 as for majority of the cases build
SLP itself fails. This means I'll need to spend time in making this more
robust first. Additionally it requires:
* Adding support for vectorizing CFG (gconds)
* Support for CFG to differ between vector and scalar loops.
Both of which would be disruptive to the tree and I suspect I'll be handling
fallouts from this patch for a while. So I plan to work on the surrounding
building blocks first for the remainder of the year.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Also ran across various workloads and no issues.
When closer to acceptance I will run on other targets as well and clean up
related testsuite fallouts there.
--- inline copy of patch --
--
[-- Attachment #2: rb17494.patch --]
[-- Type: text/plain, Size: 0 bytes --]
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
2023-06-28 13:40 [PATCH v5 0/19] Support early break/return auto-vectorization Tamar Christina
@ 2023-11-06 7:42 ` Tamar Christina
2023-11-27 12:48 ` Kyrylo Tkachov
0 siblings, 1 reply; 6+ messages in thread
From: Tamar Christina @ 2023-11-06 7:42 UTC (permalink / raw)
To: gcc-patches
Cc: nd, Ramana.Radhakrishnan, Richard.Earnshaw, nickc, Kyrylo.Tkachov
[-- Attachment #1: Type: text/plain, Size: 6094 bytes --]
Hi All,
This adds an implementation for conditional branch optab for AArch32.
For e.g.
void f1 ()
{
for (int i = 0; i < N; i++)
{
b[i] += a[i];
if (a[i] > 0)
break;
}
}
For 128-bit vectors we generate:
vcgt.s32 q8, q9, #0
vpmax.u32 d7, d16, d17
vpmax.u32 d7, d7, d7
vmov r3, s14 @ int
cmp r3, #0
and of 64-bit vector we can omit one vpmax as we still need to compress to
32-bits.
Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/arm/neon.md (cbranch<mode>4): New.
gcc/testsuite/ChangeLog:
* lib/target-supports.exp (vect_early_break): Add AArch32.
* gcc.target/arm/vect-early-break-cbranch.c: New test.
--- inline copy of patch --
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..130efbc37cfe3128533599dfadc344d2243dcb63 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,45 @@ (define_insn "vec_extract<mode><V_elem_l>"
[(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
)
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation. To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+ [(set (pc) (if_then_else
+ (match_operator 0 "expandable_comparison_operator"
+ [(match_operand:VDQI 1 "register_operand")
+ (match_operand:VDQI 2 "zero_operand")])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_NEON"
+{
+ rtx mask = operands[1];
+
+ /* For 128-bit vectors we need an additional reductions. */
+ if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+ {
+ /* Always reduce using a V4SI. */
+ mask = gen_reg_rtx (V2SImode);
+ rtx low = gen_reg_rtx (V2SImode);
+ rtx high = gen_reg_rtx (V2SImode);
+ emit_insn (gen_neon_vget_lowv4si (low, operands[1]));
+ emit_insn (gen_neon_vget_highv4si (high, operands[1]));
+ emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+ }
+
+ emit_insn (gen_neon_vpumaxv2si (mask, mask, mask));
+
+ rtx val = gen_reg_rtx (SImode);
+ emit_move_insn (val, gen_lowpart (SImode, mask));
+ emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+ DONE;
+})
+
;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c05aa10d26ed4ac9785672e6e3b4355cef046dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,136 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/* f1:
+** ...
+** vcgt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f1 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] > 0)
+ break;
+ }
+}
+
+/*
+** f2:
+** ...
+** vcge.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f2 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] >= 0)
+ break;
+ }
+}
+
+/*
+** f3:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f3 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] == 0)
+ break;
+ }
+}
+
+/*
+** f4:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vmvn q[0-9]+, q[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f4 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] != 0)
+ break;
+ }
+}
+
+/*
+** f5:
+** ...
+** vclt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f5 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] < 0)
+ break;
+ }
+}
+
+/*
+** f6:
+** ...
+** vcle.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f6 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] <= 0)
+ break;
+ }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 5516188dc0aa86d161d67dea5a7769e3c3d72f85..8f58671e6cfd3546c6a98e40341fe31c6492594b 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3784,6 +3784,7 @@ proc check_effective_target_vect_early_break { } {
return [check_cached_effective_target_indexed vect_early_break {
expr {
[istarget aarch64*-*-*]
+ || [check_effective_target_arm_neon_ok]
}}]
}
# Return 1 if the target supports hardware vectorization of complex additions of
--
[-- Attachment #2: rb17512.patch --]
[-- Type: text/plain, Size: 5281 bytes --]
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index d213369ffc38fb88ad0357d848cc7da5af73bab7..130efbc37cfe3128533599dfadc344d2243dcb63 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -408,6 +408,45 @@ (define_insn "vec_extract<mode><V_elem_l>"
[(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
)
+;; Patterns comparing two vectors and conditionally jump.
+;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
+;; operation. To not pay the penalty for inverting == we can map our any
+;; comparisons to all i.e. any(~x) => all(x).
+;;
+;; However unlike the AArch64 version, we can't optimize this further as the
+;; chain is too long for combine due to these being unspecs so it doesn't fold
+;; the operation to something simpler.
+(define_expand "cbranch<mode>4"
+ [(set (pc) (if_then_else
+ (match_operator 0 "expandable_comparison_operator"
+ [(match_operand:VDQI 1 "register_operand")
+ (match_operand:VDQI 2 "zero_operand")])
+ (label_ref (match_operand 3 "" ""))
+ (pc)))]
+ "TARGET_NEON"
+{
+ rtx mask = operands[1];
+
+ /* For 128-bit vectors we need an additional reductions. */
+ if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
+ {
+ /* Always reduce using a V4SI. */
+ mask = gen_reg_rtx (V2SImode);
+ rtx low = gen_reg_rtx (V2SImode);
+ rtx high = gen_reg_rtx (V2SImode);
+ emit_insn (gen_neon_vget_lowv4si (low, operands[1]));
+ emit_insn (gen_neon_vget_highv4si (high, operands[1]));
+ emit_insn (gen_neon_vpumaxv2si (mask, low, high));
+ }
+
+ emit_insn (gen_neon_vpumaxv2si (mask, mask, mask));
+
+ rtx val = gen_reg_rtx (SImode);
+ emit_move_insn (val, gen_lowpart (SImode, mask));
+ emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
+ DONE;
+})
+
;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
;; by define_expand in vec-common.md file.
diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c05aa10d26ed4ac9785672e6e3b4355cef046dc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
@@ -0,0 +1,136 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-require-effective-target arm32 } */
+/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define N 640
+int a[N] = {0};
+int b[N] = {0};
+
+/* f1:
+** ...
+** vcgt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f1 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] > 0)
+ break;
+ }
+}
+
+/*
+** f2:
+** ...
+** vcge.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f2 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] >= 0)
+ break;
+ }
+}
+
+/*
+** f3:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f3 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] == 0)
+ break;
+ }
+}
+
+/*
+** f4:
+** ...
+** vceq.i32 q[0-9]+, q[0-9]+, #0
+** vmvn q[0-9]+, q[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f4 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] != 0)
+ break;
+ }
+}
+
+/*
+** f5:
+** ...
+** vclt.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f5 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] < 0)
+ break;
+ }
+}
+
+/*
+** f6:
+** ...
+** vcle.s32 q[0-9]+, q[0-9]+, #0
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
+** vmov r[0-9]+, s[0-9]+ @ int
+** cmp r[0-9]+, #0
+** bne \.L[0-9]+
+** ...
+*/
+void f6 ()
+{
+ for (int i = 0; i < N; i++)
+ {
+ b[i] += a[i];
+ if (a[i] <= 0)
+ break;
+ }
+}
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 5516188dc0aa86d161d67dea5a7769e3c3d72f85..8f58671e6cfd3546c6a98e40341fe31c6492594b 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3784,6 +3784,7 @@ proc check_effective_target_vect_early_break { } {
return [check_cached_effective_target_indexed vect_early_break {
expr {
[istarget aarch64*-*-*]
+ || [check_effective_target_arm_neon_ok]
}}]
}
# Return 1 if the target supports hardware vectorization of complex additions of
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
2023-11-06 7:42 ` [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation Tamar Christina
@ 2023-11-27 12:48 ` Kyrylo Tkachov
0 siblings, 0 replies; 6+ messages in thread
From: Kyrylo Tkachov @ 2023-11-27 12:48 UTC (permalink / raw)
To: Tamar Christina, gcc-patches
Cc: nd, Ramana Radhakrishnan, Richard Earnshaw, nickc
Hi Tamar,
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Monday, November 6, 2023 7:43 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <nd@arm.com>; Ramana Radhakrishnan
> <Ramana.Radhakrishnan@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; nickc@redhat.com; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation
>
> Hi All,
>
> This adds an implementation for conditional branch optab for AArch32.
>
> For e.g.
>
> void f1 ()
> {
> for (int i = 0; i < N; i++)
> {
> b[i] += a[i];
> if (a[i] > 0)
> break;
> }
> }
>
> For 128-bit vectors we generate:
>
> vcgt.s32 q8, q9, #0
> vpmax.u32 d7, d16, d17
> vpmax.u32 d7, d7, d7
> vmov r3, s14 @ int
> cmp r3, #0
>
> and of 64-bit vector we can omit one vpmax as we still need to compress to
> 32-bits.
>
> Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
>
> Ok for master?
>
This is okay once the prerequisites go in.
Thanks,
Kyrill
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/arm/neon.md (cbranch<mode>4): New.
>
> gcc/testsuite/ChangeLog:
>
> * lib/target-supports.exp (vect_early_break): Add AArch32.
> * gcc.target/arm/vect-early-break-cbranch.c: New test.
>
> --- inline copy of patch --
> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
> index
> d213369ffc38fb88ad0357d848cc7da5af73bab7..130efbc37cfe3128533599dfadc
> 344d2243dcb63 100644
> --- a/gcc/config/arm/neon.md
> +++ b/gcc/config/arm/neon.md
> @@ -408,6 +408,45 @@ (define_insn "vec_extract<mode><V_elem_l>"
> [(set_attr "type" "neon_store1_one_lane<q>,neon_to_gp<q>")]
> )
>
> +;; Patterns comparing two vectors and conditionally jump.
> +;; Avdanced SIMD lacks a vector != comparison, but this is a quite common
> +;; operation. To not pay the penalty for inverting == we can map our any
> +;; comparisons to all i.e. any(~x) => all(x).
> +;;
> +;; However unlike the AArch64 version, we can't optimize this further as the
> +;; chain is too long for combine due to these being unspecs so it doesn't fold
> +;; the operation to something simpler.
> +(define_expand "cbranch<mode>4"
> + [(set (pc) (if_then_else
> + (match_operator 0 "expandable_comparison_operator"
> + [(match_operand:VDQI 1 "register_operand")
> + (match_operand:VDQI 2 "zero_operand")])
> + (label_ref (match_operand 3 "" ""))
> + (pc)))]
> + "TARGET_NEON"
> +{
> + rtx mask = operands[1];
> +
> + /* For 128-bit vectors we need an additional reductions. */
> + if (known_eq (128, GET_MODE_BITSIZE (<MODE>mode)))
> + {
> + /* Always reduce using a V4SI. */
> + mask = gen_reg_rtx (V2SImode);
> + rtx low = gen_reg_rtx (V2SImode);
> + rtx high = gen_reg_rtx (V2SImode);
> + emit_insn (gen_neon_vget_lowv4si (low, operands[1]));
> + emit_insn (gen_neon_vget_highv4si (high, operands[1]));
> + emit_insn (gen_neon_vpumaxv2si (mask, low, high));
> + }
> +
> + emit_insn (gen_neon_vpumaxv2si (mask, mask, mask));
> +
> + rtx val = gen_reg_rtx (SImode);
> + emit_move_insn (val, gen_lowpart (SImode, mask));
> + emit_jump_insn (gen_cbranch_cc (operands[0], val, const0_rtx, operands[3]));
> + DONE;
> +})
> +
> ;; This pattern is renamed from "vec_extract<mode><V_elem_l>" to
> ;; "neon_vec_extract<mode><V_elem_l>" and this pattern is called
> ;; by define_expand in vec-common.md file.
> diff --git a/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..2c05aa10d26ed4ac9785672e
> 6e3b4355cef046dc
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/vect-early-break-cbranch.c
> @@ -0,0 +1,136 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_neon_ok } */
> +/* { dg-require-effective-target arm32 } */
> +/* { dg-options "-O3 -march=armv8-a+simd -mfpu=auto -mfloat-abi=hard" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#define N 640
> +int a[N] = {0};
> +int b[N] = {0};
> +
> +/* f1:
> +** ...
> +** vcgt.s32 q[0-9]+, q[0-9]+, #0
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f1 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] > 0)
> + break;
> + }
> +}
> +
> +/*
> +** f2:
> +** ...
> +** vcge.s32 q[0-9]+, q[0-9]+, #0
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f2 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] >= 0)
> + break;
> + }
> +}
> +
> +/*
> +** f3:
> +** ...
> +** vceq.i32 q[0-9]+, q[0-9]+, #0
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f3 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] == 0)
> + break;
> + }
> +}
> +
> +/*
> +** f4:
> +** ...
> +** vceq.i32 q[0-9]+, q[0-9]+, #0
> +** vmvn q[0-9]+, q[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f4 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] != 0)
> + break;
> + }
> +}
> +
> +/*
> +** f5:
> +** ...
> +** vclt.s32 q[0-9]+, q[0-9]+, #0
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f5 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] < 0)
> + break;
> + }
> +}
> +
> +/*
> +** f6:
> +** ...
> +** vcle.s32 q[0-9]+, q[0-9]+, #0
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vpmax.u32 d[0-9]+, d[0-9]+, d[0-9]+
> +** vmov r[0-9]+, s[0-9]+ @ int
> +** cmp r[0-9]+, #0
> +** bne \.L[0-9]+
> +** ...
> +*/
> +void f6 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] <= 0)
> + break;
> + }
> +}
> +
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> supports.exp
> index
> 5516188dc0aa86d161d67dea5a7769e3c3d72f85..8f58671e6cfd3546c6a98e4034
> 1fe31c6492594b 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -3784,6 +3784,7 @@ proc check_effective_target_vect_early_break { } {
> return [check_cached_effective_target_indexed vect_early_break {
> expr {
> [istarget aarch64*-*-*]
> + || [check_effective_target_arm_neon_ok]
> }}]
> }
> # Return 1 if the target supports hardware vectorization of complex additions of
>
>
>
>
> --
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-01-04 11:27 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-29 14:42 [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation Tamar Christina
2024-01-04 11:06 ` Tamar Christina
2024-01-04 11:12 ` Kyrylo Tkachov
2024-01-04 11:26 ` Tamar Christina
-- strict thread matches above, loose matches on Subject: below --
2023-06-28 13:40 [PATCH v5 0/19] Support early break/return auto-vectorization Tamar Christina
2023-11-06 7:42 ` [PATCH 20/21]Arm: Add Advanced SIMD cbranch implementation Tamar Christina
2023-11-27 12:48 ` Kyrylo Tkachov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).