* [pushed] aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326]
@ 2023-11-27 14:45 Richard Sandiford
0 siblings, 0 replies; only message in thread
From: Richard Sandiford @ 2023-11-27 14:45 UTC (permalink / raw)
To: gcc-patches
Many predicated SVE intrinsics provide three forms of predication:
zeroing, merging, and any/dont-care. All three are equivalent when
the predicate is all-true, so this patch drops the zeroing and
merging in that case.
Tested on aarch64-linux-gnu & pushed.
Richard
gcc/
PR target/106326
* config/aarch64/aarch64-sve-builtins.h (is_ptrue): Declare.
* config/aarch64/aarch64-sve-builtins.cc (is_ptrue): New function.
(gimple_folder::redirect_pred_x): Likewise.
(gimple_folder::fold): Use it.
gcc/testsuite/
PR target/106326
* gcc.target/aarch64/sve/acle/general/pr106326_1.c: New test.
---
gcc/config/aarch64/aarch64-sve-builtins.cc | 46 +++
gcc/config/aarch64/aarch64-sve-builtins.h | 3 +
.../aarch64/sve/acle/general/pr106326_1.c | 378 ++++++++++++++++++
3 files changed, 427 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc b/gcc/config/aarch64/aarch64-sve-builtins.cc
index b61156302cf..ee81282a0be 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -2561,6 +2561,17 @@ vector_cst_all_same (tree v, unsigned int step)
return true;
}
+/* Return true if V is a constant predicate that acts as a ptrue when
+ predicating STEP-byte elements. */
+bool
+is_ptrue (tree v, unsigned int step)
+{
+ return (TREE_CODE (v) == VECTOR_CST
+ && TYPE_MODE (TREE_TYPE (v)) == VNx16BImode
+ && integer_nonzerop (VECTOR_CST_ENCODED_ELT (v, 0))
+ && vector_cst_all_same (v, step));
+}
+
gimple_folder::gimple_folder (const function_instance &instance, tree fndecl,
gimple_stmt_iterator *gsi_in, gcall *call_in)
: function_call_info (gimple_location (call_in), instance, fndecl),
@@ -2635,6 +2646,37 @@ gimple_folder::redirect_call (const function_instance &instance)
return call;
}
+/* Redirect _z and _m calls to _x functions if the predicate is all-true.
+ This allows us to use unpredicated instructions, where available. */
+gimple *
+gimple_folder::redirect_pred_x ()
+{
+ if (pred != PRED_z && pred != PRED_m)
+ return nullptr;
+
+ if (gimple_call_num_args (call) < 2)
+ return nullptr;
+
+ tree lhs_type = TREE_TYPE (TREE_TYPE (fndecl));
+ tree arg0_type = type_argument_type (TREE_TYPE (fndecl), 1);
+ tree arg1_type = type_argument_type (TREE_TYPE (fndecl), 2);
+ if (!VECTOR_TYPE_P (lhs_type)
+ || !VECTOR_TYPE_P (arg0_type)
+ || !VECTOR_TYPE_P (arg1_type))
+ return nullptr;
+
+ auto lhs_step = element_precision (lhs_type);
+ auto rhs_step = element_precision (arg1_type);
+ auto step = MAX (lhs_step, rhs_step);
+ if (!multiple_p (step, BITS_PER_UNIT)
+ || !is_ptrue (gimple_call_arg (call, 0), step / BITS_PER_UNIT))
+ return nullptr;
+
+ function_instance instance (*this);
+ instance.pred = PRED_x;
+ return redirect_call (instance);
+}
+
/* Fold the call to constant VAL. */
gimple *
gimple_folder::fold_to_cstu (poly_uint64 val)
@@ -2707,6 +2749,10 @@ gimple_folder::fold ()
if (!lhs && TREE_TYPE (gimple_call_fntype (call)) != void_type_node)
return NULL;
+ /* First try some simplifications that are common to many functions. */
+ if (auto *call = redirect_pred_x ())
+ return call;
+
return base->fold (*this);
}
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.h b/gcc/config/aarch64/aarch64-sve-builtins.h
index d646df1c026..b9148c51b28 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.h
+++ b/gcc/config/aarch64/aarch64-sve-builtins.h
@@ -500,6 +500,8 @@ public:
tree load_store_cookie (tree);
gimple *redirect_call (const function_instance &);
+ gimple *redirect_pred_x ();
+
gimple *fold_to_cstu (poly_uint64);
gimple *fold_to_pfalse ();
gimple *fold_to_ptrue ();
@@ -673,6 +675,7 @@ extern tree acle_svpattern;
extern tree acle_svprfop;
bool vector_cst_all_same (tree, unsigned int);
+bool is_ptrue (tree, unsigned int);
/* Return the ACLE type svbool_t. */
inline tree
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c
new file mode 100644
index 00000000000..34604a8df6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_1.c
@@ -0,0 +1,378 @@
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** add1:
+** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+** ret
+*/
+svint32_t
+add1 (svint32_t x, svint32_t y)
+{
+ return svadd_z (svptrue_b8 (), x, y);
+}
+
+/*
+** add2:
+** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+** ret
+*/
+svint32_t
+add2 (svint32_t x, svint32_t y)
+{
+ return svadd_z (svptrue_b16 (), x, y);
+}
+
+/*
+** add3:
+** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+** ret
+*/
+svint32_t
+add3 (svint32_t x, svint32_t y)
+{
+ return svadd_z (svptrue_b32 (), x, y);
+}
+
+/*
+** add4:
+** ...
+** movprfx [^\n]+
+** ...
+** ret
+*/
+svint32_t
+add4 (svint32_t x, svint32_t y)
+{
+ return svadd_z (svptrue_b64 (), x, y);
+}
+
+/*
+** add5:
+** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+** ret
+*/
+svint32_t
+add5 (svint32_t x, svint32_t y)
+{
+ return svadd_m (svptrue_b8 (), x, y);
+}
+
+/*
+** add6:
+** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+** ret
+*/
+svint32_t
+add6 (svint32_t x, svint32_t y)
+{
+ return svadd_m (svptrue_b16 (), x, y);
+}
+
+/*
+** add7:
+** add z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+** ret
+*/
+svint32_t
+add7 (svint32_t x, svint32_t y)
+{
+ return svadd_m (svptrue_b32 (), x, y);
+}
+
+/*
+** add8:
+** ptrue (p[0-7])\.d(?:, all)?
+** add z0\.s, \1/m, z0\.s, z1\.s
+** ret
+*/
+svint32_t
+add8 (svint32_t x, svint32_t y)
+{
+ return svadd_m (svptrue_b64 (), x, y);
+}
+
+/*
+** add9:
+** ptrue (p[0-7])\.s(?:, all)?
+** add z0\.h, \1/m, z0\.h, z1\.h
+** ret
+*/
+svint16_t
+add9 (svint16_t x, svint16_t y)
+{
+ return svadd_m (svptrue_b32 (), x, y);
+}
+
+/*
+** and1:
+** and z0\.s, z0\.s, #(?:0x)?1
+** ret
+*/
+svint32_t
+and1 (svint32_t x)
+{
+ return svand_z (svptrue_b8 (), x, 1);
+}
+
+/*
+** and2:
+** and z0\.s, z0\.s, #(?:0x)?1
+** ret
+*/
+svint32_t
+and2 (svint32_t x)
+{
+ return svand_z (svptrue_b16 (), x, 1);
+}
+
+/*
+** and3:
+** and z0\.s, z0\.s, #(?:0x)?1
+** ret
+*/
+svint32_t
+and3 (svint32_t x)
+{
+ return svand_z (svptrue_b32 (), x, 1);
+}
+
+/*
+** and4:
+** (?!and z0\.s, z0\.s, #).*
+** ret
+*/
+svint32_t
+and4 (svint32_t x)
+{
+ return svand_z (svptrue_b64 (), x, 1);
+}
+
+/*
+** and5:
+** and z0\.s, z0\.s, #(?:0x)?1
+** ret
+*/
+svint32_t
+and5 (svint32_t x)
+{
+ return svand_m (svptrue_b8 (), x, 1);
+}
+
+/*
+** and6:
+** and z0\.s, z0\.s, #(?:0x)?1
+** ret
+*/
+svint32_t
+and6 (svint32_t x)
+{
+ return svand_m (svptrue_b16 (), x, 1);
+}
+
+/*
+** and7:
+** and z0\.s, z0\.s, #(?:0x)?1
+** ret
+*/
+svint32_t
+and7 (svint32_t x)
+{
+ return svand_m (svptrue_b32 (), x, 1);
+}
+
+/*
+** and8:
+** (?!and z0\.s, z0\.s, #).*
+** ret
+*/
+svint32_t
+and8 (svint32_t x)
+{
+ return svand_m (svptrue_b64 (), x, 1);
+}
+
+/*
+** and9:
+** (
+** and p0\.b, p0/z, p1\.b, p1\.b
+** |
+** and p0\.b, p1/z, p0\.b, p0\.b
+** )
+** ret
+*/
+svbool_t
+and9 (svbool_t x, svbool_t y)
+{
+ return svand_z (svptrue_b8 (), x, y);
+}
+
+/*
+** not1:
+** ptrue (p[0-7])\.b(?:, all)?
+** not z0\.s, \1/m, z1\.s
+** ret
+*/
+svint32_t
+not1 (svint32_t x, svint32_t y)
+{
+ return svnot_m (x, svptrue_b8 (), y);
+}
+
+/*
+** cvt1:
+** ptrue (p[0-7])\.b(?:, all)?
+** fcvtzs z0\.s, \1/m, z0\.h
+** ret
+*/
+svint32_t
+cvt1 (svfloat16_t x)
+{
+ return svcvt_s32_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt2:
+** ptrue (p[0-7])\.b(?:, all)?
+** fcvtzs z0\.s, \1/m, z0\.h
+** ret
+*/
+svint32_t
+cvt2 (svfloat16_t x)
+{
+ return svcvt_s32_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt3:
+** ptrue (p[0-7])\.b(?:, all)?
+** fcvtzs z0\.s, \1/m, z0\.h
+** ret
+*/
+svint32_t
+cvt3 (svfloat16_t x)
+{
+ return svcvt_s32_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt4:
+** ...
+** movprfx [^\n]+
+** ...
+** ret
+*/
+svint32_t
+cvt4 (svfloat16_t x)
+{
+ return svcvt_s32_z (svptrue_b64 (), x);
+}
+
+/*
+** cvt5:
+** ptrue (p[0-7])\.b(?:, all)?
+** fcvt z0\.h, \1/m, z0\.s
+** ret
+*/
+svfloat16_t
+cvt5 (svfloat32_t x)
+{
+ return svcvt_f16_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt6:
+** ptrue (p[0-7])\.b(?:, all)?
+** fcvt z0\.h, \1/m, z0\.s
+** ret
+*/
+svfloat16_t
+cvt6 (svfloat32_t x)
+{
+ return svcvt_f16_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt7:
+** ptrue (p[0-7])\.b(?:, all)?
+** fcvt z0\.h, \1/m, z0\.s
+** ret
+*/
+svfloat16_t
+cvt7 (svfloat32_t x)
+{
+ return svcvt_f16_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt8:
+** ...
+** movprfx [^\n]+
+** ...
+** ret
+*/
+svfloat16_t
+cvt8 (svfloat32_t x)
+{
+ return svcvt_f16_z (svptrue_b64 (), x);
+}
+
+/*
+** cvt9:
+** ptrue (p[0-7])\.b(?:, all)?
+** scvtf z0\.h, \1/m, z0\.h
+** ret
+*/
+svfloat16_t
+cvt9 (svint16_t x)
+{
+ return svcvt_f16_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt10:
+** ptrue (p[0-7])\.b(?:, all)?
+** scvtf z0\.h, \1/m, z0\.h
+** ret
+*/
+svfloat16_t
+cvt10 (svint16_t x)
+{
+ return svcvt_f16_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt11:
+** ...
+** movprfx [^\n]+
+** ...
+** ret
+*/
+svfloat16_t
+cvt11 (svint16_t x)
+{
+ return svcvt_f16_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt12:
+** ...
+** movprfx [^\n]+
+** ...
+** ret
+*/
+svfloat16_t
+cvt12 (svint16_t x)
+{
+ return svcvt_f16_z (svptrue_b64 (), x);
+}
+
+#ifdef __cplusplus
+}
+#endif
--
2.25.1
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-11-27 14:45 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-27 14:45 [pushed] aarch64: Remove redundant zeroing/merging in SVE intrinsics [PR106326] Richard Sandiford
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).