* [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
@ 2023-09-27 0:50 Tamar Christina
2023-09-27 1:17 ` Andrew Pinski
2023-09-29 15:00 ` Jeff Law
0 siblings, 2 replies; 17+ messages in thread
From: Tamar Christina @ 2023-09-27 0:50 UTC (permalink / raw)
To: gcc-patches; +Cc: nd, rguenther, jlaw
[-- Attachment #1: Type: text/plain, Size: 11489 bytes --]
Hi All,
For targets that allow conversion between int and float modes this adds a new
optimization transforming fneg (fabs (x)) into x | (1 << signbit(x)). Such
sequences are common in scientific code working with gradients.
The transformed instruction if the target has an inclusive-OR that takes an
immediate is both shorter an faster. For those that don't the immediate has
to be seperate constructed but this still ends up being faster as the immediate
construction is not on the critical path.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
PR tree-optimization/109154
* match.pd: Add new neg+abs rule.
gcc/testsuite/ChangeLog:
PR tree-optimization/109154
* gcc.target/aarch64/fneg-abs_1.c: New test.
* gcc.target/aarch64/fneg-abs_2.c: New test.
* gcc.target/aarch64/fneg-abs_3.c: New test.
* gcc.target/aarch64/fneg-abs_4.c: New test.
* gcc.target/aarch64/sve/fneg-abs_1.c: New test.
* gcc.target/aarch64/sve/fneg-abs_2.c: New test.
* gcc.target/aarch64/sve/fneg-abs_3.c: New test.
* gcc.target/aarch64/sve/fneg-abs_4.c: New test.
--- inline copy of patch --
diff --git a/gcc/match.pd b/gcc/match.pd
index 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d694826cffad0fb17e1136600a 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9476,3 +9476,57 @@ and,
}
(if (full_perm_p)
(vec_perm (op@3 @0 @1) @3 @2))))))
+
+/* Transform fneg (fabs (X)) -> X | 1 << signbit (X). */
+
+(simplify
+ (negate (abs @0))
+ (if (FLOAT_TYPE_P (type)
+ /* We have to delay this rewriting till after forward prop because otherwise
+ it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) is not
+ matched in one go. Instead cos (-x) is matched first followed by cos(|x|).
+ The bottom op approach makes this rule match first and it's not untill
+ fwdprop that we match top down. There are manu such simplications so we
+ delay this optimization till later on. */
+ && canonicalize_math_after_vectorization_p ())
+ (with {
+ tree itype = unsigned_type_for (type);
+ machine_mode mode = TYPE_MODE (type);
+ const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
+ auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
+ (if (float_fmt
+ && float_fmt->signbit_rw >= 0
+ && targetm.can_change_mode_class (TYPE_MODE (itype),
+ TYPE_MODE (type), ALL_REGS)
+ && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
+ (with { wide_int wone = wi::one (element_precision (type));
+ int sbit = float_fmt->signbit_rw;
+ auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
+ tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
+ (view_convert:type
+ (bit_ior (view_convert:itype @0)
+ { build_uniform_cst (itype, sign_bit); } )))))))
+
+/* Repeat the same but for conditional negate. */
+
+(simplify
+ (IFN_COND_NEG @1 (abs @0) @2)
+ (if (FLOAT_TYPE_P (type))
+ (with {
+ tree itype = unsigned_type_for (type);
+ machine_mode mode = TYPE_MODE (type);
+ const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
+ auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
+ (if (float_fmt
+ && float_fmt->signbit_rw >= 0
+ && targetm.can_change_mode_class (TYPE_MODE (itype),
+ TYPE_MODE (type), ALL_REGS)
+ && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
+ (with { wide_int wone = wi::one (element_precision (type));
+ int sbit = float_fmt->signbit_rw;
+ auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
+ tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
+ (view_convert:type
+ (IFN_COND_IOR @1 (view_convert:itype @0)
+ { build_uniform_cst (itype, sign_bit); }
+ (view_convert:itype @2) )))))))
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..f823013c3ddf6b3a266c3abfcbf2642fc2a75fa6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+
+/*
+** t1:
+** orr v[0-9]+.2s, #128, lsl #24
+** ret
+*/
+float32x2_t t1 (float32x2_t a)
+{
+ return vneg_f32 (vabs_f32 (a));
+}
+
+/*
+** t2:
+** orr v[0-9]+.4s, #128, lsl #24
+** ret
+*/
+float32x4_t t2 (float32x4_t a)
+{
+ return vnegq_f32 (vabsq_f32 (a));
+}
+
+/*
+** t3:
+** adrp x0, .LC[0-9]+
+** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ret
+*/
+float64x2_t t3 (float64x2_t a)
+{
+ return vnegq_f64 (vabsq_f64 (a));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..141121176b309e4b2aa413dc55271a6e3c93d5e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float32_t f1 (float32_t a)
+{
+ return -fabsf (a);
+}
+
+/*
+** f2:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float64_t f2 (float64_t a)
+{
+ return -fabs (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b4652173a95d104ddfa70c497f0627a61ea89d3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** ...
+** ldr q[0-9]+, \[x0\]
+** orr v[0-9]+.4s, #128, lsl #24
+** str q[0-9]+, \[x0\], 16
+** ...
+*/
+void f1 (float32_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabsf (a[i]);
+}
+
+/*
+** f2:
+** ...
+** ldr q[0-9]+, \[x0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** str q[0-9]+, \[x0\], 16
+** ...
+*/
+void f2 (float64_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabs (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..10879dea74462d34b26160eeb0bd54ead063166b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <string.h>
+
+/*
+** negabs:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+double negabs (double x)
+{
+ unsigned long long y;
+ memcpy (&y, &x, sizeof(double));
+ y = y | (1UL << 63);
+ memcpy (&x, &y, sizeof(double));
+ return x;
+}
+
+/*
+** negabsf:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float negabsf (float x)
+{
+ unsigned int y;
+ memcpy (&y, &x, sizeof(float));
+ y = y | (1U << 31);
+ memcpy (&x, &y, sizeof(float));
+ return x;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..0c7664e6de77a497682952653ffd417453854d52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+
+/*
+** t1:
+** orr v[0-9]+.2s, #128, lsl #24
+** ret
+*/
+float32x2_t t1 (float32x2_t a)
+{
+ return vneg_f32 (vabs_f32 (a));
+}
+
+/*
+** t2:
+** orr v[0-9]+.4s, #128, lsl #24
+** ret
+*/
+float32x4_t t2 (float32x4_t a)
+{
+ return vnegq_f32 (vabsq_f32 (a));
+}
+
+/*
+** t3:
+** adrp x0, .LC[0-9]+
+** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ret
+*/
+float64x2_t t3 (float64x2_t a)
+{
+ return vnegq_f64 (vabsq_f64 (a));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a60cd31b9294af2dac69eed1c93f899bd5c78fca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float32_t f1 (float32_t a)
+{
+ return -fabsf (a);
+}
+
+/*
+** f2:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float64_t f2 (float64_t a)
+{
+ return -fabs (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..1bf34328d8841de8e6b0a5458562a9f00e31c275
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** ...
+** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
+** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
+** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
+** ...
+*/
+void f1 (float32_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabsf (a[i]);
+}
+
+/*
+** f2:
+** ...
+** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
+** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
+** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
+** ...
+*/
+void f2 (float64_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabs (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d01f6604ca7be87e3744d494
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <string.h>
+
+/*
+** negabs:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+double negabs (double x)
+{
+ unsigned long long y;
+ memcpy (&y, &x, sizeof(double));
+ y = y | (1UL << 63);
+ memcpy (&x, &y, sizeof(double));
+ return x;
+}
+
+/*
+** negabsf:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float negabsf (float x)
+{
+ unsigned int y;
+ memcpy (&y, &x, sizeof(float));
+ y = y | (1U << 31);
+ memcpy (&x, &y, sizeof(float));
+ return x;
+}
+
--
[-- Attachment #2: rb17718.patch --]
[-- Type: text/plain, Size: 10334 bytes --]
diff --git a/gcc/match.pd b/gcc/match.pd
index 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d694826cffad0fb17e1136600a 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9476,3 +9476,57 @@ and,
}
(if (full_perm_p)
(vec_perm (op@3 @0 @1) @3 @2))))))
+
+/* Transform fneg (fabs (X)) -> X | 1 << signbit (X). */
+
+(simplify
+ (negate (abs @0))
+ (if (FLOAT_TYPE_P (type)
+ /* We have to delay this rewriting till after forward prop because otherwise
+ it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) is not
+ matched in one go. Instead cos (-x) is matched first followed by cos(|x|).
+ The bottom op approach makes this rule match first and it's not untill
+ fwdprop that we match top down. There are manu such simplications so we
+ delay this optimization till later on. */
+ && canonicalize_math_after_vectorization_p ())
+ (with {
+ tree itype = unsigned_type_for (type);
+ machine_mode mode = TYPE_MODE (type);
+ const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
+ auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
+ (if (float_fmt
+ && float_fmt->signbit_rw >= 0
+ && targetm.can_change_mode_class (TYPE_MODE (itype),
+ TYPE_MODE (type), ALL_REGS)
+ && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
+ (with { wide_int wone = wi::one (element_precision (type));
+ int sbit = float_fmt->signbit_rw;
+ auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
+ tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
+ (view_convert:type
+ (bit_ior (view_convert:itype @0)
+ { build_uniform_cst (itype, sign_bit); } )))))))
+
+/* Repeat the same but for conditional negate. */
+
+(simplify
+ (IFN_COND_NEG @1 (abs @0) @2)
+ (if (FLOAT_TYPE_P (type))
+ (with {
+ tree itype = unsigned_type_for (type);
+ machine_mode mode = TYPE_MODE (type);
+ const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
+ auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
+ (if (float_fmt
+ && float_fmt->signbit_rw >= 0
+ && targetm.can_change_mode_class (TYPE_MODE (itype),
+ TYPE_MODE (type), ALL_REGS)
+ && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
+ (with { wide_int wone = wi::one (element_precision (type));
+ int sbit = float_fmt->signbit_rw;
+ auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
+ tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
+ (view_convert:type
+ (IFN_COND_IOR @1 (view_convert:itype @0)
+ { build_uniform_cst (itype, sign_bit); }
+ (view_convert:itype @2) )))))))
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..f823013c3ddf6b3a266c3abfcbf2642fc2a75fa6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+
+/*
+** t1:
+** orr v[0-9]+.2s, #128, lsl #24
+** ret
+*/
+float32x2_t t1 (float32x2_t a)
+{
+ return vneg_f32 (vabs_f32 (a));
+}
+
+/*
+** t2:
+** orr v[0-9]+.4s, #128, lsl #24
+** ret
+*/
+float32x4_t t2 (float32x4_t a)
+{
+ return vnegq_f32 (vabsq_f32 (a));
+}
+
+/*
+** t3:
+** adrp x0, .LC[0-9]+
+** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ret
+*/
+float64x2_t t3 (float64x2_t a)
+{
+ return vnegq_f64 (vabsq_f64 (a));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..141121176b309e4b2aa413dc55271a6e3c93d5e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float32_t f1 (float32_t a)
+{
+ return -fabsf (a);
+}
+
+/*
+** f2:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float64_t f2 (float64_t a)
+{
+ return -fabs (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b4652173a95d104ddfa70c497f0627a61ea89d3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** ...
+** ldr q[0-9]+, \[x0\]
+** orr v[0-9]+.4s, #128, lsl #24
+** str q[0-9]+, \[x0\], 16
+** ...
+*/
+void f1 (float32_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabsf (a[i]);
+}
+
+/*
+** f2:
+** ...
+** ldr q[0-9]+, \[x0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** str q[0-9]+, \[x0\], 16
+** ...
+*/
+void f2 (float64_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabs (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..10879dea74462d34b26160eeb0bd54ead063166b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <string.h>
+
+/*
+** negabs:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+double negabs (double x)
+{
+ unsigned long long y;
+ memcpy (&y, &x, sizeof(double));
+ y = y | (1UL << 63);
+ memcpy (&x, &y, sizeof(double));
+ return x;
+}
+
+/*
+** negabsf:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float negabsf (float x)
+{
+ unsigned int y;
+ memcpy (&y, &x, sizeof(float));
+ y = y | (1U << 31);
+ memcpy (&x, &y, sizeof(float));
+ return x;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..0c7664e6de77a497682952653ffd417453854d52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+
+/*
+** t1:
+** orr v[0-9]+.2s, #128, lsl #24
+** ret
+*/
+float32x2_t t1 (float32x2_t a)
+{
+ return vneg_f32 (vabs_f32 (a));
+}
+
+/*
+** t2:
+** orr v[0-9]+.4s, #128, lsl #24
+** ret
+*/
+float32x4_t t2 (float32x4_t a)
+{
+ return vnegq_f32 (vabsq_f32 (a));
+}
+
+/*
+** t3:
+** adrp x0, .LC[0-9]+
+** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ret
+*/
+float64x2_t t3 (float64x2_t a)
+{
+ return vnegq_f64 (vabsq_f64 (a));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a60cd31b9294af2dac69eed1c93f899bd5c78fca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float32_t f1 (float32_t a)
+{
+ return -fabsf (a);
+}
+
+/*
+** f2:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float64_t f2 (float64_t a)
+{
+ return -fabs (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..1bf34328d8841de8e6b0a5458562a9f00e31c275
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** ...
+** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
+** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
+** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
+** ...
+*/
+void f1 (float32_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabsf (a[i]);
+}
+
+/*
+** f2:
+** ...
+** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
+** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
+** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
+** ...
+*/
+void f2 (float64_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabs (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d01f6604ca7be87e3744d494
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <string.h>
+
+/*
+** negabs:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+double negabs (double x)
+{
+ unsigned long long y;
+ memcpy (&y, &x, sizeof(double));
+ y = y | (1UL << 63);
+ memcpy (&x, &y, sizeof(double));
+ return x;
+}
+
+/*
+** negabsf:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float negabsf (float x)
+{
+ unsigned int y;
+ memcpy (&y, &x, sizeof(float));
+ y = y | (1U << 31);
+ memcpy (&x, &y, sizeof(float));
+ return x;
+}
+
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-09-27 0:50 [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154] Tamar Christina
@ 2023-09-27 1:17 ` Andrew Pinski
2023-09-27 2:31 ` Tamar Christina
2023-09-29 15:00 ` Jeff Law
1 sibling, 1 reply; 17+ messages in thread
From: Andrew Pinski @ 2023-09-27 1:17 UTC (permalink / raw)
To: Tamar Christina; +Cc: gcc-patches, nd, rguenther, jlaw
On Tue, Sep 26, 2023 at 5:51 PM Tamar Christina <tamar.christina@arm.com> wrote:
>
> Hi All,
>
> For targets that allow conversion between int and float modes this adds a new
> optimization transforming fneg (fabs (x)) into x | (1 << signbit(x)). Such
> sequences are common in scientific code working with gradients.
>
> The transformed instruction if the target has an inclusive-OR that takes an
> immediate is both shorter an faster. For those that don't the immediate has
> to be seperate constructed but this still ends up being faster as the immediate
> construction is not on the critical path.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
I think this should be part of isel instead of match.
Maybe we could use genmatch to generate the code that does the
transformations but this does not belong as part of match really.
Thanks,
Andrew
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/109154
> * match.pd: Add new neg+abs rule.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/109154
> * gcc.target/aarch64/fneg-abs_1.c: New test.
> * gcc.target/aarch64/fneg-abs_2.c: New test.
> * gcc.target/aarch64/fneg-abs_3.c: New test.
> * gcc.target/aarch64/fneg-abs_4.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
>
> --- inline copy of patch --
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d694826cffad0fb17e1136600a 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -9476,3 +9476,57 @@ and,
> }
> (if (full_perm_p)
> (vec_perm (op@3 @0 @1) @3 @2))))))
> +
> +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X). */
> +
> +(simplify
> + (negate (abs @0))
> + (if (FLOAT_TYPE_P (type)
> + /* We have to delay this rewriting till after forward prop because otherwise
> + it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) is not
> + matched in one go. Instead cos (-x) is matched first followed by cos(|x|).
> + The bottom op approach makes this rule match first and it's not untill
> + fwdprop that we match top down. There are manu such simplications so we
> + delay this optimization till later on. */
> + && canonicalize_math_after_vectorization_p ())
> + (with {
> + tree itype = unsigned_type_for (type);
> + machine_mode mode = TYPE_MODE (type);
> + const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
> + auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
> + (if (float_fmt
> + && float_fmt->signbit_rw >= 0
> + && targetm.can_change_mode_class (TYPE_MODE (itype),
> + TYPE_MODE (type), ALL_REGS)
> + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> + (with { wide_int wone = wi::one (element_precision (type));
> + int sbit = float_fmt->signbit_rw;
> + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> + (view_convert:type
> + (bit_ior (view_convert:itype @0)
> + { build_uniform_cst (itype, sign_bit); } )))))))
> +
> +/* Repeat the same but for conditional negate. */
> +
> +(simplify
> + (IFN_COND_NEG @1 (abs @0) @2)
> + (if (FLOAT_TYPE_P (type))
> + (with {
> + tree itype = unsigned_type_for (type);
> + machine_mode mode = TYPE_MODE (type);
> + const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
> + auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
> + (if (float_fmt
> + && float_fmt->signbit_rw >= 0
> + && targetm.can_change_mode_class (TYPE_MODE (itype),
> + TYPE_MODE (type), ALL_REGS)
> + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> + (with { wide_int wone = wi::one (element_precision (type));
> + int sbit = float_fmt->signbit_rw;
> + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> + (view_convert:type
> + (IFN_COND_IOR @1 (view_convert:itype @0)
> + { build_uniform_cst (itype, sign_bit); }
> + (view_convert:itype @2) )))))))
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f823013c3ddf6b3a266c3abfcbf2642fc2a75fa6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +#include <arm_neon.h>
> +
> +/*
> +** t1:
> +** orr v[0-9]+.2s, #128, lsl #24
> +** ret
> +*/
> +float32x2_t t1 (float32x2_t a)
> +{
> + return vneg_f32 (vabs_f32 (a));
> +}
> +
> +/*
> +** t2:
> +** orr v[0-9]+.4s, #128, lsl #24
> +** ret
> +*/
> +float32x4_t t2 (float32x4_t a)
> +{
> + return vnegq_f32 (vabsq_f32 (a));
> +}
> +
> +/*
> +** t3:
> +** adrp x0, .LC[0-9]+
> +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** ret
> +*/
> +float64x2_t t3 (float64x2_t a)
> +{
> + return vnegq_f64 (vabsq_f64 (a));
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..141121176b309e4b2aa413dc55271a6e3c93d5e1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +** movi v[0-9]+.2s, 0x80, lsl 24
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float32_t f1 (float32_t a)
> +{
> + return -fabsf (a);
> +}
> +
> +/*
> +** f2:
> +** mov x0, -9223372036854775808
> +** fmov d[0-9]+, x0
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float64_t f2 (float64_t a)
> +{
> + return -fabs (a);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b4652173a95d104ddfa70c497f0627a61ea89d3b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +** ...
> +** ldr q[0-9]+, \[x0\]
> +** orr v[0-9]+.4s, #128, lsl #24
> +** str q[0-9]+, \[x0\], 16
> +** ...
> +*/
> +void f1 (float32_t *a, int n)
> +{
> + for (int i = 0; i < (n & -8); i++)
> + a[i] = -fabsf (a[i]);
> +}
> +
> +/*
> +** f2:
> +** ...
> +** ldr q[0-9]+, \[x0\]
> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** str q[0-9]+, \[x0\], 16
> +** ...
> +*/
> +void f2 (float64_t *a, int n)
> +{
> + for (int i = 0; i < (n & -8); i++)
> + a[i] = -fabs (a[i]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..10879dea74462d34b26160eeb0bd54ead063166b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +#include <string.h>
> +
> +/*
> +** negabs:
> +** mov x0, -9223372036854775808
> +** fmov d[0-9]+, x0
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +double negabs (double x)
> +{
> + unsigned long long y;
> + memcpy (&y, &x, sizeof(double));
> + y = y | (1UL << 63);
> + memcpy (&x, &y, sizeof(double));
> + return x;
> +}
> +
> +/*
> +** negabsf:
> +** movi v[0-9]+.2s, 0x80, lsl 24
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float negabsf (float x)
> +{
> + unsigned int y;
> + memcpy (&y, &x, sizeof(float));
> + y = y | (1U << 31);
> + memcpy (&x, &y, sizeof(float));
> + return x;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..0c7664e6de77a497682952653ffd417453854d52
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** t1:
> +** orr v[0-9]+.2s, #128, lsl #24
> +** ret
> +*/
> +float32x2_t t1 (float32x2_t a)
> +{
> + return vneg_f32 (vabs_f32 (a));
> +}
> +
> +/*
> +** t2:
> +** orr v[0-9]+.4s, #128, lsl #24
> +** ret
> +*/
> +float32x4_t t2 (float32x4_t a)
> +{
> + return vnegq_f32 (vabsq_f32 (a));
> +}
> +
> +/*
> +** t3:
> +** adrp x0, .LC[0-9]+
> +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** ret
> +*/
> +float64x2_t t3 (float64x2_t a)
> +{
> + return vnegq_f64 (vabsq_f64 (a));
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..a60cd31b9294af2dac69eed1c93f899bd5c78fca
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +** movi v[0-9]+.2s, 0x80, lsl 24
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float32_t f1 (float32_t a)
> +{
> + return -fabsf (a);
> +}
> +
> +/*
> +** f2:
> +** mov x0, -9223372036854775808
> +** fmov d[0-9]+, x0
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float64_t f2 (float64_t a)
> +{
> + return -fabs (a);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1bf34328d8841de8e6b0a5458562a9f00e31c275
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> @@ -0,0 +1,34 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +** ...
> +** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
> +** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
> +** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
> +** ...
> +*/
> +void f1 (float32_t *a, int n)
> +{
> + for (int i = 0; i < (n & -8); i++)
> + a[i] = -fabsf (a[i]);
> +}
> +
> +/*
> +** f2:
> +** ...
> +** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
> +** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
> +** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
> +** ...
> +*/
> +void f2 (float64_t *a, int n)
> +{
> + for (int i = 0; i < (n & -8); i++)
> + a[i] = -fabs (a[i]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d01f6604ca7be87e3744d494
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <string.h>
> +
> +/*
> +** negabs:
> +** mov x0, -9223372036854775808
> +** fmov d[0-9]+, x0
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +double negabs (double x)
> +{
> + unsigned long long y;
> + memcpy (&y, &x, sizeof(double));
> + y = y | (1UL << 63);
> + memcpy (&x, &y, sizeof(double));
> + return x;
> +}
> +
> +/*
> +** negabsf:
> +** movi v[0-9]+.2s, 0x80, lsl 24
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float negabsf (float x)
> +{
> + unsigned int y;
> + memcpy (&y, &x, sizeof(float));
> + y = y | (1U << 31);
> + memcpy (&x, &y, sizeof(float));
> + return x;
> +}
> +
>
>
>
>
> --
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-09-27 1:17 ` Andrew Pinski
@ 2023-09-27 2:31 ` Tamar Christina
2023-09-27 7:11 ` Richard Biener
0 siblings, 1 reply; 17+ messages in thread
From: Tamar Christina @ 2023-09-27 2:31 UTC (permalink / raw)
To: Andrew Pinski; +Cc: gcc-patches, nd, rguenther, jlaw
> -----Original Message-----
> From: Andrew Pinski <pinskia@gmail.com>
> Sent: Wednesday, September 27, 2023 2:17 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; rguenther@suse.de;
> jlaw@ventanamicro.com
> Subject: Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 <<
> signbit(x)) [PR109154]
>
> On Tue, Sep 26, 2023 at 5:51 PM Tamar Christina <tamar.christina@arm.com>
> wrote:
> >
> > Hi All,
> >
> > For targets that allow conversion between int and float modes this
> > adds a new optimization transforming fneg (fabs (x)) into x | (1 <<
> > signbit(x)). Such sequences are common in scientific code working with
> gradients.
> >
> > The transformed instruction if the target has an inclusive-OR that
> > takes an immediate is both shorter an faster. For those that don't
> > the immediate has to be seperate constructed but this still ends up
> > being faster as the immediate construction is not on the critical path.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
>
> I think this should be part of isel instead of match.
> Maybe we could use genmatch to generate the code that does the
> transformations but this does not belong as part of match really.
I disagree.. I don't think this belongs in isel. Isel is for structural transformations.
If there is a case for something else I'd imagine backwardprop is a better choice.
But I don't see why it doesn't belong here considering it *is* a mathematical optimization
and the file has plenty of transformations such as mask optimizations and vector conditional
rewriting.
Regards,
Tamar
>
> Thanks,
> Andrew
>
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > PR tree-optimization/109154
> > * match.pd: Add new neg+abs rule.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR tree-optimization/109154
> > * gcc.target/aarch64/fneg-abs_1.c: New test.
> > * gcc.target/aarch64/fneg-abs_2.c: New test.
> > * gcc.target/aarch64/fneg-abs_3.c: New test.
> > * gcc.target/aarch64/fneg-abs_4.c: New test.
> > * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/match.pd b/gcc/match.pd index
> >
> 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d69482
> 6cffad0f
> > b17e1136600a 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -9476,3 +9476,57 @@ and,
> > }
> > (if (full_perm_p)
> > (vec_perm (op@3 @0 @1) @3 @2))))))
> > +
> > +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X). */
> > +
> > +(simplify
> > + (negate (abs @0))
> > + (if (FLOAT_TYPE_P (type)
> > + /* We have to delay this rewriting till after forward prop because
> otherwise
> > + it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) is not
> > + matched in one go. Instead cos (-x) is matched first followed by
> cos(|x|).
> > + The bottom op approach makes this rule match first and it's not untill
> > + fwdprop that we match top down. There are manu such simplications
> so we
> > + delay this optimization till later on. */
> > + && canonicalize_math_after_vectorization_p ())
> > + (with {
> > + tree itype = unsigned_type_for (type);
> > + machine_mode mode = TYPE_MODE (type);
> > + const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
> > + auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
> > + (if (float_fmt
> > + && float_fmt->signbit_rw >= 0
> > + && targetm.can_change_mode_class (TYPE_MODE (itype),
> > + TYPE_MODE (type), ALL_REGS)
> > + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > + (with { wide_int wone = wi::one (element_precision (type));
> > + int sbit = float_fmt->signbit_rw;
> > + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> > + (view_convert:type
> > + (bit_ior (view_convert:itype @0)
> > + { build_uniform_cst (itype, sign_bit); } )))))))
> > +
> > +/* Repeat the same but for conditional negate. */
> > +
> > +(simplify
> > + (IFN_COND_NEG @1 (abs @0) @2)
> > + (if (FLOAT_TYPE_P (type))
> > + (with {
> > + tree itype = unsigned_type_for (type);
> > + machine_mode mode = TYPE_MODE (type);
> > + const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
> > + auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
> > + (if (float_fmt
> > + && float_fmt->signbit_rw >= 0
> > + && targetm.can_change_mode_class (TYPE_MODE (itype),
> > + TYPE_MODE (type), ALL_REGS)
> > + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > + (with { wide_int wone = wi::one (element_precision (type));
> > + int sbit = float_fmt->signbit_rw;
> > + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> > + (view_convert:type
> > + (IFN_COND_IOR @1 (view_convert:itype @0)
> > + { build_uniform_cst (itype, sign_bit); }
> > + (view_convert:itype @2) )))))))
> > \ No newline at end of file
> > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..f823013c3ddf6b3a266
> c3abfcbf2
> > 642fc2a75fa6
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > @@ -0,0 +1,39 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > +*/
> > +
> > +#pragma GCC target "+nosve"
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** t1:
> > +** orr v[0-9]+.2s, #128, lsl #24
> > +** ret
> > +*/
> > +float32x2_t t1 (float32x2_t a)
> > +{
> > + return vneg_f32 (vabs_f32 (a));
> > +}
> > +
> > +/*
> > +** t2:
> > +** orr v[0-9]+.4s, #128, lsl #24
> > +** ret
> > +*/
> > +float32x4_t t2 (float32x4_t a)
> > +{
> > + return vnegq_f32 (vabsq_f32 (a));
> > +}
> > +
> > +/*
> > +** t3:
> > +** adrp x0, .LC[0-9]+
> > +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > +** ret
> > +*/
> > +float64x2_t t3 (float64x2_t a)
> > +{
> > + return vnegq_f64 (vabsq_f64 (a));
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..141121176b309e4b2a
> a413dc5527
> > 1a6e3c93d5e1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > @@ -0,0 +1,31 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > +*/
> > +
> > +#pragma GCC target "+nosve"
> > +
> > +#include <arm_neon.h>
> > +#include <math.h>
> > +
> > +/*
> > +** f1:
> > +** movi v[0-9]+.2s, 0x80, lsl 24
> > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > +** ret
> > +*/
> > +float32_t f1 (float32_t a)
> > +{
> > + return -fabsf (a);
> > +}
> > +
> > +/*
> > +** f2:
> > +** mov x0, -9223372036854775808
> > +** fmov d[0-9]+, x0
> > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > +** ret
> > +*/
> > +float64_t f2 (float64_t a)
> > +{
> > + return -fabs (a);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..b4652173a95d104ddf
> a70c497f06
> > 27a61ea89d3b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > @@ -0,0 +1,36 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > +*/
> > +
> > +#pragma GCC target "+nosve"
> > +
> > +#include <arm_neon.h>
> > +#include <math.h>
> > +
> > +/*
> > +** f1:
> > +** ...
> > +** ldr q[0-9]+, \[x0\]
> > +** orr v[0-9]+.4s, #128, lsl #24
> > +** str q[0-9]+, \[x0\], 16
> > +** ...
> > +*/
> > +void f1 (float32_t *a, int n)
> > +{
> > + for (int i = 0; i < (n & -8); i++)
> > + a[i] = -fabsf (a[i]);
> > +}
> > +
> > +/*
> > +** f2:
> > +** ...
> > +** ldr q[0-9]+, \[x0\]
> > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > +** str q[0-9]+, \[x0\], 16
> > +** ...
> > +*/
> > +void f2 (float64_t *a, int n)
> > +{
> > + for (int i = 0; i < (n & -8); i++)
> > + a[i] = -fabs (a[i]);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..10879dea74462d34b2
> 6160eeb0bd
> > 54ead063166b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > @@ -0,0 +1,39 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > +*/
> > +
> > +#pragma GCC target "+nosve"
> > +
> > +#include <string.h>
> > +
> > +/*
> > +** negabs:
> > +** mov x0, -9223372036854775808
> > +** fmov d[0-9]+, x0
> > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > +** ret
> > +*/
> > +double negabs (double x)
> > +{
> > + unsigned long long y;
> > + memcpy (&y, &x, sizeof(double));
> > + y = y | (1UL << 63);
> > + memcpy (&x, &y, sizeof(double));
> > + return x;
> > +}
> > +
> > +/*
> > +** negabsf:
> > +** movi v[0-9]+.2s, 0x80, lsl 24
> > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > +** ret
> > +*/
> > +float negabsf (float x)
> > +{
> > + unsigned int y;
> > + memcpy (&y, &x, sizeof(float));
> > + y = y | (1U << 31);
> > + memcpy (&x, &y, sizeof(float));
> > + return x;
> > +}
> > +
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..0c7664e6de77a49768
> 2952653ffd
> > 417453854d52
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > @@ -0,0 +1,37 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > +*/
> > +
> > +#include <arm_neon.h>
> > +
> > +/*
> > +** t1:
> > +** orr v[0-9]+.2s, #128, lsl #24
> > +** ret
> > +*/
> > +float32x2_t t1 (float32x2_t a)
> > +{
> > + return vneg_f32 (vabs_f32 (a));
> > +}
> > +
> > +/*
> > +** t2:
> > +** orr v[0-9]+.4s, #128, lsl #24
> > +** ret
> > +*/
> > +float32x4_t t2 (float32x4_t a)
> > +{
> > + return vnegq_f32 (vabsq_f32 (a));
> > +}
> > +
> > +/*
> > +** t3:
> > +** adrp x0, .LC[0-9]+
> > +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > +** ret
> > +*/
> > +float64x2_t t3 (float64x2_t a)
> > +{
> > + return vnegq_f64 (vabsq_f64 (a));
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..a60cd31b9294af2dac6
> 9eed1c93f
> > 899bd5c78fca
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > @@ -0,0 +1,29 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > +*/
> > +
> > +#include <arm_neon.h>
> > +#include <math.h>
> > +
> > +/*
> > +** f1:
> > +** movi v[0-9]+.2s, 0x80, lsl 24
> > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > +** ret
> > +*/
> > +float32_t f1 (float32_t a)
> > +{
> > + return -fabsf (a);
> > +}
> > +
> > +/*
> > +** f2:
> > +** mov x0, -9223372036854775808
> > +** fmov d[0-9]+, x0
> > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > +** ret
> > +*/
> > +float64_t f2 (float64_t a)
> > +{
> > + return -fabs (a);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..1bf34328d8841de8e6
> b0a5458562
> > a9f00e31c275
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > @@ -0,0 +1,34 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > +*/
> > +
> > +#include <arm_neon.h>
> > +#include <math.h>
> > +
> > +/*
> > +** f1:
> > +** ...
> > +** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
> > +** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
> > +** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
> > +** ...
> > +*/
> > +void f1 (float32_t *a, int n)
> > +{
> > + for (int i = 0; i < (n & -8); i++)
> > + a[i] = -fabsf (a[i]);
> > +}
> > +
> > +/*
> > +** f2:
> > +** ...
> > +** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
> > +** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
> > +** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
> > +** ...
> > +*/
> > +void f2 (float64_t *a, int n)
> > +{
> > + for (int i = 0; i < (n & -8); i++)
> > + a[i] = -fabs (a[i]);
> > +}
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d0
> 1f6604ca7b
> > e87e3744d494
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > @@ -0,0 +1,37 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > +*/
> > +
> > +#include <string.h>
> > +
> > +/*
> > +** negabs:
> > +** mov x0, -9223372036854775808
> > +** fmov d[0-9]+, x0
> > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > +** ret
> > +*/
> > +double negabs (double x)
> > +{
> > + unsigned long long y;
> > + memcpy (&y, &x, sizeof(double));
> > + y = y | (1UL << 63);
> > + memcpy (&x, &y, sizeof(double));
> > + return x;
> > +}
> > +
> > +/*
> > +** negabsf:
> > +** movi v[0-9]+.2s, 0x80, lsl 24
> > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > +** ret
> > +*/
> > +float negabsf (float x)
> > +{
> > + unsigned int y;
> > + memcpy (&y, &x, sizeof(float));
> > + y = y | (1U << 31);
> > + memcpy (&x, &y, sizeof(float));
> > + return x;
> > +}
> > +
> >
> >
> >
> >
> > --
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-09-27 2:31 ` Tamar Christina
@ 2023-09-27 7:11 ` Richard Biener
2023-09-27 7:56 ` Tamar Christina
0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2023-09-27 7:11 UTC (permalink / raw)
To: Tamar Christina; +Cc: Andrew Pinski, gcc-patches, nd, jlaw
On Wed, 27 Sep 2023, Tamar Christina wrote:
> > -----Original Message-----
> > From: Andrew Pinski <pinskia@gmail.com>
> > Sent: Wednesday, September 27, 2023 2:17 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; rguenther@suse.de;
> > jlaw@ventanamicro.com
> > Subject: Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 <<
> > signbit(x)) [PR109154]
> >
> > On Tue, Sep 26, 2023 at 5:51?PM Tamar Christina <tamar.christina@arm.com>
> > wrote:
> > >
> > > Hi All,
> > >
> > > For targets that allow conversion between int and float modes this
> > > adds a new optimization transforming fneg (fabs (x)) into x | (1 <<
> > > signbit(x)). Such sequences are common in scientific code working with
> > gradients.
> > >
> > > The transformed instruction if the target has an inclusive-OR that
> > > takes an immediate is both shorter an faster. For those that don't
> > > the immediate has to be seperate constructed but this still ends up
> > > being faster as the immediate construction is not on the critical path.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master?
> >
> > I think this should be part of isel instead of match.
> > Maybe we could use genmatch to generate the code that does the
> > transformations but this does not belong as part of match really.
>
> I disagree.. I don't think this belongs in isel. Isel is for structural transformations.
> If there is a case for something else I'd imagine backwardprop is a better choice.
>
> But I don't see why it doesn't belong here considering it *is* a mathematical optimization
> and the file has plenty of transformations such as mask optimizations and vector conditional
> rewriting.
But the mathematical transform would more generally be
fneg (fabs (x)) -> copysign (x, -1.) and that can be optimally expanded
at RTL expansion time?
Richard.
> Regards,
> Tamar
>
> >
> > Thanks,
> > Andrew
> >
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > PR tree-optimization/109154
> > > * match.pd: Add new neg+abs rule.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > PR tree-optimization/109154
> > > * gcc.target/aarch64/fneg-abs_1.c: New test.
> > > * gcc.target/aarch64/fneg-abs_2.c: New test.
> > > * gcc.target/aarch64/fneg-abs_3.c: New test.
> > > * gcc.target/aarch64/fneg-abs_4.c: New test.
> > > * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > > * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > > * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > > * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > >
> > 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d69482
> > 6cffad0f
> > > b17e1136600a 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -9476,3 +9476,57 @@ and,
> > > }
> > > (if (full_perm_p)
> > > (vec_perm (op@3 @0 @1) @3 @2))))))
> > > +
> > > +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X). */
> > > +
> > > +(simplify
> > > + (negate (abs @0))
> > > + (if (FLOAT_TYPE_P (type)
> > > + /* We have to delay this rewriting till after forward prop because
> > otherwise
> > > + it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) is not
> > > + matched in one go. Instead cos (-x) is matched first followed by
> > cos(|x|).
> > > + The bottom op approach makes this rule match first and it's not untill
> > > + fwdprop that we match top down. There are manu such simplications
> > so we
> > > + delay this optimization till later on. */
> > > + && canonicalize_math_after_vectorization_p ())
> > > + (with {
> > > + tree itype = unsigned_type_for (type);
> > > + machine_mode mode = TYPE_MODE (type);
> > > + const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
> > > + auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
> > > + (if (float_fmt
> > > + && float_fmt->signbit_rw >= 0
> > > + && targetm.can_change_mode_class (TYPE_MODE (itype),
> > > + TYPE_MODE (type), ALL_REGS)
> > > + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > > + (with { wide_int wone = wi::one (element_precision (type));
> > > + int sbit = float_fmt->signbit_rw;
> > > + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > > + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> > > + (view_convert:type
> > > + (bit_ior (view_convert:itype @0)
> > > + { build_uniform_cst (itype, sign_bit); } )))))))
> > > +
> > > +/* Repeat the same but for conditional negate. */
> > > +
> > > +(simplify
> > > + (IFN_COND_NEG @1 (abs @0) @2)
> > > + (if (FLOAT_TYPE_P (type))
> > > + (with {
> > > + tree itype = unsigned_type_for (type);
> > > + machine_mode mode = TYPE_MODE (type);
> > > + const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
> > > + auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
> > > + (if (float_fmt
> > > + && float_fmt->signbit_rw >= 0
> > > + && targetm.can_change_mode_class (TYPE_MODE (itype),
> > > + TYPE_MODE (type), ALL_REGS)
> > > + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > > + (with { wide_int wone = wi::one (element_precision (type));
> > > + int sbit = float_fmt->signbit_rw;
> > > + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > > + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> > > + (view_convert:type
> > > + (IFN_COND_IOR @1 (view_convert:itype @0)
> > > + { build_uniform_cst (itype, sign_bit); }
> > > + (view_convert:itype @2) )))))))
> > > \ No newline at end of file
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..f823013c3ddf6b3a266
> > c3abfcbf2
> > > 642fc2a75fa6
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > @@ -0,0 +1,39 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > > +*/
> > > +
> > > +#pragma GCC target "+nosve"
> > > +
> > > +#include <arm_neon.h>
> > > +
> > > +/*
> > > +** t1:
> > > +** orr v[0-9]+.2s, #128, lsl #24
> > > +** ret
> > > +*/
> > > +float32x2_t t1 (float32x2_t a)
> > > +{
> > > + return vneg_f32 (vabs_f32 (a));
> > > +}
> > > +
> > > +/*
> > > +** t2:
> > > +** orr v[0-9]+.4s, #128, lsl #24
> > > +** ret
> > > +*/
> > > +float32x4_t t2 (float32x4_t a)
> > > +{
> > > + return vnegq_f32 (vabsq_f32 (a));
> > > +}
> > > +
> > > +/*
> > > +** t3:
> > > +** adrp x0, .LC[0-9]+
> > > +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > +** ret
> > > +*/
> > > +float64x2_t t3 (float64x2_t a)
> > > +{
> > > + return vnegq_f64 (vabsq_f64 (a));
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..141121176b309e4b2a
> > a413dc5527
> > > 1a6e3c93d5e1
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > @@ -0,0 +1,31 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > > +*/
> > > +
> > > +#pragma GCC target "+nosve"
> > > +
> > > +#include <arm_neon.h>
> > > +#include <math.h>
> > > +
> > > +/*
> > > +** f1:
> > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > +** ret
> > > +*/
> > > +float32_t f1 (float32_t a)
> > > +{
> > > + return -fabsf (a);
> > > +}
> > > +
> > > +/*
> > > +** f2:
> > > +** mov x0, -9223372036854775808
> > > +** fmov d[0-9]+, x0
> > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > +** ret
> > > +*/
> > > +float64_t f2 (float64_t a)
> > > +{
> > > + return -fabs (a);
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..b4652173a95d104ddf
> > a70c497f06
> > > 27a61ea89d3b
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > @@ -0,0 +1,36 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > > +*/
> > > +
> > > +#pragma GCC target "+nosve"
> > > +
> > > +#include <arm_neon.h>
> > > +#include <math.h>
> > > +
> > > +/*
> > > +** f1:
> > > +** ...
> > > +** ldr q[0-9]+, \[x0\]
> > > +** orr v[0-9]+.4s, #128, lsl #24
> > > +** str q[0-9]+, \[x0\], 16
> > > +** ...
> > > +*/
> > > +void f1 (float32_t *a, int n)
> > > +{
> > > + for (int i = 0; i < (n & -8); i++)
> > > + a[i] = -fabsf (a[i]);
> > > +}
> > > +
> > > +/*
> > > +** f2:
> > > +** ...
> > > +** ldr q[0-9]+, \[x0\]
> > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > +** str q[0-9]+, \[x0\], 16
> > > +** ...
> > > +*/
> > > +void f2 (float64_t *a, int n)
> > > +{
> > > + for (int i = 0; i < (n & -8); i++)
> > > + a[i] = -fabs (a[i]);
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..10879dea74462d34b2
> > 6160eeb0bd
> > > 54ead063166b
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > @@ -0,0 +1,39 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > > +*/
> > > +
> > > +#pragma GCC target "+nosve"
> > > +
> > > +#include <string.h>
> > > +
> > > +/*
> > > +** negabs:
> > > +** mov x0, -9223372036854775808
> > > +** fmov d[0-9]+, x0
> > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > +** ret
> > > +*/
> > > +double negabs (double x)
> > > +{
> > > + unsigned long long y;
> > > + memcpy (&y, &x, sizeof(double));
> > > + y = y | (1UL << 63);
> > > + memcpy (&x, &y, sizeof(double));
> > > + return x;
> > > +}
> > > +
> > > +/*
> > > +** negabsf:
> > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > +** ret
> > > +*/
> > > +float negabsf (float x)
> > > +{
> > > + unsigned int y;
> > > + memcpy (&y, &x, sizeof(float));
> > > + y = y | (1U << 31);
> > > + memcpy (&x, &y, sizeof(float));
> > > + return x;
> > > +}
> > > +
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..0c7664e6de77a49768
> > 2952653ffd
> > > 417453854d52
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > @@ -0,0 +1,37 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > > +*/
> > > +
> > > +#include <arm_neon.h>
> > > +
> > > +/*
> > > +** t1:
> > > +** orr v[0-9]+.2s, #128, lsl #24
> > > +** ret
> > > +*/
> > > +float32x2_t t1 (float32x2_t a)
> > > +{
> > > + return vneg_f32 (vabs_f32 (a));
> > > +}
> > > +
> > > +/*
> > > +** t2:
> > > +** orr v[0-9]+.4s, #128, lsl #24
> > > +** ret
> > > +*/
> > > +float32x4_t t2 (float32x4_t a)
> > > +{
> > > + return vnegq_f32 (vabsq_f32 (a));
> > > +}
> > > +
> > > +/*
> > > +** t3:
> > > +** adrp x0, .LC[0-9]+
> > > +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > +** ret
> > > +*/
> > > +float64x2_t t3 (float64x2_t a)
> > > +{
> > > + return vnegq_f64 (vabsq_f64 (a));
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..a60cd31b9294af2dac6
> > 9eed1c93f
> > > 899bd5c78fca
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > @@ -0,0 +1,29 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > > +*/
> > > +
> > > +#include <arm_neon.h>
> > > +#include <math.h>
> > > +
> > > +/*
> > > +** f1:
> > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > +** ret
> > > +*/
> > > +float32_t f1 (float32_t a)
> > > +{
> > > + return -fabsf (a);
> > > +}
> > > +
> > > +/*
> > > +** f2:
> > > +** mov x0, -9223372036854775808
> > > +** fmov d[0-9]+, x0
> > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > +** ret
> > > +*/
> > > +float64_t f2 (float64_t a)
> > > +{
> > > + return -fabs (a);
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..1bf34328d8841de8e6
> > b0a5458562
> > > a9f00e31c275
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > @@ -0,0 +1,34 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > > +*/
> > > +
> > > +#include <arm_neon.h>
> > > +#include <math.h>
> > > +
> > > +/*
> > > +** f1:
> > > +** ...
> > > +** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
> > > +** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
> > > +** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
> > > +** ...
> > > +*/
> > > +void f1 (float32_t *a, int n)
> > > +{
> > > + for (int i = 0; i < (n & -8); i++)
> > > + a[i] = -fabsf (a[i]);
> > > +}
> > > +
> > > +/*
> > > +** f2:
> > > +** ...
> > > +** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
> > > +** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
> > > +** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
> > > +** ...
> > > +*/
> > > +void f2 (float64_t *a, int n)
> > > +{
> > > + for (int i = 0; i < (n & -8); i++)
> > > + a[i] = -fabs (a[i]);
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d0
> > 1f6604ca7b
> > > e87e3744d494
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > @@ -0,0 +1,37 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > > +*/
> > > +
> > > +#include <string.h>
> > > +
> > > +/*
> > > +** negabs:
> > > +** mov x0, -9223372036854775808
> > > +** fmov d[0-9]+, x0
> > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > +** ret
> > > +*/
> > > +double negabs (double x)
> > > +{
> > > + unsigned long long y;
> > > + memcpy (&y, &x, sizeof(double));
> > > + y = y | (1UL << 63);
> > > + memcpy (&x, &y, sizeof(double));
> > > + return x;
> > > +}
> > > +
> > > +/*
> > > +** negabsf:
> > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > +** ret
> > > +*/
> > > +float negabsf (float x)
> > > +{
> > > + unsigned int y;
> > > + memcpy (&y, &x, sizeof(float));
> > > + y = y | (1U << 31);
> > > + memcpy (&x, &y, sizeof(float));
> > > + return x;
> > > +}
> > > +
> > >
> > >
> > >
> > >
> > > --
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-09-27 7:11 ` Richard Biener
@ 2023-09-27 7:56 ` Tamar Christina
2023-09-27 9:35 ` Tamar Christina
0 siblings, 1 reply; 17+ messages in thread
From: Tamar Christina @ 2023-09-27 7:56 UTC (permalink / raw)
To: Richard Biener; +Cc: Andrew Pinski, gcc-patches, nd, jlaw
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Wednesday, September 27, 2023 8:12 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Andrew Pinski <pinskia@gmail.com>; gcc-patches@gcc.gnu.org; nd
> <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 <<
> signbit(x)) [PR109154]
>
> On Wed, 27 Sep 2023, Tamar Christina wrote:
>
> > > -----Original Message-----
> > > From: Andrew Pinski <pinskia@gmail.com>
> > > Sent: Wednesday, September 27, 2023 2:17 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; rguenther@suse.de;
> > > jlaw@ventanamicro.com
> > > Subject: Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to
> > > x | (1 <<
> > > signbit(x)) [PR109154]
> > >
> > > On Tue, Sep 26, 2023 at 5:51?PM Tamar Christina
> > > <tamar.christina@arm.com>
> > > wrote:
> > > >
> > > > Hi All,
> > > >
> > > > For targets that allow conversion between int and float modes this
> > > > adds a new optimization transforming fneg (fabs (x)) into x | (1
> > > > << signbit(x)). Such sequences are common in scientific code
> > > > working with
> > > gradients.
> > > >
> > > > The transformed instruction if the target has an inclusive-OR that
> > > > takes an immediate is both shorter an faster. For those that
> > > > don't the immediate has to be seperate constructed but this still
> > > > ends up being faster as the immediate construction is not on the critical
> path.
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > >
> > > > Ok for master?
> > >
> > > I think this should be part of isel instead of match.
> > > Maybe we could use genmatch to generate the code that does the
> > > transformations but this does not belong as part of match really.
> >
> > I disagree.. I don't think this belongs in isel. Isel is for structural
> transformations.
> > If there is a case for something else I'd imagine backwardprop is a better
> choice.
> >
> > But I don't see why it doesn't belong here considering it *is* a
> > mathematical optimization and the file has plenty of transformations
> > such as mask optimizations and vector conditional rewriting.
>
> But the mathematical transform would more generally be fneg (fabs (x)) ->
> copysign (x, -1.) and that can be optimally expanded at RTL expansion time?
Ah sure, atm I did copysign (x, -1) -> x | 1 << signbits. I can do it the other way
around. And I guess since copysign (-x, y), copysign(|x|, y) -> copysign (x, y) that
should solve the trigonometry problem too.
Cool will do that instead, thanks!
Tamar
>
> Richard.
>
> > Regards,
> > Tamar
> >
> > >
> > > Thanks,
> > > Andrew
> > >
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > PR tree-optimization/109154
> > > > * match.pd: Add new neg+abs rule.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR tree-optimization/109154
> > > > * gcc.target/aarch64/fneg-abs_1.c: New test.
> > > > * gcc.target/aarch64/fneg-abs_2.c: New test.
> > > > * gcc.target/aarch64/fneg-abs_3.c: New test.
> > > > * gcc.target/aarch64/fneg-abs_4.c: New test.
> > > > * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > > > * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > > > * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > > > * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> > > >
> > > > --- inline copy of patch --
> > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > >
> > >
> 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d69482
> > > 6cffad0f
> > > > b17e1136600a 100644
> > > > --- a/gcc/match.pd
> > > > +++ b/gcc/match.pd
> > > > @@ -9476,3 +9476,57 @@ and,
> > > > }
> > > > (if (full_perm_p)
> > > > (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > +
> > > > +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X). */
> > > > +
> > > > +(simplify
> > > > + (negate (abs @0))
> > > > + (if (FLOAT_TYPE_P (type)
> > > > + /* We have to delay this rewriting till after forward prop
> > > > +because
> > > otherwise
> > > > + it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) is not
> > > > + matched in one go. Instead cos (-x) is matched first
> > > > + followed by
> > > cos(|x|).
> > > > + The bottom op approach makes this rule match first and it's not
> untill
> > > > + fwdprop that we match top down. There are manu such
> > > > + simplications
> > > so we
> > > > + delay this optimization till later on. */
> > > > + && canonicalize_math_after_vectorization_p ()) (with {
> > > > + tree itype = unsigned_type_for (type);
> > > > + machine_mode mode = TYPE_MODE (type);
> > > > + const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
> > > > + auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
> > > > + (if (float_fmt
> > > > + && float_fmt->signbit_rw >= 0
> > > > + && targetm.can_change_mode_class (TYPE_MODE (itype),
> > > > + TYPE_MODE (type), ALL_REGS)
> > > > + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > > > + (with { wide_int wone = wi::one (element_precision (type));
> > > > + int sbit = float_fmt->signbit_rw;
> > > > + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > > > + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> > > > + (view_convert:type
> > > > + (bit_ior (view_convert:itype @0)
> > > > + { build_uniform_cst (itype, sign_bit); } )))))))
> > > > +
> > > > +/* Repeat the same but for conditional negate. */
> > > > +
> > > > +(simplify
> > > > + (IFN_COND_NEG @1 (abs @0) @2)
> > > > + (if (FLOAT_TYPE_P (type))
> > > > + (with {
> > > > + tree itype = unsigned_type_for (type);
> > > > + machine_mode mode = TYPE_MODE (type);
> > > > + const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
> > > > + auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
> > > > + (if (float_fmt
> > > > + && float_fmt->signbit_rw >= 0
> > > > + && targetm.can_change_mode_class (TYPE_MODE (itype),
> > > > + TYPE_MODE (type), ALL_REGS)
> > > > + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > > > + (with { wide_int wone = wi::one (element_precision (type));
> > > > + int sbit = float_fmt->signbit_rw;
> > > > + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > > > + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> > > > + (view_convert:type
> > > > + (IFN_COND_IOR @1 (view_convert:itype @0)
> > > > + { build_uniform_cst (itype, sign_bit); }
> > > > + (view_convert:itype @2) )))))))
> > > > \ No newline at end of file
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..f823013c3ddf6b3a266
> > > c3abfcbf2
> > > > 642fc2a75fa6
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > > @@ -0,0 +1,39 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O3" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 }
> > > > +} } */
> > > > +
> > > > +#pragma GCC target "+nosve"
> > > > +
> > > > +#include <arm_neon.h>
> > > > +
> > > > +/*
> > > > +** t1:
> > > > +** orr v[0-9]+.2s, #128, lsl #24
> > > > +** ret
> > > > +*/
> > > > +float32x2_t t1 (float32x2_t a)
> > > > +{
> > > > + return vneg_f32 (vabs_f32 (a)); }
> > > > +
> > > > +/*
> > > > +** t2:
> > > > +** orr v[0-9]+.4s, #128, lsl #24
> > > > +** ret
> > > > +*/
> > > > +float32x4_t t2 (float32x4_t a)
> > > > +{
> > > > + return vnegq_f32 (vabsq_f32 (a)); }
> > > > +
> > > > +/*
> > > > +** t3:
> > > > +** adrp x0, .LC[0-9]+
> > > > +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> > > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > +** ret
> > > > +*/
> > > > +float64x2_t t3 (float64x2_t a)
> > > > +{
> > > > + return vnegq_f64 (vabsq_f64 (a)); }
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..141121176b309e4b2a
> > > a413dc5527
> > > > 1a6e3c93d5e1
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > > @@ -0,0 +1,31 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O3" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 }
> > > > +} } */
> > > > +
> > > > +#pragma GCC target "+nosve"
> > > > +
> > > > +#include <arm_neon.h>
> > > > +#include <math.h>
> > > > +
> > > > +/*
> > > > +** f1:
> > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > +** ret
> > > > +*/
> > > > +float32_t f1 (float32_t a)
> > > > +{
> > > > + return -fabsf (a);
> > > > +}
> > > > +
> > > > +/*
> > > > +** f2:
> > > > +** mov x0, -9223372036854775808
> > > > +** fmov d[0-9]+, x0
> > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > +** ret
> > > > +*/
> > > > +float64_t f2 (float64_t a)
> > > > +{
> > > > + return -fabs (a);
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..b4652173a95d104ddf
> > > a70c497f06
> > > > 27a61ea89d3b
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > > @@ -0,0 +1,36 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O3" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 }
> > > > +} } */
> > > > +
> > > > +#pragma GCC target "+nosve"
> > > > +
> > > > +#include <arm_neon.h>
> > > > +#include <math.h>
> > > > +
> > > > +/*
> > > > +** f1:
> > > > +** ...
> > > > +** ldr q[0-9]+, \[x0\]
> > > > +** orr v[0-9]+.4s, #128, lsl #24
> > > > +** str q[0-9]+, \[x0\], 16
> > > > +** ...
> > > > +*/
> > > > +void f1 (float32_t *a, int n)
> > > > +{
> > > > + for (int i = 0; i < (n & -8); i++)
> > > > + a[i] = -fabsf (a[i]);
> > > > +}
> > > > +
> > > > +/*
> > > > +** f2:
> > > > +** ...
> > > > +** ldr q[0-9]+, \[x0\]
> > > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > +** str q[0-9]+, \[x0\], 16
> > > > +** ...
> > > > +*/
> > > > +void f2 (float64_t *a, int n)
> > > > +{
> > > > + for (int i = 0; i < (n & -8); i++)
> > > > + a[i] = -fabs (a[i]);
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..10879dea74462d34b2
> > > 6160eeb0bd
> > > > 54ead063166b
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > > @@ -0,0 +1,39 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O3" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 }
> > > > +} } */
> > > > +
> > > > +#pragma GCC target "+nosve"
> > > > +
> > > > +#include <string.h>
> > > > +
> > > > +/*
> > > > +** negabs:
> > > > +** mov x0, -9223372036854775808
> > > > +** fmov d[0-9]+, x0
> > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > +** ret
> > > > +*/
> > > > +double negabs (double x)
> > > > +{
> > > > + unsigned long long y;
> > > > + memcpy (&y, &x, sizeof(double));
> > > > + y = y | (1UL << 63);
> > > > + memcpy (&x, &y, sizeof(double));
> > > > + return x;
> > > > +}
> > > > +
> > > > +/*
> > > > +** negabsf:
> > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > +** ret
> > > > +*/
> > > > +float negabsf (float x)
> > > > +{
> > > > + unsigned int y;
> > > > + memcpy (&y, &x, sizeof(float));
> > > > + y = y | (1U << 31);
> > > > + memcpy (&x, &y, sizeof(float));
> > > > + return x;
> > > > +}
> > > > +
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..0c7664e6de77a49768
> > > 2952653ffd
> > > > 417453854d52
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > > @@ -0,0 +1,37 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O3" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 }
> > > > +} } */
> > > > +
> > > > +#include <arm_neon.h>
> > > > +
> > > > +/*
> > > > +** t1:
> > > > +** orr v[0-9]+.2s, #128, lsl #24
> > > > +** ret
> > > > +*/
> > > > +float32x2_t t1 (float32x2_t a)
> > > > +{
> > > > + return vneg_f32 (vabs_f32 (a)); }
> > > > +
> > > > +/*
> > > > +** t2:
> > > > +** orr v[0-9]+.4s, #128, lsl #24
> > > > +** ret
> > > > +*/
> > > > +float32x4_t t2 (float32x4_t a)
> > > > +{
> > > > + return vnegq_f32 (vabsq_f32 (a)); }
> > > > +
> > > > +/*
> > > > +** t3:
> > > > +** adrp x0, .LC[0-9]+
> > > > +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> > > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > +** ret
> > > > +*/
> > > > +float64x2_t t3 (float64x2_t a)
> > > > +{
> > > > + return vnegq_f64 (vabsq_f64 (a)); }
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..a60cd31b9294af2dac6
> > > 9eed1c93f
> > > > 899bd5c78fca
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > > @@ -0,0 +1,29 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O3" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 }
> > > > +} } */
> > > > +
> > > > +#include <arm_neon.h>
> > > > +#include <math.h>
> > > > +
> > > > +/*
> > > > +** f1:
> > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > +** ret
> > > > +*/
> > > > +float32_t f1 (float32_t a)
> > > > +{
> > > > + return -fabsf (a);
> > > > +}
> > > > +
> > > > +/*
> > > > +** f2:
> > > > +** mov x0, -9223372036854775808
> > > > +** fmov d[0-9]+, x0
> > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > +** ret
> > > > +*/
> > > > +float64_t f2 (float64_t a)
> > > > +{
> > > > + return -fabs (a);
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..1bf34328d8841de8e6
> > > b0a5458562
> > > > a9f00e31c275
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > > @@ -0,0 +1,34 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O3" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 }
> > > > +} } */
> > > > +
> > > > +#include <arm_neon.h>
> > > > +#include <math.h>
> > > > +
> > > > +/*
> > > > +** f1:
> > > > +** ...
> > > > +** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
> > > > +** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
> > > > +** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
> > > > +** ...
> > > > +*/
> > > > +void f1 (float32_t *a, int n)
> > > > +{
> > > > + for (int i = 0; i < (n & -8); i++)
> > > > + a[i] = -fabsf (a[i]);
> > > > +}
> > > > +
> > > > +/*
> > > > +** f2:
> > > > +** ...
> > > > +** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
> > > > +** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
> > > > +** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
> > > > +** ...
> > > > +*/
> > > > +void f2 (float64_t *a, int n)
> > > > +{
> > > > + for (int i = 0; i < (n & -8); i++)
> > > > + a[i] = -fabs (a[i]);
> > > > +}
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d0
> > > 1f6604ca7b
> > > > e87e3744d494
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > > @@ -0,0 +1,37 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-options "-O3" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 }
> > > > +} } */
> > > > +
> > > > +#include <string.h>
> > > > +
> > > > +/*
> > > > +** negabs:
> > > > +** mov x0, -9223372036854775808
> > > > +** fmov d[0-9]+, x0
> > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > +** ret
> > > > +*/
> > > > +double negabs (double x)
> > > > +{
> > > > + unsigned long long y;
> > > > + memcpy (&y, &x, sizeof(double));
> > > > + y = y | (1UL << 63);
> > > > + memcpy (&x, &y, sizeof(double));
> > > > + return x;
> > > > +}
> > > > +
> > > > +/*
> > > > +** negabsf:
> > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > +** ret
> > > > +*/
> > > > +float negabsf (float x)
> > > > +{
> > > > + unsigned int y;
> > > > + memcpy (&y, &x, sizeof(float));
> > > > + y = y | (1U << 31);
> > > > + memcpy (&x, &y, sizeof(float));
> > > > + return x;
> > > > +}
> > > > +
> > > >
> > > >
> > > >
> > > >
> > > > --
> >
>
> --
> Richard Biener <rguenther@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-09-27 7:56 ` Tamar Christina
@ 2023-09-27 9:35 ` Tamar Christina
2023-09-27 9:39 ` Richard Biener
0 siblings, 1 reply; 17+ messages in thread
From: Tamar Christina @ 2023-09-27 9:35 UTC (permalink / raw)
To: Tamar Christina, Richard Biener; +Cc: Andrew Pinski, gcc-patches, nd, jlaw
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Wednesday, September 27, 2023 8:57 AM
> To: Richard Biener <rguenther@suse.de>
> Cc: Andrew Pinski <pinskia@gmail.com>; gcc-patches@gcc.gnu.org; nd
> <nd@arm.com>; jlaw@ventanamicro.com
> Subject: RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 <<
> signbit(x)) [PR109154]
>
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Wednesday, September 27, 2023 8:12 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: Andrew Pinski <pinskia@gmail.com>; gcc-patches@gcc.gnu.org; nd
> > <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x
> > | (1 <<
> > signbit(x)) [PR109154]
> >
> > On Wed, 27 Sep 2023, Tamar Christina wrote:
> >
> > > > -----Original Message-----
> > > > From: Andrew Pinski <pinskia@gmail.com>
> > > > Sent: Wednesday, September 27, 2023 2:17 AM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; rguenther@suse.de;
> > > > jlaw@ventanamicro.com
> > > > Subject: Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x))
> > > > to x | (1 <<
> > > > signbit(x)) [PR109154]
> > > >
> > > > On Tue, Sep 26, 2023 at 5:51?PM Tamar Christina
> > > > <tamar.christina@arm.com>
> > > > wrote:
> > > > >
> > > > > Hi All,
> > > > >
> > > > > For targets that allow conversion between int and float modes
> > > > > this adds a new optimization transforming fneg (fabs (x)) into x
> > > > > | (1 << signbit(x)). Such sequences are common in scientific
> > > > > code working with
> > > > gradients.
> > > > >
> > > > > The transformed instruction if the target has an inclusive-OR
> > > > > that takes an immediate is both shorter an faster. For those
> > > > > that don't the immediate has to be seperate constructed but this
> > > > > still ends up being faster as the immediate construction is not
> > > > > on the critical
> > path.
> > > > >
> > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > > >
> > > > > Ok for master?
> > > >
> > > > I think this should be part of isel instead of match.
> > > > Maybe we could use genmatch to generate the code that does the
> > > > transformations but this does not belong as part of match really.
> > >
> > > I disagree.. I don't think this belongs in isel. Isel is for
> > > structural
> > transformations.
> > > If there is a case for something else I'd imagine backwardprop is a
> > > better
> > choice.
> > >
> > > But I don't see why it doesn't belong here considering it *is* a
> > > mathematical optimization and the file has plenty of transformations
> > > such as mask optimizations and vector conditional rewriting.
> >
> > But the mathematical transform would more generally be fneg (fabs (x))
> > -> copysign (x, -1.) and that can be optimally expanded at RTL expansion
> time?
>
> Ah sure, atm I did copysign (x, -1) -> x | 1 << signbits. I can do it the other way
> around. And I guess since copysign (-x, y), copysign(|x|, y) -> copysign (x, y)
> that should solve the trigonometry problem too.
>
> Cool will do that instead, thanks!
Hmm this seems to conflict with the pattern
/* copysign(x, CST) -> [-]abs (x). */
(for copysigns (COPYSIGN_ALL)
(simplify
(copysigns @0 REAL_CST@1)
(if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
(negate (abs @0))
(abs @0))))
Which does the opposite transformation.
Should I try removing this?
Thanks,
Tamar
>
> Tamar
>
> >
> > Richard.
> >
> > > Regards,
> > > Tamar
> > >
> > > >
> > > > Thanks,
> > > > Andrew
> > > >
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > > PR tree-optimization/109154
> > > > > * match.pd: Add new neg+abs rule.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > PR tree-optimization/109154
> > > > > * gcc.target/aarch64/fneg-abs_1.c: New test.
> > > > > * gcc.target/aarch64/fneg-abs_2.c: New test.
> > > > > * gcc.target/aarch64/fneg-abs_3.c: New test.
> > > > > * gcc.target/aarch64/fneg-abs_4.c: New test.
> > > > > * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > > > > * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > > > > * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > > > > * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> > > > >
> > > > > --- inline copy of patch --
> > > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > > >
> > > >
> >
> 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d69482
> > > > 6cffad0f
> > > > > b17e1136600a 100644
> > > > > --- a/gcc/match.pd
> > > > > +++ b/gcc/match.pd
> > > > > @@ -9476,3 +9476,57 @@ and,
> > > > > }
> > > > > (if (full_perm_p)
> > > > > (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > +
> > > > > +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X). */
> > > > > +
> > > > > +(simplify
> > > > > + (negate (abs @0))
> > > > > + (if (FLOAT_TYPE_P (type)
> > > > > + /* We have to delay this rewriting till after forward
> > > > > +prop because
> > > > otherwise
> > > > > + it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) is
> not
> > > > > + matched in one go. Instead cos (-x) is matched first
> > > > > + followed by
> > > > cos(|x|).
> > > > > + The bottom op approach makes this rule match first and
> > > > > + it's not
> > untill
> > > > > + fwdprop that we match top down. There are manu such
> > > > > + simplications
> > > > so we
> > > > > + delay this optimization till later on. */
> > > > > + && canonicalize_math_after_vectorization_p ()) (with {
> > > > > + tree itype = unsigned_type_for (type);
> > > > > + machine_mode mode = TYPE_MODE (type);
> > > > > + const struct real_format *float_fmt = FLOAT_MODE_FORMAT
> (mode);
> > > > > + auto optab = VECTOR_TYPE_P (type) ? optab_vector :
> optab_default; }
> > > > > + (if (float_fmt
> > > > > + && float_fmt->signbit_rw >= 0
> > > > > + && targetm.can_change_mode_class (TYPE_MODE (itype),
> > > > > + TYPE_MODE (type), ALL_REGS)
> > > > > + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > > > > + (with { wide_int wone = wi::one (element_precision (type));
> > > > > + int sbit = float_fmt->signbit_rw;
> > > > > + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > > > > + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> > > > > + (view_convert:type
> > > > > + (bit_ior (view_convert:itype @0)
> > > > > + { build_uniform_cst (itype, sign_bit); } )))))))
> > > > > +
> > > > > +/* Repeat the same but for conditional negate. */
> > > > > +
> > > > > +(simplify
> > > > > + (IFN_COND_NEG @1 (abs @0) @2)
> > > > > + (if (FLOAT_TYPE_P (type))
> > > > > + (with {
> > > > > + tree itype = unsigned_type_for (type);
> > > > > + machine_mode mode = TYPE_MODE (type);
> > > > > + const struct real_format *float_fmt = FLOAT_MODE_FORMAT
> (mode);
> > > > > + auto optab = VECTOR_TYPE_P (type) ? optab_vector :
> optab_default; }
> > > > > + (if (float_fmt
> > > > > + && float_fmt->signbit_rw >= 0
> > > > > + && targetm.can_change_mode_class (TYPE_MODE (itype),
> > > > > + TYPE_MODE (type), ALL_REGS)
> > > > > + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > > > > + (with { wide_int wone = wi::one (element_precision (type));
> > > > > + int sbit = float_fmt->signbit_rw;
> > > > > + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > > > > + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> > > > > + (view_convert:type
> > > > > + (IFN_COND_IOR @1 (view_convert:itype @0)
> > > > > + { build_uniform_cst (itype, sign_bit); }
> > > > > + (view_convert:itype @2) )))))))
> > > > > \ No newline at end of file
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > > >
> >
> 0000000000000000000000000000000000000000..f823013c3ddf6b3a266
> > > > c3abfcbf2
> > > > > 642fc2a75fa6
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > > > @@ -0,0 +1,39 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O3" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > +} } } */
> > > > > +
> > > > > +#pragma GCC target "+nosve"
> > > > > +
> > > > > +#include <arm_neon.h>
> > > > > +
> > > > > +/*
> > > > > +** t1:
> > > > > +** orr v[0-9]+.2s, #128, lsl #24
> > > > > +** ret
> > > > > +*/
> > > > > +float32x2_t t1 (float32x2_t a)
> > > > > +{
> > > > > + return vneg_f32 (vabs_f32 (a)); }
> > > > > +
> > > > > +/*
> > > > > +** t2:
> > > > > +** orr v[0-9]+.4s, #128, lsl #24
> > > > > +** ret
> > > > > +*/
> > > > > +float32x4_t t2 (float32x4_t a)
> > > > > +{
> > > > > + return vnegq_f32 (vabsq_f32 (a)); }
> > > > > +
> > > > > +/*
> > > > > +** t3:
> > > > > +** adrp x0, .LC[0-9]+
> > > > > +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> > > > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > > +** ret
> > > > > +*/
> > > > > +float64x2_t t3 (float64x2_t a)
> > > > > +{
> > > > > + return vnegq_f64 (vabsq_f64 (a)); }
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > > >
> >
> 0000000000000000000000000000000000000000..141121176b309e4b2a
> > > > a413dc5527
> > > > > 1a6e3c93d5e1
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > > > @@ -0,0 +1,31 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O3" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > +} } } */
> > > > > +
> > > > > +#pragma GCC target "+nosve"
> > > > > +
> > > > > +#include <arm_neon.h>
> > > > > +#include <math.h>
> > > > > +
> > > > > +/*
> > > > > +** f1:
> > > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > +** ret
> > > > > +*/
> > > > > +float32_t f1 (float32_t a)
> > > > > +{
> > > > > + return -fabsf (a);
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > +** f2:
> > > > > +** mov x0, -9223372036854775808
> > > > > +** fmov d[0-9]+, x0
> > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > +** ret
> > > > > +*/
> > > > > +float64_t f2 (float64_t a)
> > > > > +{
> > > > > + return -fabs (a);
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > > >
> >
> 0000000000000000000000000000000000000000..b4652173a95d104ddf
> > > > a70c497f06
> > > > > 27a61ea89d3b
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > > > @@ -0,0 +1,36 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O3" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > +} } } */
> > > > > +
> > > > > +#pragma GCC target "+nosve"
> > > > > +
> > > > > +#include <arm_neon.h>
> > > > > +#include <math.h>
> > > > > +
> > > > > +/*
> > > > > +** f1:
> > > > > +** ...
> > > > > +** ldr q[0-9]+, \[x0\]
> > > > > +** orr v[0-9]+.4s, #128, lsl #24
> > > > > +** str q[0-9]+, \[x0\], 16
> > > > > +** ...
> > > > > +*/
> > > > > +void f1 (float32_t *a, int n)
> > > > > +{
> > > > > + for (int i = 0; i < (n & -8); i++)
> > > > > + a[i] = -fabsf (a[i]);
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > +** f2:
> > > > > +** ...
> > > > > +** ldr q[0-9]+, \[x0\]
> > > > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > > +** str q[0-9]+, \[x0\], 16
> > > > > +** ...
> > > > > +*/
> > > > > +void f2 (float64_t *a, int n)
> > > > > +{
> > > > > + for (int i = 0; i < (n & -8); i++)
> > > > > + a[i] = -fabs (a[i]);
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > > >
> >
> 0000000000000000000000000000000000000000..10879dea74462d34b2
> > > > 6160eeb0bd
> > > > > 54ead063166b
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > > > @@ -0,0 +1,39 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O3" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > +} } } */
> > > > > +
> > > > > +#pragma GCC target "+nosve"
> > > > > +
> > > > > +#include <string.h>
> > > > > +
> > > > > +/*
> > > > > +** negabs:
> > > > > +** mov x0, -9223372036854775808
> > > > > +** fmov d[0-9]+, x0
> > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > +** ret
> > > > > +*/
> > > > > +double negabs (double x)
> > > > > +{
> > > > > + unsigned long long y;
> > > > > + memcpy (&y, &x, sizeof(double));
> > > > > + y = y | (1UL << 63);
> > > > > + memcpy (&x, &y, sizeof(double));
> > > > > + return x;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > +** negabsf:
> > > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > +** ret
> > > > > +*/
> > > > > +float negabsf (float x)
> > > > > +{
> > > > > + unsigned int y;
> > > > > + memcpy (&y, &x, sizeof(float));
> > > > > + y = y | (1U << 31);
> > > > > + memcpy (&x, &y, sizeof(float));
> > > > > + return x;
> > > > > +}
> > > > > +
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > > >
> >
> 0000000000000000000000000000000000000000..0c7664e6de77a49768
> > > > 2952653ffd
> > > > > 417453854d52
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > > > @@ -0,0 +1,37 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O3" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > +} } } */
> > > > > +
> > > > > +#include <arm_neon.h>
> > > > > +
> > > > > +/*
> > > > > +** t1:
> > > > > +** orr v[0-9]+.2s, #128, lsl #24
> > > > > +** ret
> > > > > +*/
> > > > > +float32x2_t t1 (float32x2_t a)
> > > > > +{
> > > > > + return vneg_f32 (vabs_f32 (a)); }
> > > > > +
> > > > > +/*
> > > > > +** t2:
> > > > > +** orr v[0-9]+.4s, #128, lsl #24
> > > > > +** ret
> > > > > +*/
> > > > > +float32x4_t t2 (float32x4_t a)
> > > > > +{
> > > > > + return vnegq_f32 (vabsq_f32 (a)); }
> > > > > +
> > > > > +/*
> > > > > +** t3:
> > > > > +** adrp x0, .LC[0-9]+
> > > > > +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> > > > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > > +** ret
> > > > > +*/
> > > > > +float64x2_t t3 (float64x2_t a)
> > > > > +{
> > > > > + return vnegq_f64 (vabsq_f64 (a)); }
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > > >
> >
> 0000000000000000000000000000000000000000..a60cd31b9294af2dac6
> > > > 9eed1c93f
> > > > > 899bd5c78fca
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > > > @@ -0,0 +1,29 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O3" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > +} } } */
> > > > > +
> > > > > +#include <arm_neon.h>
> > > > > +#include <math.h>
> > > > > +
> > > > > +/*
> > > > > +** f1:
> > > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > +** ret
> > > > > +*/
> > > > > +float32_t f1 (float32_t a)
> > > > > +{
> > > > > + return -fabsf (a);
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > +** f2:
> > > > > +** mov x0, -9223372036854775808
> > > > > +** fmov d[0-9]+, x0
> > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > +** ret
> > > > > +*/
> > > > > +float64_t f2 (float64_t a)
> > > > > +{
> > > > > + return -fabs (a);
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > > >
> >
> 0000000000000000000000000000000000000000..1bf34328d8841de8e6
> > > > b0a5458562
> > > > > a9f00e31c275
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > > > @@ -0,0 +1,34 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O3" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > +} } } */
> > > > > +
> > > > > +#include <arm_neon.h>
> > > > > +#include <math.h>
> > > > > +
> > > > > +/*
> > > > > +** f1:
> > > > > +** ...
> > > > > +** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
> > > > > +** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
> > > > > +** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
> > > > > +** ...
> > > > > +*/
> > > > > +void f1 (float32_t *a, int n)
> > > > > +{
> > > > > + for (int i = 0; i < (n & -8); i++)
> > > > > + a[i] = -fabsf (a[i]);
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > +** f2:
> > > > > +** ...
> > > > > +** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
> > > > > +** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
> > > > > +** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
> > > > > +** ...
> > > > > +*/
> > > > > +void f2 (float64_t *a, int n)
> > > > > +{
> > > > > + for (int i = 0; i < (n & -8); i++)
> > > > > + a[i] = -fabs (a[i]);
> > > > > +}
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > > >
> >
> 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d0
> > > > 1f6604ca7b
> > > > > e87e3744d494
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > > > @@ -0,0 +1,37 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-options "-O3" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > +} } } */
> > > > > +
> > > > > +#include <string.h>
> > > > > +
> > > > > +/*
> > > > > +** negabs:
> > > > > +** mov x0, -9223372036854775808
> > > > > +** fmov d[0-9]+, x0
> > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > +** ret
> > > > > +*/
> > > > > +double negabs (double x)
> > > > > +{
> > > > > + unsigned long long y;
> > > > > + memcpy (&y, &x, sizeof(double));
> > > > > + y = y | (1UL << 63);
> > > > > + memcpy (&x, &y, sizeof(double));
> > > > > + return x;
> > > > > +}
> > > > > +
> > > > > +/*
> > > > > +** negabsf:
> > > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > +** ret
> > > > > +*/
> > > > > +float negabsf (float x)
> > > > > +{
> > > > > + unsigned int y;
> > > > > + memcpy (&y, &x, sizeof(float));
> > > > > + y = y | (1U << 31);
> > > > > + memcpy (&x, &y, sizeof(float));
> > > > > + return x;
> > > > > +}
> > > > > +
> > > > >
> > > > >
> > > > >
> > > > >
> > > > > --
> > >
> >
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-09-27 9:35 ` Tamar Christina
@ 2023-09-27 9:39 ` Richard Biener
2023-10-05 18:11 ` Tamar Christina
0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2023-09-27 9:39 UTC (permalink / raw)
To: Tamar Christina; +Cc: Andrew Pinski, gcc-patches, nd, jlaw
On Wed, 27 Sep 2023, Tamar Christina wrote:
> > -----Original Message-----
> > From: Tamar Christina <Tamar.Christina@arm.com>
> > Sent: Wednesday, September 27, 2023 8:57 AM
> > To: Richard Biener <rguenther@suse.de>
> > Cc: Andrew Pinski <pinskia@gmail.com>; gcc-patches@gcc.gnu.org; nd
> > <nd@arm.com>; jlaw@ventanamicro.com
> > Subject: RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 <<
> > signbit(x)) [PR109154]
> >
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Wednesday, September 27, 2023 8:12 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: Andrew Pinski <pinskia@gmail.com>; gcc-patches@gcc.gnu.org; nd
> > > <nd@arm.com>; jlaw@ventanamicro.com
> > > Subject: RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x
> > > | (1 <<
> > > signbit(x)) [PR109154]
> > >
> > > On Wed, 27 Sep 2023, Tamar Christina wrote:
> > >
> > > > > -----Original Message-----
> > > > > From: Andrew Pinski <pinskia@gmail.com>
> > > > > Sent: Wednesday, September 27, 2023 2:17 AM
> > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; rguenther@suse.de;
> > > > > jlaw@ventanamicro.com
> > > > > Subject: Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x))
> > > > > to x | (1 <<
> > > > > signbit(x)) [PR109154]
> > > > >
> > > > > On Tue, Sep 26, 2023 at 5:51?PM Tamar Christina
> > > > > <tamar.christina@arm.com>
> > > > > wrote:
> > > > > >
> > > > > > Hi All,
> > > > > >
> > > > > > For targets that allow conversion between int and float modes
> > > > > > this adds a new optimization transforming fneg (fabs (x)) into x
> > > > > > | (1 << signbit(x)). Such sequences are common in scientific
> > > > > > code working with
> > > > > gradients.
> > > > > >
> > > > > > The transformed instruction if the target has an inclusive-OR
> > > > > > that takes an immediate is both shorter an faster. For those
> > > > > > that don't the immediate has to be seperate constructed but this
> > > > > > still ends up being faster as the immediate construction is not
> > > > > > on the critical
> > > path.
> > > > > >
> > > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > > > >
> > > > > > Ok for master?
> > > > >
> > > > > I think this should be part of isel instead of match.
> > > > > Maybe we could use genmatch to generate the code that does the
> > > > > transformations but this does not belong as part of match really.
> > > >
> > > > I disagree.. I don't think this belongs in isel. Isel is for
> > > > structural
> > > transformations.
> > > > If there is a case for something else I'd imagine backwardprop is a
> > > > better
> > > choice.
> > > >
> > > > But I don't see why it doesn't belong here considering it *is* a
> > > > mathematical optimization and the file has plenty of transformations
> > > > such as mask optimizations and vector conditional rewriting.
> > >
> > > But the mathematical transform would more generally be fneg (fabs (x))
> > > -> copysign (x, -1.) and that can be optimally expanded at RTL expansion
> > time?
> >
> > Ah sure, atm I did copysign (x, -1) -> x | 1 << signbits. I can do it the other way
> > around. And I guess since copysign (-x, y), copysign(|x|, y) -> copysign (x, y)
> > that should solve the trigonometry problem too.
> >
> > Cool will do that instead, thanks!
>
> Hmm this seems to conflict with the pattern
>
> /* copysign(x, CST) -> [-]abs (x). */
> (for copysigns (COPYSIGN_ALL)
> (simplify
> (copysigns @0 REAL_CST@1)
> (if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
> (negate (abs @0))
> (abs @0))))
>
> Which does the opposite transformation.
I suppose the idea is that -abs(x) might be easier to optimize with
other patterns (consider a - copysign(x,...), optimizing to a + abs(x)).
For abs vs copysign it's a canonicalization, but (negate (abs @0))
is less canonical than copysign.
> Should I try removing this?
I'd say yes (and put the reverse canonicalization next to this pattern).
Richard.
> Thanks,
> Tamar
>
> >
> > Tamar
> >
> > >
> > > Richard.
> > >
> > > > Regards,
> > > > Tamar
> > > >
> > > > >
> > > > > Thanks,
> > > > > Andrew
> > > > >
> > > > > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > PR tree-optimization/109154
> > > > > > * match.pd: Add new neg+abs rule.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > PR tree-optimization/109154
> > > > > > * gcc.target/aarch64/fneg-abs_1.c: New test.
> > > > > > * gcc.target/aarch64/fneg-abs_2.c: New test.
> > > > > > * gcc.target/aarch64/fneg-abs_3.c: New test.
> > > > > > * gcc.target/aarch64/fneg-abs_4.c: New test.
> > > > > > * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > > > > > * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > > > > > * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > > > > > * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> > > > > >
> > > > > > --- inline copy of patch --
> > > > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > > > >
> > > > >
> > >
> > 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d69482
> > > > > 6cffad0f
> > > > > > b17e1136600a 100644
> > > > > > --- a/gcc/match.pd
> > > > > > +++ b/gcc/match.pd
> > > > > > @@ -9476,3 +9476,57 @@ and,
> > > > > > }
> > > > > > (if (full_perm_p)
> > > > > > (vec_perm (op@3 @0 @1) @3 @2))))))
> > > > > > +
> > > > > > +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X). */
> > > > > > +
> > > > > > +(simplify
> > > > > > + (negate (abs @0))
> > > > > > + (if (FLOAT_TYPE_P (type)
> > > > > > + /* We have to delay this rewriting till after forward
> > > > > > +prop because
> > > > > otherwise
> > > > > > + it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) is
> > not
> > > > > > + matched in one go. Instead cos (-x) is matched first
> > > > > > + followed by
> > > > > cos(|x|).
> > > > > > + The bottom op approach makes this rule match first and
> > > > > > + it's not
> > > untill
> > > > > > + fwdprop that we match top down. There are manu such
> > > > > > + simplications
> > > > > so we
> > > > > > + delay this optimization till later on. */
> > > > > > + && canonicalize_math_after_vectorization_p ()) (with {
> > > > > > + tree itype = unsigned_type_for (type);
> > > > > > + machine_mode mode = TYPE_MODE (type);
> > > > > > + const struct real_format *float_fmt = FLOAT_MODE_FORMAT
> > (mode);
> > > > > > + auto optab = VECTOR_TYPE_P (type) ? optab_vector :
> > optab_default; }
> > > > > > + (if (float_fmt
> > > > > > + && float_fmt->signbit_rw >= 0
> > > > > > + && targetm.can_change_mode_class (TYPE_MODE (itype),
> > > > > > + TYPE_MODE (type), ALL_REGS)
> > > > > > + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > > > > > + (with { wide_int wone = wi::one (element_precision (type));
> > > > > > + int sbit = float_fmt->signbit_rw;
> > > > > > + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > > > > > + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> > > > > > + (view_convert:type
> > > > > > + (bit_ior (view_convert:itype @0)
> > > > > > + { build_uniform_cst (itype, sign_bit); } )))))))
> > > > > > +
> > > > > > +/* Repeat the same but for conditional negate. */
> > > > > > +
> > > > > > +(simplify
> > > > > > + (IFN_COND_NEG @1 (abs @0) @2)
> > > > > > + (if (FLOAT_TYPE_P (type))
> > > > > > + (with {
> > > > > > + tree itype = unsigned_type_for (type);
> > > > > > + machine_mode mode = TYPE_MODE (type);
> > > > > > + const struct real_format *float_fmt = FLOAT_MODE_FORMAT
> > (mode);
> > > > > > + auto optab = VECTOR_TYPE_P (type) ? optab_vector :
> > optab_default; }
> > > > > > + (if (float_fmt
> > > > > > + && float_fmt->signbit_rw >= 0
> > > > > > + && targetm.can_change_mode_class (TYPE_MODE (itype),
> > > > > > + TYPE_MODE (type), ALL_REGS)
> > > > > > + && target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > > > > > + (with { wide_int wone = wi::one (element_precision (type));
> > > > > > + int sbit = float_fmt->signbit_rw;
> > > > > > + auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > > > > > + tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, sbit));}
> > > > > > + (view_convert:type
> > > > > > + (IFN_COND_IOR @1 (view_convert:itype @0)
> > > > > > + { build_uniform_cst (itype, sign_bit); }
> > > > > > + (view_convert:itype @2) )))))))
> > > > > > \ No newline at end of file
> > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > > >
> > >
> > 0000000000000000000000000000000000000000..f823013c3ddf6b3a266
> > > > > c3abfcbf2
> > > > > > 642fc2a75fa6
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> > > > > > @@ -0,0 +1,39 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O3" } */
> > > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > > +} } } */
> > > > > > +
> > > > > > +#pragma GCC target "+nosve"
> > > > > > +
> > > > > > +#include <arm_neon.h>
> > > > > > +
> > > > > > +/*
> > > > > > +** t1:
> > > > > > +** orr v[0-9]+.2s, #128, lsl #24
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float32x2_t t1 (float32x2_t a)
> > > > > > +{
> > > > > > + return vneg_f32 (vabs_f32 (a)); }
> > > > > > +
> > > > > > +/*
> > > > > > +** t2:
> > > > > > +** orr v[0-9]+.4s, #128, lsl #24
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float32x4_t t2 (float32x4_t a)
> > > > > > +{
> > > > > > + return vnegq_f32 (vabsq_f32 (a)); }
> > > > > > +
> > > > > > +/*
> > > > > > +** t3:
> > > > > > +** adrp x0, .LC[0-9]+
> > > > > > +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> > > > > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float64x2_t t3 (float64x2_t a)
> > > > > > +{
> > > > > > + return vnegq_f64 (vabsq_f64 (a)); }
> > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > > >
> > >
> > 0000000000000000000000000000000000000000..141121176b309e4b2a
> > > > > a413dc5527
> > > > > > 1a6e3c93d5e1
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> > > > > > @@ -0,0 +1,31 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O3" } */
> > > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > > +} } } */
> > > > > > +
> > > > > > +#pragma GCC target "+nosve"
> > > > > > +
> > > > > > +#include <arm_neon.h>
> > > > > > +#include <math.h>
> > > > > > +
> > > > > > +/*
> > > > > > +** f1:
> > > > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float32_t f1 (float32_t a)
> > > > > > +{
> > > > > > + return -fabsf (a);
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > +** f2:
> > > > > > +** mov x0, -9223372036854775808
> > > > > > +** fmov d[0-9]+, x0
> > > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float64_t f2 (float64_t a)
> > > > > > +{
> > > > > > + return -fabs (a);
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > > >
> > >
> > 0000000000000000000000000000000000000000..b4652173a95d104ddf
> > > > > a70c497f06
> > > > > > 27a61ea89d3b
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> > > > > > @@ -0,0 +1,36 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O3" } */
> > > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > > +} } } */
> > > > > > +
> > > > > > +#pragma GCC target "+nosve"
> > > > > > +
> > > > > > +#include <arm_neon.h>
> > > > > > +#include <math.h>
> > > > > > +
> > > > > > +/*
> > > > > > +** f1:
> > > > > > +** ...
> > > > > > +** ldr q[0-9]+, \[x0\]
> > > > > > +** orr v[0-9]+.4s, #128, lsl #24
> > > > > > +** str q[0-9]+, \[x0\], 16
> > > > > > +** ...
> > > > > > +*/
> > > > > > +void f1 (float32_t *a, int n)
> > > > > > +{
> > > > > > + for (int i = 0; i < (n & -8); i++)
> > > > > > + a[i] = -fabsf (a[i]);
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > +** f2:
> > > > > > +** ...
> > > > > > +** ldr q[0-9]+, \[x0\]
> > > > > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > > > +** str q[0-9]+, \[x0\], 16
> > > > > > +** ...
> > > > > > +*/
> > > > > > +void f2 (float64_t *a, int n)
> > > > > > +{
> > > > > > + for (int i = 0; i < (n & -8); i++)
> > > > > > + a[i] = -fabs (a[i]);
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > > > > b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > > >
> > >
> > 0000000000000000000000000000000000000000..10879dea74462d34b2
> > > > > 6160eeb0bd
> > > > > > 54ead063166b
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> > > > > > @@ -0,0 +1,39 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O3" } */
> > > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > > +} } } */
> > > > > > +
> > > > > > +#pragma GCC target "+nosve"
> > > > > > +
> > > > > > +#include <string.h>
> > > > > > +
> > > > > > +/*
> > > > > > +** negabs:
> > > > > > +** mov x0, -9223372036854775808
> > > > > > +** fmov d[0-9]+, x0
> > > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > > +** ret
> > > > > > +*/
> > > > > > +double negabs (double x)
> > > > > > +{
> > > > > > + unsigned long long y;
> > > > > > + memcpy (&y, &x, sizeof(double));
> > > > > > + y = y | (1UL << 63);
> > > > > > + memcpy (&x, &y, sizeof(double));
> > > > > > + return x;
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > +** negabsf:
> > > > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float negabsf (float x)
> > > > > > +{
> > > > > > + unsigned int y;
> > > > > > + memcpy (&y, &x, sizeof(float));
> > > > > > + y = y | (1U << 31);
> > > > > > + memcpy (&x, &y, sizeof(float));
> > > > > > + return x;
> > > > > > +}
> > > > > > +
> > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > > >
> > >
> > 0000000000000000000000000000000000000000..0c7664e6de77a49768
> > > > > 2952653ffd
> > > > > > 417453854d52
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> > > > > > @@ -0,0 +1,37 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O3" } */
> > > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > > +} } } */
> > > > > > +
> > > > > > +#include <arm_neon.h>
> > > > > > +
> > > > > > +/*
> > > > > > +** t1:
> > > > > > +** orr v[0-9]+.2s, #128, lsl #24
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float32x2_t t1 (float32x2_t a)
> > > > > > +{
> > > > > > + return vneg_f32 (vabs_f32 (a)); }
> > > > > > +
> > > > > > +/*
> > > > > > +** t2:
> > > > > > +** orr v[0-9]+.4s, #128, lsl #24
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float32x4_t t2 (float32x4_t a)
> > > > > > +{
> > > > > > + return vnegq_f32 (vabsq_f32 (a)); }
> > > > > > +
> > > > > > +/*
> > > > > > +** t3:
> > > > > > +** adrp x0, .LC[0-9]+
> > > > > > +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> > > > > > +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float64x2_t t3 (float64x2_t a)
> > > > > > +{
> > > > > > + return vnegq_f64 (vabsq_f64 (a)); }
> > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > > >
> > >
> > 0000000000000000000000000000000000000000..a60cd31b9294af2dac6
> > > > > 9eed1c93f
> > > > > > 899bd5c78fca
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > > > > > @@ -0,0 +1,29 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O3" } */
> > > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > > +} } } */
> > > > > > +
> > > > > > +#include <arm_neon.h>
> > > > > > +#include <math.h>
> > > > > > +
> > > > > > +/*
> > > > > > +** f1:
> > > > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float32_t f1 (float32_t a)
> > > > > > +{
> > > > > > + return -fabsf (a);
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > +** f2:
> > > > > > +** mov x0, -9223372036854775808
> > > > > > +** fmov d[0-9]+, x0
> > > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float64_t f2 (float64_t a)
> > > > > > +{
> > > > > > + return -fabs (a);
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > > >
> > >
> > 0000000000000000000000000000000000000000..1bf34328d8841de8e6
> > > > > b0a5458562
> > > > > > a9f00e31c275
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> > > > > > @@ -0,0 +1,34 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O3" } */
> > > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > > +} } } */
> > > > > > +
> > > > > > +#include <arm_neon.h>
> > > > > > +#include <math.h>
> > > > > > +
> > > > > > +/*
> > > > > > +** f1:
> > > > > > +** ...
> > > > > > +** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
> > > > > > +** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
> > > > > > +** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
> > > > > > +** ...
> > > > > > +*/
> > > > > > +void f1 (float32_t *a, int n)
> > > > > > +{
> > > > > > + for (int i = 0; i < (n & -8); i++)
> > > > > > + a[i] = -fabsf (a[i]);
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > +** f2:
> > > > > > +** ...
> > > > > > +** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
> > > > > > +** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
> > > > > > +** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
> > > > > > +** ...
> > > > > > +*/
> > > > > > +void f2 (float64_t *a, int n)
> > > > > > +{
> > > > > > + for (int i = 0; i < (n & -8); i++)
> > > > > > + a[i] = -fabs (a[i]);
> > > > > > +}
> > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > > > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > > > > new file mode 100644
> > > > > > index
> > > > > >
> > > > >
> > >
> > 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d0
> > > > > 1f6604ca7b
> > > > > > e87e3744d494
> > > > > > --- /dev/null
> > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> > > > > > @@ -0,0 +1,37 @@
> > > > > > +/* { dg-do compile } */
> > > > > > +/* { dg-options "-O3" } */
> > > > > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64
> > > > > > +} } } */
> > > > > > +
> > > > > > +#include <string.h>
> > > > > > +
> > > > > > +/*
> > > > > > +** negabs:
> > > > > > +** mov x0, -9223372036854775808
> > > > > > +** fmov d[0-9]+, x0
> > > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > > +** ret
> > > > > > +*/
> > > > > > +double negabs (double x)
> > > > > > +{
> > > > > > + unsigned long long y;
> > > > > > + memcpy (&y, &x, sizeof(double));
> > > > > > + y = y | (1UL << 63);
> > > > > > + memcpy (&x, &y, sizeof(double));
> > > > > > + return x;
> > > > > > +}
> > > > > > +
> > > > > > +/*
> > > > > > +** negabsf:
> > > > > > +** movi v[0-9]+.2s, 0x80, lsl 24
> > > > > > +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > > > > > +** ret
> > > > > > +*/
> > > > > > +float negabsf (float x)
> > > > > > +{
> > > > > > + unsigned int y;
> > > > > > + memcpy (&y, &x, sizeof(float));
> > > > > > + y = y | (1U << 31);
> > > > > > + memcpy (&x, &y, sizeof(float));
> > > > > > + return x;
> > > > > > +}
> > > > > > +
> > > > > >
> > > > > >
> > > > > >
> > > > > >
> > > > > > --
> > > >
> > >
> > > --
> > > Richard Biener <rguenther@suse.de>
> > > SUSE Software Solutions Germany GmbH,
> > > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > Nuernberg)
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-09-27 0:50 [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154] Tamar Christina
2023-09-27 1:17 ` Andrew Pinski
@ 2023-09-29 15:00 ` Jeff Law
2023-10-05 18:09 ` Tamar Christina
1 sibling, 1 reply; 17+ messages in thread
From: Jeff Law @ 2023-09-29 15:00 UTC (permalink / raw)
To: Tamar Christina, gcc-patches; +Cc: nd, rguenther, jlaw
On 9/26/23 18:50, Tamar Christina wrote:
> Hi All,
>
> For targets that allow conversion between int and float modes this adds a new
> optimization transforming fneg (fabs (x)) into x | (1 << signbit(x)). Such
> sequences are common in scientific code working with gradients.
>
> The transformed instruction if the target has an inclusive-OR that takes an
> immediate is both shorter an faster. For those that don't the immediate has
> to be seperate constructed but this still ends up being faster as the immediate
> construction is not on the critical path.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/109154
> * match.pd: Add new neg+abs rule.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/109154
> * gcc.target/aarch64/fneg-abs_1.c: New test.
> * gcc.target/aarch64/fneg-abs_2.c: New test.
> * gcc.target/aarch64/fneg-abs_3.c: New test.
> * gcc.target/aarch64/fneg-abs_4.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
>
> --- inline copy of patch --
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d694826cffad0fb17e1136600a 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -9476,3 +9476,57 @@ and,
> }
> (if (full_perm_p)
> (vec_perm (op@3 @0 @1) @3 @2))))))
> +
> +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X). */
> +
> +(simplify
> + (negate (abs @0))
> + (if (FLOAT_TYPE_P (type)
> + /* We have to delay this rewriting till after forward prop because otherwise
> + it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) is not
> + matched in one go. Instead cos (-x) is matched first followed by cos(|x|).
> + The bottom op approach makes this rule match first and it's not untill
> + fwdprop that we match top down. There are manu such simplications so we
Multiple typos this line. fwdprop->fwprop manu->many
simplications->simplifications.
OK with the typos fixed.
Thanks. I meant to say hi at the Cauldron, but never seemed to get away
long enough to find you..
jeff
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-09-29 15:00 ` Jeff Law
@ 2023-10-05 18:09 ` Tamar Christina
0 siblings, 0 replies; 17+ messages in thread
From: Tamar Christina @ 2023-10-05 18:09 UTC (permalink / raw)
To: Jeff Law, gcc-patches; +Cc: nd, rguenther, jlaw
> > b17e1136600a 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -9476,3 +9476,57 @@ and,
> > }
> > (if (full_perm_p)
> > (vec_perm (op@3 @0 @1) @3 @2))))))
> > +
> > +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X). */
> > +
> > +(simplify
> > + (negate (abs @0))
> > + (if (FLOAT_TYPE_P (type)
> > + /* We have to delay this rewriting till after forward prop because
> otherwise
> > + it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) is not
> > + matched in one go. Instead cos (-x) is matched first followed by
> cos(|x|).
> > + The bottom op approach makes this rule match first and it's not untill
> > + fwdprop that we match top down. There are manu such
> simplications
> > +so we
> Multiple typos this line. fwdprop->fwprop manu->many
> simplications->simplifications.
>
> OK with the typos fixed.
Ah I think you missed the previous emails from Richi whom wanted this canonicalized to
copysign instead. I've just finished doing so and will send the updated patch 😊
>
> Thanks. I meant to say hi at the Cauldron, but never seemed to get away long
> enough to find you..
Hehehe Indeed, I think I only saw you once and then *poof* like a ninja you were gone!
Next time 😊
Cheers,
Tamar
>
> jeff
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-09-27 9:39 ` Richard Biener
@ 2023-10-05 18:11 ` Tamar Christina
2023-10-06 6:24 ` Richard Biener
0 siblings, 1 reply; 17+ messages in thread
From: Tamar Christina @ 2023-10-05 18:11 UTC (permalink / raw)
To: Richard Biener; +Cc: Andrew Pinski, gcc-patches, nd, jlaw
[-- Attachment #1: Type: text/plain, Size: 16449 bytes --]
> I suppose the idea is that -abs(x) might be easier to optimize with other
> patterns (consider a - copysign(x,...), optimizing to a + abs(x)).
>
> For abs vs copysign it's a canonicalization, but (negate (abs @0)) is less
> canonical than copysign.
>
> > Should I try removing this?
>
> I'd say yes (and put the reverse canonicalization next to this pattern).
>
This patch transforms fneg (fabs (x)) into copysign (x, -1) which is more
canonical and allows a target to expand this sequence efficiently. Such
sequences are common in scientific code working with gradients.
various optimizations in match.pd only happened on COPYSIGN but not COPYSIGN_ALL
which means they exclude IFN_COPYSIGN. COPYSIGN however is restricted to only
the C99 builtins and so doesn't work for vectors.
The patch expands these optimizations to work on COPYSIGN_ALL.
There is an existing canonicalization of copysign (x, -1) to fneg (fabs (x))
which I remove since this is a less efficient form. The testsuite is also
updated in light of this.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
PR tree-optimization/109154
* match.pd: Add new neg+abs rule, remove inverse copysign rule and
expand existing copysign optimizations.
gcc/testsuite/ChangeLog:
PR tree-optimization/109154
* gcc.dg/fold-copysign-1.c: Updated.
* gcc.dg/pr55152-2.c: Updated.
* gcc.dg/tree-ssa/abs-4.c: Updated.
* gcc.dg/tree-ssa/backprop-6.c: Updated.
* gcc.dg/tree-ssa/copy-sign-2.c: Updated.
* gcc.dg/tree-ssa/mult-abs-2.c: Updated.
* gcc.target/aarch64/fneg-abs_1.c: New test.
* gcc.target/aarch64/fneg-abs_2.c: New test.
* gcc.target/aarch64/fneg-abs_3.c: New test.
* gcc.target/aarch64/fneg-abs_4.c: New test.
* gcc.target/aarch64/sve/fneg-abs_1.c: New test.
* gcc.target/aarch64/sve/fneg-abs_2.c: New test.
* gcc.target/aarch64/sve/fneg-abs_3.c: New test.
* gcc.target/aarch64/sve/fneg-abs_4.c: New test.
--- inline copy of patch ---
diff --git a/gcc/match.pd b/gcc/match.pd
index 4bdd83e6e061b16dbdb2845b9398fcfb8a6c9739..bd6599d36021e119f51a4928354f580ffe82c6e2 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1074,45 +1074,43 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
/* cos(copysign(x, y)) -> cos(x). Similarly for cosh. */
(for coss (COS COSH)
- copysigns (COPYSIGN)
- (simplify
- (coss (copysigns @0 @1))
- (coss @0)))
+ (for copysigns (COPYSIGN_ALL)
+ (simplify
+ (coss (copysigns @0 @1))
+ (coss @0))))
/* pow(copysign(x, y), z) -> pow(x, z) if z is an even integer. */
(for pows (POW)
- copysigns (COPYSIGN)
- (simplify
- (pows (copysigns @0 @2) REAL_CST@1)
- (with { HOST_WIDE_INT n; }
- (if (real_isinteger (&TREE_REAL_CST (@1), &n) && (n & 1) == 0)
- (pows @0 @1)))))
+ (for copysigns (COPYSIGN_ALL)
+ (simplify
+ (pows (copysigns @0 @2) REAL_CST@1)
+ (with { HOST_WIDE_INT n; }
+ (if (real_isinteger (&TREE_REAL_CST (@1), &n) && (n & 1) == 0)
+ (pows @0 @1))))))
/* Likewise for powi. */
(for pows (POWI)
- copysigns (COPYSIGN)
- (simplify
- (pows (copysigns @0 @2) INTEGER_CST@1)
- (if ((wi::to_wide (@1) & 1) == 0)
- (pows @0 @1))))
+ (for copysigns (COPYSIGN_ALL)
+ (simplify
+ (pows (copysigns @0 @2) INTEGER_CST@1)
+ (if ((wi::to_wide (@1) & 1) == 0)
+ (pows @0 @1)))))
(for hypots (HYPOT)
- copysigns (COPYSIGN)
- /* hypot(copysign(x, y), z) -> hypot(x, z). */
- (simplify
- (hypots (copysigns @0 @1) @2)
- (hypots @0 @2))
- /* hypot(x, copysign(y, z)) -> hypot(x, y). */
- (simplify
- (hypots @0 (copysigns @1 @2))
- (hypots @0 @1)))
+ (for copysigns (COPYSIGN)
+ /* hypot(copysign(x, y), z) -> hypot(x, z). */
+ (simplify
+ (hypots (copysigns @0 @1) @2)
+ (hypots @0 @2))
+ /* hypot(x, copysign(y, z)) -> hypot(x, y). */
+ (simplify
+ (hypots @0 (copysigns @1 @2))
+ (hypots @0 @1))))
-/* copysign(x, CST) -> [-]abs (x). */
-(for copysigns (COPYSIGN_ALL)
- (simplify
- (copysigns @0 REAL_CST@1)
- (if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
- (negate (abs @0))
- (abs @0))))
+/* Transform fneg (fabs (X)) -> copysign (X, -1). */
+
+(simplify
+ (negate (abs @0))
+ (IFN_COPYSIGN @0 { build_minus_one_cst (type); }))
/* copysign(copysign(x, y), z) -> copysign(x, z). */
(for copysigns (COPYSIGN_ALL)
diff --git a/gcc/testsuite/gcc.dg/fold-copysign-1.c b/gcc/testsuite/gcc.dg/fold-copysign-1.c
index f17d65c24ee4dca9867827d040fe0a404c515e7b..f9cafd14ab05f5e8ab2f6f68e62801d21c2df6a6 100644
--- a/gcc/testsuite/gcc.dg/fold-copysign-1.c
+++ b/gcc/testsuite/gcc.dg/fold-copysign-1.c
@@ -12,5 +12,5 @@ double bar (double x)
return __builtin_copysign (x, minuszero);
}
-/* { dg-final { scan-tree-dump-times "= -" 1 "cddce1" } } */
-/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 2 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_copysign" 1 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "cddce1" } } */
diff --git a/gcc/testsuite/gcc.dg/pr55152-2.c b/gcc/testsuite/gcc.dg/pr55152-2.c
index 54db0f2062da105a829d6690ac8ed9891fe2b588..605f202ed6bc7aa8fe921457b02ff0b88cc63ce6 100644
--- a/gcc/testsuite/gcc.dg/pr55152-2.c
+++ b/gcc/testsuite/gcc.dg/pr55152-2.c
@@ -10,4 +10,5 @@ int f(int a)
return (a<-a)?a:-a;
}
-/* { dg-final { scan-tree-dump-times "ABS_EXPR" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COPYSIGN" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "ABS_EXPR" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
index 6197519faf7b55aed7bc162cd0a14dd2145210ca..e1b825f37f69ac3c4666b3a52d733368805ad31d 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
@@ -9,5 +9,6 @@ long double abs_ld(long double x) { return __builtin_signbit(x) ? x : -x; }
/* __builtin_signbit(x) ? x : -x. Should be convert into - ABS_EXP<x> */
/* { dg-final { scan-tree-dump-not "signbit" "optimized"} } */
-/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 3 "optimized"} } */
-/* { dg-final { scan-tree-dump-times "= -" 3 "optimized"} } */
+/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "optimized"} } */
+/* { dg-final { scan-tree-dump-times "= -" 1 "optimized"} } */
+/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 2 "optimized"} } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
index 31f05716f1498dc709cac95fa20fb5796642c77e..c3a138642d6ff7be984e91fa1343cb2718db7ae1 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
@@ -26,5 +26,6 @@ TEST_FUNCTION (float, f)
TEST_FUNCTION (double, )
TEST_FUNCTION (long double, l)
-/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 6 "backprop" } } */
-/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 3 "backprop" } } */
+/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 4 "backprop" } } */
+/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 2 "backprop" } } */
+/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 1 "backprop" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c b/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
index de52c5f7c8062958353d91f5031193defc9f3f91..e5d565c4b9832c00106588ef411fbd8c292a5cad 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
@@ -10,4 +10,5 @@ float f1(float x)
float t = __builtin_copysignf (1.0f, -x);
return x * t;
}
-/* { dg-final { scan-tree-dump-times "ABS" 2 "optimized"} } */
+/* { dg-final { scan-tree-dump-times "ABS" 1 "optimized"} } */
+/* { dg-final { scan-tree-dump-times ".COPYSIGN" 1 "optimized"} } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c b/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
index a41f1baf25669a4fd301a586a49ba5e3c5b966ab..a22896b21c8b5a4d5d8e28bd8ae0db896e63ade0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
@@ -34,4 +34,5 @@ float i1(float x)
{
return x * (x <= 0.f ? 1.f : -1.f);
}
-/* { dg-final { scan-tree-dump-times "ABS" 8 "gimple"} } */
+/* { dg-final { scan-tree-dump-times "ABS" 4 "gimple"} } */
+/* { dg-final { scan-tree-dump-times "\.COPYSIGN" 4 "gimple"} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..f823013c3ddf6b3a266c3abfcbf2642fc2a75fa6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+
+/*
+** t1:
+** orr v[0-9]+.2s, #128, lsl #24
+** ret
+*/
+float32x2_t t1 (float32x2_t a)
+{
+ return vneg_f32 (vabs_f32 (a));
+}
+
+/*
+** t2:
+** orr v[0-9]+.4s, #128, lsl #24
+** ret
+*/
+float32x4_t t2 (float32x4_t a)
+{
+ return vnegq_f32 (vabsq_f32 (a));
+}
+
+/*
+** t3:
+** adrp x0, .LC[0-9]+
+** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ret
+*/
+float64x2_t t3 (float64x2_t a)
+{
+ return vnegq_f64 (vabsq_f64 (a));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..141121176b309e4b2aa413dc55271a6e3c93d5e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float32_t f1 (float32_t a)
+{
+ return -fabsf (a);
+}
+
+/*
+** f2:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float64_t f2 (float64_t a)
+{
+ return -fabs (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b4652173a95d104ddfa70c497f0627a61ea89d3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** ...
+** ldr q[0-9]+, \[x0\]
+** orr v[0-9]+.4s, #128, lsl #24
+** str q[0-9]+, \[x0\], 16
+** ...
+*/
+void f1 (float32_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabsf (a[i]);
+}
+
+/*
+** f2:
+** ...
+** ldr q[0-9]+, \[x0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** str q[0-9]+, \[x0\], 16
+** ...
+*/
+void f2 (float64_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabs (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..10879dea74462d34b26160eeb0bd54ead063166b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <string.h>
+
+/*
+** negabs:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+double negabs (double x)
+{
+ unsigned long long y;
+ memcpy (&y, &x, sizeof(double));
+ y = y | (1UL << 63);
+ memcpy (&x, &y, sizeof(double));
+ return x;
+}
+
+/*
+** negabsf:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float negabsf (float x)
+{
+ unsigned int y;
+ memcpy (&y, &x, sizeof(float));
+ y = y | (1U << 31);
+ memcpy (&x, &y, sizeof(float));
+ return x;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..0c7664e6de77a497682952653ffd417453854d52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+
+/*
+** t1:
+** orr v[0-9]+.2s, #128, lsl #24
+** ret
+*/
+float32x2_t t1 (float32x2_t a)
+{
+ return vneg_f32 (vabs_f32 (a));
+}
+
+/*
+** t2:
+** orr v[0-9]+.4s, #128, lsl #24
+** ret
+*/
+float32x4_t t2 (float32x4_t a)
+{
+ return vnegq_f32 (vabsq_f32 (a));
+}
+
+/*
+** t3:
+** adrp x0, .LC[0-9]+
+** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ret
+*/
+float64x2_t t3 (float64x2_t a)
+{
+ return vnegq_f64 (vabsq_f64 (a));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a60cd31b9294af2dac69eed1c93f899bd5c78fca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float32_t f1 (float32_t a)
+{
+ return -fabsf (a);
+}
+
+/*
+** f2:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float64_t f2 (float64_t a)
+{
+ return -fabs (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..1bf34328d8841de8e6b0a5458562a9f00e31c275
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** ...
+** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
+** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
+** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
+** ...
+*/
+void f1 (float32_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabsf (a[i]);
+}
+
+/*
+** f2:
+** ...
+** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
+** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
+** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
+** ...
+*/
+void f2 (float64_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabs (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d01f6604ca7be87e3744d494
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <string.h>
+
+/*
+** negabs:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+double negabs (double x)
+{
+ unsigned long long y;
+ memcpy (&y, &x, sizeof(double));
+ y = y | (1UL << 63);
+ memcpy (&x, &y, sizeof(double));
+ return x;
+}
+
+/*
+** negabsf:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float negabsf (float x)
+{
+ unsigned int y;
+ memcpy (&y, &x, sizeof(float));
+ y = y | (1U << 31);
+ memcpy (&x, &y, sizeof(float));
+ return x;
+}
+
[-- Attachment #2: rb17718.patch --]
[-- Type: application/octet-stream, Size: 13921 bytes --]
diff --git a/gcc/match.pd b/gcc/match.pd
index 4bdd83e6e061b16dbdb2845b9398fcfb8a6c9739..bd6599d36021e119f51a4928354f580ffe82c6e2 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1074,45 +1074,43 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
/* cos(copysign(x, y)) -> cos(x). Similarly for cosh. */
(for coss (COS COSH)
- copysigns (COPYSIGN)
- (simplify
- (coss (copysigns @0 @1))
- (coss @0)))
+ (for copysigns (COPYSIGN_ALL)
+ (simplify
+ (coss (copysigns @0 @1))
+ (coss @0))))
/* pow(copysign(x, y), z) -> pow(x, z) if z is an even integer. */
(for pows (POW)
- copysigns (COPYSIGN)
- (simplify
- (pows (copysigns @0 @2) REAL_CST@1)
- (with { HOST_WIDE_INT n; }
- (if (real_isinteger (&TREE_REAL_CST (@1), &n) && (n & 1) == 0)
- (pows @0 @1)))))
+ (for copysigns (COPYSIGN_ALL)
+ (simplify
+ (pows (copysigns @0 @2) REAL_CST@1)
+ (with { HOST_WIDE_INT n; }
+ (if (real_isinteger (&TREE_REAL_CST (@1), &n) && (n & 1) == 0)
+ (pows @0 @1))))))
/* Likewise for powi. */
(for pows (POWI)
- copysigns (COPYSIGN)
- (simplify
- (pows (copysigns @0 @2) INTEGER_CST@1)
- (if ((wi::to_wide (@1) & 1) == 0)
- (pows @0 @1))))
+ (for copysigns (COPYSIGN_ALL)
+ (simplify
+ (pows (copysigns @0 @2) INTEGER_CST@1)
+ (if ((wi::to_wide (@1) & 1) == 0)
+ (pows @0 @1)))))
(for hypots (HYPOT)
- copysigns (COPYSIGN)
- /* hypot(copysign(x, y), z) -> hypot(x, z). */
- (simplify
- (hypots (copysigns @0 @1) @2)
- (hypots @0 @2))
- /* hypot(x, copysign(y, z)) -> hypot(x, y). */
- (simplify
- (hypots @0 (copysigns @1 @2))
- (hypots @0 @1)))
+ (for copysigns (COPYSIGN)
+ /* hypot(copysign(x, y), z) -> hypot(x, z). */
+ (simplify
+ (hypots (copysigns @0 @1) @2)
+ (hypots @0 @2))
+ /* hypot(x, copysign(y, z)) -> hypot(x, y). */
+ (simplify
+ (hypots @0 (copysigns @1 @2))
+ (hypots @0 @1))))
-/* copysign(x, CST) -> [-]abs (x). */
-(for copysigns (COPYSIGN_ALL)
- (simplify
- (copysigns @0 REAL_CST@1)
- (if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
- (negate (abs @0))
- (abs @0))))
+/* Transform fneg (fabs (X)) -> copysign (X, -1). */
+
+(simplify
+ (negate (abs @0))
+ (IFN_COPYSIGN @0 { build_minus_one_cst (type); }))
/* copysign(copysign(x, y), z) -> copysign(x, z). */
(for copysigns (COPYSIGN_ALL)
diff --git a/gcc/testsuite/gcc.dg/fold-copysign-1.c b/gcc/testsuite/gcc.dg/fold-copysign-1.c
index f17d65c24ee4dca9867827d040fe0a404c515e7b..f9cafd14ab05f5e8ab2f6f68e62801d21c2df6a6 100644
--- a/gcc/testsuite/gcc.dg/fold-copysign-1.c
+++ b/gcc/testsuite/gcc.dg/fold-copysign-1.c
@@ -12,5 +12,5 @@ double bar (double x)
return __builtin_copysign (x, minuszero);
}
-/* { dg-final { scan-tree-dump-times "= -" 1 "cddce1" } } */
-/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 2 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_copysign" 1 "cddce1" } } */
+/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "cddce1" } } */
diff --git a/gcc/testsuite/gcc.dg/pr55152-2.c b/gcc/testsuite/gcc.dg/pr55152-2.c
index 54db0f2062da105a829d6690ac8ed9891fe2b588..605f202ed6bc7aa8fe921457b02ff0b88cc63ce6 100644
--- a/gcc/testsuite/gcc.dg/pr55152-2.c
+++ b/gcc/testsuite/gcc.dg/pr55152-2.c
@@ -10,4 +10,5 @@ int f(int a)
return (a<-a)?a:-a;
}
-/* { dg-final { scan-tree-dump-times "ABS_EXPR" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COPYSIGN" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "ABS_EXPR" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
index 6197519faf7b55aed7bc162cd0a14dd2145210ca..e1b825f37f69ac3c4666b3a52d733368805ad31d 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
@@ -9,5 +9,6 @@ long double abs_ld(long double x) { return __builtin_signbit(x) ? x : -x; }
/* __builtin_signbit(x) ? x : -x. Should be convert into - ABS_EXP<x> */
/* { dg-final { scan-tree-dump-not "signbit" "optimized"} } */
-/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 3 "optimized"} } */
-/* { dg-final { scan-tree-dump-times "= -" 3 "optimized"} } */
+/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "optimized"} } */
+/* { dg-final { scan-tree-dump-times "= -" 1 "optimized"} } */
+/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 2 "optimized"} } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
index 31f05716f1498dc709cac95fa20fb5796642c77e..c3a138642d6ff7be984e91fa1343cb2718db7ae1 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
@@ -26,5 +26,6 @@ TEST_FUNCTION (float, f)
TEST_FUNCTION (double, )
TEST_FUNCTION (long double, l)
-/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 6 "backprop" } } */
-/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 3 "backprop" } } */
+/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 4 "backprop" } } */
+/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 2 "backprop" } } */
+/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 1 "backprop" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c b/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
index de52c5f7c8062958353d91f5031193defc9f3f91..e5d565c4b9832c00106588ef411fbd8c292a5cad 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
@@ -10,4 +10,5 @@ float f1(float x)
float t = __builtin_copysignf (1.0f, -x);
return x * t;
}
-/* { dg-final { scan-tree-dump-times "ABS" 2 "optimized"} } */
+/* { dg-final { scan-tree-dump-times "ABS" 1 "optimized"} } */
+/* { dg-final { scan-tree-dump-times ".COPYSIGN" 1 "optimized"} } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c b/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
index a41f1baf25669a4fd301a586a49ba5e3c5b966ab..a22896b21c8b5a4d5d8e28bd8ae0db896e63ade0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
@@ -34,4 +34,5 @@ float i1(float x)
{
return x * (x <= 0.f ? 1.f : -1.f);
}
-/* { dg-final { scan-tree-dump-times "ABS" 8 "gimple"} } */
+/* { dg-final { scan-tree-dump-times "ABS" 4 "gimple"} } */
+/* { dg-final { scan-tree-dump-times "\.COPYSIGN" 4 "gimple"} } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..f823013c3ddf6b3a266c3abfcbf2642fc2a75fa6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+
+/*
+** t1:
+** orr v[0-9]+.2s, #128, lsl #24
+** ret
+*/
+float32x2_t t1 (float32x2_t a)
+{
+ return vneg_f32 (vabs_f32 (a));
+}
+
+/*
+** t2:
+** orr v[0-9]+.4s, #128, lsl #24
+** ret
+*/
+float32x4_t t2 (float32x4_t a)
+{
+ return vnegq_f32 (vabsq_f32 (a));
+}
+
+/*
+** t3:
+** adrp x0, .LC[0-9]+
+** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ret
+*/
+float64x2_t t3 (float64x2_t a)
+{
+ return vnegq_f64 (vabsq_f64 (a));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..141121176b309e4b2aa413dc55271a6e3c93d5e1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float32_t f1 (float32_t a)
+{
+ return -fabsf (a);
+}
+
+/*
+** f2:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float64_t f2 (float64_t a)
+{
+ return -fabs (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..b4652173a95d104ddfa70c497f0627a61ea89d3b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** ...
+** ldr q[0-9]+, \[x0\]
+** orr v[0-9]+.4s, #128, lsl #24
+** str q[0-9]+, \[x0\], 16
+** ...
+*/
+void f1 (float32_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabsf (a[i]);
+}
+
+/*
+** f2:
+** ...
+** ldr q[0-9]+, \[x0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** str q[0-9]+, \[x0\], 16
+** ...
+*/
+void f2 (float64_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabs (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..10879dea74462d34b26160eeb0bd54ead063166b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#pragma GCC target "+nosve"
+
+#include <string.h>
+
+/*
+** negabs:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+double negabs (double x)
+{
+ unsigned long long y;
+ memcpy (&y, &x, sizeof(double));
+ y = y | (1UL << 63);
+ memcpy (&x, &y, sizeof(double));
+ return x;
+}
+
+/*
+** negabsf:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float negabsf (float x)
+{
+ unsigned int y;
+ memcpy (&y, &x, sizeof(float));
+ y = y | (1U << 31);
+ memcpy (&x, &y, sizeof(float));
+ return x;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..0c7664e6de77a497682952653ffd417453854d52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+
+/*
+** t1:
+** orr v[0-9]+.2s, #128, lsl #24
+** ret
+*/
+float32x2_t t1 (float32x2_t a)
+{
+ return vneg_f32 (vabs_f32 (a));
+}
+
+/*
+** t2:
+** orr v[0-9]+.4s, #128, lsl #24
+** ret
+*/
+float32x4_t t2 (float32x4_t a)
+{
+ return vnegq_f32 (vabsq_f32 (a));
+}
+
+/*
+** t3:
+** adrp x0, .LC[0-9]+
+** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
+** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+** ret
+*/
+float64x2_t t3 (float64x2_t a)
+{
+ return vnegq_f64 (vabsq_f64 (a));
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..a60cd31b9294af2dac69eed1c93f899bd5c78fca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float32_t f1 (float32_t a)
+{
+ return -fabsf (a);
+}
+
+/*
+** f2:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float64_t f2 (float64_t a)
+{
+ return -fabs (a);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..1bf34328d8841de8e6b0a5458562a9f00e31c275
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+** ...
+** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
+** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
+** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
+** ...
+*/
+void f1 (float32_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabsf (a[i]);
+}
+
+/*
+** f2:
+** ...
+** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
+** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
+** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
+** ...
+*/
+void f2 (float64_t *a, int n)
+{
+ for (int i = 0; i < (n & -8); i++)
+ a[i] = -fabs (a[i]);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d01f6604ca7be87e3744d494
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <string.h>
+
+/*
+** negabs:
+** mov x0, -9223372036854775808
+** fmov d[0-9]+, x0
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+double negabs (double x)
+{
+ unsigned long long y;
+ memcpy (&y, &x, sizeof(double));
+ y = y | (1UL << 63);
+ memcpy (&x, &y, sizeof(double));
+ return x;
+}
+
+/*
+** negabsf:
+** movi v[0-9]+.2s, 0x80, lsl 24
+** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
+** ret
+*/
+float negabsf (float x)
+{
+ unsigned int y;
+ memcpy (&y, &x, sizeof(float));
+ y = y | (1U << 31);
+ memcpy (&x, &y, sizeof(float));
+ return x;
+}
+
^ permalink raw reply [flat|nested] 17+ messages in thread
* RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-10-05 18:11 ` Tamar Christina
@ 2023-10-06 6:24 ` Richard Biener
2023-10-07 9:22 ` Richard Sandiford
0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2023-10-06 6:24 UTC (permalink / raw)
To: Tamar Christina; +Cc: Andrew Pinski, gcc-patches, nd, jlaw, richard.sandiford
On Thu, 5 Oct 2023, Tamar Christina wrote:
> > I suppose the idea is that -abs(x) might be easier to optimize with other
> > patterns (consider a - copysign(x,...), optimizing to a + abs(x)).
> >
> > For abs vs copysign it's a canonicalization, but (negate (abs @0)) is less
> > canonical than copysign.
> >
> > > Should I try removing this?
> >
> > I'd say yes (and put the reverse canonicalization next to this pattern).
> >
>
> This patch transforms fneg (fabs (x)) into copysign (x, -1) which is more
> canonical and allows a target to expand this sequence efficiently. Such
> sequences are common in scientific code working with gradients.
>
> various optimizations in match.pd only happened on COPYSIGN but not COPYSIGN_ALL
> which means they exclude IFN_COPYSIGN. COPYSIGN however is restricted to only
That's not true:
(define_operator_list COPYSIGN
BUILT_IN_COPYSIGNF
BUILT_IN_COPYSIGN
BUILT_IN_COPYSIGNL
IFN_COPYSIGN)
but they miss the extended float builtin variants like
__builtin_copysignf16. Also see below
> the C99 builtins and so doesn't work for vectors.
>
> The patch expands these optimizations to work on COPYSIGN_ALL.
>
> There is an existing canonicalization of copysign (x, -1) to fneg (fabs (x))
> which I remove since this is a less efficient form. The testsuite is also
> updated in light of this.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/109154
> * match.pd: Add new neg+abs rule, remove inverse copysign rule and
> expand existing copysign optimizations.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/109154
> * gcc.dg/fold-copysign-1.c: Updated.
> * gcc.dg/pr55152-2.c: Updated.
> * gcc.dg/tree-ssa/abs-4.c: Updated.
> * gcc.dg/tree-ssa/backprop-6.c: Updated.
> * gcc.dg/tree-ssa/copy-sign-2.c: Updated.
> * gcc.dg/tree-ssa/mult-abs-2.c: Updated.
> * gcc.target/aarch64/fneg-abs_1.c: New test.
> * gcc.target/aarch64/fneg-abs_2.c: New test.
> * gcc.target/aarch64/fneg-abs_3.c: New test.
> * gcc.target/aarch64/fneg-abs_4.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 4bdd83e6e061b16dbdb2845b9398fcfb8a6c9739..bd6599d36021e119f51a4928354f580ffe82c6e2 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -1074,45 +1074,43 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>
> /* cos(copysign(x, y)) -> cos(x). Similarly for cosh. */
> (for coss (COS COSH)
> - copysigns (COPYSIGN)
> - (simplify
> - (coss (copysigns @0 @1))
> - (coss @0)))
> + (for copysigns (COPYSIGN_ALL)
So this ends up generating for example the match
(cosf (copysignl ...)) which doesn't make much sense.
The lock-step iteration did
(cosf (copysignf ..)) ... (ifn_cos (ifn_copysign ...))
which is leaner but misses the case of
(cosf (ifn_copysign ..)) - that's probably what you are
after with this change.
That said, there isn't a nice solution (without altering the match.pd
IL). There's the explicit solution, spelling out all combinations.
So if we want to go with yout pragmatic solution changing this
to use COPYSIGN_ALL isn't necessary, only changing the lock-step
for iteration to a cross product for iteration is.
Changing just this pattern to
(for coss (COS COSH)
(for copysigns (COPYSIGN)
(simplify
(coss (copysigns @0 @1))
(coss @0))))
increases the total number of gimple-match-x.cc lines from
234988 to 235324.
The alternative is to do
(for coss (COS COSH)
copysigns (COPYSIGN)
(simplify
(coss (copysigns @0 @1))
(coss @0))
(simplify
(coss (IFN_COPYSIGN @0 @1))
(coss @0)))
which properly will diagnose a duplicate pattern. Ther are
currently no operator lists with just builtins defined (that
could be fixed, see gencfn-macros.cc), supposed we'd have
COS_C we could do
(for coss (COS_C COSH_C IFN_COS IFN_COSH)
copysigns (COPYSIGN_C COPYSIGN_C IFN_COPYSIGN IFN_COPYSIGN
IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN
IFN_COPYSIGN)
(simplify
(coss (copysigns @0 @1))
(coss @0)))
which of course still looks ugly ;) (some syntax extension like
allowing to specify IFN_COPYSIGN*8 would be nice here and easy
enough to do)
Can you split out the part changing COPYSIGN to COPYSIGN_ALL,
re-do it to only split the fors, keeping COPYSIGN and provide
some statistics on the gimple-match-* size? I think this might
be the pragmatic solution for now.
Richard - can you think of a clever way to express the desired
iteration? How do RTL macro iterations address cases like this?
Richard.
> + (simplify
> + (coss (copysigns @0 @1))
> + (coss @0))))
>
> /* pow(copysign(x, y), z) -> pow(x, z) if z is an even integer. */
> (for pows (POW)
> - copysigns (COPYSIGN)
> - (simplify
> - (pows (copysigns @0 @2) REAL_CST@1)
> - (with { HOST_WIDE_INT n; }
> - (if (real_isinteger (&TREE_REAL_CST (@1), &n) && (n & 1) == 0)
> - (pows @0 @1)))))
> + (for copysigns (COPYSIGN_ALL)
> + (simplify
> + (pows (copysigns @0 @2) REAL_CST@1)
> + (with { HOST_WIDE_INT n; }
> + (if (real_isinteger (&TREE_REAL_CST (@1), &n) && (n & 1) == 0)
> + (pows @0 @1))))))
> /* Likewise for powi. */
> (for pows (POWI)
> - copysigns (COPYSIGN)
> - (simplify
> - (pows (copysigns @0 @2) INTEGER_CST@1)
> - (if ((wi::to_wide (@1) & 1) == 0)
> - (pows @0 @1))))
> + (for copysigns (COPYSIGN_ALL)
> + (simplify
> + (pows (copysigns @0 @2) INTEGER_CST@1)
> + (if ((wi::to_wide (@1) & 1) == 0)
> + (pows @0 @1)))))
>
> (for hypots (HYPOT)
> - copysigns (COPYSIGN)
> - /* hypot(copysign(x, y), z) -> hypot(x, z). */
> - (simplify
> - (hypots (copysigns @0 @1) @2)
> - (hypots @0 @2))
> - /* hypot(x, copysign(y, z)) -> hypot(x, y). */
> - (simplify
> - (hypots @0 (copysigns @1 @2))
> - (hypots @0 @1)))
> + (for copysigns (COPYSIGN)
> + /* hypot(copysign(x, y), z) -> hypot(x, z). */
> + (simplify
> + (hypots (copysigns @0 @1) @2)
> + (hypots @0 @2))
> + /* hypot(x, copysign(y, z)) -> hypot(x, y). */
> + (simplify
> + (hypots @0 (copysigns @1 @2))
> + (hypots @0 @1))))
>
> -/* copysign(x, CST) -> [-]abs (x). */
> -(for copysigns (COPYSIGN_ALL)
> - (simplify
> - (copysigns @0 REAL_CST@1)
> - (if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
> - (negate (abs @0))
> - (abs @0))))
> +/* Transform fneg (fabs (X)) -> copysign (X, -1). */
> +
> +(simplify
> + (negate (abs @0))
> + (IFN_COPYSIGN @0 { build_minus_one_cst (type); }))
>
> /* copysign(copysign(x, y), z) -> copysign(x, z). */
> (for copysigns (COPYSIGN_ALL)
> diff --git a/gcc/testsuite/gcc.dg/fold-copysign-1.c b/gcc/testsuite/gcc.dg/fold-copysign-1.c
> index f17d65c24ee4dca9867827d040fe0a404c515e7b..f9cafd14ab05f5e8ab2f6f68e62801d21c2df6a6 100644
> --- a/gcc/testsuite/gcc.dg/fold-copysign-1.c
> +++ b/gcc/testsuite/gcc.dg/fold-copysign-1.c
> @@ -12,5 +12,5 @@ double bar (double x)
> return __builtin_copysign (x, minuszero);
> }
>
> -/* { dg-final { scan-tree-dump-times "= -" 1 "cddce1" } } */
> -/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 2 "cddce1" } } */
> +/* { dg-final { scan-tree-dump-times "__builtin_copysign" 1 "cddce1" } } */
> +/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "cddce1" } } */
> diff --git a/gcc/testsuite/gcc.dg/pr55152-2.c b/gcc/testsuite/gcc.dg/pr55152-2.c
> index 54db0f2062da105a829d6690ac8ed9891fe2b588..605f202ed6bc7aa8fe921457b02ff0b88cc63ce6 100644
> --- a/gcc/testsuite/gcc.dg/pr55152-2.c
> +++ b/gcc/testsuite/gcc.dg/pr55152-2.c
> @@ -10,4 +10,5 @@ int f(int a)
> return (a<-a)?a:-a;
> }
>
> -/* { dg-final { scan-tree-dump-times "ABS_EXPR" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "\.COPYSIGN" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "ABS_EXPR" 1 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
> index 6197519faf7b55aed7bc162cd0a14dd2145210ca..e1b825f37f69ac3c4666b3a52d733368805ad31d 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
> @@ -9,5 +9,6 @@ long double abs_ld(long double x) { return __builtin_signbit(x) ? x : -x; }
>
> /* __builtin_signbit(x) ? x : -x. Should be convert into - ABS_EXP<x> */
> /* { dg-final { scan-tree-dump-not "signbit" "optimized"} } */
> -/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 3 "optimized"} } */
> -/* { dg-final { scan-tree-dump-times "= -" 3 "optimized"} } */
> +/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "optimized"} } */
> +/* { dg-final { scan-tree-dump-times "= -" 1 "optimized"} } */
> +/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 2 "optimized"} } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
> index 31f05716f1498dc709cac95fa20fb5796642c77e..c3a138642d6ff7be984e91fa1343cb2718db7ae1 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
> @@ -26,5 +26,6 @@ TEST_FUNCTION (float, f)
> TEST_FUNCTION (double, )
> TEST_FUNCTION (long double, l)
>
> -/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 6 "backprop" } } */
> -/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 3 "backprop" } } */
> +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 4 "backprop" } } */
> +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 2 "backprop" } } */
> +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 1 "backprop" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c b/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
> index de52c5f7c8062958353d91f5031193defc9f3f91..e5d565c4b9832c00106588ef411fbd8c292a5cad 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
> @@ -10,4 +10,5 @@ float f1(float x)
> float t = __builtin_copysignf (1.0f, -x);
> return x * t;
> }
> -/* { dg-final { scan-tree-dump-times "ABS" 2 "optimized"} } */
> +/* { dg-final { scan-tree-dump-times "ABS" 1 "optimized"} } */
> +/* { dg-final { scan-tree-dump-times ".COPYSIGN" 1 "optimized"} } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c b/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
> index a41f1baf25669a4fd301a586a49ba5e3c5b966ab..a22896b21c8b5a4d5d8e28bd8ae0db896e63ade0 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
> @@ -34,4 +34,5 @@ float i1(float x)
> {
> return x * (x <= 0.f ? 1.f : -1.f);
> }
> -/* { dg-final { scan-tree-dump-times "ABS" 8 "gimple"} } */
> +/* { dg-final { scan-tree-dump-times "ABS" 4 "gimple"} } */
> +/* { dg-final { scan-tree-dump-times "\.COPYSIGN" 4 "gimple"} } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f823013c3ddf6b3a266c3abfcbf2642fc2a75fa6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +#include <arm_neon.h>
> +
> +/*
> +** t1:
> +** orr v[0-9]+.2s, #128, lsl #24
> +** ret
> +*/
> +float32x2_t t1 (float32x2_t a)
> +{
> + return vneg_f32 (vabs_f32 (a));
> +}
> +
> +/*
> +** t2:
> +** orr v[0-9]+.4s, #128, lsl #24
> +** ret
> +*/
> +float32x4_t t2 (float32x4_t a)
> +{
> + return vnegq_f32 (vabsq_f32 (a));
> +}
> +
> +/*
> +** t3:
> +** adrp x0, .LC[0-9]+
> +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** ret
> +*/
> +float64x2_t t3 (float64x2_t a)
> +{
> + return vnegq_f64 (vabsq_f64 (a));
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..141121176b309e4b2aa413dc55271a6e3c93d5e1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +** movi v[0-9]+.2s, 0x80, lsl 24
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float32_t f1 (float32_t a)
> +{
> + return -fabsf (a);
> +}
> +
> +/*
> +** f2:
> +** mov x0, -9223372036854775808
> +** fmov d[0-9]+, x0
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float64_t f2 (float64_t a)
> +{
> + return -fabs (a);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b4652173a95d104ddfa70c497f0627a61ea89d3b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +** ...
> +** ldr q[0-9]+, \[x0\]
> +** orr v[0-9]+.4s, #128, lsl #24
> +** str q[0-9]+, \[x0\], 16
> +** ...
> +*/
> +void f1 (float32_t *a, int n)
> +{
> + for (int i = 0; i < (n & -8); i++)
> + a[i] = -fabsf (a[i]);
> +}
> +
> +/*
> +** f2:
> +** ...
> +** ldr q[0-9]+, \[x0\]
> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** str q[0-9]+, \[x0\], 16
> +** ...
> +*/
> +void f2 (float64_t *a, int n)
> +{
> + for (int i = 0; i < (n & -8); i++)
> + a[i] = -fabs (a[i]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..10879dea74462d34b26160eeb0bd54ead063166b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +#include <string.h>
> +
> +/*
> +** negabs:
> +** mov x0, -9223372036854775808
> +** fmov d[0-9]+, x0
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +double negabs (double x)
> +{
> + unsigned long long y;
> + memcpy (&y, &x, sizeof(double));
> + y = y | (1UL << 63);
> + memcpy (&x, &y, sizeof(double));
> + return x;
> +}
> +
> +/*
> +** negabsf:
> +** movi v[0-9]+.2s, 0x80, lsl 24
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float negabsf (float x)
> +{
> + unsigned int y;
> + memcpy (&y, &x, sizeof(float));
> + y = y | (1U << 31);
> + memcpy (&x, &y, sizeof(float));
> + return x;
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..0c7664e6de77a497682952653ffd417453854d52
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <arm_neon.h>
> +
> +/*
> +** t1:
> +** orr v[0-9]+.2s, #128, lsl #24
> +** ret
> +*/
> +float32x2_t t1 (float32x2_t a)
> +{
> + return vneg_f32 (vabs_f32 (a));
> +}
> +
> +/*
> +** t2:
> +** orr v[0-9]+.4s, #128, lsl #24
> +** ret
> +*/
> +float32x4_t t2 (float32x4_t a)
> +{
> + return vnegq_f32 (vabsq_f32 (a));
> +}
> +
> +/*
> +** t3:
> +** adrp x0, .LC[0-9]+
> +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +** ret
> +*/
> +float64x2_t t3 (float64x2_t a)
> +{
> + return vnegq_f64 (vabsq_f64 (a));
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..a60cd31b9294af2dac69eed1c93f899bd5c78fca
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +** movi v[0-9]+.2s, 0x80, lsl 24
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float32_t f1 (float32_t a)
> +{
> + return -fabsf (a);
> +}
> +
> +/*
> +** f2:
> +** mov x0, -9223372036854775808
> +** fmov d[0-9]+, x0
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float64_t f2 (float64_t a)
> +{
> + return -fabs (a);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..1bf34328d8841de8e6b0a5458562a9f00e31c275
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
> @@ -0,0 +1,34 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +** ...
> +** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
> +** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
> +** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
> +** ...
> +*/
> +void f1 (float32_t *a, int n)
> +{
> + for (int i = 0; i < (n & -8); i++)
> + a[i] = -fabsf (a[i]);
> +}
> +
> +/*
> +** f2:
> +** ...
> +** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
> +** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
> +** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
> +** ...
> +*/
> +void f2 (float64_t *a, int n)
> +{
> + for (int i = 0; i < (n & -8); i++)
> + a[i] = -fabs (a[i]);
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d01f6604ca7be87e3744d494
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <string.h>
> +
> +/*
> +** negabs:
> +** mov x0, -9223372036854775808
> +** fmov d[0-9]+, x0
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +double negabs (double x)
> +{
> + unsigned long long y;
> + memcpy (&y, &x, sizeof(double));
> + y = y | (1UL << 63);
> + memcpy (&x, &y, sizeof(double));
> + return x;
> +}
> +
> +/*
> +** negabsf:
> +** movi v[0-9]+.2s, 0x80, lsl 24
> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +** ret
> +*/
> +float negabsf (float x)
> +{
> + unsigned int y;
> + memcpy (&y, &x, sizeof(float));
> + y = y | (1U << 31);
> + memcpy (&x, &y, sizeof(float));
> + return x;
> +}
> +
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-10-06 6:24 ` Richard Biener
@ 2023-10-07 9:22 ` Richard Sandiford
2023-10-07 10:34 ` Richard Biener
0 siblings, 1 reply; 17+ messages in thread
From: Richard Sandiford @ 2023-10-07 9:22 UTC (permalink / raw)
To: Richard Biener; +Cc: Tamar Christina, Andrew Pinski, gcc-patches, nd, jlaw
Richard Biener <rguenther@suse.de> writes:
> On Thu, 5 Oct 2023, Tamar Christina wrote:
>
>> > I suppose the idea is that -abs(x) might be easier to optimize with other
>> > patterns (consider a - copysign(x,...), optimizing to a + abs(x)).
>> >
>> > For abs vs copysign it's a canonicalization, but (negate (abs @0)) is less
>> > canonical than copysign.
>> >
>> > > Should I try removing this?
>> >
>> > I'd say yes (and put the reverse canonicalization next to this pattern).
>> >
>>
>> This patch transforms fneg (fabs (x)) into copysign (x, -1) which is more
>> canonical and allows a target to expand this sequence efficiently. Such
>> sequences are common in scientific code working with gradients.
>>
>> various optimizations in match.pd only happened on COPYSIGN but not COPYSIGN_ALL
>> which means they exclude IFN_COPYSIGN. COPYSIGN however is restricted to only
>
> That's not true:
>
> (define_operator_list COPYSIGN
> BUILT_IN_COPYSIGNF
> BUILT_IN_COPYSIGN
> BUILT_IN_COPYSIGNL
> IFN_COPYSIGN)
>
> but they miss the extended float builtin variants like
> __builtin_copysignf16. Also see below
>
>> the C99 builtins and so doesn't work for vectors.
>>
>> The patch expands these optimizations to work on COPYSIGN_ALL.
>>
>> There is an existing canonicalization of copysign (x, -1) to fneg (fabs (x))
>> which I remove since this is a less efficient form. The testsuite is also
>> updated in light of this.
>>
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>>
>> Ok for master?
>>
>> Thanks,
>> Tamar
>>
>> gcc/ChangeLog:
>>
>> PR tree-optimization/109154
>> * match.pd: Add new neg+abs rule, remove inverse copysign rule and
>> expand existing copysign optimizations.
>>
>> gcc/testsuite/ChangeLog:
>>
>> PR tree-optimization/109154
>> * gcc.dg/fold-copysign-1.c: Updated.
>> * gcc.dg/pr55152-2.c: Updated.
>> * gcc.dg/tree-ssa/abs-4.c: Updated.
>> * gcc.dg/tree-ssa/backprop-6.c: Updated.
>> * gcc.dg/tree-ssa/copy-sign-2.c: Updated.
>> * gcc.dg/tree-ssa/mult-abs-2.c: Updated.
>> * gcc.target/aarch64/fneg-abs_1.c: New test.
>> * gcc.target/aarch64/fneg-abs_2.c: New test.
>> * gcc.target/aarch64/fneg-abs_3.c: New test.
>> * gcc.target/aarch64/fneg-abs_4.c: New test.
>> * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
>> * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
>> * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
>> * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
>>
>> --- inline copy of patch ---
>>
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 4bdd83e6e061b16dbdb2845b9398fcfb8a6c9739..bd6599d36021e119f51a4928354f580ffe82c6e2 100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -1074,45 +1074,43 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>
>> /* cos(copysign(x, y)) -> cos(x). Similarly for cosh. */
>> (for coss (COS COSH)
>> - copysigns (COPYSIGN)
>> - (simplify
>> - (coss (copysigns @0 @1))
>> - (coss @0)))
>> + (for copysigns (COPYSIGN_ALL)
>
> So this ends up generating for example the match
> (cosf (copysignl ...)) which doesn't make much sense.
>
> The lock-step iteration did
> (cosf (copysignf ..)) ... (ifn_cos (ifn_copysign ...))
> which is leaner but misses the case of
> (cosf (ifn_copysign ..)) - that's probably what you are
> after with this change.
>
> That said, there isn't a nice solution (without altering the match.pd
> IL). There's the explicit solution, spelling out all combinations.
>
> So if we want to go with yout pragmatic solution changing this
> to use COPYSIGN_ALL isn't necessary, only changing the lock-step
> for iteration to a cross product for iteration is.
>
> Changing just this pattern to
>
> (for coss (COS COSH)
> (for copysigns (COPYSIGN)
> (simplify
> (coss (copysigns @0 @1))
> (coss @0))))
>
> increases the total number of gimple-match-x.cc lines from
> 234988 to 235324.
I guess the difference between this and the later suggestions is that
this one allows builtin copysign to be paired with ifn cos, which would
be potentially useful in other situations. (It isn't here because
ifn_cos is rarely provided.) How much of the growth is due to that,
and much of it is from nonsensical combinations like
(builtin_cosf (builtin_copysignl ...))?
If it's mostly from nonsensical combinations then would it be possible
to make genmatch drop them?
> The alternative is to do
>
> (for coss (COS COSH)
> copysigns (COPYSIGN)
> (simplify
> (coss (copysigns @0 @1))
> (coss @0))
> (simplify
> (coss (IFN_COPYSIGN @0 @1))
> (coss @0)))
>
> which properly will diagnose a duplicate pattern. Ther are
> currently no operator lists with just builtins defined (that
> could be fixed, see gencfn-macros.cc), supposed we'd have
> COS_C we could do
>
> (for coss (COS_C COSH_C IFN_COS IFN_COSH)
> copysigns (COPYSIGN_C COPYSIGN_C IFN_COPYSIGN IFN_COPYSIGN
> IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN
> IFN_COPYSIGN)
> (simplify
> (coss (copysigns @0 @1))
> (coss @0)))
>
> which of course still looks ugly ;) (some syntax extension like
> allowing to specify IFN_COPYSIGN*8 would be nice here and easy
> enough to do)
>
> Can you split out the part changing COPYSIGN to COPYSIGN_ALL,
> re-do it to only split the fors, keeping COPYSIGN and provide
> some statistics on the gimple-match-* size? I think this might
> be the pragmatic solution for now.
>
> Richard - can you think of a clever way to express the desired
> iteration? How do RTL macro iterations address cases like this?
I don't think .md files have an equivalent construct, unfortunately.
(I also regret some of the choices I made for .md iterators, but that's
another story.)
Perhaps an alternative to the *8 thing would be "IFN_COPYSIGN...",
with the "..." meaning "fill to match the longest operator list
in the loop".
Thanks,
Richard
> Richard.
>
>> + (simplify
>> + (coss (copysigns @0 @1))
>> + (coss @0))))
>>
>> /* pow(copysign(x, y), z) -> pow(x, z) if z is an even integer. */
>> (for pows (POW)
>> - copysigns (COPYSIGN)
>> - (simplify
>> - (pows (copysigns @0 @2) REAL_CST@1)
>> - (with { HOST_WIDE_INT n; }
>> - (if (real_isinteger (&TREE_REAL_CST (@1), &n) && (n & 1) == 0)
>> - (pows @0 @1)))))
>> + (for copysigns (COPYSIGN_ALL)
>> + (simplify
>> + (pows (copysigns @0 @2) REAL_CST@1)
>> + (with { HOST_WIDE_INT n; }
>> + (if (real_isinteger (&TREE_REAL_CST (@1), &n) && (n & 1) == 0)
>> + (pows @0 @1))))))
>> /* Likewise for powi. */
>> (for pows (POWI)
>> - copysigns (COPYSIGN)
>> - (simplify
>> - (pows (copysigns @0 @2) INTEGER_CST@1)
>> - (if ((wi::to_wide (@1) & 1) == 0)
>> - (pows @0 @1))))
>> + (for copysigns (COPYSIGN_ALL)
>> + (simplify
>> + (pows (copysigns @0 @2) INTEGER_CST@1)
>> + (if ((wi::to_wide (@1) & 1) == 0)
>> + (pows @0 @1)))))
>>
>> (for hypots (HYPOT)
>> - copysigns (COPYSIGN)
>> - /* hypot(copysign(x, y), z) -> hypot(x, z). */
>> - (simplify
>> - (hypots (copysigns @0 @1) @2)
>> - (hypots @0 @2))
>> - /* hypot(x, copysign(y, z)) -> hypot(x, y). */
>> - (simplify
>> - (hypots @0 (copysigns @1 @2))
>> - (hypots @0 @1)))
>> + (for copysigns (COPYSIGN)
>> + /* hypot(copysign(x, y), z) -> hypot(x, z). */
>> + (simplify
>> + (hypots (copysigns @0 @1) @2)
>> + (hypots @0 @2))
>> + /* hypot(x, copysign(y, z)) -> hypot(x, y). */
>> + (simplify
>> + (hypots @0 (copysigns @1 @2))
>> + (hypots @0 @1))))
>>
>> -/* copysign(x, CST) -> [-]abs (x). */
>> -(for copysigns (COPYSIGN_ALL)
>> - (simplify
>> - (copysigns @0 REAL_CST@1)
>> - (if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
>> - (negate (abs @0))
>> - (abs @0))))
>> +/* Transform fneg (fabs (X)) -> copysign (X, -1). */
>> +
>> +(simplify
>> + (negate (abs @0))
>> + (IFN_COPYSIGN @0 { build_minus_one_cst (type); }))
>>
>> /* copysign(copysign(x, y), z) -> copysign(x, z). */
>> (for copysigns (COPYSIGN_ALL)
>> diff --git a/gcc/testsuite/gcc.dg/fold-copysign-1.c b/gcc/testsuite/gcc.dg/fold-copysign-1.c
>> index f17d65c24ee4dca9867827d040fe0a404c515e7b..f9cafd14ab05f5e8ab2f6f68e62801d21c2df6a6 100644
>> --- a/gcc/testsuite/gcc.dg/fold-copysign-1.c
>> +++ b/gcc/testsuite/gcc.dg/fold-copysign-1.c
>> @@ -12,5 +12,5 @@ double bar (double x)
>> return __builtin_copysign (x, minuszero);
>> }
>>
>> -/* { dg-final { scan-tree-dump-times "= -" 1 "cddce1" } } */
>> -/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 2 "cddce1" } } */
>> +/* { dg-final { scan-tree-dump-times "__builtin_copysign" 1 "cddce1" } } */
>> +/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "cddce1" } } */
>> diff --git a/gcc/testsuite/gcc.dg/pr55152-2.c b/gcc/testsuite/gcc.dg/pr55152-2.c
>> index 54db0f2062da105a829d6690ac8ed9891fe2b588..605f202ed6bc7aa8fe921457b02ff0b88cc63ce6 100644
>> --- a/gcc/testsuite/gcc.dg/pr55152-2.c
>> +++ b/gcc/testsuite/gcc.dg/pr55152-2.c
>> @@ -10,4 +10,5 @@ int f(int a)
>> return (a<-a)?a:-a;
>> }
>>
>> -/* { dg-final { scan-tree-dump-times "ABS_EXPR" 2 "optimized" } } */
>> +/* { dg-final { scan-tree-dump-times "\.COPYSIGN" 1 "optimized" } } */
>> +/* { dg-final { scan-tree-dump-times "ABS_EXPR" 1 "optimized" } } */
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
>> index 6197519faf7b55aed7bc162cd0a14dd2145210ca..e1b825f37f69ac3c4666b3a52d733368805ad31d 100644
>> --- a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
>> @@ -9,5 +9,6 @@ long double abs_ld(long double x) { return __builtin_signbit(x) ? x : -x; }
>>
>> /* __builtin_signbit(x) ? x : -x. Should be convert into - ABS_EXP<x> */
>> /* { dg-final { scan-tree-dump-not "signbit" "optimized"} } */
>> -/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 3 "optimized"} } */
>> -/* { dg-final { scan-tree-dump-times "= -" 3 "optimized"} } */
>> +/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "optimized"} } */
>> +/* { dg-final { scan-tree-dump-times "= -" 1 "optimized"} } */
>> +/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 2 "optimized"} } */
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
>> index 31f05716f1498dc709cac95fa20fb5796642c77e..c3a138642d6ff7be984e91fa1343cb2718db7ae1 100644
>> --- a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
>> @@ -26,5 +26,6 @@ TEST_FUNCTION (float, f)
>> TEST_FUNCTION (double, )
>> TEST_FUNCTION (long double, l)
>>
>> -/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 6 "backprop" } } */
>> -/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 3 "backprop" } } */
>> +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 4 "backprop" } } */
>> +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 2 "backprop" } } */
>> +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 1 "backprop" } } */
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c b/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
>> index de52c5f7c8062958353d91f5031193defc9f3f91..e5d565c4b9832c00106588ef411fbd8c292a5cad 100644
>> --- a/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
>> @@ -10,4 +10,5 @@ float f1(float x)
>> float t = __builtin_copysignf (1.0f, -x);
>> return x * t;
>> }
>> -/* { dg-final { scan-tree-dump-times "ABS" 2 "optimized"} } */
>> +/* { dg-final { scan-tree-dump-times "ABS" 1 "optimized"} } */
>> +/* { dg-final { scan-tree-dump-times ".COPYSIGN" 1 "optimized"} } */
>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c b/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
>> index a41f1baf25669a4fd301a586a49ba5e3c5b966ab..a22896b21c8b5a4d5d8e28bd8ae0db896e63ade0 100644
>> --- a/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
>> @@ -34,4 +34,5 @@ float i1(float x)
>> {
>> return x * (x <= 0.f ? 1.f : -1.f);
>> }
>> -/* { dg-final { scan-tree-dump-times "ABS" 8 "gimple"} } */
>> +/* { dg-final { scan-tree-dump-times "ABS" 4 "gimple"} } */
>> +/* { dg-final { scan-tree-dump-times "\.COPYSIGN" 4 "gimple"} } */
>> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..f823013c3ddf6b3a266c3abfcbf2642fc2a75fa6
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
>> @@ -0,0 +1,39 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>> +
>> +#pragma GCC target "+nosve"
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +** t1:
>> +** orr v[0-9]+.2s, #128, lsl #24
>> +** ret
>> +*/
>> +float32x2_t t1 (float32x2_t a)
>> +{
>> + return vneg_f32 (vabs_f32 (a));
>> +}
>> +
>> +/*
>> +** t2:
>> +** orr v[0-9]+.4s, #128, lsl #24
>> +** ret
>> +*/
>> +float32x4_t t2 (float32x4_t a)
>> +{
>> + return vnegq_f32 (vabsq_f32 (a));
>> +}
>> +
>> +/*
>> +** t3:
>> +** adrp x0, .LC[0-9]+
>> +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
>> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
>> +** ret
>> +*/
>> +float64x2_t t3 (float64x2_t a)
>> +{
>> + return vnegq_f64 (vabsq_f64 (a));
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..141121176b309e4b2aa413dc55271a6e3c93d5e1
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
>> @@ -0,0 +1,31 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>> +
>> +#pragma GCC target "+nosve"
>> +
>> +#include <arm_neon.h>
>> +#include <math.h>
>> +
>> +/*
>> +** f1:
>> +** movi v[0-9]+.2s, 0x80, lsl 24
>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>> +** ret
>> +*/
>> +float32_t f1 (float32_t a)
>> +{
>> + return -fabsf (a);
>> +}
>> +
>> +/*
>> +** f2:
>> +** mov x0, -9223372036854775808
>> +** fmov d[0-9]+, x0
>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>> +** ret
>> +*/
>> +float64_t f2 (float64_t a)
>> +{
>> + return -fabs (a);
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..b4652173a95d104ddfa70c497f0627a61ea89d3b
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
>> @@ -0,0 +1,36 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>> +
>> +#pragma GCC target "+nosve"
>> +
>> +#include <arm_neon.h>
>> +#include <math.h>
>> +
>> +/*
>> +** f1:
>> +** ...
>> +** ldr q[0-9]+, \[x0\]
>> +** orr v[0-9]+.4s, #128, lsl #24
>> +** str q[0-9]+, \[x0\], 16
>> +** ...
>> +*/
>> +void f1 (float32_t *a, int n)
>> +{
>> + for (int i = 0; i < (n & -8); i++)
>> + a[i] = -fabsf (a[i]);
>> +}
>> +
>> +/*
>> +** f2:
>> +** ...
>> +** ldr q[0-9]+, \[x0\]
>> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
>> +** str q[0-9]+, \[x0\], 16
>> +** ...
>> +*/
>> +void f2 (float64_t *a, int n)
>> +{
>> + for (int i = 0; i < (n & -8); i++)
>> + a[i] = -fabs (a[i]);
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..10879dea74462d34b26160eeb0bd54ead063166b
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
>> @@ -0,0 +1,39 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>> +
>> +#pragma GCC target "+nosve"
>> +
>> +#include <string.h>
>> +
>> +/*
>> +** negabs:
>> +** mov x0, -9223372036854775808
>> +** fmov d[0-9]+, x0
>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>> +** ret
>> +*/
>> +double negabs (double x)
>> +{
>> + unsigned long long y;
>> + memcpy (&y, &x, sizeof(double));
>> + y = y | (1UL << 63);
>> + memcpy (&x, &y, sizeof(double));
>> + return x;
>> +}
>> +
>> +/*
>> +** negabsf:
>> +** movi v[0-9]+.2s, 0x80, lsl 24
>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>> +** ret
>> +*/
>> +float negabsf (float x)
>> +{
>> + unsigned int y;
>> + memcpy (&y, &x, sizeof(float));
>> + y = y | (1U << 31);
>> + memcpy (&x, &y, sizeof(float));
>> + return x;
>> +}
>> +
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..0c7664e6de77a497682952653ffd417453854d52
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
>> @@ -0,0 +1,37 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +** t1:
>> +** orr v[0-9]+.2s, #128, lsl #24
>> +** ret
>> +*/
>> +float32x2_t t1 (float32x2_t a)
>> +{
>> + return vneg_f32 (vabs_f32 (a));
>> +}
>> +
>> +/*
>> +** t2:
>> +** orr v[0-9]+.4s, #128, lsl #24
>> +** ret
>> +*/
>> +float32x4_t t2 (float32x4_t a)
>> +{
>> + return vnegq_f32 (vabsq_f32 (a));
>> +}
>> +
>> +/*
>> +** t3:
>> +** adrp x0, .LC[0-9]+
>> +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
>> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
>> +** ret
>> +*/
>> +float64x2_t t3 (float64x2_t a)
>> +{
>> + return vnegq_f64 (vabsq_f64 (a));
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..a60cd31b9294af2dac69eed1c93f899bd5c78fca
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
>> @@ -0,0 +1,29 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>> +
>> +#include <arm_neon.h>
>> +#include <math.h>
>> +
>> +/*
>> +** f1:
>> +** movi v[0-9]+.2s, 0x80, lsl 24
>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>> +** ret
>> +*/
>> +float32_t f1 (float32_t a)
>> +{
>> + return -fabsf (a);
>> +}
>> +
>> +/*
>> +** f2:
>> +** mov x0, -9223372036854775808
>> +** fmov d[0-9]+, x0
>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>> +** ret
>> +*/
>> +float64_t f2 (float64_t a)
>> +{
>> + return -fabs (a);
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..1bf34328d8841de8e6b0a5458562a9f00e31c275
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
>> @@ -0,0 +1,34 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>> +
>> +#include <arm_neon.h>
>> +#include <math.h>
>> +
>> +/*
>> +** f1:
>> +** ...
>> +** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
>> +** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
>> +** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
>> +** ...
>> +*/
>> +void f1 (float32_t *a, int n)
>> +{
>> + for (int i = 0; i < (n & -8); i++)
>> + a[i] = -fabsf (a[i]);
>> +}
>> +
>> +/*
>> +** f2:
>> +** ...
>> +** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
>> +** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
>> +** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
>> +** ...
>> +*/
>> +void f2 (float64_t *a, int n)
>> +{
>> + for (int i = 0; i < (n & -8); i++)
>> + a[i] = -fabs (a[i]);
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d01f6604ca7be87e3744d494
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
>> @@ -0,0 +1,37 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>> +
>> +#include <string.h>
>> +
>> +/*
>> +** negabs:
>> +** mov x0, -9223372036854775808
>> +** fmov d[0-9]+, x0
>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>> +** ret
>> +*/
>> +double negabs (double x)
>> +{
>> + unsigned long long y;
>> + memcpy (&y, &x, sizeof(double));
>> + y = y | (1UL << 63);
>> + memcpy (&x, &y, sizeof(double));
>> + return x;
>> +}
>> +
>> +/*
>> +** negabsf:
>> +** movi v[0-9]+.2s, 0x80, lsl 24
>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>> +** ret
>> +*/
>> +float negabsf (float x)
>> +{
>> + unsigned int y;
>> + memcpy (&y, &x, sizeof(float));
>> + y = y | (1U << 31);
>> + memcpy (&x, &y, sizeof(float));
>> + return x;
>> +}
>> +
>>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-10-07 9:22 ` Richard Sandiford
@ 2023-10-07 10:34 ` Richard Biener
2023-10-07 11:34 ` Richard Sandiford
0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2023-10-07 10:34 UTC (permalink / raw)
To: Richard Sandiford; +Cc: Tamar Christina, Andrew Pinski, gcc-patches, nd, jlaw
> Am 07.10.2023 um 11:23 schrieb Richard Sandiford <richard.sandiford@arm.com>:
>
> Richard Biener <rguenther@suse.de> writes:
>> On Thu, 5 Oct 2023, Tamar Christina wrote:
>>
>>>> I suppose the idea is that -abs(x) might be easier to optimize with other
>>>> patterns (consider a - copysign(x,...), optimizing to a + abs(x)).
>>>>
>>>> For abs vs copysign it's a canonicalization, but (negate (abs @0)) is less
>>>> canonical than copysign.
>>>>
>>>>> Should I try removing this?
>>>>
>>>> I'd say yes (and put the reverse canonicalization next to this pattern).
>>>>
>>>
>>> This patch transforms fneg (fabs (x)) into copysign (x, -1) which is more
>>> canonical and allows a target to expand this sequence efficiently. Such
>>> sequences are common in scientific code working with gradients.
>>>
>>> various optimizations in match.pd only happened on COPYSIGN but not COPYSIGN_ALL
>>> which means they exclude IFN_COPYSIGN. COPYSIGN however is restricted to only
>>
>> That's not true:
>>
>> (define_operator_list COPYSIGN
>> BUILT_IN_COPYSIGNF
>> BUILT_IN_COPYSIGN
>> BUILT_IN_COPYSIGNL
>> IFN_COPYSIGN)
>>
>> but they miss the extended float builtin variants like
>> __builtin_copysignf16. Also see below
>>
>>> the C99 builtins and so doesn't work for vectors.
>>>
>>> The patch expands these optimizations to work on COPYSIGN_ALL.
>>>
>>> There is an existing canonicalization of copysign (x, -1) to fneg (fabs (x))
>>> which I remove since this is a less efficient form. The testsuite is also
>>> updated in light of this.
>>>
>>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>>>
>>> Ok for master?
>>>
>>> Thanks,
>>> Tamar
>>>
>>> gcc/ChangeLog:
>>>
>>> PR tree-optimization/109154
>>> * match.pd: Add new neg+abs rule, remove inverse copysign rule and
>>> expand existing copysign optimizations.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> PR tree-optimization/109154
>>> * gcc.dg/fold-copysign-1.c: Updated.
>>> * gcc.dg/pr55152-2.c: Updated.
>>> * gcc.dg/tree-ssa/abs-4.c: Updated.
>>> * gcc.dg/tree-ssa/backprop-6.c: Updated.
>>> * gcc.dg/tree-ssa/copy-sign-2.c: Updated.
>>> * gcc.dg/tree-ssa/mult-abs-2.c: Updated.
>>> * gcc.target/aarch64/fneg-abs_1.c: New test.
>>> * gcc.target/aarch64/fneg-abs_2.c: New test.
>>> * gcc.target/aarch64/fneg-abs_3.c: New test.
>>> * gcc.target/aarch64/fneg-abs_4.c: New test.
>>> * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
>>> * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
>>> * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
>>> * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
>>>
>>> --- inline copy of patch ---
>>>
>>> diff --git a/gcc/match.pd b/gcc/match.pd
>>> index 4bdd83e6e061b16dbdb2845b9398fcfb8a6c9739..bd6599d36021e119f51a4928354f580ffe82c6e2 100644
>>> --- a/gcc/match.pd
>>> +++ b/gcc/match.pd
>>> @@ -1074,45 +1074,43 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>>
>>> /* cos(copysign(x, y)) -> cos(x). Similarly for cosh. */
>>> (for coss (COS COSH)
>>> - copysigns (COPYSIGN)
>>> - (simplify
>>> - (coss (copysigns @0 @1))
>>> - (coss @0)))
>>> + (for copysigns (COPYSIGN_ALL)
>>
>> So this ends up generating for example the match
>> (cosf (copysignl ...)) which doesn't make much sense.
>>
>> The lock-step iteration did
>> (cosf (copysignf ..)) ... (ifn_cos (ifn_copysign ...))
>> which is leaner but misses the case of
>> (cosf (ifn_copysign ..)) - that's probably what you are
>> after with this change.
>>
>> That said, there isn't a nice solution (without altering the match.pd
>> IL). There's the explicit solution, spelling out all combinations.
>>
>> So if we want to go with yout pragmatic solution changing this
>> to use COPYSIGN_ALL isn't necessary, only changing the lock-step
>> for iteration to a cross product for iteration is.
>>
>> Changing just this pattern to
>>
>> (for coss (COS COSH)
>> (for copysigns (COPYSIGN)
>> (simplify
>> (coss (copysigns @0 @1))
>> (coss @0))))
>>
>> increases the total number of gimple-match-x.cc lines from
>> 234988 to 235324.
>
> I guess the difference between this and the later suggestions is that
> this one allows builtin copysign to be paired with ifn cos, which would
> be potentially useful in other situations. (It isn't here because
> ifn_cos is rarely provided.) How much of the growth is due to that,
> and much of it is from nonsensical combinations like
> (builtin_cosf (builtin_copysignl ...))?
>
> If it's mostly from nonsensical combinations then would it be possible
> to make genmatch drop them?
>
>> The alternative is to do
>>
>> (for coss (COS COSH)
>> copysigns (COPYSIGN)
>> (simplify
>> (coss (copysigns @0 @1))
>> (coss @0))
>> (simplify
>> (coss (IFN_COPYSIGN @0 @1))
>> (coss @0)))
>>
>> which properly will diagnose a duplicate pattern. Ther are
>> currently no operator lists with just builtins defined (that
>> could be fixed, see gencfn-macros.cc), supposed we'd have
>> COS_C we could do
>>
>> (for coss (COS_C COSH_C IFN_COS IFN_COSH)
>> copysigns (COPYSIGN_C COPYSIGN_C IFN_COPYSIGN IFN_COPYSIGN
>> IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN
>> IFN_COPYSIGN)
>> (simplify
>> (coss (copysigns @0 @1))
>> (coss @0)))
>>
>> which of course still looks ugly ;) (some syntax extension like
>> allowing to specify IFN_COPYSIGN*8 would be nice here and easy
>> enough to do)
>>
>> Can you split out the part changing COPYSIGN to COPYSIGN_ALL,
>> re-do it to only split the fors, keeping COPYSIGN and provide
>> some statistics on the gimple-match-* size? I think this might
>> be the pragmatic solution for now.
>>
>> Richard - can you think of a clever way to express the desired
>> iteration? How do RTL macro iterations address cases like this?
>
> I don't think .md files have an equivalent construct, unfortunately.
> (I also regret some of the choices I made for .md iterators, but that's
> another story.)
>
> Perhaps an alternative to the *8 thing would be "IFN_COPYSIGN...",
> with the "..." meaning "fill to match the longest operator list
> in the loop".
Hm, I’ll think about this. It would be useful to have a function like
Internal_fn ifn_for (combined_fn);
So we can indirectly match all builtins with a switch on the ifn code.
Richard
> Thanks,
> Richard
>
>> Richard.
>>
>>> + (simplify
>>> + (coss (copysigns @0 @1))
>>> + (coss @0))))
>>>
>>> /* pow(copysign(x, y), z) -> pow(x, z) if z is an even integer. */
>>> (for pows (POW)
>>> - copysigns (COPYSIGN)
>>> - (simplify
>>> - (pows (copysigns @0 @2) REAL_CST@1)
>>> - (with { HOST_WIDE_INT n; }
>>> - (if (real_isinteger (&TREE_REAL_CST (@1), &n) && (n & 1) == 0)
>>> - (pows @0 @1)))))
>>> + (for copysigns (COPYSIGN_ALL)
>>> + (simplify
>>> + (pows (copysigns @0 @2) REAL_CST@1)
>>> + (with { HOST_WIDE_INT n; }
>>> + (if (real_isinteger (&TREE_REAL_CST (@1), &n) && (n & 1) == 0)
>>> + (pows @0 @1))))))
>>> /* Likewise for powi. */
>>> (for pows (POWI)
>>> - copysigns (COPYSIGN)
>>> - (simplify
>>> - (pows (copysigns @0 @2) INTEGER_CST@1)
>>> - (if ((wi::to_wide (@1) & 1) == 0)
>>> - (pows @0 @1))))
>>> + (for copysigns (COPYSIGN_ALL)
>>> + (simplify
>>> + (pows (copysigns @0 @2) INTEGER_CST@1)
>>> + (if ((wi::to_wide (@1) & 1) == 0)
>>> + (pows @0 @1)))))
>>>
>>> (for hypots (HYPOT)
>>> - copysigns (COPYSIGN)
>>> - /* hypot(copysign(x, y), z) -> hypot(x, z). */
>>> - (simplify
>>> - (hypots (copysigns @0 @1) @2)
>>> - (hypots @0 @2))
>>> - /* hypot(x, copysign(y, z)) -> hypot(x, y). */
>>> - (simplify
>>> - (hypots @0 (copysigns @1 @2))
>>> - (hypots @0 @1)))
>>> + (for copysigns (COPYSIGN)
>>> + /* hypot(copysign(x, y), z) -> hypot(x, z). */
>>> + (simplify
>>> + (hypots (copysigns @0 @1) @2)
>>> + (hypots @0 @2))
>>> + /* hypot(x, copysign(y, z)) -> hypot(x, y). */
>>> + (simplify
>>> + (hypots @0 (copysigns @1 @2))
>>> + (hypots @0 @1))))
>>>
>>> -/* copysign(x, CST) -> [-]abs (x). */
>>> -(for copysigns (COPYSIGN_ALL)
>>> - (simplify
>>> - (copysigns @0 REAL_CST@1)
>>> - (if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
>>> - (negate (abs @0))
>>> - (abs @0))))
>>> +/* Transform fneg (fabs (X)) -> copysign (X, -1). */
>>> +
>>> +(simplify
>>> + (negate (abs @0))
>>> + (IFN_COPYSIGN @0 { build_minus_one_cst (type); }))
>>>
>>> /* copysign(copysign(x, y), z) -> copysign(x, z). */
>>> (for copysigns (COPYSIGN_ALL)
>>> diff --git a/gcc/testsuite/gcc.dg/fold-copysign-1.c b/gcc/testsuite/gcc.dg/fold-copysign-1.c
>>> index f17d65c24ee4dca9867827d040fe0a404c515e7b..f9cafd14ab05f5e8ab2f6f68e62801d21c2df6a6 100644
>>> --- a/gcc/testsuite/gcc.dg/fold-copysign-1.c
>>> +++ b/gcc/testsuite/gcc.dg/fold-copysign-1.c
>>> @@ -12,5 +12,5 @@ double bar (double x)
>>> return __builtin_copysign (x, minuszero);
>>> }
>>>
>>> -/* { dg-final { scan-tree-dump-times "= -" 1 "cddce1" } } */
>>> -/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 2 "cddce1" } } */
>>> +/* { dg-final { scan-tree-dump-times "__builtin_copysign" 1 "cddce1" } } */
>>> +/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "cddce1" } } */
>>> diff --git a/gcc/testsuite/gcc.dg/pr55152-2.c b/gcc/testsuite/gcc.dg/pr55152-2.c
>>> index 54db0f2062da105a829d6690ac8ed9891fe2b588..605f202ed6bc7aa8fe921457b02ff0b88cc63ce6 100644
>>> --- a/gcc/testsuite/gcc.dg/pr55152-2.c
>>> +++ b/gcc/testsuite/gcc.dg/pr55152-2.c
>>> @@ -10,4 +10,5 @@ int f(int a)
>>> return (a<-a)?a:-a;
>>> }
>>>
>>> -/* { dg-final { scan-tree-dump-times "ABS_EXPR" 2 "optimized" } } */
>>> +/* { dg-final { scan-tree-dump-times "\.COPYSIGN" 1 "optimized" } } */
>>> +/* { dg-final { scan-tree-dump-times "ABS_EXPR" 1 "optimized" } } */
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
>>> index 6197519faf7b55aed7bc162cd0a14dd2145210ca..e1b825f37f69ac3c4666b3a52d733368805ad31d 100644
>>> --- a/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/abs-4.c
>>> @@ -9,5 +9,6 @@ long double abs_ld(long double x) { return __builtin_signbit(x) ? x : -x; }
>>>
>>> /* __builtin_signbit(x) ? x : -x. Should be convert into - ABS_EXP<x> */
>>> /* { dg-final { scan-tree-dump-not "signbit" "optimized"} } */
>>> -/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 3 "optimized"} } */
>>> -/* { dg-final { scan-tree-dump-times "= -" 3 "optimized"} } */
>>> +/* { dg-final { scan-tree-dump-times "= ABS_EXPR" 1 "optimized"} } */
>>> +/* { dg-final { scan-tree-dump-times "= -" 1 "optimized"} } */
>>> +/* { dg-final { scan-tree-dump-times "= \.COPYSIGN" 2 "optimized"} } */
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
>>> index 31f05716f1498dc709cac95fa20fb5796642c77e..c3a138642d6ff7be984e91fa1343cb2718db7ae1 100644
>>> --- a/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/backprop-6.c
>>> @@ -26,5 +26,6 @@ TEST_FUNCTION (float, f)
>>> TEST_FUNCTION (double, )
>>> TEST_FUNCTION (long double, l)
>>>
>>> -/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 6 "backprop" } } */
>>> -/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 3 "backprop" } } */
>>> +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = -} 4 "backprop" } } */
>>> +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = \.COPYSIGN} 2 "backprop" } } */
>>> +/* { dg-final { scan-tree-dump-times {Deleting[^\n]* = ABS_EXPR <} 1 "backprop" } } */
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c b/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
>>> index de52c5f7c8062958353d91f5031193defc9f3f91..e5d565c4b9832c00106588ef411fbd8c292a5cad 100644
>>> --- a/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/copy-sign-2.c
>>> @@ -10,4 +10,5 @@ float f1(float x)
>>> float t = __builtin_copysignf (1.0f, -x);
>>> return x * t;
>>> }
>>> -/* { dg-final { scan-tree-dump-times "ABS" 2 "optimized"} } */
>>> +/* { dg-final { scan-tree-dump-times "ABS" 1 "optimized"} } */
>>> +/* { dg-final { scan-tree-dump-times ".COPYSIGN" 1 "optimized"} } */
>>> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c b/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
>>> index a41f1baf25669a4fd301a586a49ba5e3c5b966ab..a22896b21c8b5a4d5d8e28bd8ae0db896e63ade0 100644
>>> --- a/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
>>> +++ b/gcc/testsuite/gcc.dg/tree-ssa/mult-abs-2.c
>>> @@ -34,4 +34,5 @@ float i1(float x)
>>> {
>>> return x * (x <= 0.f ? 1.f : -1.f);
>>> }
>>> -/* { dg-final { scan-tree-dump-times "ABS" 8 "gimple"} } */
>>> +/* { dg-final { scan-tree-dump-times "ABS" 4 "gimple"} } */
>>> +/* { dg-final { scan-tree-dump-times "\.COPYSIGN" 4 "gimple"} } */
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..f823013c3ddf6b3a266c3abfcbf2642fc2a75fa6
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_1.c
>>> @@ -0,0 +1,39 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O3" } */
>>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>>> +
>>> +#pragma GCC target "+nosve"
>>> +
>>> +#include <arm_neon.h>
>>> +
>>> +/*
>>> +** t1:
>>> +** orr v[0-9]+.2s, #128, lsl #24
>>> +** ret
>>> +*/
>>> +float32x2_t t1 (float32x2_t a)
>>> +{
>>> + return vneg_f32 (vabs_f32 (a));
>>> +}
>>> +
>>> +/*
>>> +** t2:
>>> +** orr v[0-9]+.4s, #128, lsl #24
>>> +** ret
>>> +*/
>>> +float32x4_t t2 (float32x4_t a)
>>> +{
>>> + return vnegq_f32 (vabsq_f32 (a));
>>> +}
>>> +
>>> +/*
>>> +** t3:
>>> +** adrp x0, .LC[0-9]+
>>> +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
>>> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
>>> +** ret
>>> +*/
>>> +float64x2_t t3 (float64x2_t a)
>>> +{
>>> + return vnegq_f64 (vabsq_f64 (a));
>>> +}
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..141121176b309e4b2aa413dc55271a6e3c93d5e1
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
>>> @@ -0,0 +1,31 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O3" } */
>>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>>> +
>>> +#pragma GCC target "+nosve"
>>> +
>>> +#include <arm_neon.h>
>>> +#include <math.h>
>>> +
>>> +/*
>>> +** f1:
>>> +** movi v[0-9]+.2s, 0x80, lsl 24
>>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>>> +** ret
>>> +*/
>>> +float32_t f1 (float32_t a)
>>> +{
>>> + return -fabsf (a);
>>> +}
>>> +
>>> +/*
>>> +** f2:
>>> +** mov x0, -9223372036854775808
>>> +** fmov d[0-9]+, x0
>>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>>> +** ret
>>> +*/
>>> +float64_t f2 (float64_t a)
>>> +{
>>> + return -fabs (a);
>>> +}
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..b4652173a95d104ddfa70c497f0627a61ea89d3b
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_3.c
>>> @@ -0,0 +1,36 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O3" } */
>>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>>> +
>>> +#pragma GCC target "+nosve"
>>> +
>>> +#include <arm_neon.h>
>>> +#include <math.h>
>>> +
>>> +/*
>>> +** f1:
>>> +** ...
>>> +** ldr q[0-9]+, \[x0\]
>>> +** orr v[0-9]+.4s, #128, lsl #24
>>> +** str q[0-9]+, \[x0\], 16
>>> +** ...
>>> +*/
>>> +void f1 (float32_t *a, int n)
>>> +{
>>> + for (int i = 0; i < (n & -8); i++)
>>> + a[i] = -fabsf (a[i]);
>>> +}
>>> +
>>> +/*
>>> +** f2:
>>> +** ...
>>> +** ldr q[0-9]+, \[x0\]
>>> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
>>> +** str q[0-9]+, \[x0\], 16
>>> +** ...
>>> +*/
>>> +void f2 (float64_t *a, int n)
>>> +{
>>> + for (int i = 0; i < (n & -8); i++)
>>> + a[i] = -fabs (a[i]);
>>> +}
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..10879dea74462d34b26160eeb0bd54ead063166b
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
>>> @@ -0,0 +1,39 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O3" } */
>>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>>> +
>>> +#pragma GCC target "+nosve"
>>> +
>>> +#include <string.h>
>>> +
>>> +/*
>>> +** negabs:
>>> +** mov x0, -9223372036854775808
>>> +** fmov d[0-9]+, x0
>>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>>> +** ret
>>> +*/
>>> +double negabs (double x)
>>> +{
>>> + unsigned long long y;
>>> + memcpy (&y, &x, sizeof(double));
>>> + y = y | (1UL << 63);
>>> + memcpy (&x, &y, sizeof(double));
>>> + return x;
>>> +}
>>> +
>>> +/*
>>> +** negabsf:
>>> +** movi v[0-9]+.2s, 0x80, lsl 24
>>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>>> +** ret
>>> +*/
>>> +float negabsf (float x)
>>> +{
>>> + unsigned int y;
>>> + memcpy (&y, &x, sizeof(float));
>>> + y = y | (1U << 31);
>>> + memcpy (&x, &y, sizeof(float));
>>> + return x;
>>> +}
>>> +
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..0c7664e6de77a497682952653ffd417453854d52
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
>>> @@ -0,0 +1,37 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O3" } */
>>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>>> +
>>> +#include <arm_neon.h>
>>> +
>>> +/*
>>> +** t1:
>>> +** orr v[0-9]+.2s, #128, lsl #24
>>> +** ret
>>> +*/
>>> +float32x2_t t1 (float32x2_t a)
>>> +{
>>> + return vneg_f32 (vabs_f32 (a));
>>> +}
>>> +
>>> +/*
>>> +** t2:
>>> +** orr v[0-9]+.4s, #128, lsl #24
>>> +** ret
>>> +*/
>>> +float32x4_t t2 (float32x4_t a)
>>> +{
>>> + return vnegq_f32 (vabsq_f32 (a));
>>> +}
>>> +
>>> +/*
>>> +** t3:
>>> +** adrp x0, .LC[0-9]+
>>> +** ldr q[0-9]+, \[x0, #:lo12:.LC0\]
>>> +** orr v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
>>> +** ret
>>> +*/
>>> +float64x2_t t3 (float64x2_t a)
>>> +{
>>> + return vnegq_f64 (vabsq_f64 (a));
>>> +}
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..a60cd31b9294af2dac69eed1c93f899bd5c78fca
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
>>> @@ -0,0 +1,29 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O3" } */
>>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>>> +
>>> +#include <arm_neon.h>
>>> +#include <math.h>
>>> +
>>> +/*
>>> +** f1:
>>> +** movi v[0-9]+.2s, 0x80, lsl 24
>>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>>> +** ret
>>> +*/
>>> +float32_t f1 (float32_t a)
>>> +{
>>> + return -fabsf (a);
>>> +}
>>> +
>>> +/*
>>> +** f2:
>>> +** mov x0, -9223372036854775808
>>> +** fmov d[0-9]+, x0
>>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>>> +** ret
>>> +*/
>>> +float64_t f2 (float64_t a)
>>> +{
>>> + return -fabs (a);
>>> +}
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..1bf34328d8841de8e6b0a5458562a9f00e31c275
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_3.c
>>> @@ -0,0 +1,34 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O3" } */
>>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>>> +
>>> +#include <arm_neon.h>
>>> +#include <math.h>
>>> +
>>> +/*
>>> +** f1:
>>> +** ...
>>> +** ld1w z[0-9]+.s, p[0-9]+/z, \[x0, x2, lsl 2\]
>>> +** orr z[0-9]+.s, z[0-9]+.s, #0x80000000
>>> +** st1w z[0-9]+.s, p[0-9]+, \[x0, x2, lsl 2\]
>>> +** ...
>>> +*/
>>> +void f1 (float32_t *a, int n)
>>> +{
>>> + for (int i = 0; i < (n & -8); i++)
>>> + a[i] = -fabsf (a[i]);
>>> +}
>>> +
>>> +/*
>>> +** f2:
>>> +** ...
>>> +** ld1d z[0-9]+.d, p[0-9]+/z, \[x0, x2, lsl 3\]
>>> +** orr z[0-9]+.d, z[0-9]+.d, #0x8000000000000000
>>> +** st1d z[0-9]+.d, p[0-9]+, \[x0, x2, lsl 3\]
>>> +** ...
>>> +*/
>>> +void f2 (float64_t *a, int n)
>>> +{
>>> + for (int i = 0; i < (n & -8); i++)
>>> + a[i] = -fabs (a[i]);
>>> +}
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
>>> new file mode 100644
>>> index 0000000000000000000000000000000000000000..21f2a8da2a5d44e3d01f6604ca7be87e3744d494
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
>>> @@ -0,0 +1,37 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O3" } */
>>> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
>>> +
>>> +#include <string.h>
>>> +
>>> +/*
>>> +** negabs:
>>> +** mov x0, -9223372036854775808
>>> +** fmov d[0-9]+, x0
>>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>>> +** ret
>>> +*/
>>> +double negabs (double x)
>>> +{
>>> + unsigned long long y;
>>> + memcpy (&y, &x, sizeof(double));
>>> + y = y | (1UL << 63);
>>> + memcpy (&x, &y, sizeof(double));
>>> + return x;
>>> +}
>>> +
>>> +/*
>>> +** negabsf:
>>> +** movi v[0-9]+.2s, 0x80, lsl 24
>>> +** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>>> +** ret
>>> +*/
>>> +float negabsf (float x)
>>> +{
>>> + unsigned int y;
>>> + memcpy (&y, &x, sizeof(float));
>>> + y = y | (1U << 31);
>>> + memcpy (&x, &y, sizeof(float));
>>> + return x;
>>> +}
>>> +
>>>
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-10-07 10:34 ` Richard Biener
@ 2023-10-07 11:34 ` Richard Sandiford
2023-10-09 7:20 ` Richard Biener
0 siblings, 1 reply; 17+ messages in thread
From: Richard Sandiford @ 2023-10-07 11:34 UTC (permalink / raw)
To: Richard Biener; +Cc: Tamar Christina, Andrew Pinski, gcc-patches, nd, jlaw
Richard Biener <rguenther@suse.de> writes:
>> Am 07.10.2023 um 11:23 schrieb Richard Sandiford <richard.sandiford@arm.com>>> Richard Biener <rguenther@suse.de> writes:
>>> On Thu, 5 Oct 2023, Tamar Christina wrote:
>>>
>>>>> I suppose the idea is that -abs(x) might be easier to optimize with other
>>>>> patterns (consider a - copysign(x,...), optimizing to a + abs(x)).
>>>>>
>>>>> For abs vs copysign it's a canonicalization, but (negate (abs @0)) is less
>>>>> canonical than copysign.
>>>>>
>>>>>> Should I try removing this?
>>>>>
>>>>> I'd say yes (and put the reverse canonicalization next to this pattern).
>>>>>
>>>>
>>>> This patch transforms fneg (fabs (x)) into copysign (x, -1) which is more
>>>> canonical and allows a target to expand this sequence efficiently. Such
>>>> sequences are common in scientific code working with gradients.
>>>>
>>>> various optimizations in match.pd only happened on COPYSIGN but not COPYSIGN_ALL
>>>> which means they exclude IFN_COPYSIGN. COPYSIGN however is restricted to only
>>>
>>> That's not true:
>>>
>>> (define_operator_list COPYSIGN
>>> BUILT_IN_COPYSIGNF
>>> BUILT_IN_COPYSIGN
>>> BUILT_IN_COPYSIGNL
>>> IFN_COPYSIGN)
>>>
>>> but they miss the extended float builtin variants like
>>> __builtin_copysignf16. Also see below
>>>
>>>> the C99 builtins and so doesn't work for vectors.
>>>>
>>>> The patch expands these optimizations to work on COPYSIGN_ALL.
>>>>
>>>> There is an existing canonicalization of copysign (x, -1) to fneg (fabs (x))
>>>> which I remove since this is a less efficient form. The testsuite is also
>>>> updated in light of this.
>>>>
>>>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>>>>
>>>> Ok for master?
>>>>
>>>> Thanks,
>>>> Tamar
>>>>
>>>> gcc/ChangeLog:
>>>>
>>>> PR tree-optimization/109154
>>>> * match.pd: Add new neg+abs rule, remove inverse copysign rule and
>>>> expand existing copysign optimizations.
>>>>
>>>> gcc/testsuite/ChangeLog:
>>>>
>>>> PR tree-optimization/109154
>>>> * gcc.dg/fold-copysign-1.c: Updated.
>>>> * gcc.dg/pr55152-2.c: Updated.
>>>> * gcc.dg/tree-ssa/abs-4.c: Updated.
>>>> * gcc.dg/tree-ssa/backprop-6.c: Updated.
>>>> * gcc.dg/tree-ssa/copy-sign-2.c: Updated.
>>>> * gcc.dg/tree-ssa/mult-abs-2.c: Updated.
>>>> * gcc.target/aarch64/fneg-abs_1.c: New test.
>>>> * gcc.target/aarch64/fneg-abs_2.c: New test.
>>>> * gcc.target/aarch64/fneg-abs_3.c: New test.
>>>> * gcc.target/aarch64/fneg-abs_4.c: New test.
>>>> * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
>>>> * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
>>>> * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
>>>> * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
>>>>
>>>> --- inline copy of patch ---
>>>>
>>>> diff --git a/gcc/match.pd b/gcc/match.pd
>>>> index 4bdd83e6e061b16dbdb2845b9398fcfb8a6c9739..bd6599d36021e119f51a4928354f580ffe82c6e2 100644
>>>> --- a/gcc/match.pd
>>>> +++ b/gcc/match.pd
>>>> @@ -1074,45 +1074,43 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>>>>
>>>> /* cos(copysign(x, y)) -> cos(x). Similarly for cosh. */
>>>> (for coss (COS COSH)
>>>> - copysigns (COPYSIGN)
>>>> - (simplify
>>>> - (coss (copysigns @0 @1))
>>>> - (coss @0)))
>>>> + (for copysigns (COPYSIGN_ALL)
>>>
>>> So this ends up generating for example the match
>>> (cosf (copysignl ...)) which doesn't make much sense.
>>>
>>> The lock-step iteration did
>>> (cosf (copysignf ..)) ... (ifn_cos (ifn_copysign ...))
>>> which is leaner but misses the case of
>>> (cosf (ifn_copysign ..)) - that's probably what you are
>>> after with this change.
>>>
>>> That said, there isn't a nice solution (without altering the match.pd
>>> IL). There's the explicit solution, spelling out all combinations.
>>>
>>> So if we want to go with yout pragmatic solution changing this
>>> to use COPYSIGN_ALL isn't necessary, only changing the lock-step
>>> for iteration to a cross product for iteration is.
>>>
>>> Changing just this pattern to
>>>
>>> (for coss (COS COSH)
>>> (for copysigns (COPYSIGN)
>>> (simplify
>>> (coss (copysigns @0 @1))
>>> (coss @0))))
>>>
>>> increases the total number of gimple-match-x.cc lines from
>>> 234988 to 235324.
>>
>> I guess the difference between this and the later suggestions is that
>> this one allows builtin copysign to be paired with ifn cos, which would
>> be potentially useful in other situations. (It isn't here because
>> ifn_cos is rarely provided.) How much of the growth is due to that,
>> and much of it is from nonsensical combinations like
>> (builtin_cosf (builtin_copysignl ...))?
>>
>> If it's mostly from nonsensical combinations then would it be possible
>> to make genmatch drop them?
>>
>>> The alternative is to do
>>>
>>> (for coss (COS COSH)
>>> copysigns (COPYSIGN)
>>> (simplify
>>> (coss (copysigns @0 @1))
>>> (coss @0))
>>> (simplify
>>> (coss (IFN_COPYSIGN @0 @1))
>>> (coss @0)))
>>>
>>> which properly will diagnose a duplicate pattern. Ther are
>>> currently no operator lists with just builtins defined (that
>>> could be fixed, see gencfn-macros.cc), supposed we'd have
>>> COS_C we could do
>>>
>>> (for coss (COS_C COSH_C IFN_COS IFN_COSH)
>>> copysigns (COPYSIGN_C COPYSIGN_C IFN_COPYSIGN IFN_COPYSIGN
>>> IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN
>>> IFN_COPYSIGN)
>>> (simplify
>>> (coss (copysigns @0 @1))
>>> (coss @0)))
>>>
>>> which of course still looks ugly ;) (some syntax extension like
>>> allowing to specify IFN_COPYSIGN*8 would be nice here and easy
>>> enough to do)
>>>
>>> Can you split out the part changing COPYSIGN to COPYSIGN_ALL,
>>> re-do it to only split the fors, keeping COPYSIGN and provide
>>> some statistics on the gimple-match-* size? I think this might
>>> be the pragmatic solution for now.
>>>
>>> Richard - can you think of a clever way to express the desired
>>> iteration? How do RTL macro iterations address cases like this?
>>
>> I don't think .md files have an equivalent construct, unfortunately.
>> (I also regret some of the choices I made for .md iterators, but that's
>> another story.)
>>
>> Perhaps an alternative to the *8 thing would be "IFN_COPYSIGN...",
>> with the "..." meaning "fill to match the longest operator list
>> in the loop".
>
> Hm, I’ll think about this. It would be useful to have a function like
>
> Internal_fn ifn_for (combined_fn);
>
> So we can indirectly match all builtins with a switch on the ifn code.
There's:
extern internal_fn associated_internal_fn (combined_fn, tree);
extern internal_fn associated_internal_fn (tree);
extern internal_fn replacement_internal_fn (gcall *);
where the first one requires the return type, and the second one
operates on CALL_EXPRs.
Thanks,
Richard
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-10-07 11:34 ` Richard Sandiford
@ 2023-10-09 7:20 ` Richard Biener
2023-10-09 7:36 ` Andrew Pinski
0 siblings, 1 reply; 17+ messages in thread
From: Richard Biener @ 2023-10-09 7:20 UTC (permalink / raw)
To: Richard Sandiford; +Cc: Tamar Christina, Andrew Pinski, gcc-patches, nd, jlaw
[-- Attachment #1: Type: text/plain, Size: 10759 bytes --]
On Sat, 7 Oct 2023, Richard Sandiford wrote:
> Richard Biener <rguenther@suse.de> writes:
> >> Am 07.10.2023 um 11:23 schrieb Richard Sandiford <richard.sandiford@arm.com>>> Richard Biener <rguenther@suse.de> writes:
> >>> On Thu, 5 Oct 2023, Tamar Christina wrote:
> >>>
> >>>>> I suppose the idea is that -abs(x) might be easier to optimize with other
> >>>>> patterns (consider a - copysign(x,...), optimizing to a + abs(x)).
> >>>>>
> >>>>> For abs vs copysign it's a canonicalization, but (negate (abs @0)) is less
> >>>>> canonical than copysign.
> >>>>>
> >>>>>> Should I try removing this?
> >>>>>
> >>>>> I'd say yes (and put the reverse canonicalization next to this pattern).
> >>>>>
> >>>>
> >>>> This patch transforms fneg (fabs (x)) into copysign (x, -1) which is more
> >>>> canonical and allows a target to expand this sequence efficiently. Such
> >>>> sequences are common in scientific code working with gradients.
> >>>>
> >>>> various optimizations in match.pd only happened on COPYSIGN but not COPYSIGN_ALL
> >>>> which means they exclude IFN_COPYSIGN. COPYSIGN however is restricted to only
> >>>
> >>> That's not true:
> >>>
> >>> (define_operator_list COPYSIGN
> >>> BUILT_IN_COPYSIGNF
> >>> BUILT_IN_COPYSIGN
> >>> BUILT_IN_COPYSIGNL
> >>> IFN_COPYSIGN)
> >>>
> >>> but they miss the extended float builtin variants like
> >>> __builtin_copysignf16. Also see below
> >>>
> >>>> the C99 builtins and so doesn't work for vectors.
> >>>>
> >>>> The patch expands these optimizations to work on COPYSIGN_ALL.
> >>>>
> >>>> There is an existing canonicalization of copysign (x, -1) to fneg (fabs (x))
> >>>> which I remove since this is a less efficient form. The testsuite is also
> >>>> updated in light of this.
> >>>>
> >>>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >>>>
> >>>> Ok for master?
> >>>>
> >>>> Thanks,
> >>>> Tamar
> >>>>
> >>>> gcc/ChangeLog:
> >>>>
> >>>> PR tree-optimization/109154
> >>>> * match.pd: Add new neg+abs rule, remove inverse copysign rule and
> >>>> expand existing copysign optimizations.
> >>>>
> >>>> gcc/testsuite/ChangeLog:
> >>>>
> >>>> PR tree-optimization/109154
> >>>> * gcc.dg/fold-copysign-1.c: Updated.
> >>>> * gcc.dg/pr55152-2.c: Updated.
> >>>> * gcc.dg/tree-ssa/abs-4.c: Updated.
> >>>> * gcc.dg/tree-ssa/backprop-6.c: Updated.
> >>>> * gcc.dg/tree-ssa/copy-sign-2.c: Updated.
> >>>> * gcc.dg/tree-ssa/mult-abs-2.c: Updated.
> >>>> * gcc.target/aarch64/fneg-abs_1.c: New test.
> >>>> * gcc.target/aarch64/fneg-abs_2.c: New test.
> >>>> * gcc.target/aarch64/fneg-abs_3.c: New test.
> >>>> * gcc.target/aarch64/fneg-abs_4.c: New test.
> >>>> * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> >>>> * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> >>>> * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> >>>> * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> >>>>
> >>>> --- inline copy of patch ---
> >>>>
> >>>> diff --git a/gcc/match.pd b/gcc/match.pd
> >>>> index 4bdd83e6e061b16dbdb2845b9398fcfb8a6c9739..bd6599d36021e119f51a4928354f580ffe82c6e2 100644
> >>>> --- a/gcc/match.pd
> >>>> +++ b/gcc/match.pd
> >>>> @@ -1074,45 +1074,43 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >>>>
> >>>> /* cos(copysign(x, y)) -> cos(x). Similarly for cosh. */
> >>>> (for coss (COS COSH)
> >>>> - copysigns (COPYSIGN)
> >>>> - (simplify
> >>>> - (coss (copysigns @0 @1))
> >>>> - (coss @0)))
> >>>> + (for copysigns (COPYSIGN_ALL)
> >>>
> >>> So this ends up generating for example the match
> >>> (cosf (copysignl ...)) which doesn't make much sense.
> >>>
> >>> The lock-step iteration did
> >>> (cosf (copysignf ..)) ... (ifn_cos (ifn_copysign ...))
> >>> which is leaner but misses the case of
> >>> (cosf (ifn_copysign ..)) - that's probably what you are
> >>> after with this change.
> >>>
> >>> That said, there isn't a nice solution (without altering the match.pd
> >>> IL). There's the explicit solution, spelling out all combinations.
> >>>
> >>> So if we want to go with yout pragmatic solution changing this
> >>> to use COPYSIGN_ALL isn't necessary, only changing the lock-step
> >>> for iteration to a cross product for iteration is.
> >>>
> >>> Changing just this pattern to
> >>>
> >>> (for coss (COS COSH)
> >>> (for copysigns (COPYSIGN)
> >>> (simplify
> >>> (coss (copysigns @0 @1))
> >>> (coss @0))))
> >>>
> >>> increases the total number of gimple-match-x.cc lines from
> >>> 234988 to 235324.
> >>
> >> I guess the difference between this and the later suggestions is that
> >> this one allows builtin copysign to be paired with ifn cos, which would
> >> be potentially useful in other situations. (It isn't here because
> >> ifn_cos is rarely provided.) How much of the growth is due to that,
> >> and much of it is from nonsensical combinations like
> >> (builtin_cosf (builtin_copysignl ...))?
> >>
> >> If it's mostly from nonsensical combinations then would it be possible
> >> to make genmatch drop them?
> >>
> >>> The alternative is to do
> >>>
> >>> (for coss (COS COSH)
> >>> copysigns (COPYSIGN)
> >>> (simplify
> >>> (coss (copysigns @0 @1))
> >>> (coss @0))
> >>> (simplify
> >>> (coss (IFN_COPYSIGN @0 @1))
> >>> (coss @0)))
> >>>
> >>> which properly will diagnose a duplicate pattern. Ther are
> >>> currently no operator lists with just builtins defined (that
> >>> could be fixed, see gencfn-macros.cc), supposed we'd have
> >>> COS_C we could do
> >>>
> >>> (for coss (COS_C COSH_C IFN_COS IFN_COSH)
> >>> copysigns (COPYSIGN_C COPYSIGN_C IFN_COPYSIGN IFN_COPYSIGN
> >>> IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN
> >>> IFN_COPYSIGN)
> >>> (simplify
> >>> (coss (copysigns @0 @1))
> >>> (coss @0)))
> >>>
> >>> which of course still looks ugly ;) (some syntax extension like
> >>> allowing to specify IFN_COPYSIGN*8 would be nice here and easy
> >>> enough to do)
> >>>
> >>> Can you split out the part changing COPYSIGN to COPYSIGN_ALL,
> >>> re-do it to only split the fors, keeping COPYSIGN and provide
> >>> some statistics on the gimple-match-* size? I think this might
> >>> be the pragmatic solution for now.
> >>>
> >>> Richard - can you think of a clever way to express the desired
> >>> iteration? How do RTL macro iterations address cases like this?
> >>
> >> I don't think .md files have an equivalent construct, unfortunately.
> >> (I also regret some of the choices I made for .md iterators, but that's
> >> another story.)
> >>
> >> Perhaps an alternative to the *8 thing would be "IFN_COPYSIGN...",
> >> with the "..." meaning "fill to match the longest operator list
> >> in the loop".
> >
> > Hm, I?ll think about this. It would be useful to have a function like
> >
> > Internal_fn ifn_for (combined_fn);
> >
> > So we can indirectly match all builtins with a switch on the ifn code.
>
> There's:
>
> extern internal_fn associated_internal_fn (combined_fn, tree);
> extern internal_fn associated_internal_fn (tree);
> extern internal_fn replacement_internal_fn (gcall *);
>
> where the first one requires the return type, and the second one
> operates on CALL_EXPRs.
Hmm, for full generality the way we code-generate would need to change
quite a bit. Instead I've come up with the following quite limited
approach. You can write
(for coss (COS COSH)
(simplify
(coss (ANY_COPYSIGN @0 @1))
(coss @0))))
with it. For each internal function the following patch adds a
ANY_<name> identifier. The use is somewhat limited - you cannot
use it as the outermost operation in the match part and you cannot
use it in the replacement part at all. The nice thing is there's
no "iteration" at all, the ANY_COPYSIGN doesn't cause any pattern
duplication, instead we match it via CASE_CFN_<name> so it will
happily match mis-matched (typewise) calls (but those shouldn't
be there...).
The patch doesn't contain any defensiveness in the parser for the
use restriction, but you should get compile failures for misuses
at least.
It should match quite some of the copysign cases, I suspect its
of no use for most of the operators so maybe less general handling
and only specifically introducing ANY_COPYSIGN would be better.
At least I cannot think of any other functions that are matched
but disappear in the resulting replacement?
Richard.
diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
index 03d325efdf6..f7d3f51c013 100644
--- a/gcc/genmatch.cc
+++ b/gcc/genmatch.cc
@@ -524,10 +524,14 @@ class fn_id : public id_base
{
public:
fn_id (enum built_in_function fn_, const char *id_)
- : id_base (id_base::FN, id_), fn (fn_) {}
+ : id_base (id_base::FN, id_), fn (fn_), case_macro (nullptr) {}
fn_id (enum internal_fn fn_, const char *id_)
- : id_base (id_base::FN, id_), fn (int (END_BUILTINS) + int (fn_)) {}
+ : id_base (id_base::FN, id_), fn (int (END_BUILTINS) + int (fn_)),
+ case_macro (nullptr) {}
+ fn_id (const char *case_macro_, const char *id_)
+ : id_base (id_base::FN, id_), fn (-1U), case_macro (case_macro_) {}
unsigned int fn;
+ const char *case_macro;
};
class simplify;
@@ -3262,6 +3266,10 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth,
if (user_id *u = dyn_cast <user_id *> (e->operation))
for (auto id : u->substitutes)
fprintf_indent (f, indent, "case %s:\n", id->id);
+ else if (is_a <fn_id *> (e->operation)
+ && as_a <fn_id *> (e->operation)->case_macro)
+ fprintf_indent (f, indent, "%s:\n",
+ as_a <fn_id *> (e->operation)->case_macro);
else
fprintf_indent (f, indent, "case %s:\n", e->operation->id);
/* We need to be defensive against bogus prototypes allowing
@@ -3337,9 +3345,12 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth,
for (unsigned j = 0; j < generic_fns.length (); ++j)
{
expr *e = as_a <expr *>(generic_fns[j]->op);
- gcc_assert (e->operation->kind == id_base::FN);
+ fn_id *oper = as_a <fn_id *> (e->operation);
- fprintf_indent (f, indent, "case %s:\n", e->operation->id);
+ if (oper->case_macro)
+ fprintf_indent (f, indent, "%s:\n", oper->case_macro);
+ else
+ fprintf_indent (f, indent, "case %s:\n", e->operation->id);
fprintf_indent (f, indent, " if (call_expr_nargs (%s) == %d)\n"
" {\n", kid_opname, e->ops.length ());
generic_fns[j]->gen (f, indent + 6, false, depth);
@@ -5496,7 +5507,8 @@ main (int argc, char **argv)
#include "builtins.def"
#define DEF_INTERNAL_FN(CODE, NAME, FNSPEC) \
- add_function (IFN_##CODE, "CFN_" #CODE);
+ add_function (IFN_##CODE, "CFN_" #CODE); \
+ add_function ("CASE_CFN_" # CODE, "ANY_" # CODE);
#include "internal-fn.def"
/* Parse ahead! */
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-10-09 7:20 ` Richard Biener
@ 2023-10-09 7:36 ` Andrew Pinski
2023-10-09 9:06 ` Richard Biener
0 siblings, 1 reply; 17+ messages in thread
From: Andrew Pinski @ 2023-10-09 7:36 UTC (permalink / raw)
To: Richard Biener; +Cc: Richard Sandiford, Tamar Christina, gcc-patches, nd, jlaw
On Mon, Oct 9, 2023 at 12:20 AM Richard Biener <rguenther@suse.de> wrote:
>
> On Sat, 7 Oct 2023, Richard Sandiford wrote:
>
> > Richard Biener <rguenther@suse.de> writes:
> > >> Am 07.10.2023 um 11:23 schrieb Richard Sandiford <richard.sandiford@arm.com>>> Richard Biener <rguenther@suse.de> writes:
> > >>> On Thu, 5 Oct 2023, Tamar Christina wrote:
> > >>>
> > >>>>> I suppose the idea is that -abs(x) might be easier to optimize with other
> > >>>>> patterns (consider a - copysign(x,...), optimizing to a + abs(x)).
> > >>>>>
> > >>>>> For abs vs copysign it's a canonicalization, but (negate (abs @0)) is less
> > >>>>> canonical than copysign.
> > >>>>>
> > >>>>>> Should I try removing this?
> > >>>>>
> > >>>>> I'd say yes (and put the reverse canonicalization next to this pattern).
> > >>>>>
> > >>>>
> > >>>> This patch transforms fneg (fabs (x)) into copysign (x, -1) which is more
> > >>>> canonical and allows a target to expand this sequence efficiently. Such
> > >>>> sequences are common in scientific code working with gradients.
> > >>>>
> > >>>> various optimizations in match.pd only happened on COPYSIGN but not COPYSIGN_ALL
> > >>>> which means they exclude IFN_COPYSIGN. COPYSIGN however is restricted to only
> > >>>
> > >>> That's not true:
> > >>>
> > >>> (define_operator_list COPYSIGN
> > >>> BUILT_IN_COPYSIGNF
> > >>> BUILT_IN_COPYSIGN
> > >>> BUILT_IN_COPYSIGNL
> > >>> IFN_COPYSIGN)
> > >>>
> > >>> but they miss the extended float builtin variants like
> > >>> __builtin_copysignf16. Also see below
> > >>>
> > >>>> the C99 builtins and so doesn't work for vectors.
> > >>>>
> > >>>> The patch expands these optimizations to work on COPYSIGN_ALL.
> > >>>>
> > >>>> There is an existing canonicalization of copysign (x, -1) to fneg (fabs (x))
> > >>>> which I remove since this is a less efficient form. The testsuite is also
> > >>>> updated in light of this.
> > >>>>
> > >>>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >>>>
> > >>>> Ok for master?
> > >>>>
> > >>>> Thanks,
> > >>>> Tamar
> > >>>>
> > >>>> gcc/ChangeLog:
> > >>>>
> > >>>> PR tree-optimization/109154
> > >>>> * match.pd: Add new neg+abs rule, remove inverse copysign rule and
> > >>>> expand existing copysign optimizations.
> > >>>>
> > >>>> gcc/testsuite/ChangeLog:
> > >>>>
> > >>>> PR tree-optimization/109154
> > >>>> * gcc.dg/fold-copysign-1.c: Updated.
> > >>>> * gcc.dg/pr55152-2.c: Updated.
> > >>>> * gcc.dg/tree-ssa/abs-4.c: Updated.
> > >>>> * gcc.dg/tree-ssa/backprop-6.c: Updated.
> > >>>> * gcc.dg/tree-ssa/copy-sign-2.c: Updated.
> > >>>> * gcc.dg/tree-ssa/mult-abs-2.c: Updated.
> > >>>> * gcc.target/aarch64/fneg-abs_1.c: New test.
> > >>>> * gcc.target/aarch64/fneg-abs_2.c: New test.
> > >>>> * gcc.target/aarch64/fneg-abs_3.c: New test.
> > >>>> * gcc.target/aarch64/fneg-abs_4.c: New test.
> > >>>> * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > >>>> * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > >>>> * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > >>>> * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> > >>>>
> > >>>> --- inline copy of patch ---
> > >>>>
> > >>>> diff --git a/gcc/match.pd b/gcc/match.pd
> > >>>> index 4bdd83e6e061b16dbdb2845b9398fcfb8a6c9739..bd6599d36021e119f51a4928354f580ffe82c6e2 100644
> > >>>> --- a/gcc/match.pd
> > >>>> +++ b/gcc/match.pd
> > >>>> @@ -1074,45 +1074,43 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >>>>
> > >>>> /* cos(copysign(x, y)) -> cos(x). Similarly for cosh. */
> > >>>> (for coss (COS COSH)
> > >>>> - copysigns (COPYSIGN)
> > >>>> - (simplify
> > >>>> - (coss (copysigns @0 @1))
> > >>>> - (coss @0)))
> > >>>> + (for copysigns (COPYSIGN_ALL)
> > >>>
> > >>> So this ends up generating for example the match
> > >>> (cosf (copysignl ...)) which doesn't make much sense.
> > >>>
> > >>> The lock-step iteration did
> > >>> (cosf (copysignf ..)) ... (ifn_cos (ifn_copysign ...))
> > >>> which is leaner but misses the case of
> > >>> (cosf (ifn_copysign ..)) - that's probably what you are
> > >>> after with this change.
> > >>>
> > >>> That said, there isn't a nice solution (without altering the match.pd
> > >>> IL). There's the explicit solution, spelling out all combinations.
> > >>>
> > >>> So if we want to go with yout pragmatic solution changing this
> > >>> to use COPYSIGN_ALL isn't necessary, only changing the lock-step
> > >>> for iteration to a cross product for iteration is.
> > >>>
> > >>> Changing just this pattern to
> > >>>
> > >>> (for coss (COS COSH)
> > >>> (for copysigns (COPYSIGN)
> > >>> (simplify
> > >>> (coss (copysigns @0 @1))
> > >>> (coss @0))))
> > >>>
> > >>> increases the total number of gimple-match-x.cc lines from
> > >>> 234988 to 235324.
> > >>
> > >> I guess the difference between this and the later suggestions is that
> > >> this one allows builtin copysign to be paired with ifn cos, which would
> > >> be potentially useful in other situations. (It isn't here because
> > >> ifn_cos is rarely provided.) How much of the growth is due to that,
> > >> and much of it is from nonsensical combinations like
> > >> (builtin_cosf (builtin_copysignl ...))?
> > >>
> > >> If it's mostly from nonsensical combinations then would it be possible
> > >> to make genmatch drop them?
> > >>
> > >>> The alternative is to do
> > >>>
> > >>> (for coss (COS COSH)
> > >>> copysigns (COPYSIGN)
> > >>> (simplify
> > >>> (coss (copysigns @0 @1))
> > >>> (coss @0))
> > >>> (simplify
> > >>> (coss (IFN_COPYSIGN @0 @1))
> > >>> (coss @0)))
> > >>>
> > >>> which properly will diagnose a duplicate pattern. Ther are
> > >>> currently no operator lists with just builtins defined (that
> > >>> could be fixed, see gencfn-macros.cc), supposed we'd have
> > >>> COS_C we could do
> > >>>
> > >>> (for coss (COS_C COSH_C IFN_COS IFN_COSH)
> > >>> copysigns (COPYSIGN_C COPYSIGN_C IFN_COPYSIGN IFN_COPYSIGN
> > >>> IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN
> > >>> IFN_COPYSIGN)
> > >>> (simplify
> > >>> (coss (copysigns @0 @1))
> > >>> (coss @0)))
> > >>>
> > >>> which of course still looks ugly ;) (some syntax extension like
> > >>> allowing to specify IFN_COPYSIGN*8 would be nice here and easy
> > >>> enough to do)
> > >>>
> > >>> Can you split out the part changing COPYSIGN to COPYSIGN_ALL,
> > >>> re-do it to only split the fors, keeping COPYSIGN and provide
> > >>> some statistics on the gimple-match-* size? I think this might
> > >>> be the pragmatic solution for now.
> > >>>
> > >>> Richard - can you think of a clever way to express the desired
> > >>> iteration? How do RTL macro iterations address cases like this?
> > >>
> > >> I don't think .md files have an equivalent construct, unfortunately.
> > >> (I also regret some of the choices I made for .md iterators, but that's
> > >> another story.)
> > >>
> > >> Perhaps an alternative to the *8 thing would be "IFN_COPYSIGN...",
> > >> with the "..." meaning "fill to match the longest operator list
> > >> in the loop".
> > >
> > > Hm, I?ll think about this. It would be useful to have a function like
> > >
> > > Internal_fn ifn_for (combined_fn);
> > >
> > > So we can indirectly match all builtins with a switch on the ifn code.
> >
> > There's:
> >
> > extern internal_fn associated_internal_fn (combined_fn, tree);
> > extern internal_fn associated_internal_fn (tree);
> > extern internal_fn replacement_internal_fn (gcall *);
> >
> > where the first one requires the return type, and the second one
> > operates on CALL_EXPRs.
>
> Hmm, for full generality the way we code-generate would need to change
> quite a bit. Instead I've come up with the following quite limited
> approach. You can write
>
> (for coss (COS COSH)
> (simplify
> (coss (ANY_COPYSIGN @0 @1))
> (coss @0))))
This optimization is also handled by backprop (gimple-ssa-backprop.cc)
in a better way than the match code handle.
So maybe we don't really need to extend match-and-simplify here.
Right now backprop is only ran once early after inlining. Maybe run it
once more late would help?
Thanks,
Andrew
>
> with it. For each internal function the following patch adds a
> ANY_<name> identifier. The use is somewhat limited - you cannot
> use it as the outermost operation in the match part and you cannot
> use it in the replacement part at all. The nice thing is there's
> no "iteration" at all, the ANY_COPYSIGN doesn't cause any pattern
> duplication, instead we match it via CASE_CFN_<name> so it will
> happily match mis-matched (typewise) calls (but those shouldn't
> be there...).
>
> The patch doesn't contain any defensiveness in the parser for the
> use restriction, but you should get compile failures for misuses
> at least.
>
> It should match quite some of the copysign cases, I suspect its
> of no use for most of the operators so maybe less general handling
> and only specifically introducing ANY_COPYSIGN would be better.
> At least I cannot think of any other functions that are matched
> but disappear in the resulting replacement?
>
> Richard.
>
> diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
> index 03d325efdf6..f7d3f51c013 100644
> --- a/gcc/genmatch.cc
> +++ b/gcc/genmatch.cc
> @@ -524,10 +524,14 @@ class fn_id : public id_base
> {
> public:
> fn_id (enum built_in_function fn_, const char *id_)
> - : id_base (id_base::FN, id_), fn (fn_) {}
> + : id_base (id_base::FN, id_), fn (fn_), case_macro (nullptr) {}
> fn_id (enum internal_fn fn_, const char *id_)
> - : id_base (id_base::FN, id_), fn (int (END_BUILTINS) + int (fn_)) {}
> + : id_base (id_base::FN, id_), fn (int (END_BUILTINS) + int (fn_)),
> + case_macro (nullptr) {}
> + fn_id (const char *case_macro_, const char *id_)
> + : id_base (id_base::FN, id_), fn (-1U), case_macro (case_macro_) {}
> unsigned int fn;
> + const char *case_macro;
> };
>
> class simplify;
> @@ -3262,6 +3266,10 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth,
> if (user_id *u = dyn_cast <user_id *> (e->operation))
> for (auto id : u->substitutes)
> fprintf_indent (f, indent, "case %s:\n", id->id);
> + else if (is_a <fn_id *> (e->operation)
> + && as_a <fn_id *> (e->operation)->case_macro)
> + fprintf_indent (f, indent, "%s:\n",
> + as_a <fn_id *> (e->operation)->case_macro);
> else
> fprintf_indent (f, indent, "case %s:\n", e->operation->id);
> /* We need to be defensive against bogus prototypes allowing
> @@ -3337,9 +3345,12 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth,
> for (unsigned j = 0; j < generic_fns.length (); ++j)
> {
> expr *e = as_a <expr *>(generic_fns[j]->op);
> - gcc_assert (e->operation->kind == id_base::FN);
> + fn_id *oper = as_a <fn_id *> (e->operation);
>
> - fprintf_indent (f, indent, "case %s:\n", e->operation->id);
> + if (oper->case_macro)
> + fprintf_indent (f, indent, "%s:\n", oper->case_macro);
> + else
> + fprintf_indent (f, indent, "case %s:\n", e->operation->id);
> fprintf_indent (f, indent, " if (call_expr_nargs (%s) == %d)\n"
> " {\n", kid_opname, e->ops.length ());
> generic_fns[j]->gen (f, indent + 6, false, depth);
> @@ -5496,7 +5507,8 @@ main (int argc, char **argv)
> #include "builtins.def"
>
> #define DEF_INTERNAL_FN(CODE, NAME, FNSPEC) \
> - add_function (IFN_##CODE, "CFN_" #CODE);
> + add_function (IFN_##CODE, "CFN_" #CODE); \
> + add_function ("CASE_CFN_" # CODE, "ANY_" # CODE);
> #include "internal-fn.def"
>
> /* Parse ahead! */
^ permalink raw reply [flat|nested] 17+ messages in thread
* Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]
2023-10-09 7:36 ` Andrew Pinski
@ 2023-10-09 9:06 ` Richard Biener
0 siblings, 0 replies; 17+ messages in thread
From: Richard Biener @ 2023-10-09 9:06 UTC (permalink / raw)
To: Andrew Pinski; +Cc: Richard Sandiford, Tamar Christina, gcc-patches, nd, jlaw
[-- Attachment #1: Type: text/plain, Size: 13730 bytes --]
On Mon, 9 Oct 2023, Andrew Pinski wrote:
> On Mon, Oct 9, 2023 at 12:20?AM Richard Biener <rguenther@suse.de> wrote:
> >
> > On Sat, 7 Oct 2023, Richard Sandiford wrote:
> >
> > > Richard Biener <rguenther@suse.de> writes:
> > > >> Am 07.10.2023 um 11:23 schrieb Richard Sandiford <richard.sandiford@arm.com>>> Richard Biener <rguenther@suse.de> writes:
> > > >>> On Thu, 5 Oct 2023, Tamar Christina wrote:
> > > >>>
> > > >>>>> I suppose the idea is that -abs(x) might be easier to optimize with other
> > > >>>>> patterns (consider a - copysign(x,...), optimizing to a + abs(x)).
> > > >>>>>
> > > >>>>> For abs vs copysign it's a canonicalization, but (negate (abs @0)) is less
> > > >>>>> canonical than copysign.
> > > >>>>>
> > > >>>>>> Should I try removing this?
> > > >>>>>
> > > >>>>> I'd say yes (and put the reverse canonicalization next to this pattern).
> > > >>>>>
> > > >>>>
> > > >>>> This patch transforms fneg (fabs (x)) into copysign (x, -1) which is more
> > > >>>> canonical and allows a target to expand this sequence efficiently. Such
> > > >>>> sequences are common in scientific code working with gradients.
> > > >>>>
> > > >>>> various optimizations in match.pd only happened on COPYSIGN but not COPYSIGN_ALL
> > > >>>> which means they exclude IFN_COPYSIGN. COPYSIGN however is restricted to only
> > > >>>
> > > >>> That's not true:
> > > >>>
> > > >>> (define_operator_list COPYSIGN
> > > >>> BUILT_IN_COPYSIGNF
> > > >>> BUILT_IN_COPYSIGN
> > > >>> BUILT_IN_COPYSIGNL
> > > >>> IFN_COPYSIGN)
> > > >>>
> > > >>> but they miss the extended float builtin variants like
> > > >>> __builtin_copysignf16. Also see below
> > > >>>
> > > >>>> the C99 builtins and so doesn't work for vectors.
> > > >>>>
> > > >>>> The patch expands these optimizations to work on COPYSIGN_ALL.
> > > >>>>
> > > >>>> There is an existing canonicalization of copysign (x, -1) to fneg (fabs (x))
> > > >>>> which I remove since this is a less efficient form. The testsuite is also
> > > >>>> updated in light of this.
> > > >>>>
> > > >>>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > >>>>
> > > >>>> Ok for master?
> > > >>>>
> > > >>>> Thanks,
> > > >>>> Tamar
> > > >>>>
> > > >>>> gcc/ChangeLog:
> > > >>>>
> > > >>>> PR tree-optimization/109154
> > > >>>> * match.pd: Add new neg+abs rule, remove inverse copysign rule and
> > > >>>> expand existing copysign optimizations.
> > > >>>>
> > > >>>> gcc/testsuite/ChangeLog:
> > > >>>>
> > > >>>> PR tree-optimization/109154
> > > >>>> * gcc.dg/fold-copysign-1.c: Updated.
> > > >>>> * gcc.dg/pr55152-2.c: Updated.
> > > >>>> * gcc.dg/tree-ssa/abs-4.c: Updated.
> > > >>>> * gcc.dg/tree-ssa/backprop-6.c: Updated.
> > > >>>> * gcc.dg/tree-ssa/copy-sign-2.c: Updated.
> > > >>>> * gcc.dg/tree-ssa/mult-abs-2.c: Updated.
> > > >>>> * gcc.target/aarch64/fneg-abs_1.c: New test.
> > > >>>> * gcc.target/aarch64/fneg-abs_2.c: New test.
> > > >>>> * gcc.target/aarch64/fneg-abs_3.c: New test.
> > > >>>> * gcc.target/aarch64/fneg-abs_4.c: New test.
> > > >>>> * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > > >>>> * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > > >>>> * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > > >>>> * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> > > >>>>
> > > >>>> --- inline copy of patch ---
> > > >>>>
> > > >>>> diff --git a/gcc/match.pd b/gcc/match.pd
> > > >>>> index 4bdd83e6e061b16dbdb2845b9398fcfb8a6c9739..bd6599d36021e119f51a4928354f580ffe82c6e2 100644
> > > >>>> --- a/gcc/match.pd
> > > >>>> +++ b/gcc/match.pd
> > > >>>> @@ -1074,45 +1074,43 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > > >>>>
> > > >>>> /* cos(copysign(x, y)) -> cos(x). Similarly for cosh. */
> > > >>>> (for coss (COS COSH)
> > > >>>> - copysigns (COPYSIGN)
> > > >>>> - (simplify
> > > >>>> - (coss (copysigns @0 @1))
> > > >>>> - (coss @0)))
> > > >>>> + (for copysigns (COPYSIGN_ALL)
> > > >>>
> > > >>> So this ends up generating for example the match
> > > >>> (cosf (copysignl ...)) which doesn't make much sense.
> > > >>>
> > > >>> The lock-step iteration did
> > > >>> (cosf (copysignf ..)) ... (ifn_cos (ifn_copysign ...))
> > > >>> which is leaner but misses the case of
> > > >>> (cosf (ifn_copysign ..)) - that's probably what you are
> > > >>> after with this change.
> > > >>>
> > > >>> That said, there isn't a nice solution (without altering the match.pd
> > > >>> IL). There's the explicit solution, spelling out all combinations.
> > > >>>
> > > >>> So if we want to go with yout pragmatic solution changing this
> > > >>> to use COPYSIGN_ALL isn't necessary, only changing the lock-step
> > > >>> for iteration to a cross product for iteration is.
> > > >>>
> > > >>> Changing just this pattern to
> > > >>>
> > > >>> (for coss (COS COSH)
> > > >>> (for copysigns (COPYSIGN)
> > > >>> (simplify
> > > >>> (coss (copysigns @0 @1))
> > > >>> (coss @0))))
> > > >>>
> > > >>> increases the total number of gimple-match-x.cc lines from
> > > >>> 234988 to 235324.
> > > >>
> > > >> I guess the difference between this and the later suggestions is that
> > > >> this one allows builtin copysign to be paired with ifn cos, which would
> > > >> be potentially useful in other situations. (It isn't here because
> > > >> ifn_cos is rarely provided.) How much of the growth is due to that,
> > > >> and much of it is from nonsensical combinations like
> > > >> (builtin_cosf (builtin_copysignl ...))?
> > > >>
> > > >> If it's mostly from nonsensical combinations then would it be possible
> > > >> to make genmatch drop them?
> > > >>
> > > >>> The alternative is to do
> > > >>>
> > > >>> (for coss (COS COSH)
> > > >>> copysigns (COPYSIGN)
> > > >>> (simplify
> > > >>> (coss (copysigns @0 @1))
> > > >>> (coss @0))
> > > >>> (simplify
> > > >>> (coss (IFN_COPYSIGN @0 @1))
> > > >>> (coss @0)))
> > > >>>
> > > >>> which properly will diagnose a duplicate pattern. Ther are
> > > >>> currently no operator lists with just builtins defined (that
> > > >>> could be fixed, see gencfn-macros.cc), supposed we'd have
> > > >>> COS_C we could do
> > > >>>
> > > >>> (for coss (COS_C COSH_C IFN_COS IFN_COSH)
> > > >>> copysigns (COPYSIGN_C COPYSIGN_C IFN_COPYSIGN IFN_COPYSIGN
> > > >>> IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN IFN_COPYSIGN
> > > >>> IFN_COPYSIGN)
> > > >>> (simplify
> > > >>> (coss (copysigns @0 @1))
> > > >>> (coss @0)))
> > > >>>
> > > >>> which of course still looks ugly ;) (some syntax extension like
> > > >>> allowing to specify IFN_COPYSIGN*8 would be nice here and easy
> > > >>> enough to do)
> > > >>>
> > > >>> Can you split out the part changing COPYSIGN to COPYSIGN_ALL,
> > > >>> re-do it to only split the fors, keeping COPYSIGN and provide
> > > >>> some statistics on the gimple-match-* size? I think this might
> > > >>> be the pragmatic solution for now.
> > > >>>
> > > >>> Richard - can you think of a clever way to express the desired
> > > >>> iteration? How do RTL macro iterations address cases like this?
> > > >>
> > > >> I don't think .md files have an equivalent construct, unfortunately.
> > > >> (I also regret some of the choices I made for .md iterators, but that's
> > > >> another story.)
> > > >>
> > > >> Perhaps an alternative to the *8 thing would be "IFN_COPYSIGN...",
> > > >> with the "..." meaning "fill to match the longest operator list
> > > >> in the loop".
> > > >
> > > > Hm, I?ll think about this. It would be useful to have a function like
> > > >
> > > > Internal_fn ifn_for (combined_fn);
> > > >
> > > > So we can indirectly match all builtins with a switch on the ifn code.
> > >
> > > There's:
> > >
> > > extern internal_fn associated_internal_fn (combined_fn, tree);
> > > extern internal_fn associated_internal_fn (tree);
> > > extern internal_fn replacement_internal_fn (gcall *);
> > >
> > > where the first one requires the return type, and the second one
> > > operates on CALL_EXPRs.
> >
> > Hmm, for full generality the way we code-generate would need to change
> > quite a bit. Instead I've come up with the following quite limited
> > approach. You can write
> >
> > (for coss (COS COSH)
> > (simplify
> > (coss (ANY_COPYSIGN @0 @1))
> > (coss @0))))
>
> This optimization is also handled by backprop (gimple-ssa-backprop.cc)
> in a better way than the match code handle.
> So maybe we don't really need to extend match-and-simplify here.
> Right now backprop is only ran once early after inlining. Maybe run it
> once more late would help?
I think it generally makes sense to clean up simple things during
building/folding and not wait for specialized passes.
The question here is mostly whether we are fine with some bloat
in {generic,gimple}-match-?.cc or not. The change proposed likely
doens't make a big impact as it's going to be of limited use.
Any change exposing semantics of the builtins to genmatch so it
can rule out say combining BUILT_IN_SINF and BUILT_IN_COS is going
to be quite difficult if you consider
(for coss (BUILT_IN_COS BUILT_IN_COSL)
sins (BUILT_IN_SINF BUILT_IN_SIN)
(simplify
(coss (convert (sins @0)))
...
or so. The current operator-list handling is handling them ordered,
we might want to introduce a semantically different operator-set
iteration, treating them unordered. That's basically how
(for list1 (...)
(for list2 (...)
works. There would be the opportunity to change code generation
for such case to catch-all case, but the way we generate the
decision tree makes this difficult I think. I've filed PR111732
to track this genmatch optimization opportunity.
Richard.
> Thanks,
> Andrew
>
>
> >
> > with it. For each internal function the following patch adds a
> > ANY_<name> identifier. The use is somewhat limited - you cannot
> > use it as the outermost operation in the match part and you cannot
> > use it in the replacement part at all. The nice thing is there's
> > no "iteration" at all, the ANY_COPYSIGN doesn't cause any pattern
> > duplication, instead we match it via CASE_CFN_<name> so it will
> > happily match mis-matched (typewise) calls (but those shouldn't
> > be there...).
> >
> > The patch doesn't contain any defensiveness in the parser for the
> > use restriction, but you should get compile failures for misuses
> > at least.
> >
> > It should match quite some of the copysign cases, I suspect its
> > of no use for most of the operators so maybe less general handling
> > and only specifically introducing ANY_COPYSIGN would be better.
> > At least I cannot think of any other functions that are matched
> > but disappear in the resulting replacement?
> >
> > Richard.
> >
> > diff --git a/gcc/genmatch.cc b/gcc/genmatch.cc
> > index 03d325efdf6..f7d3f51c013 100644
> > --- a/gcc/genmatch.cc
> > +++ b/gcc/genmatch.cc
> > @@ -524,10 +524,14 @@ class fn_id : public id_base
> > {
> > public:
> > fn_id (enum built_in_function fn_, const char *id_)
> > - : id_base (id_base::FN, id_), fn (fn_) {}
> > + : id_base (id_base::FN, id_), fn (fn_), case_macro (nullptr) {}
> > fn_id (enum internal_fn fn_, const char *id_)
> > - : id_base (id_base::FN, id_), fn (int (END_BUILTINS) + int (fn_)) {}
> > + : id_base (id_base::FN, id_), fn (int (END_BUILTINS) + int (fn_)),
> > + case_macro (nullptr) {}
> > + fn_id (const char *case_macro_, const char *id_)
> > + : id_base (id_base::FN, id_), fn (-1U), case_macro (case_macro_) {}
> > unsigned int fn;
> > + const char *case_macro;
> > };
> >
> > class simplify;
> > @@ -3262,6 +3266,10 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth,
> > if (user_id *u = dyn_cast <user_id *> (e->operation))
> > for (auto id : u->substitutes)
> > fprintf_indent (f, indent, "case %s:\n", id->id);
> > + else if (is_a <fn_id *> (e->operation)
> > + && as_a <fn_id *> (e->operation)->case_macro)
> > + fprintf_indent (f, indent, "%s:\n",
> > + as_a <fn_id *> (e->operation)->case_macro);
> > else
> > fprintf_indent (f, indent, "case %s:\n", e->operation->id);
> > /* We need to be defensive against bogus prototypes allowing
> > @@ -3337,9 +3345,12 @@ dt_node::gen_kids_1 (FILE *f, int indent, bool gimple, int depth,
> > for (unsigned j = 0; j < generic_fns.length (); ++j)
> > {
> > expr *e = as_a <expr *>(generic_fns[j]->op);
> > - gcc_assert (e->operation->kind == id_base::FN);
> > + fn_id *oper = as_a <fn_id *> (e->operation);
> >
> > - fprintf_indent (f, indent, "case %s:\n", e->operation->id);
> > + if (oper->case_macro)
> > + fprintf_indent (f, indent, "%s:\n", oper->case_macro);
> > + else
> > + fprintf_indent (f, indent, "case %s:\n", e->operation->id);
> > fprintf_indent (f, indent, " if (call_expr_nargs (%s) == %d)\n"
> > " {\n", kid_opname, e->ops.length ());
> > generic_fns[j]->gen (f, indent + 6, false, depth);
> > @@ -5496,7 +5507,8 @@ main (int argc, char **argv)
> > #include "builtins.def"
> >
> > #define DEF_INTERNAL_FN(CODE, NAME, FNSPEC) \
> > - add_function (IFN_##CODE, "CFN_" #CODE);
> > + add_function (IFN_##CODE, "CFN_" #CODE); \
> > + add_function ("CASE_CFN_" # CODE, "ANY_" # CODE);
> > #include "internal-fn.def"
> >
> > /* Parse ahead! */
>
--
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
^ permalink raw reply [flat|nested] 17+ messages in thread
end of thread, other threads:[~2023-10-09 9:06 UTC | newest]
Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-27 0:50 [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154] Tamar Christina
2023-09-27 1:17 ` Andrew Pinski
2023-09-27 2:31 ` Tamar Christina
2023-09-27 7:11 ` Richard Biener
2023-09-27 7:56 ` Tamar Christina
2023-09-27 9:35 ` Tamar Christina
2023-09-27 9:39 ` Richard Biener
2023-10-05 18:11 ` Tamar Christina
2023-10-06 6:24 ` Richard Biener
2023-10-07 9:22 ` Richard Sandiford
2023-10-07 10:34 ` Richard Biener
2023-10-07 11:34 ` Richard Sandiford
2023-10-09 7:20 ` Richard Biener
2023-10-09 7:36 ` Andrew Pinski
2023-10-09 9:06 ` Richard Biener
2023-09-29 15:00 ` Jeff Law
2023-10-05 18:09 ` Tamar Christina
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).