[PATCH]AArch64 Add SVE implementation for cond

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH]AArch64 Add SVE implementation for cond_copysign.
@ 2023-10-05 18:21 Tamar Christina
  2023-10-05 19:28 ` Richard Sandiford
  2023-10-19 21:29 ` Richard Sandiford
  0 siblings, 2 replies; 16+ messages in thread
From: Tamar Christina @ 2023-10-05 18:21 UTC (permalink / raw)
  To: gcc-patches
  Cc: nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov,
	richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 3753 bytes --]

Hi All,

This adds an implementation for masked copysign along with an optimized
pattern for masked copysign (x, -1).

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR tree-optimization/109154
	* config/aarch64/aarch64-sve.md (cond_copysign<mode>): New.

gcc/testsuite/ChangeLog:

	PR tree-optimization/109154
	* gcc.target/aarch64/sve/fneg-abs_5.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254568f45b61a14aa11c305 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
   }
 )
 
+(define_expand "cond_copysign<mode>"
+  [(match_operand:SVE_FULL_F 0 "register_operand")
+   (match_operand:<VPRED> 1 "register_operand")
+   (match_operand:SVE_FULL_F 2 "register_operand")
+   (match_operand:SVE_FULL_F 3 "nonmemory_operand")
+   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+  "TARGET_SVE"
+  {
+    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
+    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
+    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
+    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
+
+    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
+    rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3], <MODE>mode);
+    rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4], <MODE>mode);
+
+    rtx v_sign_bitmask
+      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+					   HOST_WIDE_INT_M1U << bits);
+
+    /* copysign (x, -1) should instead be expanded as orr with the sign
+       bit.  */
+    if (!REG_P (operands[3]))
+      {
+	auto r0
+	  = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[3]));
+	if (-1 == real_to_integer (r0))
+	  {
+	    arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
+	    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
+						  arg3, arg4));
+	    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+	    DONE;
+	  }
+      }
+
+    operands[2] = force_reg (<MODE>mode, operands[3]);
+    emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
+    emit_insn (gen_and<v_int_equiv>3
+	       (mant, arg2,
+		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+						   ~(HOST_WIDE_INT_M1U
+						     << bits))));
+    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant,
+					  arg4));
+    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+    DONE;
+  }
+)
+
 (define_expand "xorsign<mode>3"
   [(match_operand:SVE_FULL_F 0 "register_operand")
    (match_operand:SVE_FULL_F 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..f4ecbeecbe1290134e688f46a4389d17155e4a0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+**	...
+**	orr	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**	...
+*/
+void f1 (float32_t *a, int n)
+{
+  for (int i = 0; i < (n & -8); i++)
+   if (a[i] > n)
+     a[i] = -fabsf (a[i]);
+   else
+     a[i] = n;
+}
+
+/*
+** f2:
+**	...
+**	orr	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**	...
+*/
+void f2 (float64_t *a, int n)
+{
+  for (int i = 0; i < (n & -8); i++)
+   if (a[i] > n)
+     a[i] = -fabs (a[i]);
+   else
+     a[i] = n;
+}




-- 

[-- Attachment #2: rb17812.patch --]
[-- Type: text/plain, Size: 3291 bytes --]

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254568f45b61a14aa11c305 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
   }
 )
 
+(define_expand "cond_copysign<mode>"
+  [(match_operand:SVE_FULL_F 0 "register_operand")
+   (match_operand:<VPRED> 1 "register_operand")
+   (match_operand:SVE_FULL_F 2 "register_operand")
+   (match_operand:SVE_FULL_F 3 "nonmemory_operand")
+   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
+  "TARGET_SVE"
+  {
+    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
+    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
+    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
+    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
+
+    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
+    rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3], <MODE>mode);
+    rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4], <MODE>mode);
+
+    rtx v_sign_bitmask
+      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+					   HOST_WIDE_INT_M1U << bits);
+
+    /* copysign (x, -1) should instead be expanded as orr with the sign
+       bit.  */
+    if (!REG_P (operands[3]))
+      {
+	auto r0
+	  = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[3]));
+	if (-1 == real_to_integer (r0))
+	  {
+	    arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
+	    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
+						  arg3, arg4));
+	    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+	    DONE;
+	  }
+      }
+
+    operands[2] = force_reg (<MODE>mode, operands[3]);
+    emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
+    emit_insn (gen_and<v_int_equiv>3
+	       (mant, arg2,
+		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+						   ~(HOST_WIDE_INT_M1U
+						     << bits))));
+    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant,
+					  arg4));
+    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
+    DONE;
+  }
+)
+
 (define_expand "xorsign<mode>3"
   [(match_operand:SVE_FULL_F 0 "register_operand")
    (match_operand:SVE_FULL_F 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..f4ecbeecbe1290134e688f46a4389d17155e4a0a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
+
+#include <arm_neon.h>
+#include <math.h>
+
+/*
+** f1:
+**	...
+**	orr	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**	...
+*/
+void f1 (float32_t *a, int n)
+{
+  for (int i = 0; i < (n & -8); i++)
+   if (a[i] > n)
+     a[i] = -fabsf (a[i]);
+   else
+     a[i] = n;
+}
+
+/*
+** f2:
+**	...
+**	orr	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+**	...
+*/
+void f2 (float64_t *a, int n)
+{
+  for (int i = 0; i < (n & -8); i++)
+   if (a[i] > n)
+     a[i] = -fabs (a[i]);
+   else
+     a[i] = n;
+}




^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-05 18:21 [PATCH]AArch64 Add SVE implementation for cond_copysign Tamar Christina
@ 2023-10-05 19:28 ` Richard Sandiford
  2023-10-05 19:47   ` Tamar Christina
  2023-10-19 21:29 ` Richard Sandiford
  1 sibling, 1 reply; 16+ messages in thread
From: Richard Sandiford @ 2023-10-05 19:28 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> This adds an implementation for masked copysign along with an optimized
> pattern for masked copysign (x, -1).

It feels like we're ending up with a lot of AArch64-specific code that
just hard-codes the observation that changing the sign is equivalent to
changing the top bit.  We then need to make sure that we choose the best
way of changing the top bit for any given situation.

Hard-coding the -1/negative case is one instance of that.  But it looks
like we also fail to use the best sequence for SVE2.  E.g.
[https://godbolt.org/z/ajh3MM5jv]:

#include <stdint.h>

void f(double *restrict a, double *restrict b) {
    for (int i = 0; i < 100; ++i)
        a[i] = __builtin_copysign(a[i], b[i]);
}

void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
    for (int i = 0; i < 100; ++i)
        a[i] = (a[i] & ~c) | (b[i] & c);
}

gives:

f:
        mov     x2, 0
        mov     w3, 100
        whilelo p7.d, wzr, w3
.L2:
        ld1d    z30.d, p7/z, [x0, x2, lsl 3]
        ld1d    z31.d, p7/z, [x1, x2, lsl 3]
        and     z30.d, z30.d, #0x7fffffffffffffff
        and     z31.d, z31.d, #0x8000000000000000
        orr     z31.d, z31.d, z30.d
        st1d    z31.d, p7, [x0, x2, lsl 3]
        incd    x2
        whilelo p7.d, w2, w3
        b.any   .L2
        ret
g:
        mov     x3, 0
        mov     w4, 100
        mov     z29.d, x2
        whilelo p7.d, wzr, w4
.L6:
        ld1d    z30.d, p7/z, [x0, x3, lsl 3]
        ld1d    z31.d, p7/z, [x1, x3, lsl 3]
        bsl     z31.d, z31.d, z30.d, z29.d
        st1d    z31.d, p7, [x0, x3, lsl 3]
        incd    x3
        whilelo p7.d, w3, w4
        b.any   .L6
        ret

I saw that you originally tried to do this in match.pd and that the
decision was to fold to copysign instead.  But perhaps there's a compromise
where isel does something with the (new) copysign canonical form?
I.e. could we go with your new version of the match.pd patch, and add
some isel stuff as a follow-on?

Not saying no to this patch, just thought that the above was worth
considering.

[I agree with Andrew's comments FWIW.]

Thanks,
Richard

>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	PR tree-optimization/109154
> 	* config/aarch64/aarch64-sve.md (cond_copysign<mode>): New.
>
> gcc/testsuite/ChangeLog:
>
> 	PR tree-optimization/109154
> 	* gcc.target/aarch64/sve/fneg-abs_5.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index 071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254568f45b61a14aa11c305 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
>    }
>  )
>  
> +(define_expand "cond_copysign<mode>"
> +  [(match_operand:SVE_FULL_F 0 "register_operand")
> +   (match_operand:<VPRED> 1 "register_operand")
> +   (match_operand:SVE_FULL_F 2 "register_operand")
> +   (match_operand:SVE_FULL_F 3 "nonmemory_operand")
> +   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> +  "TARGET_SVE"
> +  {
> +    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
> +    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
> +    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
> +    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
> +
> +    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
> +    rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3], <MODE>mode);
> +    rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4], <MODE>mode);
> +
> +    rtx v_sign_bitmask
> +      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> +					   HOST_WIDE_INT_M1U << bits);
> +
> +    /* copysign (x, -1) should instead be expanded as orr with the sign
> +       bit.  */
> +    if (!REG_P (operands[3]))
> +      {
> +	auto r0
> +	  = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[3]));
> +	if (-1 == real_to_integer (r0))
> +	  {
> +	    arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
> +	    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
> +						  arg3, arg4));
> +	    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> +	    DONE;
> +	  }
> +      }
> +
> +    operands[2] = force_reg (<MODE>mode, operands[3]);
> +    emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
> +    emit_insn (gen_and<v_int_equiv>3
> +	       (mant, arg2,
> +		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> +						   ~(HOST_WIDE_INT_M1U
> +						     << bits))));
> +    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant,
> +					  arg4));
> +    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> +    DONE;
> +  }
> +)
> +
>  (define_expand "xorsign<mode>3"
>    [(match_operand:SVE_FULL_F 0 "register_operand")
>     (match_operand:SVE_FULL_F 1 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f4ecbeecbe1290134e688f46a4389d17155e4a0a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +**	...
> +**	orr	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> +**	...
> +*/
> +void f1 (float32_t *a, int n)
> +{
> +  for (int i = 0; i < (n & -8); i++)
> +   if (a[i] > n)
> +     a[i] = -fabsf (a[i]);
> +   else
> +     a[i] = n;
> +}
> +
> +/*
> +** f2:
> +**	...
> +**	orr	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +**	...
> +*/
> +void f2 (float64_t *a, int n)
> +{
> +  for (int i = 0; i < (n & -8); i++)
> +   if (a[i] > n)
> +     a[i] = -fabs (a[i]);
> +   else
> +     a[i] = n;
> +}

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-05 19:28 ` Richard Sandiford
@ 2023-10-05 19:47   ` Tamar Christina
  2023-10-05 20:25     ` Richard Sandiford
  2023-10-05 20:34     ` Andrew Pinski
  0 siblings, 2 replies; 16+ messages in thread
From: Tamar Christina @ 2023-10-05 19:47 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Thursday, October 5, 2023 8:29 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> 
> Tamar Christina <tamar.christina@arm.com> writes:
> > Hi All,
> >
> > This adds an implementation for masked copysign along with an
> > optimized pattern for masked copysign (x, -1).
> 
> It feels like we're ending up with a lot of AArch64-specific code that just hard-
> codes the observation that changing the sign is equivalent to changing the top
> bit.  We then need to make sure that we choose the best way of changing the
> top bit for any given situation.
> 
> Hard-coding the -1/negative case is one instance of that.  But it looks like we
> also fail to use the best sequence for SVE2.  E.g.
> [https://godbolt.org/z/ajh3MM5jv]:
> 
> #include <stdint.h>
> 
> void f(double *restrict a, double *restrict b) {
>     for (int i = 0; i < 100; ++i)
>         a[i] = __builtin_copysign(a[i], b[i]); }
> 
> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
>     for (int i = 0; i < 100; ++i)
>         a[i] = (a[i] & ~c) | (b[i] & c); }
> 
> gives:
> 
> f:
>         mov     x2, 0
>         mov     w3, 100
>         whilelo p7.d, wzr, w3
> .L2:
>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
>         and     z30.d, z30.d, #0x7fffffffffffffff
>         and     z31.d, z31.d, #0x8000000000000000
>         orr     z31.d, z31.d, z30.d
>         st1d    z31.d, p7, [x0, x2, lsl 3]
>         incd    x2
>         whilelo p7.d, w2, w3
>         b.any   .L2
>         ret
> g:
>         mov     x3, 0
>         mov     w4, 100
>         mov     z29.d, x2
>         whilelo p7.d, wzr, w4
> .L6:
>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
>         bsl     z31.d, z31.d, z30.d, z29.d
>         st1d    z31.d, p7, [x0, x3, lsl 3]
>         incd    x3
>         whilelo p7.d, w3, w4
>         b.any   .L6
>         ret
> 
> I saw that you originally tried to do this in match.pd and that the decision was
> to fold to copysign instead.  But perhaps there's a compromise where isel does
> something with the (new) copysign canonical form?
> I.e. could we go with your new version of the match.pd patch, and add some
> isel stuff as a follow-on?
> 

Sure if that's what's desired.... But..

The example you posted above is for instance worse for x86 https://godbolt.org/z/x9ccqxW6T
where the first operation has a dependency chain of 2 and the latter of 3.  It's likely any
open coding of this operation is going to hurt a target.

So I'm unsure what isel transform this into...

Tamar

> Not saying no to this patch, just thought that the above was worth
> considering.
> 
> [I agree with Andrew's comments FWIW.]
> 
> Thanks,
> Richard
> 
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	PR tree-optimization/109154
> > 	* config/aarch64/aarch64-sve.md (cond_copysign<mode>): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	PR tree-optimization/109154
> > 	* gcc.target/aarch64/sve/fneg-abs_5.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-sve.md
> > b/gcc/config/aarch64/aarch64-sve.md
> > index
> >
> 071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254
> 568f45b6
> > 1a14aa11c305 100644
> > --- a/gcc/config/aarch64/aarch64-sve.md
> > +++ b/gcc/config/aarch64/aarch64-sve.md
> > @@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
> >    }
> >  )
> >
> > +(define_expand "cond_copysign<mode>"
> > +  [(match_operand:SVE_FULL_F 0 "register_operand")
> > +   (match_operand:<VPRED> 1 "register_operand")
> > +   (match_operand:SVE_FULL_F 2 "register_operand")
> > +   (match_operand:SVE_FULL_F 3 "nonmemory_operand")
> > +   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> > +  "TARGET_SVE"
> > +  {
> > +    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
> > +    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
> > +    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
> > +    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
> > +
> > +    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2],
> <MODE>mode);
> > +    rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3],
> <MODE>mode);
> > +    rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4],
> > + <MODE>mode);
> > +
> > +    rtx v_sign_bitmask
> > +      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> > +					   HOST_WIDE_INT_M1U << bits);
> > +
> > +    /* copysign (x, -1) should instead be expanded as orr with the sign
> > +       bit.  */
> > +    if (!REG_P (operands[3]))
> > +      {
> > +	auto r0
> > +	  = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate
> (operands[3]));
> > +	if (-1 == real_to_integer (r0))
> > +	  {
> > +	    arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
> > +	    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
> > +						  arg3, arg4));
> > +	    emit_move_insn (operands[0], gen_lowpart (<MODE>mode,
> int_res));
> > +	    DONE;
> > +	  }
> > +      }
> > +
> > +    operands[2] = force_reg (<MODE>mode, operands[3]);
> > +    emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
> > +    emit_insn (gen_and<v_int_equiv>3
> > +	       (mant, arg2,
> > +		aarch64_simd_gen_const_vector_dup
> (<V_INT_EQUIV>mode,
> > +						   ~(HOST_WIDE_INT_M1U
> > +						     << bits))));
> > +    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant,
> > +					  arg4));
> > +    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> > +    DONE;
> > +  }
> > +)
> > +
> >  (define_expand "xorsign<mode>3"
> >    [(match_operand:SVE_FULL_F 0 "register_operand")
> >     (match_operand:SVE_FULL_F 1 "register_operand") diff --git
> > a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..f4ecbeecbe1290134e6
> 88f46a438
> > 9d17155e4a0a
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > @@ -0,0 +1,36 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > +*/
> > +
> > +#include <arm_neon.h>
> > +#include <math.h>
> > +
> > +/*
> > +** f1:
> > +**	...
> > +**	orr	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> > +**	...
> > +*/
> > +void f1 (float32_t *a, int n)
> > +{
> > +  for (int i = 0; i < (n & -8); i++)
> > +   if (a[i] > n)
> > +     a[i] = -fabsf (a[i]);
> > +   else
> > +     a[i] = n;
> > +}
> > +
> > +/*
> > +** f2:
> > +**	...
> > +**	orr	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > +**	...
> > +*/
> > +void f2 (float64_t *a, int n)
> > +{
> > +  for (int i = 0; i < (n & -8); i++)
> > +   if (a[i] > n)
> > +     a[i] = -fabs (a[i]);
> > +   else
> > +     a[i] = n;
> > +}

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-05 19:47   ` Tamar Christina
@ 2023-10-05 20:25     ` Richard Sandiford
  2023-10-05 20:45       ` Tamar Christina
  2023-10-05 20:34     ` Andrew Pinski
  1 sibling, 1 reply; 16+ messages in thread
From: Richard Sandiford @ 2023-10-05 20:25 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Thursday, October 5, 2023 8:29 PM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
>> 
>> Tamar Christina <tamar.christina@arm.com> writes:
>> > Hi All,
>> >
>> > This adds an implementation for masked copysign along with an
>> > optimized pattern for masked copysign (x, -1).
>> 
>> It feels like we're ending up with a lot of AArch64-specific code that just hard-
>> codes the observation that changing the sign is equivalent to changing the top
>> bit.  We then need to make sure that we choose the best way of changing the
>> top bit for any given situation.
>> 
>> Hard-coding the -1/negative case is one instance of that.  But it looks like we
>> also fail to use the best sequence for SVE2.  E.g.
>> [https://godbolt.org/z/ajh3MM5jv]:
>> 
>> #include <stdint.h>
>> 
>> void f(double *restrict a, double *restrict b) {
>>     for (int i = 0; i < 100; ++i)
>>         a[i] = __builtin_copysign(a[i], b[i]); }
>> 
>> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
>>     for (int i = 0; i < 100; ++i)
>>         a[i] = (a[i] & ~c) | (b[i] & c); }
>> 
>> gives:
>> 
>> f:
>>         mov     x2, 0
>>         mov     w3, 100
>>         whilelo p7.d, wzr, w3
>> .L2:
>>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
>>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
>>         and     z30.d, z30.d, #0x7fffffffffffffff
>>         and     z31.d, z31.d, #0x8000000000000000
>>         orr     z31.d, z31.d, z30.d
>>         st1d    z31.d, p7, [x0, x2, lsl 3]
>>         incd    x2
>>         whilelo p7.d, w2, w3
>>         b.any   .L2
>>         ret
>> g:
>>         mov     x3, 0
>>         mov     w4, 100
>>         mov     z29.d, x2
>>         whilelo p7.d, wzr, w4
>> .L6:
>>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
>>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
>>         bsl     z31.d, z31.d, z30.d, z29.d
>>         st1d    z31.d, p7, [x0, x3, lsl 3]
>>         incd    x3
>>         whilelo p7.d, w3, w4
>>         b.any   .L6
>>         ret
>> 
>> I saw that you originally tried to do this in match.pd and that the decision was
>> to fold to copysign instead.  But perhaps there's a compromise where isel does
>> something with the (new) copysign canonical form?
>> I.e. could we go with your new version of the match.pd patch, and add some
>> isel stuff as a follow-on?
>> 
>
> Sure if that's what's desired.... But..
>
> The example you posted above is for instance worse for x86 https://godbolt.org/z/x9ccqxW6T
> where the first operation has a dependency chain of 2 and the latter of 3.  It's likely any
> open coding of this operation is going to hurt a target.
>
> So I'm unsure what isel transform this into...

I didn't mean that we should go straight to using isel for the general
case, just for the new case.  The example above was instead trying to
show the general point that hiding the logic ops in target code is a
double-edged sword.

The x86_64 example for the -1 case would be https://godbolt.org/z/b9s6MaKs8
where the isel change would be an improvement.  Without that, I guess
x86_64 will need to have a similar patch to the AArch64 one.

That said, https://godbolt.org/z/e6nqoqbMh suggests that powerpc64
is probably relying on the current copysign -> neg/abs transform.
(Not sure why the second function uses different IVs from the first.)

Personally, I wouldn't be against a target hook that indicated whether
float bit manipulation is "free" for a given mode, if it comes to that.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-05 19:47   ` Tamar Christina
  2023-10-05 20:25     ` Richard Sandiford
@ 2023-10-05 20:34     ` Andrew Pinski
  1 sibling, 0 replies; 16+ messages in thread
From: Andrew Pinski @ 2023-10-05 20:34 UTC (permalink / raw)
  To: Tamar Christina
  Cc: Richard Sandiford, gcc-patches, nd, Richard Earnshaw,
	Marcus Shawcroft, Kyrylo Tkachov

On Thu, Oct 5, 2023 at 12:48 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > -----Original Message-----
> > From: Richard Sandiford <richard.sandiford@arm.com>
> > Sent: Thursday, October 5, 2023 8:29 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> > Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> >
> > Tamar Christina <tamar.christina@arm.com> writes:
> > > Hi All,
> > >
> > > This adds an implementation for masked copysign along with an
> > > optimized pattern for masked copysign (x, -1).
> >
> > It feels like we're ending up with a lot of AArch64-specific code that just hard-
> > codes the observation that changing the sign is equivalent to changing the top
> > bit.  We then need to make sure that we choose the best way of changing the
> > top bit for any given situation.
> >
> > Hard-coding the -1/negative case is one instance of that.  But it looks like we
> > also fail to use the best sequence for SVE2.  E.g.
> > [https://godbolt.org/z/ajh3MM5jv]:
> >
> > #include <stdint.h>
> >
> > void f(double *restrict a, double *restrict b) {
> >     for (int i = 0; i < 100; ++i)
> >         a[i] = __builtin_copysign(a[i], b[i]); }
> >
> > void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
> >     for (int i = 0; i < 100; ++i)
> >         a[i] = (a[i] & ~c) | (b[i] & c); }
> >
> > gives:
> >
> > f:
> >         mov     x2, 0
> >         mov     w3, 100
> >         whilelo p7.d, wzr, w3
> > .L2:
> >         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
> >         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
> >         and     z30.d, z30.d, #0x7fffffffffffffff
> >         and     z31.d, z31.d, #0x8000000000000000
> >         orr     z31.d, z31.d, z30.d
> >         st1d    z31.d, p7, [x0, x2, lsl 3]
> >         incd    x2
> >         whilelo p7.d, w2, w3
> >         b.any   .L2
> >         ret
> > g:
> >         mov     x3, 0
> >         mov     w4, 100
> >         mov     z29.d, x2
> >         whilelo p7.d, wzr, w4
> > .L6:
> >         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
> >         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
> >         bsl     z31.d, z31.d, z30.d, z29.d
> >         st1d    z31.d, p7, [x0, x3, lsl 3]
> >         incd    x3
> >         whilelo p7.d, w3, w4
> >         b.any   .L6
> >         ret
> >
> > I saw that you originally tried to do this in match.pd and that the decision was
> > to fold to copysign instead.  But perhaps there's a compromise where isel does
> > something with the (new) copysign canonical form?
> > I.e. could we go with your new version of the match.pd patch, and add some
> > isel stuff as a follow-on?
> >
>
> Sure if that's what's desired.... But..
>
> The example you posted above is for instance worse for x86 https://godbolt.org/z/x9ccqxW6T
> where the first operation has a dependency chain of 2 and the latter of 3.  It's likely any
> open coding of this operation is going to hurt a target.

But that is because it is not using andn when it should be.
That would be https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94790
(scalar fix but not vector) and
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90323 IIRC.
AARCH64 already has a pattern to match the above which is why it works
there but not x86_64.

Thanks,
Andrew

>
> So I'm unsure what isel transform this into...
>
> Tamar
>
> > Not saying no to this patch, just thought that the above was worth
> > considering.
> >
> > [I agree with Andrew's comments FWIW.]
> >
> > Thanks,
> > Richard
> >
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > >     PR tree-optimization/109154
> > >     * config/aarch64/aarch64-sve.md (cond_copysign<mode>): New.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >     PR tree-optimization/109154
> > >     * gcc.target/aarch64/sve/fneg-abs_5.c: New test.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/config/aarch64/aarch64-sve.md
> > > b/gcc/config/aarch64/aarch64-sve.md
> > > index
> > >
> > 071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254
> > 568f45b6
> > > 1a14aa11c305 100644
> > > --- a/gcc/config/aarch64/aarch64-sve.md
> > > +++ b/gcc/config/aarch64/aarch64-sve.md
> > > @@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
> > >    }
> > >  )
> > >
> > > +(define_expand "cond_copysign<mode>"
> > > +  [(match_operand:SVE_FULL_F 0 "register_operand")
> > > +   (match_operand:<VPRED> 1 "register_operand")
> > > +   (match_operand:SVE_FULL_F 2 "register_operand")
> > > +   (match_operand:SVE_FULL_F 3 "nonmemory_operand")
> > > +   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> > > +  "TARGET_SVE"
> > > +  {
> > > +    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
> > > +    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
> > > +    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
> > > +    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
> > > +
> > > +    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2],
> > <MODE>mode);
> > > +    rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3],
> > <MODE>mode);
> > > +    rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4],
> > > + <MODE>mode);
> > > +
> > > +    rtx v_sign_bitmask
> > > +      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> > > +                                      HOST_WIDE_INT_M1U << bits);
> > > +
> > > +    /* copysign (x, -1) should instead be expanded as orr with the sign
> > > +       bit.  */
> > > +    if (!REG_P (operands[3]))
> > > +      {
> > > +   auto r0
> > > +     = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate
> > (operands[3]));
> > > +   if (-1 == real_to_integer (r0))
> > > +     {
> > > +       arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
> > > +       emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
> > > +                                             arg3, arg4));
> > > +       emit_move_insn (operands[0], gen_lowpart (<MODE>mode,
> > int_res));
> > > +       DONE;
> > > +     }
> > > +      }
> > > +
> > > +    operands[2] = force_reg (<MODE>mode, operands[3]);
> > > +    emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
> > > +    emit_insn (gen_and<v_int_equiv>3
> > > +          (mant, arg2,
> > > +           aarch64_simd_gen_const_vector_dup
> > (<V_INT_EQUIV>mode,
> > > +                                              ~(HOST_WIDE_INT_M1U
> > > +                                                << bits))));
> > > +    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant,
> > > +                                     arg4));
> > > +    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> > > +    DONE;
> > > +  }
> > > +)
> > > +
> > >  (define_expand "xorsign<mode>3"
> > >    [(match_operand:SVE_FULL_F 0 "register_operand")
> > >     (match_operand:SVE_FULL_F 1 "register_operand") diff --git
> > > a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..f4ecbeecbe1290134e6
> > 88f46a438
> > > 9d17155e4a0a
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> > > @@ -0,0 +1,36 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } }
> > > +*/
> > > +
> > > +#include <arm_neon.h>
> > > +#include <math.h>
> > > +
> > > +/*
> > > +** f1:
> > > +** ...
> > > +** orr     z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> > > +** ...
> > > +*/
> > > +void f1 (float32_t *a, int n)
> > > +{
> > > +  for (int i = 0; i < (n & -8); i++)
> > > +   if (a[i] > n)
> > > +     a[i] = -fabsf (a[i]);
> > > +   else
> > > +     a[i] = n;
> > > +}
> > > +
> > > +/*
> > > +** f2:
> > > +** ...
> > > +** orr     z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> > > +** ...
> > > +*/
> > > +void f2 (float64_t *a, int n)
> > > +{
> > > +  for (int i = 0; i < (n & -8); i++)
> > > +   if (a[i] > n)
> > > +     a[i] = -fabs (a[i]);
> > > +   else
> > > +     a[i] = n;
> > > +}

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-05 20:25     ` Richard Sandiford
@ 2023-10-05 20:45       ` Tamar Christina
  2023-10-06  7:32         ` Richard Biener
  0 siblings, 1 reply; 16+ messages in thread
From: Tamar Christina @ 2023-10-05 20:45 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Thursday, October 5, 2023 9:26 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Thursday, October 5, 2023 8:29 PM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> >>
> >> Tamar Christina <tamar.christina@arm.com> writes:
> >> > Hi All,
> >> >
> >> > This adds an implementation for masked copysign along with an
> >> > optimized pattern for masked copysign (x, -1).
> >>
> >> It feels like we're ending up with a lot of AArch64-specific code
> >> that just hard- codes the observation that changing the sign is
> >> equivalent to changing the top bit.  We then need to make sure that
> >> we choose the best way of changing the top bit for any given situation.
> >>
> >> Hard-coding the -1/negative case is one instance of that.  But it
> >> looks like we also fail to use the best sequence for SVE2.  E.g.
> >> [https://godbolt.org/z/ajh3MM5jv]:
> >>
> >> #include <stdint.h>
> >>
> >> void f(double *restrict a, double *restrict b) {
> >>     for (int i = 0; i < 100; ++i)
> >>         a[i] = __builtin_copysign(a[i], b[i]); }
> >>
> >> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
> >>     for (int i = 0; i < 100; ++i)
> >>         a[i] = (a[i] & ~c) | (b[i] & c); }
> >>
> >> gives:
> >>
> >> f:
> >>         mov     x2, 0
> >>         mov     w3, 100
> >>         whilelo p7.d, wzr, w3
> >> .L2:
> >>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
> >>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
> >>         and     z30.d, z30.d, #0x7fffffffffffffff
> >>         and     z31.d, z31.d, #0x8000000000000000
> >>         orr     z31.d, z31.d, z30.d
> >>         st1d    z31.d, p7, [x0, x2, lsl 3]
> >>         incd    x2
> >>         whilelo p7.d, w2, w3
> >>         b.any   .L2
> >>         ret
> >> g:
> >>         mov     x3, 0
> >>         mov     w4, 100
> >>         mov     z29.d, x2
> >>         whilelo p7.d, wzr, w4
> >> .L6:
> >>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
> >>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
> >>         bsl     z31.d, z31.d, z30.d, z29.d
> >>         st1d    z31.d, p7, [x0, x3, lsl 3]
> >>         incd    x3
> >>         whilelo p7.d, w3, w4
> >>         b.any   .L6
> >>         ret
> >>
> >> I saw that you originally tried to do this in match.pd and that the
> >> decision was to fold to copysign instead.  But perhaps there's a
> >> compromise where isel does something with the (new) copysign canonical
> form?
> >> I.e. could we go with your new version of the match.pd patch, and add
> >> some isel stuff as a follow-on?
> >>
> >
> > Sure if that's what's desired.... But..
> >
> > The example you posted above is for instance worse for x86
> > https://godbolt.org/z/x9ccqxW6T where the first operation has a
> > dependency chain of 2 and the latter of 3.  It's likely any open coding of this
> operation is going to hurt a target.
> >
> > So I'm unsure what isel transform this into...
> 
> I didn't mean that we should go straight to using isel for the general case, just
> for the new case.  The example above was instead trying to show the general
> point that hiding the logic ops in target code is a double-edged sword.

I see.. but the problem here is that transforming copysign (x, -1) into
(x | 0x8000000) would require an integer operation on an FP value.  I'm happy to
do it but it seems like it'll be an AArch64 only thing anyway.

If we want to do this we need to check can_change_mode_class or a hook.
Most targets including x86 reject the conversion.  So it'll just be effectively an AArch64
thing.

You're right that the actual equivalent transformation is this https://godbolt.org/z/KesfrMv5z
But the target won't allow it.

> 
> The x86_64 example for the -1 case would be
> https://godbolt.org/z/b9s6MaKs8 where the isel change would be an
> improvement.  Without that, I guess
> x86_64 will need to have a similar patch to the AArch64 one.
> 

I think that's to be expected.  I think it's logical that every target just needs to implement
their optabs optimally.

> That said, https://godbolt.org/z/e6nqoqbMh suggests that powerpc64 is
> probably relying on the current copysign -> neg/abs transform.
> (Not sure why the second function uses different IVs from the first.)
> 
> Personally, I wouldn't be against a target hook that indicated whether float bit
> manipulation is "free" for a given mode, if it comes to that.

I'm really sure Richi would agree there.  Generally speaking I don't think people see
doing FP operations on Int as beneficial.  But it's always the case on AArch64.

But sure, if you believe it to be beneficial I can follow up with a patch, but I'd still
need this one to allow the folding of the VEC_PERM_EXPR away.

Thanks,
Tamar

> 
> Thanks,
> Richard

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-05 20:45       ` Tamar Christina
@ 2023-10-06  7:32         ` Richard Biener
  2023-10-07  9:57           ` Richard Sandiford
  0 siblings, 1 reply; 16+ messages in thread
From: Richard Biener @ 2023-10-06  7:32 UTC (permalink / raw)
  To: Tamar Christina
  Cc: Richard Sandiford, gcc-patches, nd, Richard Earnshaw,
	Marcus Shawcroft, Kyrylo Tkachov

On Thu, Oct 5, 2023 at 10:46 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > -----Original Message-----
> > From: Richard Sandiford <richard.sandiford@arm.com>
> > Sent: Thursday, October 5, 2023 9:26 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> > Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> >
> > Tamar Christina <Tamar.Christina@arm.com> writes:
> > >> -----Original Message-----
> > >> From: Richard Sandiford <richard.sandiford@arm.com>
> > >> Sent: Thursday, October 5, 2023 8:29 PM
> > >> To: Tamar Christina <Tamar.Christina@arm.com>
> > >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>
> > >> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> > >>
> > >> Tamar Christina <tamar.christina@arm.com> writes:
> > >> > Hi All,
> > >> >
> > >> > This adds an implementation for masked copysign along with an
> > >> > optimized pattern for masked copysign (x, -1).
> > >>
> > >> It feels like we're ending up with a lot of AArch64-specific code
> > >> that just hard- codes the observation that changing the sign is
> > >> equivalent to changing the top bit.  We then need to make sure that
> > >> we choose the best way of changing the top bit for any given situation.
> > >>
> > >> Hard-coding the -1/negative case is one instance of that.  But it
> > >> looks like we also fail to use the best sequence for SVE2.  E.g.
> > >> [https://godbolt.org/z/ajh3MM5jv]:
> > >>
> > >> #include <stdint.h>
> > >>
> > >> void f(double *restrict a, double *restrict b) {
> > >>     for (int i = 0; i < 100; ++i)
> > >>         a[i] = __builtin_copysign(a[i], b[i]); }
> > >>
> > >> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
> > >>     for (int i = 0; i < 100; ++i)
> > >>         a[i] = (a[i] & ~c) | (b[i] & c); }
> > >>
> > >> gives:
> > >>
> > >> f:
> > >>         mov     x2, 0
> > >>         mov     w3, 100
> > >>         whilelo p7.d, wzr, w3
> > >> .L2:
> > >>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
> > >>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
> > >>         and     z30.d, z30.d, #0x7fffffffffffffff
> > >>         and     z31.d, z31.d, #0x8000000000000000
> > >>         orr     z31.d, z31.d, z30.d
> > >>         st1d    z31.d, p7, [x0, x2, lsl 3]
> > >>         incd    x2
> > >>         whilelo p7.d, w2, w3
> > >>         b.any   .L2
> > >>         ret
> > >> g:
> > >>         mov     x3, 0
> > >>         mov     w4, 100
> > >>         mov     z29.d, x2
> > >>         whilelo p7.d, wzr, w4
> > >> .L6:
> > >>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
> > >>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
> > >>         bsl     z31.d, z31.d, z30.d, z29.d
> > >>         st1d    z31.d, p7, [x0, x3, lsl 3]
> > >>         incd    x3
> > >>         whilelo p7.d, w3, w4
> > >>         b.any   .L6
> > >>         ret
> > >>
> > >> I saw that you originally tried to do this in match.pd and that the
> > >> decision was to fold to copysign instead.  But perhaps there's a
> > >> compromise where isel does something with the (new) copysign canonical
> > form?
> > >> I.e. could we go with your new version of the match.pd patch, and add
> > >> some isel stuff as a follow-on?
> > >>
> > >
> > > Sure if that's what's desired.... But..
> > >
> > > The example you posted above is for instance worse for x86
> > > https://godbolt.org/z/x9ccqxW6T where the first operation has a
> > > dependency chain of 2 and the latter of 3.  It's likely any open coding of this
> > operation is going to hurt a target.
> > >
> > > So I'm unsure what isel transform this into...
> >
> > I didn't mean that we should go straight to using isel for the general case, just
> > for the new case.  The example above was instead trying to show the general
> > point that hiding the logic ops in target code is a double-edged sword.
>
> I see.. but the problem here is that transforming copysign (x, -1) into
> (x | 0x8000000) would require an integer operation on an FP value.  I'm happy to
> do it but it seems like it'll be an AArch64 only thing anyway.
>
> If we want to do this we need to check can_change_mode_class or a hook.
> Most targets including x86 reject the conversion.  So it'll just be effectively an AArch64
> thing.
>
> You're right that the actual equivalent transformation is this https://godbolt.org/z/KesfrMv5z
> But the target won't allow it.
>
> >
> > The x86_64 example for the -1 case would be
> > https://godbolt.org/z/b9s6MaKs8 where the isel change would be an
> > improvement.  Without that, I guess
> > x86_64 will need to have a similar patch to the AArch64 one.
> >
>
> I think that's to be expected.  I think it's logical that every target just needs to implement
> their optabs optimally.
>
> > That said, https://godbolt.org/z/e6nqoqbMh suggests that powerpc64 is
> > probably relying on the current copysign -> neg/abs transform.
> > (Not sure why the second function uses different IVs from the first.)
> >
> > Personally, I wouldn't be against a target hook that indicated whether float bit
> > manipulation is "free" for a given mode, if it comes to that.
>
> I'm really sure Richi would agree there.  Generally speaking I don't think people see
> doing FP operations on Int as beneficial.  But it's always the case on AArch64.

IIRC we're doing fpclassify "expansion" early for example.

Note the issue I see is that the middle-end shouldn't get in the way of
targets that have a copysign optab.  In case it's worthwhile to special-case
a "setsign" thing then maybe provide an optab for that as well.  Now, that
doesn't help if we want setsign to be expanded from the middle-end but
still wan the copysign optab (and not require targets to implement both
if they want to escape middle-end generic expansion of setsign).

But yes, I also thought the , 1 and , -1 cases could be special cased by
RTL expansion (or ISEL).  But it would need to invoke costing which likely
means target specific changes anyway... :/

So I have no good advice here but I hoped that even the generic target
specific copysign implementation with and & xor would eventually be
optimized later on RTL for constant second arg?

Richard.

> But sure, if you believe it to be beneficial I can follow up with a patch, but I'd still
> need this one to allow the folding of the VEC_PERM_EXPR away.
>
> Thanks,
> Tamar
>
> >
> > Thanks,
> > Richard

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-06  7:32         ` Richard Biener
@ 2023-10-07  9:57           ` Richard Sandiford
  2023-10-09  9:38             ` Tamar Christina
  0 siblings, 1 reply; 16+ messages in thread
From: Richard Sandiford @ 2023-10-07  9:57 UTC (permalink / raw)
  To: Richard Biener
  Cc: Tamar Christina, gcc-patches, nd, Richard Earnshaw,
	Marcus Shawcroft, Kyrylo Tkachov

Richard Biener <richard.guenther@gmail.com> writes:
> On Thu, Oct 5, 2023 at 10:46 PM Tamar Christina <Tamar.Christina@arm.com> wrote:
>>
>> > -----Original Message-----
>> > From: Richard Sandiford <richard.sandiford@arm.com>
>> > Sent: Thursday, October 5, 2023 9:26 PM
>> > To: Tamar Christina <Tamar.Christina@arm.com>
>> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
>> > Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
>> >
>> > Tamar Christina <Tamar.Christina@arm.com> writes:
>> > >> -----Original Message-----
>> > >> From: Richard Sandiford <richard.sandiford@arm.com>
>> > >> Sent: Thursday, October 5, 2023 8:29 PM
>> > >> To: Tamar Christina <Tamar.Christina@arm.com>
>> > >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> > >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> > >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> > <Kyrylo.Tkachov@arm.com>
>> > >> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
>> > >>
>> > >> Tamar Christina <tamar.christina@arm.com> writes:
>> > >> > Hi All,
>> > >> >
>> > >> > This adds an implementation for masked copysign along with an
>> > >> > optimized pattern for masked copysign (x, -1).
>> > >>
>> > >> It feels like we're ending up with a lot of AArch64-specific code
>> > >> that just hard- codes the observation that changing the sign is
>> > >> equivalent to changing the top bit.  We then need to make sure that
>> > >> we choose the best way of changing the top bit for any given situation.
>> > >>
>> > >> Hard-coding the -1/negative case is one instance of that.  But it
>> > >> looks like we also fail to use the best sequence for SVE2.  E.g.
>> > >> [https://godbolt.org/z/ajh3MM5jv]:
>> > >>
>> > >> #include <stdint.h>
>> > >>
>> > >> void f(double *restrict a, double *restrict b) {
>> > >>     for (int i = 0; i < 100; ++i)
>> > >>         a[i] = __builtin_copysign(a[i], b[i]); }
>> > >>
>> > >> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
>> > >>     for (int i = 0; i < 100; ++i)
>> > >>         a[i] = (a[i] & ~c) | (b[i] & c); }
>> > >>
>> > >> gives:
>> > >>
>> > >> f:
>> > >>         mov     x2, 0
>> > >>         mov     w3, 100
>> > >>         whilelo p7.d, wzr, w3
>> > >> .L2:
>> > >>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
>> > >>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
>> > >>         and     z30.d, z30.d, #0x7fffffffffffffff
>> > >>         and     z31.d, z31.d, #0x8000000000000000
>> > >>         orr     z31.d, z31.d, z30.d
>> > >>         st1d    z31.d, p7, [x0, x2, lsl 3]
>> > >>         incd    x2
>> > >>         whilelo p7.d, w2, w3
>> > >>         b.any   .L2
>> > >>         ret
>> > >> g:
>> > >>         mov     x3, 0
>> > >>         mov     w4, 100
>> > >>         mov     z29.d, x2
>> > >>         whilelo p7.d, wzr, w4
>> > >> .L6:
>> > >>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
>> > >>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
>> > >>         bsl     z31.d, z31.d, z30.d, z29.d
>> > >>         st1d    z31.d, p7, [x0, x3, lsl 3]
>> > >>         incd    x3
>> > >>         whilelo p7.d, w3, w4
>> > >>         b.any   .L6
>> > >>         ret
>> > >>
>> > >> I saw that you originally tried to do this in match.pd and that the
>> > >> decision was to fold to copysign instead.  But perhaps there's a
>> > >> compromise where isel does something with the (new) copysign canonical
>> > form?
>> > >> I.e. could we go with your new version of the match.pd patch, and add
>> > >> some isel stuff as a follow-on?
>> > >>
>> > >
>> > > Sure if that's what's desired.... But..
>> > >
>> > > The example you posted above is for instance worse for x86
>> > > https://godbolt.org/z/x9ccqxW6T where the first operation has a
>> > > dependency chain of 2 and the latter of 3.  It's likely any open coding of this
>> > operation is going to hurt a target.
>> > >
>> > > So I'm unsure what isel transform this into...
>> >
>> > I didn't mean that we should go straight to using isel for the general case, just
>> > for the new case.  The example above was instead trying to show the general
>> > point that hiding the logic ops in target code is a double-edged sword.
>>
>> I see.. but the problem here is that transforming copysign (x, -1) into
>> (x | 0x8000000) would require an integer operation on an FP value.  I'm happy to
>> do it but it seems like it'll be an AArch64 only thing anyway.
>>
>> If we want to do this we need to check can_change_mode_class or a hook.
>> Most targets including x86 reject the conversion.  So it'll just be effectively an AArch64
>> thing.
>>
>> You're right that the actual equivalent transformation is this https://godbolt.org/z/KesfrMv5z
>> But the target won't allow it.
>>
>> >
>> > The x86_64 example for the -1 case would be
>> > https://godbolt.org/z/b9s6MaKs8 where the isel change would be an
>> > improvement.  Without that, I guess
>> > x86_64 will need to have a similar patch to the AArch64 one.
>> >
>>
>> I think that's to be expected.  I think it's logical that every target just needs to implement
>> their optabs optimally.
>>
>> > That said, https://godbolt.org/z/e6nqoqbMh suggests that powerpc64 is
>> > probably relying on the current copysign -> neg/abs transform.
>> > (Not sure why the second function uses different IVs from the first.)
>> >
>> > Personally, I wouldn't be against a target hook that indicated whether float bit
>> > manipulation is "free" for a given mode, if it comes to that.
>>
>> I'm really sure Richi would agree there.  Generally speaking I don't think people see
>> doing FP operations on Int as beneficial.  But it's always the case on AArch64.
>
> IIRC we're doing fpclassify "expansion" early for example.
>
> Note the issue I see is that the middle-end shouldn't get in the way of
> targets that have a copysign optab.  In case it's worthwhile to special-case
> a "setsign" thing then maybe provide an optab for that as well.  Now, that
> doesn't help if we want setsign to be expanded from the middle-end but
> still wan the copysign optab (and not require targets to implement both
> if they want to escape middle-end generic expansion of setsign).
>
> But yes, I also thought the , 1 and , -1 cases could be special cased by
> RTL expansion (or ISEL).  But it would need to invoke costing which likely
> means target specific changes anyway... :/

FWIW, if we had the target hook I suggested, I don't think AArch64 would
need to implement copysign or xorsign optabs.  That's probably true of
AArch32 too (at least for all combinations that are likely to matter
these days).

I'd go one step further and say that, if a target wants to do its own
thing for copysign and xorsign, it clearly doesn't meet the requirement
that bit manipulation of floats is "free" for that mode.

> So I have no good advice here but I hoped that even the generic target
> specific copysign implementation with and & xor would eventually be
> optimized later on RTL for constant second arg?

Yeah.  It looks like the required logic is there for scalars, it just
needs extending to vectors.

The patch below (untested beyond simple cases) seems to be enough
to fix it, but using the simplify routines even for CONST_INTs
might be controversial.

Thanks,
Richard


diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index bd9443dbcc2..5a9b1745673 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -3409,20 +3409,20 @@ simplify_context::simplify_binary_operation_1 (rtx_code code,
 
       /* Canonicalize (X & C1) | C2.  */
       if (GET_CODE (op0) == AND
-	  && CONST_INT_P (trueop1)
-	  && CONST_INT_P (XEXP (op0, 1)))
+	  && CONSTANT_P (trueop1)
+	  && CONSTANT_P (XEXP (op0, 1)))
 	{
-	  HOST_WIDE_INT mask = GET_MODE_MASK (mode);
-	  HOST_WIDE_INT c1 = INTVAL (XEXP (op0, 1));
-	  HOST_WIDE_INT c2 = INTVAL (trueop1);
+	  rtx c1 = XEXP (op0, 1);
+	  rtx c2 = trueop1;
 
 	  /* If (C1&C2) == C1, then (X&C1)|C2 becomes C2.  */
-	  if ((c1 & c2) == c1
+	  if (rtx_equal_p (simplify_binary_operation (AND, mode, c1, c2), c1)
 	      && !side_effects_p (XEXP (op0, 0)))
 	    return trueop1;
 
 	  /* If (C1|C2) == ~0 then (X&C1)|C2 becomes X|C2.  */
-	  if (((c1|c2) & mask) == mask)
+	  if (rtx_equal_p (simplify_binary_operation (IOR, mode, c1, c2),
+			   CONSTM1_RTX (mode)))
 	    return simplify_gen_binary (IOR, mode, XEXP (op0, 0), op1);
 	}
 
@@ -3732,7 +3732,7 @@ simplify_context::simplify_binary_operation_1 (rtx_code code,
 	 machines, and also has shorter instruction path length.  */
       if (GET_CODE (op0) == AND
 	  && GET_CODE (XEXP (op0, 0)) == XOR
-	  && CONST_INT_P (XEXP (op0, 1))
+	  && CONSTANT_P (XEXP (op0, 1))
 	  && rtx_equal_p (XEXP (XEXP (op0, 0), 0), trueop1))
 	{
 	  rtx a = trueop1;
@@ -3746,7 +3746,7 @@ simplify_context::simplify_binary_operation_1 (rtx_code code,
       /* Similarly, (xor (and (xor A B) C) B) as (ior (and A C) (and B ~C))  */
       else if (GET_CODE (op0) == AND
 	  && GET_CODE (XEXP (op0, 0)) == XOR
-	  && CONST_INT_P (XEXP (op0, 1))
+	  && CONSTANT_P (XEXP (op0, 1))
 	  && rtx_equal_p (XEXP (XEXP (op0, 0), 1), trueop1))
 	{
 	  rtx a = XEXP (XEXP (op0, 0), 0);
-- 
2.25.1



^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-07  9:57           ` Richard Sandiford
@ 2023-10-09  9:38             ` Tamar Christina
  2023-10-09  9:45               ` Richard Biener
  2023-10-09  9:56               ` Richard Sandiford
  0 siblings, 2 replies; 16+ messages in thread
From: Tamar Christina @ 2023-10-09  9:38 UTC (permalink / raw)
  To: Richard Sandiford, Richard Biener
  Cc: gcc-patches, nd, Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Saturday, October 7, 2023 10:58 AM
> To: Richard Biener <richard.guenther@gmail.com>
> Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org;
> nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> 
> Richard Biener <richard.guenther@gmail.com> writes:
> > On Thu, Oct 5, 2023 at 10:46 PM Tamar Christina
> <Tamar.Christina@arm.com> wrote:
> >>
> >> > -----Original Message-----
> >> > From: Richard Sandiford <richard.sandiford@arm.com>
> >> > Sent: Thursday, October 5, 2023 9:26 PM
> >> > To: Tamar Christina <Tamar.Christina@arm.com>
> >> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> >> > Subject: Re: [PATCH]AArch64 Add SVE implementation for
> cond_copysign.
> >> >
> >> > Tamar Christina <Tamar.Christina@arm.com> writes:
> >> > >> -----Original Message-----
> >> > >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> > >> Sent: Thursday, October 5, 2023 8:29 PM
> >> > >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> > >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> > >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> > >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> > <Kyrylo.Tkachov@arm.com>
> >> > >> Subject: Re: [PATCH]AArch64 Add SVE implementation for
> cond_copysign.
> >> > >>
> >> > >> Tamar Christina <tamar.christina@arm.com> writes:
> >> > >> > Hi All,
> >> > >> >
> >> > >> > This adds an implementation for masked copysign along with an
> >> > >> > optimized pattern for masked copysign (x, -1).
> >> > >>
> >> > >> It feels like we're ending up with a lot of AArch64-specific
> >> > >> code that just hard- codes the observation that changing the
> >> > >> sign is equivalent to changing the top bit.  We then need to
> >> > >> make sure that we choose the best way of changing the top bit for any
> given situation.
> >> > >>
> >> > >> Hard-coding the -1/negative case is one instance of that.  But
> >> > >> it looks like we also fail to use the best sequence for SVE2.  E.g.
> >> > >> [https://godbolt.org/z/ajh3MM5jv]:
> >> > >>
> >> > >> #include <stdint.h>
> >> > >>
> >> > >> void f(double *restrict a, double *restrict b) {
> >> > >>     for (int i = 0; i < 100; ++i)
> >> > >>         a[i] = __builtin_copysign(a[i], b[i]); }
> >> > >>
> >> > >> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
> >> > >>     for (int i = 0; i < 100; ++i)
> >> > >>         a[i] = (a[i] & ~c) | (b[i] & c); }
> >> > >>
> >> > >> gives:
> >> > >>
> >> > >> f:
> >> > >>         mov     x2, 0
> >> > >>         mov     w3, 100
> >> > >>         whilelo p7.d, wzr, w3
> >> > >> .L2:
> >> > >>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
> >> > >>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
> >> > >>         and     z30.d, z30.d, #0x7fffffffffffffff
> >> > >>         and     z31.d, z31.d, #0x8000000000000000
> >> > >>         orr     z31.d, z31.d, z30.d
> >> > >>         st1d    z31.d, p7, [x0, x2, lsl 3]
> >> > >>         incd    x2
> >> > >>         whilelo p7.d, w2, w3
> >> > >>         b.any   .L2
> >> > >>         ret
> >> > >> g:
> >> > >>         mov     x3, 0
> >> > >>         mov     w4, 100
> >> > >>         mov     z29.d, x2
> >> > >>         whilelo p7.d, wzr, w4
> >> > >> .L6:
> >> > >>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
> >> > >>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
> >> > >>         bsl     z31.d, z31.d, z30.d, z29.d
> >> > >>         st1d    z31.d, p7, [x0, x3, lsl 3]
> >> > >>         incd    x3
> >> > >>         whilelo p7.d, w3, w4
> >> > >>         b.any   .L6
> >> > >>         ret
> >> > >>
> >> > >> I saw that you originally tried to do this in match.pd and that
> >> > >> the decision was to fold to copysign instead.  But perhaps
> >> > >> there's a compromise where isel does something with the (new)
> >> > >> copysign canonical
> >> > form?
> >> > >> I.e. could we go with your new version of the match.pd patch,
> >> > >> and add some isel stuff as a follow-on?
> >> > >>
> >> > >
> >> > > Sure if that's what's desired.... But..
> >> > >
> >> > > The example you posted above is for instance worse for x86
> >> > > https://godbolt.org/z/x9ccqxW6T where the first operation has a
> >> > > dependency chain of 2 and the latter of 3.  It's likely any open
> >> > > coding of this
> >> > operation is going to hurt a target.
> >> > >
> >> > > So I'm unsure what isel transform this into...
> >> >
> >> > I didn't mean that we should go straight to using isel for the
> >> > general case, just for the new case.  The example above was instead
> >> > trying to show the general point that hiding the logic ops in target code is
> a double-edged sword.
> >>
> >> I see.. but the problem here is that transforming copysign (x, -1)
> >> into (x | 0x8000000) would require an integer operation on an FP
> >> value.  I'm happy to do it but it seems like it'll be an AArch64 only thing
> anyway.
> >>
> >> If we want to do this we need to check can_change_mode_class or a hook.
> >> Most targets including x86 reject the conversion.  So it'll just be
> >> effectively an AArch64 thing.
> >>
> >> You're right that the actual equivalent transformation is this
> >> https://godbolt.org/z/KesfrMv5z But the target won't allow it.
> >>
> >> >
> >> > The x86_64 example for the -1 case would be
> >> > https://godbolt.org/z/b9s6MaKs8 where the isel change would be an
> >> > improvement.  Without that, I guess
> >> > x86_64 will need to have a similar patch to the AArch64 one.
> >> >
> >>
> >> I think that's to be expected.  I think it's logical that every
> >> target just needs to implement their optabs optimally.
> >>
> >> > That said, https://godbolt.org/z/e6nqoqbMh suggests that powerpc64
> >> > is probably relying on the current copysign -> neg/abs transform.
> >> > (Not sure why the second function uses different IVs from the
> >> > first.)
> >> >
> >> > Personally, I wouldn't be against a target hook that indicated
> >> > whether float bit manipulation is "free" for a given mode, if it comes to
> that.
> >>
> >> I'm really sure Richi would agree there.  Generally speaking I don't
> >> think people see doing FP operations on Int as beneficial.  But it's always
> the case on AArch64.
> >
> > IIRC we're doing fpclassify "expansion" early for example.
> >
> > Note the issue I see is that the middle-end shouldn't get in the way
> > of targets that have a copysign optab.  In case it's worthwhile to
> > special-case a "setsign" thing then maybe provide an optab for that as
> > well.  Now, that doesn't help if we want setsign to be expanded from
> > the middle-end but still wan the copysign optab (and not require
> > targets to implement both if they want to escape middle-end generic
> expansion of setsign).
> >
> > But yes, I also thought the , 1 and , -1 cases could be special cased
> > by RTL expansion (or ISEL).  But it would need to invoke costing which
> > likely means target specific changes anyway... :/
> 
> FWIW, if we had the target hook I suggested, I don't think AArch64 would
> need to implement copysign or xorsign optabs.  That's probably true of
> AArch32 too (at least for all combinations that are likely to matter these days).
> 
> I'd go one step further and say that, if a target wants to do its own thing for
> copysign and xorsign, it clearly doesn't meet the requirement that bit
> manipulation of floats is "free" for that mode.
> 

So I'm aware I have no say here, but I'll make one last effort.

The patch isn't just implementing the fneg (fabs ()) optimization just because.
The location where it's implemented makes a big difference.

If this optimization is done late, it doesn't fix the regression fully, because doing
It after all optimization passes have run means it can't properly be optimized.

The point of doing the rewrite early to ORR accomplished two things:

1. It makes PRE realize that the block it's splitting would only have 1 instruction in it
    and that such a split is not beneficial.  This is why I'm against doing such optimizations
    so later. Optimizations don’t' happen in isolation, and when they make sense should
    happen early.  Transforming fneg (fabs (..)) in isel not only feels wrong but results in a 4%
    performance loss.

2. Doing it early also gets the ORRs to be reassociated changing where the loop dependent
     variable lands.  Early makes it land in the merging MOVPRFX rather than requiring a SEL
     at the end of the iteration.

Replacing the fneg (fabs (..)) with copysign accomplishes 1 but no 2.  This results in a 2-3% loss,
but I can live with that given doing 1 gets us back to GCC 12 levels.

Doing fneg (fabs (..)) in isel would have no meaning for me and not fix the regression.  I won't be
looking to do that in that case.

If it's acceptable I can transform COPYSIGN (X, -1) in gimple-isel.
So before I start on this, Would this be acceptable for you?

Thanks,
Tamar

> > So I have no good advice here but I hoped that even the generic target
> > specific copysign implementation with and & xor would eventually be
> > optimized later on RTL for constant second arg?
> 
> Yeah.  It looks like the required logic is there for scalars, it just needs extending
> to vectors.
> 
> The patch below (untested beyond simple cases) seems to be enough to fix it,
> but using the simplify routines even for CONST_INTs might be controversial.
> 
> Thanks,
> Richard
> 
> 
> diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index
> bd9443dbcc2..5a9b1745673 100644
> --- a/gcc/simplify-rtx.cc
> +++ b/gcc/simplify-rtx.cc
> @@ -3409,20 +3409,20 @@ simplify_context::simplify_binary_operation_1
> (rtx_code code,
> 
>        /* Canonicalize (X & C1) | C2.  */
>        if (GET_CODE (op0) == AND
> -	  && CONST_INT_P (trueop1)
> -	  && CONST_INT_P (XEXP (op0, 1)))
> +	  && CONSTANT_P (trueop1)
> +	  && CONSTANT_P (XEXP (op0, 1)))
>  	{
> -	  HOST_WIDE_INT mask = GET_MODE_MASK (mode);
> -	  HOST_WIDE_INT c1 = INTVAL (XEXP (op0, 1));
> -	  HOST_WIDE_INT c2 = INTVAL (trueop1);
> +	  rtx c1 = XEXP (op0, 1);
> +	  rtx c2 = trueop1;
> 
>  	  /* If (C1&C2) == C1, then (X&C1)|C2 becomes C2.  */
> -	  if ((c1 & c2) == c1
> +	  if (rtx_equal_p (simplify_binary_operation (AND, mode, c1, c2), c1)
>  	      && !side_effects_p (XEXP (op0, 0)))
>  	    return trueop1;
> 
>  	  /* If (C1|C2) == ~0 then (X&C1)|C2 becomes X|C2.  */
> -	  if (((c1|c2) & mask) == mask)
> +	  if (rtx_equal_p (simplify_binary_operation (IOR, mode, c1, c2),
> +			   CONSTM1_RTX (mode)))
>  	    return simplify_gen_binary (IOR, mode, XEXP (op0, 0), op1);
>  	}
> 
> @@ -3732,7 +3732,7 @@ simplify_context::simplify_binary_operation_1
> (rtx_code code,
>  	 machines, and also has shorter instruction path length.  */
>        if (GET_CODE (op0) == AND
>  	  && GET_CODE (XEXP (op0, 0)) == XOR
> -	  && CONST_INT_P (XEXP (op0, 1))
> +	  && CONSTANT_P (XEXP (op0, 1))
>  	  && rtx_equal_p (XEXP (XEXP (op0, 0), 0), trueop1))
>  	{
>  	  rtx a = trueop1;
> @@ -3746,7 +3746,7 @@ simplify_context::simplify_binary_operation_1
> (rtx_code code,
>        /* Similarly, (xor (and (xor A B) C) B) as (ior (and A C) (and B ~C))  */
>        else if (GET_CODE (op0) == AND
>  	  && GET_CODE (XEXP (op0, 0)) == XOR
> -	  && CONST_INT_P (XEXP (op0, 1))
> +	  && CONSTANT_P (XEXP (op0, 1))
>  	  && rtx_equal_p (XEXP (XEXP (op0, 0), 1), trueop1))
>  	{
>  	  rtx a = XEXP (XEXP (op0, 0), 0);
> --
> 2.25.1
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-09  9:38             ` Tamar Christina
@ 2023-10-09  9:45               ` Richard Biener
  2023-10-09  9:55                 ` Tamar Christina
  2023-10-09  9:56               ` Richard Sandiford
  1 sibling, 1 reply; 16+ messages in thread
From: Richard Biener @ 2023-10-09  9:45 UTC (permalink / raw)
  To: Tamar Christina
  Cc: Richard Sandiford, gcc-patches, nd, Richard Earnshaw,
	Marcus Shawcroft, Kyrylo Tkachov

On Mon, Oct 9, 2023 at 11:39 AM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > -----Original Message-----
> > From: Richard Sandiford <richard.sandiford@arm.com>
> > Sent: Saturday, October 7, 2023 10:58 AM
> > To: Richard Biener <richard.guenther@gmail.com>
> > Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org;
> > nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> > Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>
> > Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> >
> > Richard Biener <richard.guenther@gmail.com> writes:
> > > On Thu, Oct 5, 2023 at 10:46 PM Tamar Christina
> > <Tamar.Christina@arm.com> wrote:
> > >>
> > >> > -----Original Message-----
> > >> > From: Richard Sandiford <richard.sandiford@arm.com>
> > >> > Sent: Thursday, October 5, 2023 9:26 PM
> > >> > To: Tamar Christina <Tamar.Christina@arm.com>
> > >> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > >> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > >> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>
> > >> > Subject: Re: [PATCH]AArch64 Add SVE implementation for
> > cond_copysign.
> > >> >
> > >> > Tamar Christina <Tamar.Christina@arm.com> writes:
> > >> > >> -----Original Message-----
> > >> > >> From: Richard Sandiford <richard.sandiford@arm.com>
> > >> > >> Sent: Thursday, October 5, 2023 8:29 PM
> > >> > >> To: Tamar Christina <Tamar.Christina@arm.com>
> > >> > >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > >> > >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > >> > >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > >> > <Kyrylo.Tkachov@arm.com>
> > >> > >> Subject: Re: [PATCH]AArch64 Add SVE implementation for
> > cond_copysign.
> > >> > >>
> > >> > >> Tamar Christina <tamar.christina@arm.com> writes:
> > >> > >> > Hi All,
> > >> > >> >
> > >> > >> > This adds an implementation for masked copysign along with an
> > >> > >> > optimized pattern for masked copysign (x, -1).
> > >> > >>
> > >> > >> It feels like we're ending up with a lot of AArch64-specific
> > >> > >> code that just hard- codes the observation that changing the
> > >> > >> sign is equivalent to changing the top bit.  We then need to
> > >> > >> make sure that we choose the best way of changing the top bit for any
> > given situation.
> > >> > >>
> > >> > >> Hard-coding the -1/negative case is one instance of that.  But
> > >> > >> it looks like we also fail to use the best sequence for SVE2.  E.g.
> > >> > >> [https://godbolt.org/z/ajh3MM5jv]:
> > >> > >>
> > >> > >> #include <stdint.h>
> > >> > >>
> > >> > >> void f(double *restrict a, double *restrict b) {
> > >> > >>     for (int i = 0; i < 100; ++i)
> > >> > >>         a[i] = __builtin_copysign(a[i], b[i]); }
> > >> > >>
> > >> > >> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
> > >> > >>     for (int i = 0; i < 100; ++i)
> > >> > >>         a[i] = (a[i] & ~c) | (b[i] & c); }
> > >> > >>
> > >> > >> gives:
> > >> > >>
> > >> > >> f:
> > >> > >>         mov     x2, 0
> > >> > >>         mov     w3, 100
> > >> > >>         whilelo p7.d, wzr, w3
> > >> > >> .L2:
> > >> > >>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
> > >> > >>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
> > >> > >>         and     z30.d, z30.d, #0x7fffffffffffffff
> > >> > >>         and     z31.d, z31.d, #0x8000000000000000
> > >> > >>         orr     z31.d, z31.d, z30.d
> > >> > >>         st1d    z31.d, p7, [x0, x2, lsl 3]
> > >> > >>         incd    x2
> > >> > >>         whilelo p7.d, w2, w3
> > >> > >>         b.any   .L2
> > >> > >>         ret
> > >> > >> g:
> > >> > >>         mov     x3, 0
> > >> > >>         mov     w4, 100
> > >> > >>         mov     z29.d, x2
> > >> > >>         whilelo p7.d, wzr, w4
> > >> > >> .L6:
> > >> > >>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
> > >> > >>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
> > >> > >>         bsl     z31.d, z31.d, z30.d, z29.d
> > >> > >>         st1d    z31.d, p7, [x0, x3, lsl 3]
> > >> > >>         incd    x3
> > >> > >>         whilelo p7.d, w3, w4
> > >> > >>         b.any   .L6
> > >> > >>         ret
> > >> > >>
> > >> > >> I saw that you originally tried to do this in match.pd and that
> > >> > >> the decision was to fold to copysign instead.  But perhaps
> > >> > >> there's a compromise where isel does something with the (new)
> > >> > >> copysign canonical
> > >> > form?
> > >> > >> I.e. could we go with your new version of the match.pd patch,
> > >> > >> and add some isel stuff as a follow-on?
> > >> > >>
> > >> > >
> > >> > > Sure if that's what's desired.... But..
> > >> > >
> > >> > > The example you posted above is for instance worse for x86
> > >> > > https://godbolt.org/z/x9ccqxW6T where the first operation has a
> > >> > > dependency chain of 2 and the latter of 3.  It's likely any open
> > >> > > coding of this
> > >> > operation is going to hurt a target.
> > >> > >
> > >> > > So I'm unsure what isel transform this into...
> > >> >
> > >> > I didn't mean that we should go straight to using isel for the
> > >> > general case, just for the new case.  The example above was instead
> > >> > trying to show the general point that hiding the logic ops in target code is
> > a double-edged sword.
> > >>
> > >> I see.. but the problem here is that transforming copysign (x, -1)
> > >> into (x | 0x8000000) would require an integer operation on an FP
> > >> value.  I'm happy to do it but it seems like it'll be an AArch64 only thing
> > anyway.
> > >>
> > >> If we want to do this we need to check can_change_mode_class or a hook.
> > >> Most targets including x86 reject the conversion.  So it'll just be
> > >> effectively an AArch64 thing.
> > >>
> > >> You're right that the actual equivalent transformation is this
> > >> https://godbolt.org/z/KesfrMv5z But the target won't allow it.
> > >>
> > >> >
> > >> > The x86_64 example for the -1 case would be
> > >> > https://godbolt.org/z/b9s6MaKs8 where the isel change would be an
> > >> > improvement.  Without that, I guess
> > >> > x86_64 will need to have a similar patch to the AArch64 one.
> > >> >
> > >>
> > >> I think that's to be expected.  I think it's logical that every
> > >> target just needs to implement their optabs optimally.
> > >>
> > >> > That said, https://godbolt.org/z/e6nqoqbMh suggests that powerpc64
> > >> > is probably relying on the current copysign -> neg/abs transform.
> > >> > (Not sure why the second function uses different IVs from the
> > >> > first.)
> > >> >
> > >> > Personally, I wouldn't be against a target hook that indicated
> > >> > whether float bit manipulation is "free" for a given mode, if it comes to
> > that.
> > >>
> > >> I'm really sure Richi would agree there.  Generally speaking I don't
> > >> think people see doing FP operations on Int as beneficial.  But it's always
> > the case on AArch64.
> > >
> > > IIRC we're doing fpclassify "expansion" early for example.
> > >
> > > Note the issue I see is that the middle-end shouldn't get in the way
> > > of targets that have a copysign optab.  In case it's worthwhile to
> > > special-case a "setsign" thing then maybe provide an optab for that as
> > > well.  Now, that doesn't help if we want setsign to be expanded from
> > > the middle-end but still wan the copysign optab (and not require
> > > targets to implement both if they want to escape middle-end generic
> > expansion of setsign).
> > >
> > > But yes, I also thought the , 1 and , -1 cases could be special cased
> > > by RTL expansion (or ISEL).  But it would need to invoke costing which
> > > likely means target specific changes anyway... :/
> >
> > FWIW, if we had the target hook I suggested, I don't think AArch64 would
> > need to implement copysign or xorsign optabs.  That's probably true of
> > AArch32 too (at least for all combinations that are likely to matter these days).
> >
> > I'd go one step further and say that, if a target wants to do its own thing for
> > copysign and xorsign, it clearly doesn't meet the requirement that bit
> > manipulation of floats is "free" for that mode.
> >
>
> So I'm aware I have no say here, but I'll make one last effort.
>
> The patch isn't just implementing the fneg (fabs ()) optimization just because.
> The location where it's implemented makes a big difference.
>
> If this optimization is done late, it doesn't fix the regression fully, because doing
> It after all optimization passes have run means it can't properly be optimized.
>
> The point of doing the rewrite early to ORR accomplished two things:
>
> 1. It makes PRE realize that the block it's splitting would only have 1 instruction in it
>     and that such a split is not beneficial.  This is why I'm against doing such optimizations
>     so later. Optimizations don’t' happen in isolation, and when they make sense should
>     happen early.  Transforming fneg (fabs (..)) in isel not only feels wrong but results in a 4%
>     performance loss.
>
> 2. Doing it early also gets the ORRs to be reassociated changing where the loop dependent
>      variable lands.  Early makes it land in the merging MOVPRFX rather than requiring a SEL
>      at the end of the iteration.
>
> Replacing the fneg (fabs (..)) with copysign accomplishes 1 but no 2.  This results in a 2-3% loss,
> but I can live with that given doing 1 gets us back to GCC 12 levels.
>
> Doing fneg (fabs (..)) in isel would have no meaning for me and not fix the regression.  I won't be
> looking to do that in that case.
>
> If it's acceptable I can transform COPYSIGN (X, -1) in gimple-isel.
> So before I start on this, Would this be acceptable for you?

Since copysign (x, -1) is a single statement you can just massage the
internal function expander?
Or the generic expand_copysign which already has a bit operation
fallback.  The question is
what 'copysign' to use during folding of fneg (fabs (...)) when you
remove the backend
expander (because then IFN_COPYSIGN isn't directly expandable ...)

Richard.

>
> Thanks,
> Tamar
>
> > > So I have no good advice here but I hoped that even the generic target
> > > specific copysign implementation with and & xor would eventually be
> > > optimized later on RTL for constant second arg?
> >
> > Yeah.  It looks like the required logic is there for scalars, it just needs extending
> > to vectors.
> >
> > The patch below (untested beyond simple cases) seems to be enough to fix it,
> > but using the simplify routines even for CONST_INTs might be controversial.
> >
> > Thanks,
> > Richard
> >
> >
> > diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index
> > bd9443dbcc2..5a9b1745673 100644
> > --- a/gcc/simplify-rtx.cc
> > +++ b/gcc/simplify-rtx.cc
> > @@ -3409,20 +3409,20 @@ simplify_context::simplify_binary_operation_1
> > (rtx_code code,
> >
> >        /* Canonicalize (X & C1) | C2.  */
> >        if (GET_CODE (op0) == AND
> > -       && CONST_INT_P (trueop1)
> > -       && CONST_INT_P (XEXP (op0, 1)))
> > +       && CONSTANT_P (trueop1)
> > +       && CONSTANT_P (XEXP (op0, 1)))
> >       {
> > -       HOST_WIDE_INT mask = GET_MODE_MASK (mode);
> > -       HOST_WIDE_INT c1 = INTVAL (XEXP (op0, 1));
> > -       HOST_WIDE_INT c2 = INTVAL (trueop1);
> > +       rtx c1 = XEXP (op0, 1);
> > +       rtx c2 = trueop1;
> >
> >         /* If (C1&C2) == C1, then (X&C1)|C2 becomes C2.  */
> > -       if ((c1 & c2) == c1
> > +       if (rtx_equal_p (simplify_binary_operation (AND, mode, c1, c2), c1)
> >             && !side_effects_p (XEXP (op0, 0)))
> >           return trueop1;
> >
> >         /* If (C1|C2) == ~0 then (X&C1)|C2 becomes X|C2.  */
> > -       if (((c1|c2) & mask) == mask)
> > +       if (rtx_equal_p (simplify_binary_operation (IOR, mode, c1, c2),
> > +                        CONSTM1_RTX (mode)))
> >           return simplify_gen_binary (IOR, mode, XEXP (op0, 0), op1);
> >       }
> >
> > @@ -3732,7 +3732,7 @@ simplify_context::simplify_binary_operation_1
> > (rtx_code code,
> >        machines, and also has shorter instruction path length.  */
> >        if (GET_CODE (op0) == AND
> >         && GET_CODE (XEXP (op0, 0)) == XOR
> > -       && CONST_INT_P (XEXP (op0, 1))
> > +       && CONSTANT_P (XEXP (op0, 1))
> >         && rtx_equal_p (XEXP (XEXP (op0, 0), 0), trueop1))
> >       {
> >         rtx a = trueop1;
> > @@ -3746,7 +3746,7 @@ simplify_context::simplify_binary_operation_1
> > (rtx_code code,
> >        /* Similarly, (xor (and (xor A B) C) B) as (ior (and A C) (and B ~C))  */
> >        else if (GET_CODE (op0) == AND
> >         && GET_CODE (XEXP (op0, 0)) == XOR
> > -       && CONST_INT_P (XEXP (op0, 1))
> > +       && CONSTANT_P (XEXP (op0, 1))
> >         && rtx_equal_p (XEXP (XEXP (op0, 0), 1), trueop1))
> >       {
> >         rtx a = XEXP (XEXP (op0, 0), 0);
> > --
> > 2.25.1
> >
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-09  9:45               ` Richard Biener
@ 2023-10-09  9:55                 ` Tamar Christina
  0 siblings, 0 replies; 16+ messages in thread
From: Tamar Christina @ 2023-10-09  9:55 UTC (permalink / raw)
  To: Richard Biener
  Cc: Richard Sandiford, gcc-patches, nd, Richard Earnshaw,
	Marcus Shawcroft, Kyrylo Tkachov

> -----Original Message-----
> From: Richard Biener <richard.guenther@gmail.com>
> Sent: Monday, October 9, 2023 10:45 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Sandiford <Richard.Sandiford@arm.com>; gcc-
> patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> 
> On Mon, Oct 9, 2023 at 11:39 AM Tamar Christina
> <Tamar.Christina@arm.com> wrote:
> >
> > > -----Original Message-----
> > > From: Richard Sandiford <richard.sandiford@arm.com>
> > > Sent: Saturday, October 7, 2023 10:58 AM
> > > To: Richard Biener <richard.guenther@gmail.com>
> > > Cc: Tamar Christina <Tamar.Christina@arm.com>;
> > > gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> > > Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> > >
> > > Richard Biener <richard.guenther@gmail.com> writes:
> > > > On Thu, Oct 5, 2023 at 10:46 PM Tamar Christina
> > > <Tamar.Christina@arm.com> wrote:
> > > >>
> > > >> > -----Original Message-----
> > > >> > From: Richard Sandiford <richard.sandiford@arm.com>
> > > >> > Sent: Thursday, October 5, 2023 9:26 PM
> > > >> > To: Tamar Christina <Tamar.Christina@arm.com>
> > > >> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> > > >> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > > >> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > > <Kyrylo.Tkachov@arm.com>
> > > >> > Subject: Re: [PATCH]AArch64 Add SVE implementation for
> > > cond_copysign.
> > > >> >
> > > >> > Tamar Christina <Tamar.Christina@arm.com> writes:
> > > >> > >> -----Original Message-----
> > > >> > >> From: Richard Sandiford <richard.sandiford@arm.com>
> > > >> > >> Sent: Thursday, October 5, 2023 8:29 PM
> > > >> > >> To: Tamar Christina <Tamar.Christina@arm.com>
> > > >> > >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard
> > > >> > >> Earnshaw <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> > > >> > >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> > > >> > <Kyrylo.Tkachov@arm.com>
> > > >> > >> Subject: Re: [PATCH]AArch64 Add SVE implementation for
> > > cond_copysign.
> > > >> > >>
> > > >> > >> Tamar Christina <tamar.christina@arm.com> writes:
> > > >> > >> > Hi All,
> > > >> > >> >
> > > >> > >> > This adds an implementation for masked copysign along with
> > > >> > >> > an optimized pattern for masked copysign (x, -1).
> > > >> > >>
> > > >> > >> It feels like we're ending up with a lot of AArch64-specific
> > > >> > >> code that just hard- codes the observation that changing the
> > > >> > >> sign is equivalent to changing the top bit.  We then need to
> > > >> > >> make sure that we choose the best way of changing the top
> > > >> > >> bit for any
> > > given situation.
> > > >> > >>
> > > >> > >> Hard-coding the -1/negative case is one instance of that.
> > > >> > >> But it looks like we also fail to use the best sequence for SVE2.  E.g.
> > > >> > >> [https://godbolt.org/z/ajh3MM5jv]:
> > > >> > >>
> > > >> > >> #include <stdint.h>
> > > >> > >>
> > > >> > >> void f(double *restrict a, double *restrict b) {
> > > >> > >>     for (int i = 0; i < 100; ++i)
> > > >> > >>         a[i] = __builtin_copysign(a[i], b[i]); }
> > > >> > >>
> > > >> > >> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
> > > >> > >>     for (int i = 0; i < 100; ++i)
> > > >> > >>         a[i] = (a[i] & ~c) | (b[i] & c); }
> > > >> > >>
> > > >> > >> gives:
> > > >> > >>
> > > >> > >> f:
> > > >> > >>         mov     x2, 0
> > > >> > >>         mov     w3, 100
> > > >> > >>         whilelo p7.d, wzr, w3
> > > >> > >> .L2:
> > > >> > >>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
> > > >> > >>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
> > > >> > >>         and     z30.d, z30.d, #0x7fffffffffffffff
> > > >> > >>         and     z31.d, z31.d, #0x8000000000000000
> > > >> > >>         orr     z31.d, z31.d, z30.d
> > > >> > >>         st1d    z31.d, p7, [x0, x2, lsl 3]
> > > >> > >>         incd    x2
> > > >> > >>         whilelo p7.d, w2, w3
> > > >> > >>         b.any   .L2
> > > >> > >>         ret
> > > >> > >> g:
> > > >> > >>         mov     x3, 0
> > > >> > >>         mov     w4, 100
> > > >> > >>         mov     z29.d, x2
> > > >> > >>         whilelo p7.d, wzr, w4
> > > >> > >> .L6:
> > > >> > >>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
> > > >> > >>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
> > > >> > >>         bsl     z31.d, z31.d, z30.d, z29.d
> > > >> > >>         st1d    z31.d, p7, [x0, x3, lsl 3]
> > > >> > >>         incd    x3
> > > >> > >>         whilelo p7.d, w3, w4
> > > >> > >>         b.any   .L6
> > > >> > >>         ret
> > > >> > >>
> > > >> > >> I saw that you originally tried to do this in match.pd and
> > > >> > >> that the decision was to fold to copysign instead.  But
> > > >> > >> perhaps there's a compromise where isel does something with
> > > >> > >> the (new) copysign canonical
> > > >> > form?
> > > >> > >> I.e. could we go with your new version of the match.pd
> > > >> > >> patch, and add some isel stuff as a follow-on?
> > > >> > >>
> > > >> > >
> > > >> > > Sure if that's what's desired.... But..
> > > >> > >
> > > >> > > The example you posted above is for instance worse for x86
> > > >> > > https://godbolt.org/z/x9ccqxW6T where the first operation has
> > > >> > > a dependency chain of 2 and the latter of 3.  It's likely any
> > > >> > > open coding of this
> > > >> > operation is going to hurt a target.
> > > >> > >
> > > >> > > So I'm unsure what isel transform this into...
> > > >> >
> > > >> > I didn't mean that we should go straight to using isel for the
> > > >> > general case, just for the new case.  The example above was
> > > >> > instead trying to show the general point that hiding the logic
> > > >> > ops in target code is
> > > a double-edged sword.
> > > >>
> > > >> I see.. but the problem here is that transforming copysign (x,
> > > >> -1) into (x | 0x8000000) would require an integer operation on an
> > > >> FP value.  I'm happy to do it but it seems like it'll be an
> > > >> AArch64 only thing
> > > anyway.
> > > >>
> > > >> If we want to do this we need to check can_change_mode_class or a
> hook.
> > > >> Most targets including x86 reject the conversion.  So it'll just
> > > >> be effectively an AArch64 thing.
> > > >>
> > > >> You're right that the actual equivalent transformation is this
> > > >> https://godbolt.org/z/KesfrMv5z But the target won't allow it.
> > > >>
> > > >> >
> > > >> > The x86_64 example for the -1 case would be
> > > >> > https://godbolt.org/z/b9s6MaKs8 where the isel change would be
> > > >> > an improvement.  Without that, I guess
> > > >> > x86_64 will need to have a similar patch to the AArch64 one.
> > > >> >
> > > >>
> > > >> I think that's to be expected.  I think it's logical that every
> > > >> target just needs to implement their optabs optimally.
> > > >>
> > > >> > That said, https://godbolt.org/z/e6nqoqbMh suggests that
> > > >> > powerpc64 is probably relying on the current copysign -> neg/abs
> transform.
> > > >> > (Not sure why the second function uses different IVs from the
> > > >> > first.)
> > > >> >
> > > >> > Personally, I wouldn't be against a target hook that indicated
> > > >> > whether float bit manipulation is "free" for a given mode, if
> > > >> > it comes to
> > > that.
> > > >>
> > > >> I'm really sure Richi would agree there.  Generally speaking I
> > > >> don't think people see doing FP operations on Int as beneficial.
> > > >> But it's always
> > > the case on AArch64.
> > > >
> > > > IIRC we're doing fpclassify "expansion" early for example.
> > > >
> > > > Note the issue I see is that the middle-end shouldn't get in the
> > > > way of targets that have a copysign optab.  In case it's
> > > > worthwhile to special-case a "setsign" thing then maybe provide an
> > > > optab for that as well.  Now, that doesn't help if we want setsign
> > > > to be expanded from the middle-end but still wan the copysign
> > > > optab (and not require targets to implement both if they want to
> > > > escape middle-end generic
> > > expansion of setsign).
> > > >
> > > > But yes, I also thought the , 1 and , -1 cases could be special
> > > > cased by RTL expansion (or ISEL).  But it would need to invoke
> > > > costing which likely means target specific changes anyway... :/
> > >
> > > FWIW, if we had the target hook I suggested, I don't think AArch64
> > > would need to implement copysign or xorsign optabs.  That's probably
> > > true of
> > > AArch32 too (at least for all combinations that are likely to matter these
> days).
> > >
> > > I'd go one step further and say that, if a target wants to do its
> > > own thing for copysign and xorsign, it clearly doesn't meet the
> > > requirement that bit manipulation of floats is "free" for that mode.
> > >
> >
> > So I'm aware I have no say here, but I'll make one last effort.
> >
> > The patch isn't just implementing the fneg (fabs ()) optimization just
> because.
> > The location where it's implemented makes a big difference.
> >
> > If this optimization is done late, it doesn't fix the regression
> > fully, because doing It after all optimization passes have run means it can't
> properly be optimized.
> >
> > The point of doing the rewrite early to ORR accomplished two things:
> >
> > 1. It makes PRE realize that the block it's splitting would only have 1
> instruction in it
> >     and that such a split is not beneficial.  This is why I'm against doing such
> optimizations
> >     so later. Optimizations don’t' happen in isolation, and when they make
> sense should
> >     happen early.  Transforming fneg (fabs (..)) in isel not only feels wrong but
> results in a 4%
> >     performance loss.
> >
> > 2. Doing it early also gets the ORRs to be reassociated changing where the
> loop dependent
> >      variable lands.  Early makes it land in the merging MOVPRFX rather than
> requiring a SEL
> >      at the end of the iteration.
> >
> > Replacing the fneg (fabs (..)) with copysign accomplishes 1 but no 2.
> > This results in a 2-3% loss, but I can live with that given doing 1 gets us back
> to GCC 12 levels.
> >
> > Doing fneg (fabs (..)) in isel would have no meaning for me and not
> > fix the regression.  I won't be looking to do that in that case.
> >
> > If it's acceptable I can transform COPYSIGN (X, -1) in gimple-isel.
> > So before I start on this, Would this be acceptable for you?
> 
> Since copysign (x, -1) is a single statement you can just massage the internal
> function expander?
> Or the generic expand_copysign which already has a bit operation fallback.
> The question is what 'copysign' to use during folding of fneg (fabs (...)) when
> you remove the backend expander (because then IFN_COPYSIGN isn't directly
> expandable ...)
> 

Indeed, I forgot to say so in my previous email.  I don't think removing the
pattern makes sense.  It doesn't work for IFN_COPYSIGN and as such wouldn't
work for vectors.  So intrintrics wouldn't work.  I guess we could somehow
make IFN_COPYSIGN synthetic like COPYSIGN but why.. there's apparently
even an RTL code for copysign that POWER has just implemented.

Regards,
Tamar

> Richard.
> 
> >
> > Thanks,
> > Tamar
> >
> > > > So I have no good advice here but I hoped that even the generic
> > > > target specific copysign implementation with and & xor would
> > > > eventually be optimized later on RTL for constant second arg?
> > >
> > > Yeah.  It looks like the required logic is there for scalars, it
> > > just needs extending to vectors.
> > >
> > > The patch below (untested beyond simple cases) seems to be enough to
> > > fix it, but using the simplify routines even for CONST_INTs might be
> controversial.
> > >
> > > Thanks,
> > > Richard
> > >
> > >
> > > diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index
> > > bd9443dbcc2..5a9b1745673 100644
> > > --- a/gcc/simplify-rtx.cc
> > > +++ b/gcc/simplify-rtx.cc
> > > @@ -3409,20 +3409,20 @@
> > > simplify_context::simplify_binary_operation_1
> > > (rtx_code code,
> > >
> > >        /* Canonicalize (X & C1) | C2.  */
> > >        if (GET_CODE (op0) == AND
> > > -       && CONST_INT_P (trueop1)
> > > -       && CONST_INT_P (XEXP (op0, 1)))
> > > +       && CONSTANT_P (trueop1)
> > > +       && CONSTANT_P (XEXP (op0, 1)))
> > >       {
> > > -       HOST_WIDE_INT mask = GET_MODE_MASK (mode);
> > > -       HOST_WIDE_INT c1 = INTVAL (XEXP (op0, 1));
> > > -       HOST_WIDE_INT c2 = INTVAL (trueop1);
> > > +       rtx c1 = XEXP (op0, 1);
> > > +       rtx c2 = trueop1;
> > >
> > >         /* If (C1&C2) == C1, then (X&C1)|C2 becomes C2.  */
> > > -       if ((c1 & c2) == c1
> > > +       if (rtx_equal_p (simplify_binary_operation (AND, mode, c1,
> > > + c2), c1)
> > >             && !side_effects_p (XEXP (op0, 0)))
> > >           return trueop1;
> > >
> > >         /* If (C1|C2) == ~0 then (X&C1)|C2 becomes X|C2.  */
> > > -       if (((c1|c2) & mask) == mask)
> > > +       if (rtx_equal_p (simplify_binary_operation (IOR, mode, c1, c2),
> > > +                        CONSTM1_RTX (mode)))
> > >           return simplify_gen_binary (IOR, mode, XEXP (op0, 0), op1);
> > >       }
> > >
> > > @@ -3732,7 +3732,7 @@ simplify_context::simplify_binary_operation_1
> > > (rtx_code code,
> > >        machines, and also has shorter instruction path length.  */
> > >        if (GET_CODE (op0) == AND
> > >         && GET_CODE (XEXP (op0, 0)) == XOR
> > > -       && CONST_INT_P (XEXP (op0, 1))
> > > +       && CONSTANT_P (XEXP (op0, 1))
> > >         && rtx_equal_p (XEXP (XEXP (op0, 0), 0), trueop1))
> > >       {
> > >         rtx a = trueop1;
> > > @@ -3746,7 +3746,7 @@ simplify_context::simplify_binary_operation_1
> > > (rtx_code code,
> > >        /* Similarly, (xor (and (xor A B) C) B) as (ior (and A C) (and B ~C))  */
> > >        else if (GET_CODE (op0) == AND
> > >         && GET_CODE (XEXP (op0, 0)) == XOR
> > > -       && CONST_INT_P (XEXP (op0, 1))
> > > +       && CONSTANT_P (XEXP (op0, 1))
> > >         && rtx_equal_p (XEXP (XEXP (op0, 0), 1), trueop1))
> > >       {
> > >         rtx a = XEXP (XEXP (op0, 0), 0);
> > > --
> > > 2.25.1
> > >
> >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-09  9:38             ` Tamar Christina
  2023-10-09  9:45               ` Richard Biener
@ 2023-10-09  9:56               ` Richard Sandiford
  2023-10-09 10:09                 ` Tamar Christina
  1 sibling, 1 reply; 16+ messages in thread
From: Richard Sandiford @ 2023-10-09  9:56 UTC (permalink / raw)
  To: Tamar Christina
  Cc: Richard Biener, gcc-patches, nd, Richard Earnshaw,
	Marcus Shawcroft, Kyrylo Tkachov

Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Saturday, October 7, 2023 10:58 AM
>> To: Richard Biener <richard.guenther@gmail.com>
>> Cc: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org;
>> nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
>> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
>> 
>> Richard Biener <richard.guenther@gmail.com> writes:
>> > On Thu, Oct 5, 2023 at 10:46 PM Tamar Christina
>> <Tamar.Christina@arm.com> wrote:
>> >>
>> >> > -----Original Message-----
>> >> > From: Richard Sandiford <richard.sandiford@arm.com>
>> >> > Sent: Thursday, October 5, 2023 9:26 PM
>> >> > To: Tamar Christina <Tamar.Christina@arm.com>
>> >> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> <Kyrylo.Tkachov@arm.com>
>> >> > Subject: Re: [PATCH]AArch64 Add SVE implementation for
>> cond_copysign.
>> >> >
>> >> > Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> > >> -----Original Message-----
>> >> > >> From: Richard Sandiford <richard.sandiford@arm.com>
>> >> > >> Sent: Thursday, October 5, 2023 8:29 PM
>> >> > >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> > >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> > >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> > >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> >> > <Kyrylo.Tkachov@arm.com>
>> >> > >> Subject: Re: [PATCH]AArch64 Add SVE implementation for
>> cond_copysign.
>> >> > >>
>> >> > >> Tamar Christina <tamar.christina@arm.com> writes:
>> >> > >> > Hi All,
>> >> > >> >
>> >> > >> > This adds an implementation for masked copysign along with an
>> >> > >> > optimized pattern for masked copysign (x, -1).
>> >> > >>
>> >> > >> It feels like we're ending up with a lot of AArch64-specific
>> >> > >> code that just hard- codes the observation that changing the
>> >> > >> sign is equivalent to changing the top bit.  We then need to
>> >> > >> make sure that we choose the best way of changing the top bit for any
>> given situation.
>> >> > >>
>> >> > >> Hard-coding the -1/negative case is one instance of that.  But
>> >> > >> it looks like we also fail to use the best sequence for SVE2.  E.g.
>> >> > >> [https://godbolt.org/z/ajh3MM5jv]:
>> >> > >>
>> >> > >> #include <stdint.h>
>> >> > >>
>> >> > >> void f(double *restrict a, double *restrict b) {
>> >> > >>     for (int i = 0; i < 100; ++i)
>> >> > >>         a[i] = __builtin_copysign(a[i], b[i]); }
>> >> > >>
>> >> > >> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
>> >> > >>     for (int i = 0; i < 100; ++i)
>> >> > >>         a[i] = (a[i] & ~c) | (b[i] & c); }
>> >> > >>
>> >> > >> gives:
>> >> > >>
>> >> > >> f:
>> >> > >>         mov     x2, 0
>> >> > >>         mov     w3, 100
>> >> > >>         whilelo p7.d, wzr, w3
>> >> > >> .L2:
>> >> > >>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
>> >> > >>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
>> >> > >>         and     z30.d, z30.d, #0x7fffffffffffffff
>> >> > >>         and     z31.d, z31.d, #0x8000000000000000
>> >> > >>         orr     z31.d, z31.d, z30.d
>> >> > >>         st1d    z31.d, p7, [x0, x2, lsl 3]
>> >> > >>         incd    x2
>> >> > >>         whilelo p7.d, w2, w3
>> >> > >>         b.any   .L2
>> >> > >>         ret
>> >> > >> g:
>> >> > >>         mov     x3, 0
>> >> > >>         mov     w4, 100
>> >> > >>         mov     z29.d, x2
>> >> > >>         whilelo p7.d, wzr, w4
>> >> > >> .L6:
>> >> > >>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
>> >> > >>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
>> >> > >>         bsl     z31.d, z31.d, z30.d, z29.d
>> >> > >>         st1d    z31.d, p7, [x0, x3, lsl 3]
>> >> > >>         incd    x3
>> >> > >>         whilelo p7.d, w3, w4
>> >> > >>         b.any   .L6
>> >> > >>         ret
>> >> > >>
>> >> > >> I saw that you originally tried to do this in match.pd and that
>> >> > >> the decision was to fold to copysign instead.  But perhaps
>> >> > >> there's a compromise where isel does something with the (new)
>> >> > >> copysign canonical
>> >> > form?
>> >> > >> I.e. could we go with your new version of the match.pd patch,
>> >> > >> and add some isel stuff as a follow-on?

[A]

>> >> > >>
>> >> > >
>> >> > > Sure if that's what's desired.... But..
>> >> > >
>> >> > > The example you posted above is for instance worse for x86
>> >> > > https://godbolt.org/z/x9ccqxW6T where the first operation has a
>> >> > > dependency chain of 2 and the latter of 3.  It's likely any open
>> >> > > coding of this
>> >> > operation is going to hurt a target.
>> >> > >
>> >> > > So I'm unsure what isel transform this into...
>> >> >
>> >> > I didn't mean that we should go straight to using isel for the
>> >> > general case, just for the new case.  The example above was instead
>> >> > trying to show the general point that hiding the logic ops in target code is
>> a double-edged sword.
>> >>
>> >> I see.. but the problem here is that transforming copysign (x, -1)
>> >> into (x | 0x8000000) would require an integer operation on an FP
>> >> value.  I'm happy to do it but it seems like it'll be an AArch64 only thing
>> anyway.
>> >>
>> >> If we want to do this we need to check can_change_mode_class or a hook.
>> >> Most targets including x86 reject the conversion.  So it'll just be
>> >> effectively an AArch64 thing.
>> >>
>> >> You're right that the actual equivalent transformation is this
>> >> https://godbolt.org/z/KesfrMv5z But the target won't allow it.
>> >>
>> >> >
>> >> > The x86_64 example for the -1 case would be
>> >> > https://godbolt.org/z/b9s6MaKs8 where the isel change would be an
>> >> > improvement.  Without that, I guess
>> >> > x86_64 will need to have a similar patch to the AArch64 one.
>> >> >
>> >>
>> >> I think that's to be expected.  I think it's logical that every
>> >> target just needs to implement their optabs optimally.
>> >>
>> >> > That said, https://godbolt.org/z/e6nqoqbMh suggests that powerpc64
>> >> > is probably relying on the current copysign -> neg/abs transform.
>> >> > (Not sure why the second function uses different IVs from the
>> >> > first.)
>> >> >
>> >> > Personally, I wouldn't be against a target hook that indicated
>> >> > whether float bit manipulation is "free" for a given mode, if it comes to
>> that.
>> >>
>> >> I'm really sure Richi would agree there.  Generally speaking I don't
>> >> think people see doing FP operations on Int as beneficial.  But it's always
>> the case on AArch64.
>> >
>> > IIRC we're doing fpclassify "expansion" early for example.
>> >
>> > Note the issue I see is that the middle-end shouldn't get in the way
>> > of targets that have a copysign optab.  In case it's worthwhile to
>> > special-case a "setsign" thing then maybe provide an optab for that as
>> > well.  Now, that doesn't help if we want setsign to be expanded from
>> > the middle-end but still wan the copysign optab (and not require
>> > targets to implement both if they want to escape middle-end generic
>> expansion of setsign).
>> >
>> > But yes, I also thought the , 1 and , -1 cases could be special cased
>> > by RTL expansion (or ISEL).  But it would need to invoke costing which
>> > likely means target specific changes anyway... :/
>> 
>> FWIW, if we had the target hook I suggested, I don't think AArch64 would
>> need to implement copysign or xorsign optabs.  That's probably true of
>> AArch32 too (at least for all combinations that are likely to matter these days).
>> 
>> I'd go one step further and say that, if a target wants to do its own thing for
>> copysign and xorsign, it clearly doesn't meet the requirement that bit
>> manipulation of floats is "free" for that mode.
>> 
>
> So I'm aware I have no say here, but I'll make one last effort.
>
> The patch isn't just implementing the fneg (fabs ()) optimization just because.
> The location where it's implemented makes a big difference.
>
> If this optimization is done late, it doesn't fix the regression fully, because doing
> It after all optimization passes have run means it can't properly be optimized.
>
> The point of doing the rewrite early to ORR accomplished two things:
>
> 1. It makes PRE realize that the block it's splitting would only have 1 instruction in it
>     and that such a split is not beneficial.  This is why I'm against doing such optimizations
>     so later. Optimizations don’t' happen in isolation, and when they make sense should
>     happen early.  Transforming fneg (fabs (..)) in isel not only feels wrong but results in a 4%
>     performance loss.
>
> 2. Doing it early also gets the ORRs to be reassociated changing where the loop dependent
>      variable lands.  Early makes it land in the merging MOVPRFX rather than requiring a SEL
>      at the end of the iteration.
>
> Replacing the fneg (fabs (..)) with copysign accomplishes 1 but no 2.  This results in a 2-3% loss,
> but I can live with that given doing 1 gets us back to GCC 12 levels.
>
> Doing fneg (fabs (..)) in isel would have no meaning for me and not fix the regression.  I won't be
> looking to do that in that case.
>
> If it's acceptable I can transform COPYSIGN (X, -1) in gimple-isel.

FWIW, this is what I meant by [A] above.  I.e. go with your latest
target-independent change, and make isel replace COPYSIGN (X, -1)
with logic operations, under control of a new target hook that says that
logic operations on float values are as cheap as logic operations
on integer values (for a given float mode).  In future, hopefully all
COPYSIGNs and XORSIGNs would be handled the same way, under control
of the same hook, but that would be a separate future change.

I wasn't suggesting recognising (fneg (fabs )) in isel.

But like I say, beware of powerpc64.  It seems to rely on the current
reverse transformation of (copysign x -1) to (fneg (fabs x)).
(Could be wrong, but would be worth checking.)

Thanks,
Richard



> So before I start on this, Would this be acceptable for you?
>
> Thanks,
> Tamar
>
>> > So I have no good advice here but I hoped that even the generic target
>> > specific copysign implementation with and & xor would eventually be
>> > optimized later on RTL for constant second arg?
>> 
>> Yeah.  It looks like the required logic is there for scalars, it just needs extending
>> to vectors.
>> 
>> The patch below (untested beyond simple cases) seems to be enough to fix it,
>> but using the simplify routines even for CONST_INTs might be controversial.
>> 
>> Thanks,
>> Richard
>> 
>> 
>> diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index
>> bd9443dbcc2..5a9b1745673 100644
>> --- a/gcc/simplify-rtx.cc
>> +++ b/gcc/simplify-rtx.cc
>> @@ -3409,20 +3409,20 @@ simplify_context::simplify_binary_operation_1
>> (rtx_code code,
>> 
>>        /* Canonicalize (X & C1) | C2.  */
>>        if (GET_CODE (op0) == AND
>> -	  && CONST_INT_P (trueop1)
>> -	  && CONST_INT_P (XEXP (op0, 1)))
>> +	  && CONSTANT_P (trueop1)
>> +	  && CONSTANT_P (XEXP (op0, 1)))
>>  	{
>> -	  HOST_WIDE_INT mask = GET_MODE_MASK (mode);
>> -	  HOST_WIDE_INT c1 = INTVAL (XEXP (op0, 1));
>> -	  HOST_WIDE_INT c2 = INTVAL (trueop1);
>> +	  rtx c1 = XEXP (op0, 1);
>> +	  rtx c2 = trueop1;
>> 
>>  	  /* If (C1&C2) == C1, then (X&C1)|C2 becomes C2.  */
>> -	  if ((c1 & c2) == c1
>> +	  if (rtx_equal_p (simplify_binary_operation (AND, mode, c1, c2), c1)
>>  	      && !side_effects_p (XEXP (op0, 0)))
>>  	    return trueop1;
>> 
>>  	  /* If (C1|C2) == ~0 then (X&C1)|C2 becomes X|C2.  */
>> -	  if (((c1|c2) & mask) == mask)
>> +	  if (rtx_equal_p (simplify_binary_operation (IOR, mode, c1, c2),
>> +			   CONSTM1_RTX (mode)))
>>  	    return simplify_gen_binary (IOR, mode, XEXP (op0, 0), op1);
>>  	}
>> 
>> @@ -3732,7 +3732,7 @@ simplify_context::simplify_binary_operation_1
>> (rtx_code code,
>>  	 machines, and also has shorter instruction path length.  */
>>        if (GET_CODE (op0) == AND
>>  	  && GET_CODE (XEXP (op0, 0)) == XOR
>> -	  && CONST_INT_P (XEXP (op0, 1))
>> +	  && CONSTANT_P (XEXP (op0, 1))
>>  	  && rtx_equal_p (XEXP (XEXP (op0, 0), 0), trueop1))
>>  	{
>>  	  rtx a = trueop1;
>> @@ -3746,7 +3746,7 @@ simplify_context::simplify_binary_operation_1
>> (rtx_code code,
>>        /* Similarly, (xor (and (xor A B) C) B) as (ior (and A C) (and B ~C))  */
>>        else if (GET_CODE (op0) == AND
>>  	  && GET_CODE (XEXP (op0, 0)) == XOR
>> -	  && CONST_INT_P (XEXP (op0, 1))
>> +	  && CONSTANT_P (XEXP (op0, 1))
>>  	  && rtx_equal_p (XEXP (XEXP (op0, 0), 1), trueop1))
>>  	{
>>  	  rtx a = XEXP (XEXP (op0, 0), 0);
>> --
>> 2.25.1
>> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-09  9:56               ` Richard Sandiford
@ 2023-10-09 10:09                 ` Tamar Christina
  2023-10-09 10:17                   ` Richard Sandiford
  0 siblings, 1 reply; 16+ messages in thread
From: Tamar Christina @ 2023-10-09 10:09 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: Richard Biener, gcc-patches, nd, Richard Earnshaw,
	Marcus Shawcroft, Kyrylo Tkachov

> -----Original Message-----
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: Monday, October 9, 2023 10:56 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org;
> nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> 
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Saturday, October 7, 2023 10:58 AM
> >> To: Richard Biener <richard.guenther@gmail.com>
> >> Cc: Tamar Christina <Tamar.Christina@arm.com>;
> >> gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> >>
> >> Richard Biener <richard.guenther@gmail.com> writes:
> >> > On Thu, Oct 5, 2023 at 10:46 PM Tamar Christina
> >> <Tamar.Christina@arm.com> wrote:
> >> >>
> >> >> > -----Original Message-----
> >> >> > From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> > Sent: Thursday, October 5, 2023 9:26 PM
> >> >> > To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> >> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> >> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> <Kyrylo.Tkachov@arm.com>
> >> >> > Subject: Re: [PATCH]AArch64 Add SVE implementation for
> >> cond_copysign.
> >> >> >
> >> >> > Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> > >> -----Original Message-----
> >> >> > >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> > >> Sent: Thursday, October 5, 2023 8:29 PM
> >> >> > >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> > >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard
> >> >> > >> Earnshaw <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> >> > >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> >> > <Kyrylo.Tkachov@arm.com>
> >> >> > >> Subject: Re: [PATCH]AArch64 Add SVE implementation for
> >> cond_copysign.
> >> >> > >>
> >> >> > >> Tamar Christina <tamar.christina@arm.com> writes:
> >> >> > >> > Hi All,
> >> >> > >> >
> >> >> > >> > This adds an implementation for masked copysign along with
> >> >> > >> > an optimized pattern for masked copysign (x, -1).
> >> >> > >>
> >> >> > >> It feels like we're ending up with a lot of AArch64-specific
> >> >> > >> code that just hard- codes the observation that changing the
> >> >> > >> sign is equivalent to changing the top bit.  We then need to
> >> >> > >> make sure that we choose the best way of changing the top bit
> >> >> > >> for any
> >> given situation.
> >> >> > >>
> >> >> > >> Hard-coding the -1/negative case is one instance of that.
> >> >> > >> But it looks like we also fail to use the best sequence for SVE2.  E.g.
> >> >> > >> [https://godbolt.org/z/ajh3MM5jv]:
> >> >> > >>
> >> >> > >> #include <stdint.h>
> >> >> > >>
> >> >> > >> void f(double *restrict a, double *restrict b) {
> >> >> > >>     for (int i = 0; i < 100; ++i)
> >> >> > >>         a[i] = __builtin_copysign(a[i], b[i]); }
> >> >> > >>
> >> >> > >> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
> >> >> > >>     for (int i = 0; i < 100; ++i)
> >> >> > >>         a[i] = (a[i] & ~c) | (b[i] & c); }
> >> >> > >>
> >> >> > >> gives:
> >> >> > >>
> >> >> > >> f:
> >> >> > >>         mov     x2, 0
> >> >> > >>         mov     w3, 100
> >> >> > >>         whilelo p7.d, wzr, w3
> >> >> > >> .L2:
> >> >> > >>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
> >> >> > >>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
> >> >> > >>         and     z30.d, z30.d, #0x7fffffffffffffff
> >> >> > >>         and     z31.d, z31.d, #0x8000000000000000
> >> >> > >>         orr     z31.d, z31.d, z30.d
> >> >> > >>         st1d    z31.d, p7, [x0, x2, lsl 3]
> >> >> > >>         incd    x2
> >> >> > >>         whilelo p7.d, w2, w3
> >> >> > >>         b.any   .L2
> >> >> > >>         ret
> >> >> > >> g:
> >> >> > >>         mov     x3, 0
> >> >> > >>         mov     w4, 100
> >> >> > >>         mov     z29.d, x2
> >> >> > >>         whilelo p7.d, wzr, w4
> >> >> > >> .L6:
> >> >> > >>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
> >> >> > >>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
> >> >> > >>         bsl     z31.d, z31.d, z30.d, z29.d
> >> >> > >>         st1d    z31.d, p7, [x0, x3, lsl 3]
> >> >> > >>         incd    x3
> >> >> > >>         whilelo p7.d, w3, w4
> >> >> > >>         b.any   .L6
> >> >> > >>         ret
> >> >> > >>
> >> >> > >> I saw that you originally tried to do this in match.pd and
> >> >> > >> that the decision was to fold to copysign instead.  But
> >> >> > >> perhaps there's a compromise where isel does something with
> >> >> > >> the (new) copysign canonical
> >> >> > form?
> >> >> > >> I.e. could we go with your new version of the match.pd patch,
> >> >> > >> and add some isel stuff as a follow-on?
> 
> [A]
> 
> >> >> > >>
> >> >> > >
> >> >> > > Sure if that's what's desired.... But..
> >> >> > >
> >> >> > > The example you posted above is for instance worse for x86
> >> >> > > https://godbolt.org/z/x9ccqxW6T where the first operation has
> >> >> > > a dependency chain of 2 and the latter of 3.  It's likely any
> >> >> > > open coding of this
> >> >> > operation is going to hurt a target.
> >> >> > >
> >> >> > > So I'm unsure what isel transform this into...
> >> >> >
> >> >> > I didn't mean that we should go straight to using isel for the
> >> >> > general case, just for the new case.  The example above was
> >> >> > instead trying to show the general point that hiding the logic
> >> >> > ops in target code is
> >> a double-edged sword.
> >> >>
> >> >> I see.. but the problem here is that transforming copysign (x, -1)
> >> >> into (x | 0x8000000) would require an integer operation on an FP
> >> >> value.  I'm happy to do it but it seems like it'll be an AArch64
> >> >> only thing
> >> anyway.
> >> >>
> >> >> If we want to do this we need to check can_change_mode_class or a
> hook.
> >> >> Most targets including x86 reject the conversion.  So it'll just
> >> >> be effectively an AArch64 thing.
> >> >>
> >> >> You're right that the actual equivalent transformation is this
> >> >> https://godbolt.org/z/KesfrMv5z But the target won't allow it.
> >> >>
> >> >> >
> >> >> > The x86_64 example for the -1 case would be
> >> >> > https://godbolt.org/z/b9s6MaKs8 where the isel change would be
> >> >> > an improvement.  Without that, I guess
> >> >> > x86_64 will need to have a similar patch to the AArch64 one.
> >> >> >
> >> >>
> >> >> I think that's to be expected.  I think it's logical that every
> >> >> target just needs to implement their optabs optimally.
> >> >>
> >> >> > That said, https://godbolt.org/z/e6nqoqbMh suggests that
> >> >> > powerpc64 is probably relying on the current copysign -> neg/abs
> transform.
> >> >> > (Not sure why the second function uses different IVs from the
> >> >> > first.)
> >> >> >
> >> >> > Personally, I wouldn't be against a target hook that indicated
> >> >> > whether float bit manipulation is "free" for a given mode, if it
> >> >> > comes to
> >> that.
> >> >>
> >> >> I'm really sure Richi would agree there.  Generally speaking I
> >> >> don't think people see doing FP operations on Int as beneficial.
> >> >> But it's always
> >> the case on AArch64.
> >> >
> >> > IIRC we're doing fpclassify "expansion" early for example.
> >> >
> >> > Note the issue I see is that the middle-end shouldn't get in the
> >> > way of targets that have a copysign optab.  In case it's worthwhile
> >> > to special-case a "setsign" thing then maybe provide an optab for
> >> > that as well.  Now, that doesn't help if we want setsign to be
> >> > expanded from the middle-end but still wan the copysign optab (and
> >> > not require targets to implement both if they want to escape
> >> > middle-end generic
> >> expansion of setsign).
> >> >
> >> > But yes, I also thought the , 1 and , -1 cases could be special
> >> > cased by RTL expansion (or ISEL).  But it would need to invoke
> >> > costing which likely means target specific changes anyway... :/
> >>
> >> FWIW, if we had the target hook I suggested, I don't think AArch64
> >> would need to implement copysign or xorsign optabs.  That's probably
> >> true of
> >> AArch32 too (at least for all combinations that are likely to matter these
> days).
> >>
> >> I'd go one step further and say that, if a target wants to do its own
> >> thing for copysign and xorsign, it clearly doesn't meet the
> >> requirement that bit manipulation of floats is "free" for that mode.
> >>
> >
> > So I'm aware I have no say here, but I'll make one last effort.
> >
> > The patch isn't just implementing the fneg (fabs ()) optimization just
> because.
> > The location where it's implemented makes a big difference.
> >
> > If this optimization is done late, it doesn't fix the regression
> > fully, because doing It after all optimization passes have run means it can't
> properly be optimized.
> >
> > The point of doing the rewrite early to ORR accomplished two things:
> >
> > 1. It makes PRE realize that the block it's splitting would only have 1
> instruction in it
> >     and that such a split is not beneficial.  This is why I'm against doing such
> optimizations
> >     so later. Optimizations don’t' happen in isolation, and when they make
> sense should
> >     happen early.  Transforming fneg (fabs (..)) in isel not only feels wrong but
> results in a 4%
> >     performance loss.
> >
> > 2. Doing it early also gets the ORRs to be reassociated changing where the
> loop dependent
> >      variable lands.  Early makes it land in the merging MOVPRFX rather than
> requiring a SEL
> >      at the end of the iteration.
> >
> > Replacing the fneg (fabs (..)) with copysign accomplishes 1 but no 2.
> > This results in a 2-3% loss, but I can live with that given doing 1 gets us back
> to GCC 12 levels.
> >
> > Doing fneg (fabs (..)) in isel would have no meaning for me and not
> > fix the regression.  I won't be looking to do that in that case.
> >
> > If it's acceptable I can transform COPYSIGN (X, -1) in gimple-isel.
> 
> FWIW, this is what I meant by [A] above.  I.e. go with your latest target-
> independent change, and make isel replace COPYSIGN (X, -1) with logic
> operations, under control of a new target hook that says that logic operations
> on float values are as cheap as logic operations on integer values (for a given
> float mode).  In future, hopefully all COPYSIGNs and XORSIGNs would be
> handled the same way, under control of the same hook, but that would be a
> separate future change.

Ok, would you prefer isel or in the expander that Richi suggested?
I assume isel is better if the intention is to later remove the expansion code?

> I wasn't suggesting recognising (fneg (fabs )) in isel.
> 

Ok, I wasn't sure as there were many interleaving threads.

> But like I say, beware of powerpc64.  It seems to rely on the current reverse
> transformation of (copysign x -1) to (fneg (fabs x)).
> (Could be wrong, but would be worth checking.)

I'll do a check, thanks!

Tamar

> Thanks,
> Richard
> 
> 
> 
> > So before I start on this, Would this be acceptable for you?
> >
> > Thanks,
> > Tamar
> >
> >> > So I have no good advice here but I hoped that even the generic
> >> > target specific copysign implementation with and & xor would
> >> > eventually be optimized later on RTL for constant second arg?
> >>
> >> Yeah.  It looks like the required logic is there for scalars, it just
> >> needs extending to vectors.
> >>
> >> The patch below (untested beyond simple cases) seems to be enough to
> >> fix it, but using the simplify routines even for CONST_INTs might be
> controversial.
> >>
> >> Thanks,
> >> Richard
> >>
> >>
> >> diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index
> >> bd9443dbcc2..5a9b1745673 100644
> >> --- a/gcc/simplify-rtx.cc
> >> +++ b/gcc/simplify-rtx.cc
> >> @@ -3409,20 +3409,20 @@
> simplify_context::simplify_binary_operation_1
> >> (rtx_code code,
> >>
> >>        /* Canonicalize (X & C1) | C2.  */
> >>        if (GET_CODE (op0) == AND
> >> -	  && CONST_INT_P (trueop1)
> >> -	  && CONST_INT_P (XEXP (op0, 1)))
> >> +	  && CONSTANT_P (trueop1)
> >> +	  && CONSTANT_P (XEXP (op0, 1)))
> >>  	{
> >> -	  HOST_WIDE_INT mask = GET_MODE_MASK (mode);
> >> -	  HOST_WIDE_INT c1 = INTVAL (XEXP (op0, 1));
> >> -	  HOST_WIDE_INT c2 = INTVAL (trueop1);
> >> +	  rtx c1 = XEXP (op0, 1);
> >> +	  rtx c2 = trueop1;
> >>
> >>  	  /* If (C1&C2) == C1, then (X&C1)|C2 becomes C2.  */
> >> -	  if ((c1 & c2) == c1
> >> +	  if (rtx_equal_p (simplify_binary_operation (AND, mode, c1, c2),
> >> +c1)
> >>  	      && !side_effects_p (XEXP (op0, 0)))
> >>  	    return trueop1;
> >>
> >>  	  /* If (C1|C2) == ~0 then (X&C1)|C2 becomes X|C2.  */
> >> -	  if (((c1|c2) & mask) == mask)
> >> +	  if (rtx_equal_p (simplify_binary_operation (IOR, mode, c1, c2),
> >> +			   CONSTM1_RTX (mode)))
> >>  	    return simplify_gen_binary (IOR, mode, XEXP (op0, 0), op1);
> >>  	}
> >>
> >> @@ -3732,7 +3732,7 @@ simplify_context::simplify_binary_operation_1
> >> (rtx_code code,
> >>  	 machines, and also has shorter instruction path length.  */
> >>        if (GET_CODE (op0) == AND
> >>  	  && GET_CODE (XEXP (op0, 0)) == XOR
> >> -	  && CONST_INT_P (XEXP (op0, 1))
> >> +	  && CONSTANT_P (XEXP (op0, 1))
> >>  	  && rtx_equal_p (XEXP (XEXP (op0, 0), 0), trueop1))
> >>  	{
> >>  	  rtx a = trueop1;
> >> @@ -3746,7 +3746,7 @@ simplify_context::simplify_binary_operation_1
> >> (rtx_code code,
> >>        /* Similarly, (xor (and (xor A B) C) B) as (ior (and A C) (and B ~C))  */
> >>        else if (GET_CODE (op0) == AND
> >>  	  && GET_CODE (XEXP (op0, 0)) == XOR
> >> -	  && CONST_INT_P (XEXP (op0, 1))
> >> +	  && CONSTANT_P (XEXP (op0, 1))
> >>  	  && rtx_equal_p (XEXP (XEXP (op0, 0), 1), trueop1))
> >>  	{
> >>  	  rtx a = XEXP (XEXP (op0, 0), 0);
> >> --
> >> 2.25.1
> >>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-09 10:09                 ` Tamar Christina
@ 2023-10-09 10:17                   ` Richard Sandiford
  2023-10-09 11:30                     ` Richard Biener
  0 siblings, 1 reply; 16+ messages in thread
From: Richard Sandiford @ 2023-10-09 10:17 UTC (permalink / raw)
  To: Tamar Christina
  Cc: Richard Biener, gcc-patches, nd, Richard Earnshaw,
	Marcus Shawcroft, Kyrylo Tkachov

Tamar Christina <Tamar.Christina@arm.com> writes:
>> -----Original Message-----
>> From: Richard Sandiford <richard.sandiford@arm.com>
>> Sent: Monday, October 9, 2023 10:56 AM
>> To: Tamar Christina <Tamar.Christina@arm.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org;
>> nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
>> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> <Kyrylo.Tkachov@arm.com>
>> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
>> 
>> Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> -----Original Message-----
>> >> From: Richard Sandiford <richard.sandiford@arm.com>
>> >> Sent: Saturday, October 7, 2023 10:58 AM
>> >> To: Richard Biener <richard.guenther@gmail.com>
>> >> Cc: Tamar Christina <Tamar.Christina@arm.com>;
>> >> gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> <Kyrylo.Tkachov@arm.com>
>> >> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
>> >>
>> >> Richard Biener <richard.guenther@gmail.com> writes:
>> >> > On Thu, Oct 5, 2023 at 10:46 PM Tamar Christina
>> >> <Tamar.Christina@arm.com> wrote:
>> >> >>
>> >> >> > -----Original Message-----
>> >> >> > From: Richard Sandiford <richard.sandiford@arm.com>
>> >> >> > Sent: Thursday, October 5, 2023 9:26 PM
>> >> >> > To: Tamar Christina <Tamar.Christina@arm.com>
>> >> >> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
>> >> >> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> >> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> >> <Kyrylo.Tkachov@arm.com>
>> >> >> > Subject: Re: [PATCH]AArch64 Add SVE implementation for
>> >> cond_copysign.
>> >> >> >
>> >> >> > Tamar Christina <Tamar.Christina@arm.com> writes:
>> >> >> > >> -----Original Message-----
>> >> >> > >> From: Richard Sandiford <richard.sandiford@arm.com>
>> >> >> > >> Sent: Thursday, October 5, 2023 8:29 PM
>> >> >> > >> To: Tamar Christina <Tamar.Christina@arm.com>
>> >> >> > >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard
>> >> >> > >> Earnshaw <Richard.Earnshaw@arm.com>; Marcus Shawcroft
>> >> >> > >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
>> >> >> > <Kyrylo.Tkachov@arm.com>
>> >> >> > >> Subject: Re: [PATCH]AArch64 Add SVE implementation for
>> >> cond_copysign.
>> >> >> > >>
>> >> >> > >> Tamar Christina <tamar.christina@arm.com> writes:
>> >> >> > >> > Hi All,
>> >> >> > >> >
>> >> >> > >> > This adds an implementation for masked copysign along with
>> >> >> > >> > an optimized pattern for masked copysign (x, -1).
>> >> >> > >>
>> >> >> > >> It feels like we're ending up with a lot of AArch64-specific
>> >> >> > >> code that just hard- codes the observation that changing the
>> >> >> > >> sign is equivalent to changing the top bit.  We then need to
>> >> >> > >> make sure that we choose the best way of changing the top bit
>> >> >> > >> for any
>> >> given situation.
>> >> >> > >>
>> >> >> > >> Hard-coding the -1/negative case is one instance of that.
>> >> >> > >> But it looks like we also fail to use the best sequence for SVE2.  E.g.
>> >> >> > >> [https://godbolt.org/z/ajh3MM5jv]:
>> >> >> > >>
>> >> >> > >> #include <stdint.h>
>> >> >> > >>
>> >> >> > >> void f(double *restrict a, double *restrict b) {
>> >> >> > >>     for (int i = 0; i < 100; ++i)
>> >> >> > >>         a[i] = __builtin_copysign(a[i], b[i]); }
>> >> >> > >>
>> >> >> > >> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
>> >> >> > >>     for (int i = 0; i < 100; ++i)
>> >> >> > >>         a[i] = (a[i] & ~c) | (b[i] & c); }
>> >> >> > >>
>> >> >> > >> gives:
>> >> >> > >>
>> >> >> > >> f:
>> >> >> > >>         mov     x2, 0
>> >> >> > >>         mov     w3, 100
>> >> >> > >>         whilelo p7.d, wzr, w3
>> >> >> > >> .L2:
>> >> >> > >>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
>> >> >> > >>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
>> >> >> > >>         and     z30.d, z30.d, #0x7fffffffffffffff
>> >> >> > >>         and     z31.d, z31.d, #0x8000000000000000
>> >> >> > >>         orr     z31.d, z31.d, z30.d
>> >> >> > >>         st1d    z31.d, p7, [x0, x2, lsl 3]
>> >> >> > >>         incd    x2
>> >> >> > >>         whilelo p7.d, w2, w3
>> >> >> > >>         b.any   .L2
>> >> >> > >>         ret
>> >> >> > >> g:
>> >> >> > >>         mov     x3, 0
>> >> >> > >>         mov     w4, 100
>> >> >> > >>         mov     z29.d, x2
>> >> >> > >>         whilelo p7.d, wzr, w4
>> >> >> > >> .L6:
>> >> >> > >>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
>> >> >> > >>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
>> >> >> > >>         bsl     z31.d, z31.d, z30.d, z29.d
>> >> >> > >>         st1d    z31.d, p7, [x0, x3, lsl 3]
>> >> >> > >>         incd    x3
>> >> >> > >>         whilelo p7.d, w3, w4
>> >> >> > >>         b.any   .L6
>> >> >> > >>         ret
>> >> >> > >>
>> >> >> > >> I saw that you originally tried to do this in match.pd and
>> >> >> > >> that the decision was to fold to copysign instead.  But
>> >> >> > >> perhaps there's a compromise where isel does something with
>> >> >> > >> the (new) copysign canonical
>> >> >> > form?
>> >> >> > >> I.e. could we go with your new version of the match.pd patch,
>> >> >> > >> and add some isel stuff as a follow-on?
>> 
>> [A]
>> 
>> >> >> > >>
>> >> >> > >
>> >> >> > > Sure if that's what's desired.... But..
>> >> >> > >
>> >> >> > > The example you posted above is for instance worse for x86
>> >> >> > > https://godbolt.org/z/x9ccqxW6T where the first operation has
>> >> >> > > a dependency chain of 2 and the latter of 3.  It's likely any
>> >> >> > > open coding of this
>> >> >> > operation is going to hurt a target.
>> >> >> > >
>> >> >> > > So I'm unsure what isel transform this into...
>> >> >> >
>> >> >> > I didn't mean that we should go straight to using isel for the
>> >> >> > general case, just for the new case.  The example above was
>> >> >> > instead trying to show the general point that hiding the logic
>> >> >> > ops in target code is
>> >> a double-edged sword.
>> >> >>
>> >> >> I see.. but the problem here is that transforming copysign (x, -1)
>> >> >> into (x | 0x8000000) would require an integer operation on an FP
>> >> >> value.  I'm happy to do it but it seems like it'll be an AArch64
>> >> >> only thing
>> >> anyway.
>> >> >>
>> >> >> If we want to do this we need to check can_change_mode_class or a
>> hook.
>> >> >> Most targets including x86 reject the conversion.  So it'll just
>> >> >> be effectively an AArch64 thing.
>> >> >>
>> >> >> You're right that the actual equivalent transformation is this
>> >> >> https://godbolt.org/z/KesfrMv5z But the target won't allow it.
>> >> >>
>> >> >> >
>> >> >> > The x86_64 example for the -1 case would be
>> >> >> > https://godbolt.org/z/b9s6MaKs8 where the isel change would be
>> >> >> > an improvement.  Without that, I guess
>> >> >> > x86_64 will need to have a similar patch to the AArch64 one.
>> >> >> >
>> >> >>
>> >> >> I think that's to be expected.  I think it's logical that every
>> >> >> target just needs to implement their optabs optimally.
>> >> >>
>> >> >> > That said, https://godbolt.org/z/e6nqoqbMh suggests that
>> >> >> > powerpc64 is probably relying on the current copysign -> neg/abs
>> transform.
>> >> >> > (Not sure why the second function uses different IVs from the
>> >> >> > first.)
>> >> >> >
>> >> >> > Personally, I wouldn't be against a target hook that indicated
>> >> >> > whether float bit manipulation is "free" for a given mode, if it
>> >> >> > comes to
>> >> that.
>> >> >>
>> >> >> I'm really sure Richi would agree there.  Generally speaking I
>> >> >> don't think people see doing FP operations on Int as beneficial.
>> >> >> But it's always
>> >> the case on AArch64.
>> >> >
>> >> > IIRC we're doing fpclassify "expansion" early for example.
>> >> >
>> >> > Note the issue I see is that the middle-end shouldn't get in the
>> >> > way of targets that have a copysign optab.  In case it's worthwhile
>> >> > to special-case a "setsign" thing then maybe provide an optab for
>> >> > that as well.  Now, that doesn't help if we want setsign to be
>> >> > expanded from the middle-end but still wan the copysign optab (and
>> >> > not require targets to implement both if they want to escape
>> >> > middle-end generic
>> >> expansion of setsign).
>> >> >
>> >> > But yes, I also thought the , 1 and , -1 cases could be special
>> >> > cased by RTL expansion (or ISEL).  But it would need to invoke
>> >> > costing which likely means target specific changes anyway... :/
>> >>
>> >> FWIW, if we had the target hook I suggested, I don't think AArch64
>> >> would need to implement copysign or xorsign optabs.  That's probably
>> >> true of
>> >> AArch32 too (at least for all combinations that are likely to matter these
>> days).
>> >>
>> >> I'd go one step further and say that, if a target wants to do its own
>> >> thing for copysign and xorsign, it clearly doesn't meet the
>> >> requirement that bit manipulation of floats is "free" for that mode.
>> >>
>> >
>> > So I'm aware I have no say here, but I'll make one last effort.
>> >
>> > The patch isn't just implementing the fneg (fabs ()) optimization just
>> because.
>> > The location where it's implemented makes a big difference.
>> >
>> > If this optimization is done late, it doesn't fix the regression
>> > fully, because doing It after all optimization passes have run means it can't
>> properly be optimized.
>> >
>> > The point of doing the rewrite early to ORR accomplished two things:
>> >
>> > 1. It makes PRE realize that the block it's splitting would only have 1
>> instruction in it
>> >     and that such a split is not beneficial.  This is why I'm against doing such
>> optimizations
>> >     so later. Optimizations don’t' happen in isolation, and when they make
>> sense should
>> >     happen early.  Transforming fneg (fabs (..)) in isel not only feels wrong but
>> results in a 4%
>> >     performance loss.
>> >
>> > 2. Doing it early also gets the ORRs to be reassociated changing where the
>> loop dependent
>> >      variable lands.  Early makes it land in the merging MOVPRFX rather than
>> requiring a SEL
>> >      at the end of the iteration.
>> >
>> > Replacing the fneg (fabs (..)) with copysign accomplishes 1 but no 2.
>> > This results in a 2-3% loss, but I can live with that given doing 1 gets us back
>> to GCC 12 levels.
>> >
>> > Doing fneg (fabs (..)) in isel would have no meaning for me and not
>> > fix the regression.  I won't be looking to do that in that case.
>> >
>> > If it's acceptable I can transform COPYSIGN (X, -1) in gimple-isel.
>> 
>> FWIW, this is what I meant by [A] above.  I.e. go with your latest target-
>> independent change, and make isel replace COPYSIGN (X, -1) with logic
>> operations, under control of a new target hook that says that logic operations
>> on float values are as cheap as logic operations on integer values (for a given
>> float mode).  In future, hopefully all COPYSIGNs and XORSIGNs would be
>> handled the same way, under control of the same hook, but that would be a
>> separate future change.
>
> Ok, would you prefer isel or in the expander that Richi suggested?
> I assume isel is better if the intention is to later remove the expansion code?

Yeah, isel sounds better to me FWIW (for that reason), but Richi would
know better.

At least with isel, there's the theoretical possibility of doing
simple optimisation before expand, or taking surrounding stmts
into account, if that ever becomes necessary for future changes.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-09 10:17                   ` Richard Sandiford
@ 2023-10-09 11:30                     ` Richard Biener
  0 siblings, 0 replies; 16+ messages in thread
From: Richard Biener @ 2023-10-09 11:30 UTC (permalink / raw)
  To: Tamar Christina, Richard Biener, gcc-patches, nd,
	Richard Earnshaw, Marcus Shawcroft, Kyrylo Tkachov,
	richard.sandiford

On Mon, Oct 9, 2023 at 12:17 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> -----Original Message-----
> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> Sent: Monday, October 9, 2023 10:56 AM
> >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> Cc: Richard Biener <richard.guenther@gmail.com>; gcc-patches@gcc.gnu.org;
> >> nd <nd@arm.com>; Richard Earnshaw <Richard.Earnshaw@arm.com>;
> >> Marcus Shawcroft <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> <Kyrylo.Tkachov@arm.com>
> >> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> >>
> >> Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> -----Original Message-----
> >> >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> Sent: Saturday, October 7, 2023 10:58 AM
> >> >> To: Richard Biener <richard.guenther@gmail.com>
> >> >> Cc: Tamar Christina <Tamar.Christina@arm.com>;
> >> >> gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> >> <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> <Kyrylo.Tkachov@arm.com>
> >> >> Subject: Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
> >> >>
> >> >> Richard Biener <richard.guenther@gmail.com> writes:
> >> >> > On Thu, Oct 5, 2023 at 10:46 PM Tamar Christina
> >> >> <Tamar.Christina@arm.com> wrote:
> >> >> >>
> >> >> >> > -----Original Message-----
> >> >> >> > From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> >> > Sent: Thursday, October 5, 2023 9:26 PM
> >> >> >> > To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> >> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard Earnshaw
> >> >> >> > <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> >> >> > <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> >> <Kyrylo.Tkachov@arm.com>
> >> >> >> > Subject: Re: [PATCH]AArch64 Add SVE implementation for
> >> >> cond_copysign.
> >> >> >> >
> >> >> >> > Tamar Christina <Tamar.Christina@arm.com> writes:
> >> >> >> > >> -----Original Message-----
> >> >> >> > >> From: Richard Sandiford <richard.sandiford@arm.com>
> >> >> >> > >> Sent: Thursday, October 5, 2023 8:29 PM
> >> >> >> > >> To: Tamar Christina <Tamar.Christina@arm.com>
> >> >> >> > >> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; Richard
> >> >> >> > >> Earnshaw <Richard.Earnshaw@arm.com>; Marcus Shawcroft
> >> >> >> > >> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov
> >> >> >> > <Kyrylo.Tkachov@arm.com>
> >> >> >> > >> Subject: Re: [PATCH]AArch64 Add SVE implementation for
> >> >> cond_copysign.
> >> >> >> > >>
> >> >> >> > >> Tamar Christina <tamar.christina@arm.com> writes:
> >> >> >> > >> > Hi All,
> >> >> >> > >> >
> >> >> >> > >> > This adds an implementation for masked copysign along with
> >> >> >> > >> > an optimized pattern for masked copysign (x, -1).
> >> >> >> > >>
> >> >> >> > >> It feels like we're ending up with a lot of AArch64-specific
> >> >> >> > >> code that just hard- codes the observation that changing the
> >> >> >> > >> sign is equivalent to changing the top bit.  We then need to
> >> >> >> > >> make sure that we choose the best way of changing the top bit
> >> >> >> > >> for any
> >> >> given situation.
> >> >> >> > >>
> >> >> >> > >> Hard-coding the -1/negative case is one instance of that.
> >> >> >> > >> But it looks like we also fail to use the best sequence for SVE2.  E.g.
> >> >> >> > >> [https://godbolt.org/z/ajh3MM5jv]:
> >> >> >> > >>
> >> >> >> > >> #include <stdint.h>
> >> >> >> > >>
> >> >> >> > >> void f(double *restrict a, double *restrict b) {
> >> >> >> > >>     for (int i = 0; i < 100; ++i)
> >> >> >> > >>         a[i] = __builtin_copysign(a[i], b[i]); }
> >> >> >> > >>
> >> >> >> > >> void g(uint64_t *restrict a, uint64_t *restrict b, uint64_t c) {
> >> >> >> > >>     for (int i = 0; i < 100; ++i)
> >> >> >> > >>         a[i] = (a[i] & ~c) | (b[i] & c); }
> >> >> >> > >>
> >> >> >> > >> gives:
> >> >> >> > >>
> >> >> >> > >> f:
> >> >> >> > >>         mov     x2, 0
> >> >> >> > >>         mov     w3, 100
> >> >> >> > >>         whilelo p7.d, wzr, w3
> >> >> >> > >> .L2:
> >> >> >> > >>         ld1d    z30.d, p7/z, [x0, x2, lsl 3]
> >> >> >> > >>         ld1d    z31.d, p7/z, [x1, x2, lsl 3]
> >> >> >> > >>         and     z30.d, z30.d, #0x7fffffffffffffff
> >> >> >> > >>         and     z31.d, z31.d, #0x8000000000000000
> >> >> >> > >>         orr     z31.d, z31.d, z30.d
> >> >> >> > >>         st1d    z31.d, p7, [x0, x2, lsl 3]
> >> >> >> > >>         incd    x2
> >> >> >> > >>         whilelo p7.d, w2, w3
> >> >> >> > >>         b.any   .L2
> >> >> >> > >>         ret
> >> >> >> > >> g:
> >> >> >> > >>         mov     x3, 0
> >> >> >> > >>         mov     w4, 100
> >> >> >> > >>         mov     z29.d, x2
> >> >> >> > >>         whilelo p7.d, wzr, w4
> >> >> >> > >> .L6:
> >> >> >> > >>         ld1d    z30.d, p7/z, [x0, x3, lsl 3]
> >> >> >> > >>         ld1d    z31.d, p7/z, [x1, x3, lsl 3]
> >> >> >> > >>         bsl     z31.d, z31.d, z30.d, z29.d
> >> >> >> > >>         st1d    z31.d, p7, [x0, x3, lsl 3]
> >> >> >> > >>         incd    x3
> >> >> >> > >>         whilelo p7.d, w3, w4
> >> >> >> > >>         b.any   .L6
> >> >> >> > >>         ret
> >> >> >> > >>
> >> >> >> > >> I saw that you originally tried to do this in match.pd and
> >> >> >> > >> that the decision was to fold to copysign instead.  But
> >> >> >> > >> perhaps there's a compromise where isel does something with
> >> >> >> > >> the (new) copysign canonical
> >> >> >> > form?
> >> >> >> > >> I.e. could we go with your new version of the match.pd patch,
> >> >> >> > >> and add some isel stuff as a follow-on?
> >>
> >> [A]
> >>
> >> >> >> > >>
> >> >> >> > >
> >> >> >> > > Sure if that's what's desired.... But..
> >> >> >> > >
> >> >> >> > > The example you posted above is for instance worse for x86
> >> >> >> > > https://godbolt.org/z/x9ccqxW6T where the first operation has
> >> >> >> > > a dependency chain of 2 and the latter of 3.  It's likely any
> >> >> >> > > open coding of this
> >> >> >> > operation is going to hurt a target.
> >> >> >> > >
> >> >> >> > > So I'm unsure what isel transform this into...
> >> >> >> >
> >> >> >> > I didn't mean that we should go straight to using isel for the
> >> >> >> > general case, just for the new case.  The example above was
> >> >> >> > instead trying to show the general point that hiding the logic
> >> >> >> > ops in target code is
> >> >> a double-edged sword.
> >> >> >>
> >> >> >> I see.. but the problem here is that transforming copysign (x, -1)
> >> >> >> into (x | 0x8000000) would require an integer operation on an FP
> >> >> >> value.  I'm happy to do it but it seems like it'll be an AArch64
> >> >> >> only thing
> >> >> anyway.
> >> >> >>
> >> >> >> If we want to do this we need to check can_change_mode_class or a
> >> hook.
> >> >> >> Most targets including x86 reject the conversion.  So it'll just
> >> >> >> be effectively an AArch64 thing.
> >> >> >>
> >> >> >> You're right that the actual equivalent transformation is this
> >> >> >> https://godbolt.org/z/KesfrMv5z But the target won't allow it.
> >> >> >>
> >> >> >> >
> >> >> >> > The x86_64 example for the -1 case would be
> >> >> >> > https://godbolt.org/z/b9s6MaKs8 where the isel change would be
> >> >> >> > an improvement.  Without that, I guess
> >> >> >> > x86_64 will need to have a similar patch to the AArch64 one.
> >> >> >> >
> >> >> >>
> >> >> >> I think that's to be expected.  I think it's logical that every
> >> >> >> target just needs to implement their optabs optimally.
> >> >> >>
> >> >> >> > That said, https://godbolt.org/z/e6nqoqbMh suggests that
> >> >> >> > powerpc64 is probably relying on the current copysign -> neg/abs
> >> transform.
> >> >> >> > (Not sure why the second function uses different IVs from the
> >> >> >> > first.)
> >> >> >> >
> >> >> >> > Personally, I wouldn't be against a target hook that indicated
> >> >> >> > whether float bit manipulation is "free" for a given mode, if it
> >> >> >> > comes to
> >> >> that.
> >> >> >>
> >> >> >> I'm really sure Richi would agree there.  Generally speaking I
> >> >> >> don't think people see doing FP operations on Int as beneficial.
> >> >> >> But it's always
> >> >> the case on AArch64.
> >> >> >
> >> >> > IIRC we're doing fpclassify "expansion" early for example.
> >> >> >
> >> >> > Note the issue I see is that the middle-end shouldn't get in the
> >> >> > way of targets that have a copysign optab.  In case it's worthwhile
> >> >> > to special-case a "setsign" thing then maybe provide an optab for
> >> >> > that as well.  Now, that doesn't help if we want setsign to be
> >> >> > expanded from the middle-end but still wan the copysign optab (and
> >> >> > not require targets to implement both if they want to escape
> >> >> > middle-end generic
> >> >> expansion of setsign).
> >> >> >
> >> >> > But yes, I also thought the , 1 and , -1 cases could be special
> >> >> > cased by RTL expansion (or ISEL).  But it would need to invoke
> >> >> > costing which likely means target specific changes anyway... :/
> >> >>
> >> >> FWIW, if we had the target hook I suggested, I don't think AArch64
> >> >> would need to implement copysign or xorsign optabs.  That's probably
> >> >> true of
> >> >> AArch32 too (at least for all combinations that are likely to matter these
> >> days).
> >> >>
> >> >> I'd go one step further and say that, if a target wants to do its own
> >> >> thing for copysign and xorsign, it clearly doesn't meet the
> >> >> requirement that bit manipulation of floats is "free" for that mode.
> >> >>
> >> >
> >> > So I'm aware I have no say here, but I'll make one last effort.
> >> >
> >> > The patch isn't just implementing the fneg (fabs ()) optimization just
> >> because.
> >> > The location where it's implemented makes a big difference.
> >> >
> >> > If this optimization is done late, it doesn't fix the regression
> >> > fully, because doing It after all optimization passes have run means it can't
> >> properly be optimized.
> >> >
> >> > The point of doing the rewrite early to ORR accomplished two things:
> >> >
> >> > 1. It makes PRE realize that the block it's splitting would only have 1
> >> instruction in it
> >> >     and that such a split is not beneficial.  This is why I'm against doing such
> >> optimizations
> >> >     so later. Optimizations don’t' happen in isolation, and when they make
> >> sense should
> >> >     happen early.  Transforming fneg (fabs (..)) in isel not only feels wrong but
> >> results in a 4%
> >> >     performance loss.
> >> >
> >> > 2. Doing it early also gets the ORRs to be reassociated changing where the
> >> loop dependent
> >> >      variable lands.  Early makes it land in the merging MOVPRFX rather than
> >> requiring a SEL
> >> >      at the end of the iteration.
> >> >
> >> > Replacing the fneg (fabs (..)) with copysign accomplishes 1 but no 2.
> >> > This results in a 2-3% loss, but I can live with that given doing 1 gets us back
> >> to GCC 12 levels.
> >> >
> >> > Doing fneg (fabs (..)) in isel would have no meaning for me and not
> >> > fix the regression.  I won't be looking to do that in that case.
> >> >
> >> > If it's acceptable I can transform COPYSIGN (X, -1) in gimple-isel.
> >>
> >> FWIW, this is what I meant by [A] above.  I.e. go with your latest target-
> >> independent change, and make isel replace COPYSIGN (X, -1) with logic
> >> operations, under control of a new target hook that says that logic operations
> >> on float values are as cheap as logic operations on integer values (for a given
> >> float mode).  In future, hopefully all COPYSIGNs and XORSIGNs would be
> >> handled the same way, under control of the same hook, but that would be a
> >> separate future change.
> >
> > Ok, would you prefer isel or in the expander that Richi suggested?
> > I assume isel is better if the intention is to later remove the expansion code?
>
> Yeah, isel sounds better to me FWIW (for that reason), but Richi would
> know better.

ISEL is supposed to be "RTL expansion on GIMPLE" and since we have
copysign builtin expansion with several variants in the optab code and
also direct IFN expansion I think it wouldn't be good to add another place
pre-empting those.

> At least with isel, there's the theoretical possibility of doing
> simple optimisation before expand, or taking surrounding stmts
> into account, if that ever becomes necessary for future changes.

Yeah, but I doubt there's anything for copysign (x, -1) here.

Unfortunately IFN_COPYSIGN expansion isn't routed through
expand_copysign but goes through expand_fn_using_insn(?).
I suppose we can go back to DEF_INTERNAL_OPTAB_FN
for it and implement -1 handing in expand_COPYSIGN?

There's also the case being made for targets not implementing
a vector copysign expander to handle this in pattern recognition.

Richard.

>
> Thanks,
> Richard

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH]AArch64 Add SVE implementation for cond_copysign.
  2023-10-05 18:21 [PATCH]AArch64 Add SVE implementation for cond_copysign Tamar Christina
  2023-10-05 19:28 ` Richard Sandiford
@ 2023-10-19 21:29 ` Richard Sandiford
  1 sibling, 0 replies; 16+ messages in thread
From: Richard Sandiford @ 2023-10-19 21:29 UTC (permalink / raw)
  To: Tamar Christina
  Cc: gcc-patches, nd, Richard.Earnshaw, Marcus.Shawcroft, Kyrylo.Tkachov

Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> This adds an implementation for masked copysign along with an optimized
> pattern for masked copysign (x, -1).
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	PR tree-optimization/109154
> 	* config/aarch64/aarch64-sve.md (cond_copysign<mode>): New.
>
> gcc/testsuite/ChangeLog:
>
> 	PR tree-optimization/109154
> 	* gcc.target/aarch64/sve/fneg-abs_5.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index 071400c820a5b106ddf9dc9faebb117975d74ea0..00ca30c24624dc661254568f45b61a14aa11c305 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -6429,6 +6429,57 @@ (define_expand "copysign<mode>3"
>    }
>  )
>  
> +(define_expand "cond_copysign<mode>"
> +  [(match_operand:SVE_FULL_F 0 "register_operand")
> +   (match_operand:<VPRED> 1 "register_operand")
> +   (match_operand:SVE_FULL_F 2 "register_operand")
> +   (match_operand:SVE_FULL_F 3 "nonmemory_operand")
> +   (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")]
> +  "TARGET_SVE"
> +  {
> +    rtx sign = gen_reg_rtx (<V_INT_EQUIV>mode);
> +    rtx mant = gen_reg_rtx (<V_INT_EQUIV>mode);
> +    rtx int_res = gen_reg_rtx (<V_INT_EQUIV>mode);
> +    int bits = GET_MODE_UNIT_BITSIZE (<MODE>mode) - 1;
> +
> +    rtx arg2 = lowpart_subreg (<V_INT_EQUIV>mode, operands[2], <MODE>mode);
> +    rtx arg3 = lowpart_subreg (<V_INT_EQUIV>mode, operands[3], <MODE>mode);
> +    rtx arg4 = lowpart_subreg (<V_INT_EQUIV>mode, operands[4], <MODE>mode);
> +
> +    rtx v_sign_bitmask
> +      = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> +					   HOST_WIDE_INT_M1U << bits);
> +
> +    /* copysign (x, -1) should instead be expanded as orr with the sign
> +       bit.  */
> +    if (!REG_P (operands[3]))
> +      {
> +	auto r0
> +	  = CONST_DOUBLE_REAL_VALUE (unwrap_const_vec_duplicate (operands[3]));
> +	if (-1 == real_to_integer (r0))

OK with the same change and under the same conditions as the FP/SIMD patch.

Thanks,
Richard

> +	  {
> +	    arg3 = force_reg (<V_INT_EQUIV>mode, v_sign_bitmask);
> +	    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], arg2,
> +						  arg3, arg4));
> +	    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> +	    DONE;
> +	  }
> +      }
> +
> +    operands[2] = force_reg (<MODE>mode, operands[3]);
> +    emit_insn (gen_and<v_int_equiv>3 (sign, arg3, v_sign_bitmask));
> +    emit_insn (gen_and<v_int_equiv>3
> +	       (mant, arg2,
> +		aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
> +						   ~(HOST_WIDE_INT_M1U
> +						     << bits))));
> +    emit_insn (gen_cond_ior<v_int_equiv> (int_res, operands[1], sign, mant,
> +					  arg4));
> +    emit_move_insn (operands[0], gen_lowpart (<MODE>mode, int_res));
> +    DONE;
> +  }
> +)
> +
>  (define_expand "xorsign<mode>3"
>    [(match_operand:SVE_FULL_F 0 "register_operand")
>     (match_operand:SVE_FULL_F 1 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f4ecbeecbe1290134e688f46a4389d17155e4a0a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_5.c
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
> +
> +#include <arm_neon.h>
> +#include <math.h>
> +
> +/*
> +** f1:
> +**	...
> +**	orr	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
> +**	...
> +*/
> +void f1 (float32_t *a, int n)
> +{
> +  for (int i = 0; i < (n & -8); i++)
> +   if (a[i] > n)
> +     a[i] = -fabsf (a[i]);
> +   else
> +     a[i] = n;
> +}
> +
> +/*
> +** f2:
> +**	...
> +**	orr	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +**	...
> +*/
> +void f2 (float64_t *a, int n)
> +{
> +  for (int i = 0; i < (n & -8); i++)
> +   if (a[i] > n)
> +     a[i] = -fabs (a[i]);
> +   else
> +     a[i] = n;
> +}

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2023-10-19 21:30 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-05 18:21 [PATCH]AArch64 Add SVE implementation for cond_copysign Tamar Christina
2023-10-05 19:28 ` Richard Sandiford
2023-10-05 19:47   ` Tamar Christina
2023-10-05 20:25     ` Richard Sandiford
2023-10-05 20:45       ` Tamar Christina
2023-10-06  7:32         ` Richard Biener
2023-10-07  9:57           ` Richard Sandiford
2023-10-09  9:38             ` Tamar Christina
2023-10-09  9:45               ` Richard Biener
2023-10-09  9:55                 ` Tamar Christina
2023-10-09  9:56               ` Richard Sandiford
2023-10-09 10:09                 ` Tamar Christina
2023-10-09 10:17                   ` Richard Sandiford
2023-10-09 11:30                     ` Richard Biener
2023-10-05 20:34     ` Andrew Pinski
2023-10-19 21:29 ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).