[PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
@ 2023-03-16 15:27 Manolis Tsamis
  2023-03-16 16:41 ` Jeff Law
  2023-03-17  8:31 ` Richard Biener
  0 siblings, 2 replies; 20+ messages in thread
From: Manolis Tsamis @ 2023-03-16 15:27 UTC (permalink / raw)
  To: gcc-patches; +Cc: Philipp Tomsich, Richard Biener, Manolis Tsamis

For this C testcase:

void g();
void f(unsigned int *a)
{
  if (++*a == 1)
    g();
}

GCC will currently emit a comparison with 1 by using the value
of *a after the increment. This can be improved by comparing
against 0 and using the value before the increment. As a result
there is a potentially shorter dependancy chain (no need to wait
for the result of +1) and on targets with compare zero instructions
the generated code is one instruction shorter.

Example from Aarch64:

Before
        ldr     w1, [x0]
        add     w1, w1, 1
        str     w1, [x0]
        cmp     w1, 1
        beq     .L4
        ret

After
        ldr     w1, [x0]
        add     w2, w1, 1
        str     w2, [x0]
        cbz     w1, .L4
        ret

gcc/ChangeLog:

        * tree-ssa-forwprop.cc (combine_cond_expr_cond):
        (forward_propagate_into_comparison_1): Optimize
        for zero comparisons.

Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
---

 gcc/tree-ssa-forwprop.cc | 41 +++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index e34f0888954..93d5043821b 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -373,12 +373,13 @@ rhs_to_tree (tree type, gimple *stmt)
 /* Combine OP0 CODE OP1 in the context of a COND_EXPR.  Returns
    the folded result in a form suitable for COND_EXPR_COND or
    NULL_TREE, if there is no suitable simplified form.  If
-   INVARIANT_ONLY is true only gimple_min_invariant results are
-   considered simplified.  */
+   ALWAYS_COMBINE is false then only combine it the resulting
+   expression is gimple_min_invariant or considered simplified
+   compared to the original.  */
 
 static tree
 combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
-			tree op0, tree op1, bool invariant_only)
+			tree op0, tree op1, bool always_combine)
 {
   tree t;
 
@@ -398,17 +399,31 @@ combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
   /* Canonicalize the combined condition for use in a COND_EXPR.  */
   t = canonicalize_cond_expr_cond (t);
 
-  /* Bail out if we required an invariant but didn't get one.  */
-  if (!t || (invariant_only && !is_gimple_min_invariant (t)))
+  if (!t)
     {
       fold_undefer_overflow_warnings (false, NULL, 0);
       return NULL_TREE;
     }
 
-  bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
-  fold_undefer_overflow_warnings (!nowarn, stmt, 0);
+  if (always_combine || is_gimple_min_invariant (t))
+    {
+      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
+      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
+      return t;
+    }
 
-  return t;
+  /* If the result of folding is a zero comparison treat it preferentially.  */
+  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison
+      && integer_zerop (TREE_OPERAND (t, 1))
+      && !integer_zerop (op1))
+    {
+      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
+      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
+      return t;
+    }
+
+  fold_undefer_overflow_warnings (false, NULL, 0);
+  return NULL_TREE;
 }
 
 /* Combine the comparison OP0 CODE OP1 at LOC with the defining statements
@@ -432,7 +447,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
       if (def_stmt && can_propagate_from (def_stmt))
 	{
 	  enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
-	  bool invariant_only_p = !single_use0_p;
+	  bool always_combine = single_use0_p;
 
 	  rhs0 = rhs_to_tree (TREE_TYPE (op1), def_stmt);
 
@@ -442,10 +457,10 @@ forward_propagate_into_comparison_1 (gimple *stmt,
 		   && TREE_CODE (TREE_TYPE (TREE_OPERAND (rhs0, 0)))
 		      == BOOLEAN_TYPE)
 		  || TREE_CODE_CLASS (def_code) == tcc_comparison))
-	    invariant_only_p = false;
+	    always_combine = true;
 
 	  tmp = combine_cond_expr_cond (stmt, code, type,
-					rhs0, op1, invariant_only_p);
+					rhs0, op1, always_combine);
 	  if (tmp)
 	    return tmp;
 	}
@@ -459,7 +474,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
 	{
 	  rhs1 = rhs_to_tree (TREE_TYPE (op0), def_stmt);
 	  tmp = combine_cond_expr_cond (stmt, code, type,
-					op0, rhs1, !single_use1_p);
+					op0, rhs1, single_use1_p);
 	  if (tmp)
 	    return tmp;
 	}
@@ -470,7 +485,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
       && rhs1 != NULL_TREE)
     tmp = combine_cond_expr_cond (stmt, code, type,
 				  rhs0, rhs1,
-				  !(single_use0_p && single_use1_p));
+				  single_use0_p && single_use1_p);
 
   return tmp;
 }
-- 
2.34.1


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-03-16 15:27 [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop Manolis Tsamis
@ 2023-03-16 16:41 ` Jeff Law
  2023-03-16 20:32   ` Philipp Tomsich
  2023-03-17  8:31 ` Richard Biener
  1 sibling, 1 reply; 20+ messages in thread
From: Jeff Law @ 2023-03-16 16:41 UTC (permalink / raw)
  To: Manolis Tsamis, gcc-patches; +Cc: Philipp Tomsich, Richard Biener



On 3/16/23 09:27, Manolis Tsamis wrote:
> For this C testcase:
> 
> void g();
> void f(unsigned int *a)
> {
>    if (++*a == 1)
>      g();
> }
> 
> GCC will currently emit a comparison with 1 by using the value
> of *a after the increment. This can be improved by comparing
> against 0 and using the value before the increment. As a result
> there is a potentially shorter dependancy chain (no need to wait
> for the result of +1) and on targets with compare zero instructions
> the generated code is one instruction shorter.
> 
> Example from Aarch64:
> 
> Before
>          ldr     w1, [x0]
>          add     w1, w1, 1
>          str     w1, [x0]
>          cmp     w1, 1
>          beq     .L4
>          ret
> 
> After
>          ldr     w1, [x0]
>          add     w2, w1, 1
>          str     w2, [x0]
>          cbz     w1, .L4
>          ret
> 
> gcc/ChangeLog:
> 
>          * tree-ssa-forwprop.cc (combine_cond_expr_cond):
>          (forward_propagate_into_comparison_1): Optimize
>          for zero comparisons.
Deferring to gcc-14.  Though I'm generally supportive of normalizing to 
a comparison against zero when we safely can :-)

jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-03-16 16:41 ` Jeff Law
@ 2023-03-16 20:32   ` Philipp Tomsich
  0 siblings, 0 replies; 20+ messages in thread
From: Philipp Tomsich @ 2023-03-16 20:32 UTC (permalink / raw)
  To: Jeff Law; +Cc: Manolis Tsamis, gcc-patches, Richard Biener

[-- Attachment #1: Type: text/plain, Size: 2329 bytes --]

Just to add a bit more color on this one...
It was originally observed (and isolated from)
_ZN11xalanc_1_1027XalanReferenceCountedObject12addReferenceEPS0_ and
reproduces both for AArch64 and RISC-V.

The basic block (annotated with dynamic instructions executed and
percentage of total dynamic instructions) looks as follows:

>   0x0000000000511488 4589868875 0.4638%
> _ZN11xalanc_1_1027XalanReferenceCountedObject12addReferenceEPS0_
>       4518              lw              a4,8(a0)
>       0017029b          addiw           t0,a4,1
>       00552423          sw              t0,8(a0)
>       4685              addi            a3,zero,1
>       00d28363          beq             t0,a3,6         # 0x51149a


This change reduces the instruction count on RISC-V by one compressible
instruction (2 bytes) and on AArch64 by one instruction (4 bytes).
No execution time improvement (measured on Neoverse-N1) — as would be
expected.

--Philipp.


On Thu, 16 Mar 2023 at 17:41, Jeff Law <jeffreyalaw@gmail.com> wrote:

>
>
> On 3/16/23 09:27, Manolis Tsamis wrote:
> > For this C testcase:
> >
> > void g();
> > void f(unsigned int *a)
> > {
> >    if (++*a == 1)
> >      g();
> > }
> >
> > GCC will currently emit a comparison with 1 by using the value
> > of *a after the increment. This can be improved by comparing
> > against 0 and using the value before the increment. As a result
> > there is a potentially shorter dependancy chain (no need to wait
> > for the result of +1) and on targets with compare zero instructions
> > the generated code is one instruction shorter.
> >
> > Example from Aarch64:
> >
> > Before
> >          ldr     w1, [x0]
> >          add     w1, w1, 1
> >          str     w1, [x0]
> >          cmp     w1, 1
> >          beq     .L4
> >          ret
> >
> > After
> >          ldr     w1, [x0]
> >          add     w2, w1, 1
> >          str     w2, [x0]
> >          cbz     w1, .L4
> >          ret
> >
> > gcc/ChangeLog:
> >
> >          * tree-ssa-forwprop.cc (combine_cond_expr_cond):
> >          (forward_propagate_into_comparison_1): Optimize
> >          for zero comparisons.
> Deferring to gcc-14.  Though I'm generally supportive of normalizing to
> a comparison against zero when we safely can :-)
>
> jeff
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-03-16 15:27 [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop Manolis Tsamis
  2023-03-16 16:41 ` Jeff Law
@ 2023-03-17  8:31 ` Richard Biener
  2023-03-17 13:15   ` Philipp Tomsich
                     ` (2 more replies)
  1 sibling, 3 replies; 20+ messages in thread
From: Richard Biener @ 2023-03-17  8:31 UTC (permalink / raw)
  To: Manolis Tsamis, Andrew MacLeod; +Cc: gcc-patches, Philipp Tomsich

On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>
> For this C testcase:
>
> void g();
> void f(unsigned int *a)
> {
>   if (++*a == 1)
>     g();
> }
>
> GCC will currently emit a comparison with 1 by using the value
> of *a after the increment. This can be improved by comparing
> against 0 and using the value before the increment. As a result
> there is a potentially shorter dependancy chain (no need to wait
> for the result of +1) and on targets with compare zero instructions
> the generated code is one instruction shorter.

The downside is we now need two registers and their lifetime overlaps.

Your patch mixes changing / inverting a parameter (which seems unneeded
for the actual change) with preferring compares against zero.

What's the reason to specifically prefer compares against zero?  On x86
we have add that sets flags, so ++*a == 0 would be preferred, but
for your sequence we'd need a test reg, reg; branch on zero, so we do
not save any instruction.

We do have quite some number of bugreports with regards to making VRPs
life harder when splitting things this way.  It's easier for VRP to handle

  _1 = _2 + 1;
  if (_1 == 1)

than it is

  _1 = _2 + 1;
  if (_2 == 0)

where VRP fails to derive a range for _1 on the _2 == 0 branch.  So besides
the life-range issue there's other side-effects as well.  Maybe ranger meanwhile
can handle the above case?

What's the overall effect of the change on a larger code base?

Thanks,
Richard.

>
> Example from Aarch64:
>
> Before
>         ldr     w1, [x0]
>         add     w1, w1, 1
>         str     w1, [x0]
>         cmp     w1, 1
>         beq     .L4
>         ret
>
> After
>         ldr     w1, [x0]
>         add     w2, w1, 1
>         str     w2, [x0]
>         cbz     w1, .L4
>         ret
>
> gcc/ChangeLog:
>
>         * tree-ssa-forwprop.cc (combine_cond_expr_cond):
>         (forward_propagate_into_comparison_1): Optimize
>         for zero comparisons.
>
> Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> ---
>
>  gcc/tree-ssa-forwprop.cc | 41 +++++++++++++++++++++++++++-------------
>  1 file changed, 28 insertions(+), 13 deletions(-)
>
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index e34f0888954..93d5043821b 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -373,12 +373,13 @@ rhs_to_tree (tree type, gimple *stmt)
>  /* Combine OP0 CODE OP1 in the context of a COND_EXPR.  Returns
>     the folded result in a form suitable for COND_EXPR_COND or
>     NULL_TREE, if there is no suitable simplified form.  If
> -   INVARIANT_ONLY is true only gimple_min_invariant results are
> -   considered simplified.  */
> +   ALWAYS_COMBINE is false then only combine it the resulting
> +   expression is gimple_min_invariant or considered simplified
> +   compared to the original.  */
>
>  static tree
>  combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> -                       tree op0, tree op1, bool invariant_only)
> +                       tree op0, tree op1, bool always_combine)
>  {
>    tree t;
>
> @@ -398,17 +399,31 @@ combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
>    /* Canonicalize the combined condition for use in a COND_EXPR.  */
>    t = canonicalize_cond_expr_cond (t);
>
> -  /* Bail out if we required an invariant but didn't get one.  */
> -  if (!t || (invariant_only && !is_gimple_min_invariant (t)))
> +  if (!t)
>      {
>        fold_undefer_overflow_warnings (false, NULL, 0);
>        return NULL_TREE;
>      }
>
> -  bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> -  fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> +  if (always_combine || is_gimple_min_invariant (t))
> +    {
> +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> +      return t;
> +    }
>
> -  return t;
> +  /* If the result of folding is a zero comparison treat it preferentially.  */
> +  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison
> +      && integer_zerop (TREE_OPERAND (t, 1))
> +      && !integer_zerop (op1))
> +    {
> +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> +      return t;
> +    }
> +
> +  fold_undefer_overflow_warnings (false, NULL, 0);
> +  return NULL_TREE;
>  }
>
>  /* Combine the comparison OP0 CODE OP1 at LOC with the defining statements
> @@ -432,7 +447,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
>        if (def_stmt && can_propagate_from (def_stmt))
>         {
>           enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
> -         bool invariant_only_p = !single_use0_p;
> +         bool always_combine = single_use0_p;
>
>           rhs0 = rhs_to_tree (TREE_TYPE (op1), def_stmt);
>
> @@ -442,10 +457,10 @@ forward_propagate_into_comparison_1 (gimple *stmt,
>                    && TREE_CODE (TREE_TYPE (TREE_OPERAND (rhs0, 0)))
>                       == BOOLEAN_TYPE)
>                   || TREE_CODE_CLASS (def_code) == tcc_comparison))
> -           invariant_only_p = false;
> +           always_combine = true;
>
>           tmp = combine_cond_expr_cond (stmt, code, type,
> -                                       rhs0, op1, invariant_only_p);
> +                                       rhs0, op1, always_combine);
>           if (tmp)
>             return tmp;
>         }
> @@ -459,7 +474,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
>         {
>           rhs1 = rhs_to_tree (TREE_TYPE (op0), def_stmt);
>           tmp = combine_cond_expr_cond (stmt, code, type,
> -                                       op0, rhs1, !single_use1_p);
> +                                       op0, rhs1, single_use1_p);
>           if (tmp)
>             return tmp;
>         }
> @@ -470,7 +485,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
>        && rhs1 != NULL_TREE)
>      tmp = combine_cond_expr_cond (stmt, code, type,
>                                   rhs0, rhs1,
> -                                 !(single_use0_p && single_use1_p));
> +                                 single_use0_p && single_use1_p);
>
>    return tmp;
>  }
> --
> 2.34.1
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-03-17  8:31 ` Richard Biener
@ 2023-03-17 13:15   ` Philipp Tomsich
  2023-03-17 14:03     ` Richard Biener
  2023-03-17 20:43     ` Andrew Waterman
  2023-03-17 14:12   ` Andrew MacLeod
  2023-03-20 14:01   ` Manolis Tsamis
  2 siblings, 2 replies; 20+ messages in thread
From: Philipp Tomsich @ 2023-03-17 13:15 UTC (permalink / raw)
  To: Richard Biener; +Cc: Manolis Tsamis, Andrew MacLeod, gcc-patches

On Fri, 17 Mar 2023 at 09:31, Richard Biener <richard.guenther@gmail.com> wrote:
>
> On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> >
> > For this C testcase:
> >
> > void g();
> > void f(unsigned int *a)
> > {
> >   if (++*a == 1)
> >     g();
> > }
> >
> > GCC will currently emit a comparison with 1 by using the value
> > of *a after the increment. This can be improved by comparing
> > against 0 and using the value before the increment. As a result
> > there is a potentially shorter dependancy chain (no need to wait
> > for the result of +1) and on targets with compare zero instructions
> > the generated code is one instruction shorter.
>
> The downside is we now need two registers and their lifetime overlaps.
>
> Your patch mixes changing / inverting a parameter (which seems unneeded
> for the actual change) with preferring compares against zero.
>
> What's the reason to specifically prefer compares against zero?  On x86
> we have add that sets flags, so ++*a == 0 would be preferred, but
> for your sequence we'd need a test reg, reg; branch on zero, so we do
> not save any instruction.

AArch64, RISC-V and MIPS support a branch-on-(not-)equals-zero, while
comparing against a constant requires to load any non-zero value into
a register first.
This feels a bit like we need to call onto the backend to check
whether comparisons against 0 are cheaper.

Obviously, the underlying issue become worse if the immediate can not
be built up in a single instruction.
Using RISC-V as an example (primarily, as RISC-V makes it particularly
easy to run into multi-instruction sequences for constants), we can
construct the following case:

  void f(unsigned int *a)
  {
    if ((*a += 0x900) == 0x900)
       g();
  }

which GCC 12.2.0 (trunk may already be small enough to reuse the
constant once loaded into register, but I did not check…) with -O3
turns into:

f:
  lw a4,0(a0)
  li a5,4096
  addiw a5,a5,-1792
  addw a4,a5,a4
  li a5,4096
  sw a4,0(a0)
  addi a5,a5,-1792
  beq a4,a5,.L4
  ret
.L4:
  tail g

Thanks,
Philipp.


On Fri, 17 Mar 2023 at 09:31, Richard Biener <richard.guenther@gmail.com> wrote:
>
> On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> >
> > For this C testcase:
> >
> > void g();
> > void f(unsigned int *a)
> > {
> >   if (++*a == 1)
> >     g();
> > }
> >
> > GCC will currently emit a comparison with 1 by using the value
> > of *a after the increment. This can be improved by comparing
> > against 0 and using the value before the increment. As a result
> > there is a potentially shorter dependancy chain (no need to wait
> > for the result of +1) and on targets with compare zero instructions
> > the generated code is one instruction shorter.
>
> The downside is we now need two registers and their lifetime overlaps.
>
> Your patch mixes changing / inverting a parameter (which seems unneeded
> for the actual change) with preferring compares against zero.
>
> What's the reason to specifically prefer compares against zero?  On x86
> we have add that sets flags, so ++*a == 0 would be preferred, but
> for your sequence we'd need a test reg, reg; branch on zero, so we do
> not save any instruction.
>
> We do have quite some number of bugreports with regards to making VRPs
> life harder when splitting things this way.  It's easier for VRP to handle
>
>   _1 = _2 + 1;
>   if (_1 == 1)
>
> than it is
>
>   _1 = _2 + 1;
>   if (_2 == 0)
>
> where VRP fails to derive a range for _1 on the _2 == 0 branch.  So besides
> the life-range issue there's other side-effects as well.  Maybe ranger meanwhile
> can handle the above case?
>
> What's the overall effect of the change on a larger code base?
>
> Thanks,
> Richard.
>
> >
> > Example from Aarch64:
> >
> > Before
> >         ldr     w1, [x0]
> >         add     w1, w1, 1
> >         str     w1, [x0]
> >         cmp     w1, 1
> >         beq     .L4
> >         ret
> >
> > After
> >         ldr     w1, [x0]
> >         add     w2, w1, 1
> >         str     w2, [x0]
> >         cbz     w1, .L4
> >         ret
> >
> > gcc/ChangeLog:
> >
> >         * tree-ssa-forwprop.cc (combine_cond_expr_cond):
> >         (forward_propagate_into_comparison_1): Optimize
> >         for zero comparisons.
> >
> > Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> > ---
> >
> >  gcc/tree-ssa-forwprop.cc | 41 +++++++++++++++++++++++++++-------------
> >  1 file changed, 28 insertions(+), 13 deletions(-)
> >
> > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> > index e34f0888954..93d5043821b 100644
> > --- a/gcc/tree-ssa-forwprop.cc
> > +++ b/gcc/tree-ssa-forwprop.cc
> > @@ -373,12 +373,13 @@ rhs_to_tree (tree type, gimple *stmt)
> >  /* Combine OP0 CODE OP1 in the context of a COND_EXPR.  Returns
> >     the folded result in a form suitable for COND_EXPR_COND or
> >     NULL_TREE, if there is no suitable simplified form.  If
> > -   INVARIANT_ONLY is true only gimple_min_invariant results are
> > -   considered simplified.  */
> > +   ALWAYS_COMBINE is false then only combine it the resulting
> > +   expression is gimple_min_invariant or considered simplified
> > +   compared to the original.  */
> >
> >  static tree
> >  combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> > -                       tree op0, tree op1, bool invariant_only)
> > +                       tree op0, tree op1, bool always_combine)
> >  {
> >    tree t;
> >
> > @@ -398,17 +399,31 @@ combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> >    /* Canonicalize the combined condition for use in a COND_EXPR.  */
> >    t = canonicalize_cond_expr_cond (t);
> >
> > -  /* Bail out if we required an invariant but didn't get one.  */
> > -  if (!t || (invariant_only && !is_gimple_min_invariant (t)))
> > +  if (!t)
> >      {
> >        fold_undefer_overflow_warnings (false, NULL, 0);
> >        return NULL_TREE;
> >      }
> >
> > -  bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > -  fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > +  if (always_combine || is_gimple_min_invariant (t))
> > +    {
> > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > +      return t;
> > +    }
> >
> > -  return t;
> > +  /* If the result of folding is a zero comparison treat it preferentially.  */
> > +  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison
> > +      && integer_zerop (TREE_OPERAND (t, 1))
> > +      && !integer_zerop (op1))
> > +    {
> > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > +      return t;
> > +    }
> > +
> > +  fold_undefer_overflow_warnings (false, NULL, 0);
> > +  return NULL_TREE;
> >  }
> >
> >  /* Combine the comparison OP0 CODE OP1 at LOC with the defining statements
> > @@ -432,7 +447,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> >        if (def_stmt && can_propagate_from (def_stmt))
> >         {
> >           enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
> > -         bool invariant_only_p = !single_use0_p;
> > +         bool always_combine = single_use0_p;
> >
> >           rhs0 = rhs_to_tree (TREE_TYPE (op1), def_stmt);
> >
> > @@ -442,10 +457,10 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> >                    && TREE_CODE (TREE_TYPE (TREE_OPERAND (rhs0, 0)))
> >                       == BOOLEAN_TYPE)
> >                   || TREE_CODE_CLASS (def_code) == tcc_comparison))
> > -           invariant_only_p = false;
> > +           always_combine = true;
> >
> >           tmp = combine_cond_expr_cond (stmt, code, type,
> > -                                       rhs0, op1, invariant_only_p);
> > +                                       rhs0, op1, always_combine);
> >           if (tmp)
> >             return tmp;
> >         }
> > @@ -459,7 +474,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> >         {
> >           rhs1 = rhs_to_tree (TREE_TYPE (op0), def_stmt);
> >           tmp = combine_cond_expr_cond (stmt, code, type,
> > -                                       op0, rhs1, !single_use1_p);
> > +                                       op0, rhs1, single_use1_p);
> >           if (tmp)
> >             return tmp;
> >         }
> > @@ -470,7 +485,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> >        && rhs1 != NULL_TREE)
> >      tmp = combine_cond_expr_cond (stmt, code, type,
> >                                   rhs0, rhs1,
> > -                                 !(single_use0_p && single_use1_p));
> > +                                 single_use0_p && single_use1_p);
> >
> >    return tmp;
> >  }
> > --
> > 2.34.1
> >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-03-17 13:15   ` Philipp Tomsich
@ 2023-03-17 14:03     ` Richard Biener
  2023-03-17 20:43     ` Andrew Waterman
  1 sibling, 0 replies; 20+ messages in thread
From: Richard Biener @ 2023-03-17 14:03 UTC (permalink / raw)
  To: Philipp Tomsich; +Cc: Manolis Tsamis, Andrew MacLeod, gcc-patches

On Fri, Mar 17, 2023 at 2:15 PM Philipp Tomsich
<philipp.tomsich@vrull.eu> wrote:
>
> On Fri, 17 Mar 2023 at 09:31, Richard Biener <richard.guenther@gmail.com> wrote:
> >
> > On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> > >
> > > For this C testcase:
> > >
> > > void g();
> > > void f(unsigned int *a)
> > > {
> > >   if (++*a == 1)
> > >     g();
> > > }
> > >
> > > GCC will currently emit a comparison with 1 by using the value
> > > of *a after the increment. This can be improved by comparing
> > > against 0 and using the value before the increment. As a result
> > > there is a potentially shorter dependancy chain (no need to wait
> > > for the result of +1) and on targets with compare zero instructions
> > > the generated code is one instruction shorter.
> >
> > The downside is we now need two registers and their lifetime overlaps.
> >
> > Your patch mixes changing / inverting a parameter (which seems unneeded
> > for the actual change) with preferring compares against zero.
> >
> > What's the reason to specifically prefer compares against zero?  On x86
> > we have add that sets flags, so ++*a == 0 would be preferred, but
> > for your sequence we'd need a test reg, reg; branch on zero, so we do
> > not save any instruction.
>
> AArch64, RISC-V and MIPS support a branch-on-(not-)equals-zero, while
> comparing against a constant requires to load any non-zero value into
> a register first.
> This feels a bit like we need to call onto the backend to check
> whether comparisons against 0 are cheaper.

Hmm - we should try hard to not introduce too many target dependent IL
differences early.  I do wonder if it's possible to handle this later
(and also the reverse transform).  Ideally we'd know register pressure
and the cost of the constants involved as well.  We do some related
"fixups" when rewriting out-of-SSA, mainly to improve coalescing
around backedges and avoiding overlapping life ranges there for loop
exit compares with live before/after IVs after the loop.

> Obviously, the underlying issue become worse if the immediate can not
> be built up in a single instruction.
> Using RISC-V as an example (primarily, as RISC-V makes it particularly
> easy to run into multi-instruction sequences for constants), we can
> construct the following case:
>
>   void f(unsigned int *a)
>   {
>     if ((*a += 0x900) == 0x900)
>        g();
>   }
>
> which GCC 12.2.0 (trunk may already be small enough to reuse the
> constant once loaded into register, but I did not check…) with -O3
> turns into:
>
> f:
>   lw a4,0(a0)
>   li a5,4096
>   addiw a5,a5,-1792
>   addw a4,a5,a4
>   li a5,4096
>   sw a4,0(a0)
>   addi a5,a5,-1792
>   beq a4,a5,.L4
>   ret
> .L4:
>   tail g
>
> Thanks,
> Philipp.
>
>
> On Fri, 17 Mar 2023 at 09:31, Richard Biener <richard.guenther@gmail.com> wrote:
> >
> > On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> > >
> > > For this C testcase:
> > >
> > > void g();
> > > void f(unsigned int *a)
> > > {
> > >   if (++*a == 1)
> > >     g();
> > > }
> > >
> > > GCC will currently emit a comparison with 1 by using the value
> > > of *a after the increment. This can be improved by comparing
> > > against 0 and using the value before the increment. As a result
> > > there is a potentially shorter dependancy chain (no need to wait
> > > for the result of +1) and on targets with compare zero instructions
> > > the generated code is one instruction shorter.
> >
> > The downside is we now need two registers and their lifetime overlaps.
> >
> > Your patch mixes changing / inverting a parameter (which seems unneeded
> > for the actual change) with preferring compares against zero.
> >
> > What's the reason to specifically prefer compares against zero?  On x86
> > we have add that sets flags, so ++*a == 0 would be preferred, but
> > for your sequence we'd need a test reg, reg; branch on zero, so we do
> > not save any instruction.
> >
> > We do have quite some number of bugreports with regards to making VRPs
> > life harder when splitting things this way.  It's easier for VRP to handle
> >
> >   _1 = _2 + 1;
> >   if (_1 == 1)
> >
> > than it is
> >
> >   _1 = _2 + 1;
> >   if (_2 == 0)
> >
> > where VRP fails to derive a range for _1 on the _2 == 0 branch.  So besides
> > the life-range issue there's other side-effects as well.  Maybe ranger meanwhile
> > can handle the above case?
> >
> > What's the overall effect of the change on a larger code base?
> >
> > Thanks,
> > Richard.
> >
> > >
> > > Example from Aarch64:
> > >
> > > Before
> > >         ldr     w1, [x0]
> > >         add     w1, w1, 1
> > >         str     w1, [x0]
> > >         cmp     w1, 1
> > >         beq     .L4
> > >         ret
> > >
> > > After
> > >         ldr     w1, [x0]
> > >         add     w2, w1, 1
> > >         str     w2, [x0]
> > >         cbz     w1, .L4
> > >         ret
> > >
> > > gcc/ChangeLog:
> > >
> > >         * tree-ssa-forwprop.cc (combine_cond_expr_cond):
> > >         (forward_propagate_into_comparison_1): Optimize
> > >         for zero comparisons.
> > >
> > > Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> > > ---
> > >
> > >  gcc/tree-ssa-forwprop.cc | 41 +++++++++++++++++++++++++++-------------
> > >  1 file changed, 28 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> > > index e34f0888954..93d5043821b 100644
> > > --- a/gcc/tree-ssa-forwprop.cc
> > > +++ b/gcc/tree-ssa-forwprop.cc
> > > @@ -373,12 +373,13 @@ rhs_to_tree (tree type, gimple *stmt)
> > >  /* Combine OP0 CODE OP1 in the context of a COND_EXPR.  Returns
> > >     the folded result in a form suitable for COND_EXPR_COND or
> > >     NULL_TREE, if there is no suitable simplified form.  If
> > > -   INVARIANT_ONLY is true only gimple_min_invariant results are
> > > -   considered simplified.  */
> > > +   ALWAYS_COMBINE is false then only combine it the resulting
> > > +   expression is gimple_min_invariant or considered simplified
> > > +   compared to the original.  */
> > >
> > >  static tree
> > >  combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> > > -                       tree op0, tree op1, bool invariant_only)
> > > +                       tree op0, tree op1, bool always_combine)
> > >  {
> > >    tree t;
> > >
> > > @@ -398,17 +399,31 @@ combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> > >    /* Canonicalize the combined condition for use in a COND_EXPR.  */
> > >    t = canonicalize_cond_expr_cond (t);
> > >
> > > -  /* Bail out if we required an invariant but didn't get one.  */
> > > -  if (!t || (invariant_only && !is_gimple_min_invariant (t)))
> > > +  if (!t)
> > >      {
> > >        fold_undefer_overflow_warnings (false, NULL, 0);
> > >        return NULL_TREE;
> > >      }
> > >
> > > -  bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > -  fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > +  if (always_combine || is_gimple_min_invariant (t))
> > > +    {
> > > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > +      return t;
> > > +    }
> > >
> > > -  return t;
> > > +  /* If the result of folding is a zero comparison treat it preferentially.  */
> > > +  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison
> > > +      && integer_zerop (TREE_OPERAND (t, 1))
> > > +      && !integer_zerop (op1))
> > > +    {
> > > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > +      return t;
> > > +    }
> > > +
> > > +  fold_undefer_overflow_warnings (false, NULL, 0);
> > > +  return NULL_TREE;
> > >  }
> > >
> > >  /* Combine the comparison OP0 CODE OP1 at LOC with the defining statements
> > > @@ -432,7 +447,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >        if (def_stmt && can_propagate_from (def_stmt))
> > >         {
> > >           enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
> > > -         bool invariant_only_p = !single_use0_p;
> > > +         bool always_combine = single_use0_p;
> > >
> > >           rhs0 = rhs_to_tree (TREE_TYPE (op1), def_stmt);
> > >
> > > @@ -442,10 +457,10 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >                    && TREE_CODE (TREE_TYPE (TREE_OPERAND (rhs0, 0)))
> > >                       == BOOLEAN_TYPE)
> > >                   || TREE_CODE_CLASS (def_code) == tcc_comparison))
> > > -           invariant_only_p = false;
> > > +           always_combine = true;
> > >
> > >           tmp = combine_cond_expr_cond (stmt, code, type,
> > > -                                       rhs0, op1, invariant_only_p);
> > > +                                       rhs0, op1, always_combine);
> > >           if (tmp)
> > >             return tmp;
> > >         }
> > > @@ -459,7 +474,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >         {
> > >           rhs1 = rhs_to_tree (TREE_TYPE (op0), def_stmt);
> > >           tmp = combine_cond_expr_cond (stmt, code, type,
> > > -                                       op0, rhs1, !single_use1_p);
> > > +                                       op0, rhs1, single_use1_p);
> > >           if (tmp)
> > >             return tmp;
> > >         }
> > > @@ -470,7 +485,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >        && rhs1 != NULL_TREE)
> > >      tmp = combine_cond_expr_cond (stmt, code, type,
> > >                                   rhs0, rhs1,
> > > -                                 !(single_use0_p && single_use1_p));
> > > +                                 single_use0_p && single_use1_p);
> > >
> > >    return tmp;
> > >  }
> > > --
> > > 2.34.1
> > >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-03-17  8:31 ` Richard Biener
  2023-03-17 13:15   ` Philipp Tomsich
@ 2023-03-17 14:12   ` Andrew MacLeod
  2023-03-20 14:01   ` Manolis Tsamis
  2 siblings, 0 replies; 20+ messages in thread
From: Andrew MacLeod @ 2023-03-17 14:12 UTC (permalink / raw)
  To: Richard Biener, Manolis Tsamis
  Cc: gcc-patches, Philipp Tomsich, hernandez, aldy


On 3/17/23 04:31, Richard Biener wrote:
> On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>> For this C testcase:
>>
>> void g();
>> void f(unsigned int *a)
>> {
>>    if (++*a == 1)
>>      g();
>> }
>>
>> GCC will currently emit a comparison with 1 by using the value
>> of *a after the increment. This can be improved by comparing
>> against 0 and using the value before the increment. As a result
>> there is a potentially shorter dependancy chain (no need to wait
>> for the result of +1) and on targets with compare zero instructions
>> the generated code is one instruction shorter.
> The downside is we now need two registers and their lifetime overlaps.
>
> Your patch mixes changing / inverting a parameter (which seems unneeded
> for the actual change) with preferring compares against zero.
>
> What's the reason to specifically prefer compares against zero?  On x86
> we have add that sets flags, so ++*a == 0 would be preferred, but
> for your sequence we'd need a test reg, reg; branch on zero, so we do
> not save any instruction.
>
> We do have quite some number of bugreports with regards to making VRPs
> life harder when splitting things this way.  It's easier for VRP to handle
>
>    _1 = _2 + 1;
>    if (_1 == 1)
>
> than it is
>
>    _1 = _2 + 1;
>    if (_2 == 0)
>
> where VRP fails to derive a range for _1 on the _2 == 0 branch.  So besides


Heh?

     _1 = *a_5(D);
     b_6 = _1 + 1;
     if (_1 == 0)
       goto <bb 3>; [INV]
     else
       goto <bb 4>; [INV]

2->3  (T) _1 :  [irange] unsigned int [0, 0] NONZERO 0x0
2->3  (T) b_6 :         [irange] unsigned int [1, 1] NONZERO 0x1
2->4  (F) _1 :  [irange] unsigned int [1, +INF]
2->4  (F) b_6 :         [irange] unsigned int [0, 0][2, +INF]

I will grant you that if the definition of b_6 is in a different basic 
block that if (_1 == 0) we may not always get a range for it,  but 
generally this should be OK?  especialyl within a basic block.

I do have a few re-computation cases to improve upon of course :-P

Andrew




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-03-17 13:15   ` Philipp Tomsich
  2023-03-17 14:03     ` Richard Biener
@ 2023-03-17 20:43     ` Andrew Waterman
  1 sibling, 0 replies; 20+ messages in thread
From: Andrew Waterman @ 2023-03-17 20:43 UTC (permalink / raw)
  To: Philipp Tomsich
  Cc: Richard Biener, Manolis Tsamis, Andrew MacLeod, gcc-patches

On Fri, Mar 17, 2023 at 6:16 AM Philipp Tomsich
<philipp.tomsich@vrull.eu> wrote:
>
> On Fri, 17 Mar 2023 at 09:31, Richard Biener <richard.guenther@gmail.com> wrote:
> >
> > On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> > >
> > > For this C testcase:
> > >
> > > void g();
> > > void f(unsigned int *a)
> > > {
> > >   if (++*a == 1)
> > >     g();
> > > }
> > >
> > > GCC will currently emit a comparison with 1 by using the value
> > > of *a after the increment. This can be improved by comparing
> > > against 0 and using the value before the increment. As a result
> > > there is a potentially shorter dependancy chain (no need to wait
> > > for the result of +1) and on targets with compare zero instructions
> > > the generated code is one instruction shorter.
> >
> > The downside is we now need two registers and their lifetime overlaps.
> >
> > Your patch mixes changing / inverting a parameter (which seems unneeded
> > for the actual change) with preferring compares against zero.
> >
> > What's the reason to specifically prefer compares against zero?  On x86
> > we have add that sets flags, so ++*a == 0 would be preferred, but
> > for your sequence we'd need a test reg, reg; branch on zero, so we do
> > not save any instruction.
>
> AArch64, RISC-V and MIPS support a branch-on-(not-)equals-zero, while
> comparing against a constant requires to load any non-zero value into
> a register first.

Equality comparisons against 0 are also slightly cheaper than equality
comparisons against 1 on x86, though it's a code-size difference, not
an instruction-count difference.  Not sure this changes the
story--just pointing out that this optimization might be slightly more
generally applicable than it initially seems.

>
> This feels a bit like we need to call onto the backend to check
> whether comparisons against 0 are cheaper.
>
> Obviously, the underlying issue become worse if the immediate can not
> be built up in a single instruction.
> Using RISC-V as an example (primarily, as RISC-V makes it particularly
> easy to run into multi-instruction sequences for constants), we can
> construct the following case:
>
>   void f(unsigned int *a)
>   {
>     if ((*a += 0x900) == 0x900)
>        g();
>   }
>
> which GCC 12.2.0 (trunk may already be small enough to reuse the
> constant once loaded into register, but I did not check…) with -O3
> turns into:
>
> f:
>   lw a4,0(a0)
>   li a5,4096
>   addiw a5,a5,-1792
>   addw a4,a5,a4
>   li a5,4096
>   sw a4,0(a0)
>   addi a5,a5,-1792
>   beq a4,a5,.L4
>   ret
> .L4:
>   tail g
>
> Thanks,
> Philipp.
>
>
> On Fri, 17 Mar 2023 at 09:31, Richard Biener <richard.guenther@gmail.com> wrote:
> >
> > On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> > >
> > > For this C testcase:
> > >
> > > void g();
> > > void f(unsigned int *a)
> > > {
> > >   if (++*a == 1)
> > >     g();
> > > }
> > >
> > > GCC will currently emit a comparison with 1 by using the value
> > > of *a after the increment. This can be improved by comparing
> > > against 0 and using the value before the increment. As a result
> > > there is a potentially shorter dependancy chain (no need to wait
> > > for the result of +1) and on targets with compare zero instructions
> > > the generated code is one instruction shorter.
> >
> > The downside is we now need two registers and their lifetime overlaps.
> >
> > Your patch mixes changing / inverting a parameter (which seems unneeded
> > for the actual change) with preferring compares against zero.
> >
> > What's the reason to specifically prefer compares against zero?  On x86
> > we have add that sets flags, so ++*a == 0 would be preferred, but
> > for your sequence we'd need a test reg, reg; branch on zero, so we do
> > not save any instruction.
> >
> > We do have quite some number of bugreports with regards to making VRPs
> > life harder when splitting things this way.  It's easier for VRP to handle
> >
> >   _1 = _2 + 1;
> >   if (_1 == 1)
> >
> > than it is
> >
> >   _1 = _2 + 1;
> >   if (_2 == 0)
> >
> > where VRP fails to derive a range for _1 on the _2 == 0 branch.  So besides
> > the life-range issue there's other side-effects as well.  Maybe ranger meanwhile
> > can handle the above case?
> >
> > What's the overall effect of the change on a larger code base?
> >
> > Thanks,
> > Richard.
> >
> > >
> > > Example from Aarch64:
> > >
> > > Before
> > >         ldr     w1, [x0]
> > >         add     w1, w1, 1
> > >         str     w1, [x0]
> > >         cmp     w1, 1
> > >         beq     .L4
> > >         ret
> > >
> > > After
> > >         ldr     w1, [x0]
> > >         add     w2, w1, 1
> > >         str     w2, [x0]
> > >         cbz     w1, .L4
> > >         ret
> > >
> > > gcc/ChangeLog:
> > >
> > >         * tree-ssa-forwprop.cc (combine_cond_expr_cond):
> > >         (forward_propagate_into_comparison_1): Optimize
> > >         for zero comparisons.
> > >
> > > Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> > > ---
> > >
> > >  gcc/tree-ssa-forwprop.cc | 41 +++++++++++++++++++++++++++-------------
> > >  1 file changed, 28 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> > > index e34f0888954..93d5043821b 100644
> > > --- a/gcc/tree-ssa-forwprop.cc
> > > +++ b/gcc/tree-ssa-forwprop.cc
> > > @@ -373,12 +373,13 @@ rhs_to_tree (tree type, gimple *stmt)
> > >  /* Combine OP0 CODE OP1 in the context of a COND_EXPR.  Returns
> > >     the folded result in a form suitable for COND_EXPR_COND or
> > >     NULL_TREE, if there is no suitable simplified form.  If
> > > -   INVARIANT_ONLY is true only gimple_min_invariant results are
> > > -   considered simplified.  */
> > > +   ALWAYS_COMBINE is false then only combine it the resulting
> > > +   expression is gimple_min_invariant or considered simplified
> > > +   compared to the original.  */
> > >
> > >  static tree
> > >  combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> > > -                       tree op0, tree op1, bool invariant_only)
> > > +                       tree op0, tree op1, bool always_combine)
> > >  {
> > >    tree t;
> > >
> > > @@ -398,17 +399,31 @@ combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> > >    /* Canonicalize the combined condition for use in a COND_EXPR.  */
> > >    t = canonicalize_cond_expr_cond (t);
> > >
> > > -  /* Bail out if we required an invariant but didn't get one.  */
> > > -  if (!t || (invariant_only && !is_gimple_min_invariant (t)))
> > > +  if (!t)
> > >      {
> > >        fold_undefer_overflow_warnings (false, NULL, 0);
> > >        return NULL_TREE;
> > >      }
> > >
> > > -  bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > -  fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > +  if (always_combine || is_gimple_min_invariant (t))
> > > +    {
> > > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > +      return t;
> > > +    }
> > >
> > > -  return t;
> > > +  /* If the result of folding is a zero comparison treat it preferentially.  */
> > > +  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison
> > > +      && integer_zerop (TREE_OPERAND (t, 1))
> > > +      && !integer_zerop (op1))
> > > +    {
> > > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > +      return t;
> > > +    }
> > > +
> > > +  fold_undefer_overflow_warnings (false, NULL, 0);
> > > +  return NULL_TREE;
> > >  }
> > >
> > >  /* Combine the comparison OP0 CODE OP1 at LOC with the defining statements
> > > @@ -432,7 +447,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >        if (def_stmt && can_propagate_from (def_stmt))
> > >         {
> > >           enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
> > > -         bool invariant_only_p = !single_use0_p;
> > > +         bool always_combine = single_use0_p;
> > >
> > >           rhs0 = rhs_to_tree (TREE_TYPE (op1), def_stmt);
> > >
> > > @@ -442,10 +457,10 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >                    && TREE_CODE (TREE_TYPE (TREE_OPERAND (rhs0, 0)))
> > >                       == BOOLEAN_TYPE)
> > >                   || TREE_CODE_CLASS (def_code) == tcc_comparison))
> > > -           invariant_only_p = false;
> > > +           always_combine = true;
> > >
> > >           tmp = combine_cond_expr_cond (stmt, code, type,
> > > -                                       rhs0, op1, invariant_only_p);
> > > +                                       rhs0, op1, always_combine);
> > >           if (tmp)
> > >             return tmp;
> > >         }
> > > @@ -459,7 +474,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >         {
> > >           rhs1 = rhs_to_tree (TREE_TYPE (op0), def_stmt);
> > >           tmp = combine_cond_expr_cond (stmt, code, type,
> > > -                                       op0, rhs1, !single_use1_p);
> > > +                                       op0, rhs1, single_use1_p);
> > >           if (tmp)
> > >             return tmp;
> > >         }
> > > @@ -470,7 +485,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >        && rhs1 != NULL_TREE)
> > >      tmp = combine_cond_expr_cond (stmt, code, type,
> > >                                   rhs0, rhs1,
> > > -                                 !(single_use0_p && single_use1_p));
> > > +                                 single_use0_p && single_use1_p);
> > >
> > >    return tmp;
> > >  }
> > > --
> > > 2.34.1
> > >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-03-17  8:31 ` Richard Biener
  2023-03-17 13:15   ` Philipp Tomsich
  2023-03-17 14:12   ` Andrew MacLeod
@ 2023-03-20 14:01   ` Manolis Tsamis
  2023-03-23 23:27     ` Jeff Law
  2023-04-21 21:01     ` Philipp Tomsich
  2 siblings, 2 replies; 20+ messages in thread
From: Manolis Tsamis @ 2023-03-20 14:01 UTC (permalink / raw)
  To: Richard Biener; +Cc: Andrew MacLeod, gcc-patches, Philipp Tomsich

On Fri, Mar 17, 2023 at 10:31 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> >
> > For this C testcase:
> >
> > void g();
> > void f(unsigned int *a)
> > {
> >   if (++*a == 1)
> >     g();
> > }
> >
> > GCC will currently emit a comparison with 1 by using the value
> > of *a after the increment. This can be improved by comparing
> > against 0 and using the value before the increment. As a result
> > there is a potentially shorter dependancy chain (no need to wait
> > for the result of +1) and on targets with compare zero instructions
> > the generated code is one instruction shorter.
>
> The downside is we now need two registers and their lifetime overlaps.
>
> Your patch mixes changing / inverting a parameter (which seems unneeded
> for the actual change) with preferring compares against zero.
>

Indeed. I thought that without that change the original names wouldn't properly
describe what the parameter actually does and that's why I've changed it.
I can undo that in the next revision.

> What's the reason to specifically prefer compares against zero?  On x86
> we have add that sets flags, so ++*a == 0 would be preferred, but
> for your sequence we'd need a test reg, reg; branch on zero, so we do
> not save any instruction.
>

My reasoning is that zero is treated preferentially  in most if not
all architectures. Some specifically have zero/non-zero comparisons so
we get one less instruction. X86 doesn't explicitly have that but I
think that test reg, reg may not be always needed depending on the
rest of the code. By what Andrew mentions below there may even be
optimizations for zero in the microarchitecture level.

Because this is still an arch-specific thing I initially tried to make
it arch-depended by invoking the target's const functions (e.g. If I
recall correctly aarch64 will return a lower cost for zero
comparisons). But the code turned out complicated and messy so I came
up with this alternative that just treats zero preferentially.

If you have in mind a way that this can be done in a better way I
could try to implement it.

> We do have quite some number of bugreports with regards to making VRPs
> life harder when splitting things this way.  It's easier for VRP to handle
>
>   _1 = _2 + 1;
>   if (_1 == 1)
>
> than it is
>
>   _1 = _2 + 1;
>   if (_2 == 0)
>
> where VRP fails to derive a range for _1 on the _2 == 0 branch.  So besides
> the life-range issue there's other side-effects as well.  Maybe ranger meanwhile
> can handle the above case?
>

Answered by Andrew MacLeod.

> What's the overall effect of the change on a larger code base?
>

I made some quick runs of SPEC2017 and got the following results (# of
folds of zero comparisons):

 gcc             2586
 xalancbmk 1456
 perlbench   375
 x264           307
 omnetpp     137
 leela           24
 deepsjeng  15
 exchange2  4
 xz                4

My test runs on Aarch64 do not show any significant change in runtime.
In some cases (e.g. gcc) the binary is smaller in size, but that can
depend on a number of other things.

Thanks,
Manolis

> Thanks,
> Richard.
>
> >
> > Example from Aarch64:
> >
> > Before
> >         ldr     w1, [x0]
> >         add     w1, w1, 1
> >         str     w1, [x0]
> >         cmp     w1, 1
> >         beq     .L4
> >         ret
> >
> > After
> >         ldr     w1, [x0]
> >         add     w2, w1, 1
> >         str     w2, [x0]
> >         cbz     w1, .L4
> >         ret
> >
> > gcc/ChangeLog:
> >
> >         * tree-ssa-forwprop.cc (combine_cond_expr_cond):
> >         (forward_propagate_into_comparison_1): Optimize
> >         for zero comparisons.
> >
> > Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> > ---
> >
> >  gcc/tree-ssa-forwprop.cc | 41 +++++++++++++++++++++++++++-------------
> >  1 file changed, 28 insertions(+), 13 deletions(-)
> >
> > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> > index e34f0888954..93d5043821b 100644
> > --- a/gcc/tree-ssa-forwprop.cc
> > +++ b/gcc/tree-ssa-forwprop.cc
> > @@ -373,12 +373,13 @@ rhs_to_tree (tree type, gimple *stmt)
> >  /* Combine OP0 CODE OP1 in the context of a COND_EXPR.  Returns
> >     the folded result in a form suitable for COND_EXPR_COND or
> >     NULL_TREE, if there is no suitable simplified form.  If
> > -   INVARIANT_ONLY is true only gimple_min_invariant results are
> > -   considered simplified.  */
> > +   ALWAYS_COMBINE is false then only combine it the resulting
> > +   expression is gimple_min_invariant or considered simplified
> > +   compared to the original.  */
> >
> >  static tree
> >  combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> > -                       tree op0, tree op1, bool invariant_only)
> > +                       tree op0, tree op1, bool always_combine)
> >  {
> >    tree t;
> >
> > @@ -398,17 +399,31 @@ combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> >    /* Canonicalize the combined condition for use in a COND_EXPR.  */
> >    t = canonicalize_cond_expr_cond (t);
> >
> > -  /* Bail out if we required an invariant but didn't get one.  */
> > -  if (!t || (invariant_only && !is_gimple_min_invariant (t)))
> > +  if (!t)
> >      {
> >        fold_undefer_overflow_warnings (false, NULL, 0);
> >        return NULL_TREE;
> >      }
> >
> > -  bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > -  fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > +  if (always_combine || is_gimple_min_invariant (t))
> > +    {
> > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > +      return t;
> > +    }
> >
> > -  return t;
> > +  /* If the result of folding is a zero comparison treat it preferentially.  */
> > +  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison
> > +      && integer_zerop (TREE_OPERAND (t, 1))
> > +      && !integer_zerop (op1))
> > +    {
> > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > +      return t;
> > +    }
> > +
> > +  fold_undefer_overflow_warnings (false, NULL, 0);
> > +  return NULL_TREE;
> >  }
> >
> >  /* Combine the comparison OP0 CODE OP1 at LOC with the defining statements
> > @@ -432,7 +447,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> >        if (def_stmt && can_propagate_from (def_stmt))
> >         {
> >           enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
> > -         bool invariant_only_p = !single_use0_p;
> > +         bool always_combine = single_use0_p;
> >
> >           rhs0 = rhs_to_tree (TREE_TYPE (op1), def_stmt);
> >
> > @@ -442,10 +457,10 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> >                    && TREE_CODE (TREE_TYPE (TREE_OPERAND (rhs0, 0)))
> >                       == BOOLEAN_TYPE)
> >                   || TREE_CODE_CLASS (def_code) == tcc_comparison))
> > -           invariant_only_p = false;
> > +           always_combine = true;
> >
> >           tmp = combine_cond_expr_cond (stmt, code, type,
> > -                                       rhs0, op1, invariant_only_p);
> > +                                       rhs0, op1, always_combine);
> >           if (tmp)
> >             return tmp;
> >         }
> > @@ -459,7 +474,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> >         {
> >           rhs1 = rhs_to_tree (TREE_TYPE (op0), def_stmt);
> >           tmp = combine_cond_expr_cond (stmt, code, type,
> > -                                       op0, rhs1, !single_use1_p);
> > +                                       op0, rhs1, single_use1_p);
> >           if (tmp)
> >             return tmp;
> >         }
> > @@ -470,7 +485,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> >        && rhs1 != NULL_TREE)
> >      tmp = combine_cond_expr_cond (stmt, code, type,
> >                                   rhs0, rhs1,
> > -                                 !(single_use0_p && single_use1_p));
> > +                                 single_use0_p && single_use1_p);
> >
> >    return tmp;
> >  }
> > --
> > 2.34.1
> >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-03-20 14:01   ` Manolis Tsamis
@ 2023-03-23 23:27     ` Jeff Law
  2023-04-21 21:01     ` Philipp Tomsich
  1 sibling, 0 replies; 20+ messages in thread
From: Jeff Law @ 2023-03-23 23:27 UTC (permalink / raw)
  To: gcc-patches



On 3/20/23 08:01, Manolis Tsamis wrote:
> On Fri, Mar 17, 2023 at 10:31 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
>>
>> On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>>>
>>> For this C testcase:
>>>
>>> void g();
>>> void f(unsigned int *a)
>>> {
>>>    if (++*a == 1)
>>>      g();
>>> }
>>>
>>> GCC will currently emit a comparison with 1 by using the value
>>> of *a after the increment. This can be improved by comparing
>>> against 0 and using the value before the increment. As a result
>>> there is a potentially shorter dependancy chain (no need to wait
>>> for the result of +1) and on targets with compare zero instructions
>>> the generated code is one instruction shorter.
>>
>> The downside is we now need two registers and their lifetime overlaps.
>>
>> Your patch mixes changing / inverting a parameter (which seems unneeded
>> for the actual change) with preferring compares against zero.
>>
> 
> Indeed. I thought that without that change the original names wouldn't properly
> describe what the parameter actually does and that's why I've changed it.
> I can undo that in the next revision.
Typically the thing to do is send that separately.  If it has no 
functional change, then it can often go in immediately.


> 
>> What's the reason to specifically prefer compares against zero?  On x86
>> we have add that sets flags, so ++*a == 0 would be preferred, but
>> for your sequence we'd need a test reg, reg; branch on zero, so we do
>> not save any instruction.
>>
> 
> My reasoning is that zero is treated preferentially  in most if not
> all architectures. Some specifically have zero/non-zero comparisons so
> we get one less instruction. X86 doesn't explicitly have that but I
> think that test reg, reg may not be always needed depending on the
> rest of the code. By what Andrew mentions below there may even be
> optimizations for zero in the microarchitecture level.
There's all kinds of low level ways a test against zero is better than a 
test against any other value.  I'm not aware of any architecture were 
the opposite is true.

Note that in this specific case rewriting does cause us to need two 
registers, so we'll want to think about the right time to make this 
transformation.  It may be the case that doing it in gimple is too early.



> 
> Because this is still an arch-specific thing I initially tried to make
> it arch-depended by invoking the target's const functions (e.g. If I
> recall correctly aarch64 will return a lower cost for zero
> comparisons). But the code turned out complicated and messy so I came
> up with this alternative that just treats zero preferentially.
> 
> If you have in mind a way that this can be done in a better way I
> could try to implement it.
And in general I think you approached this in the preferred way -- it's 
largely a target independent optimization, so let's tackle it in a 
generic way.

Anyway, we'll dive into it once gcc-14 development opens and try to 
figure out the best way forward.

jeff


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-03-20 14:01   ` Manolis Tsamis
  2023-03-23 23:27     ` Jeff Law
@ 2023-04-21 21:01     ` Philipp Tomsich
  2023-04-24  8:06       ` Richard Biener
  1 sibling, 1 reply; 20+ messages in thread
From: Philipp Tomsich @ 2023-04-21 21:01 UTC (permalink / raw)
  To: Manolis Tsamis; +Cc: Richard Biener, Andrew MacLeod, gcc-patches

Any guidance on the next steps for this patch?
I believe that we answered all open questions, but may have missed something.

With trunk open for new development, we would like to revise and land this…

Thanks,
Philipp.

On Mon, 20 Mar 2023 at 15:02, Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>
> On Fri, Mar 17, 2023 at 10:31 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> > >
> > > For this C testcase:
> > >
> > > void g();
> > > void f(unsigned int *a)
> > > {
> > >   if (++*a == 1)
> > >     g();
> > > }
> > >
> > > GCC will currently emit a comparison with 1 by using the value
> > > of *a after the increment. This can be improved by comparing
> > > against 0 and using the value before the increment. As a result
> > > there is a potentially shorter dependancy chain (no need to wait
> > > for the result of +1) and on targets with compare zero instructions
> > > the generated code is one instruction shorter.
> >
> > The downside is we now need two registers and their lifetime overlaps.
> >
> > Your patch mixes changing / inverting a parameter (which seems unneeded
> > for the actual change) with preferring compares against zero.
> >
>
> Indeed. I thought that without that change the original names wouldn't properly
> describe what the parameter actually does and that's why I've changed it.
> I can undo that in the next revision.
>
> > What's the reason to specifically prefer compares against zero?  On x86
> > we have add that sets flags, so ++*a == 0 would be preferred, but
> > for your sequence we'd need a test reg, reg; branch on zero, so we do
> > not save any instruction.
> >
>
> My reasoning is that zero is treated preferentially  in most if not
> all architectures. Some specifically have zero/non-zero comparisons so
> we get one less instruction. X86 doesn't explicitly have that but I
> think that test reg, reg may not be always needed depending on the
> rest of the code. By what Andrew mentions below there may even be
> optimizations for zero in the microarchitecture level.
>
> Because this is still an arch-specific thing I initially tried to make
> it arch-depended by invoking the target's const functions (e.g. If I
> recall correctly aarch64 will return a lower cost for zero
> comparisons). But the code turned out complicated and messy so I came
> up with this alternative that just treats zero preferentially.
>
> If you have in mind a way that this can be done in a better way I
> could try to implement it.
>
> > We do have quite some number of bugreports with regards to making VRPs
> > life harder when splitting things this way.  It's easier for VRP to handle
> >
> >   _1 = _2 + 1;
> >   if (_1 == 1)
> >
> > than it is
> >
> >   _1 = _2 + 1;
> >   if (_2 == 0)
> >
> > where VRP fails to derive a range for _1 on the _2 == 0 branch.  So besides
> > the life-range issue there's other side-effects as well.  Maybe ranger meanwhile
> > can handle the above case?
> >
>
> Answered by Andrew MacLeod.
>
> > What's the overall effect of the change on a larger code base?
> >
>
> I made some quick runs of SPEC2017 and got the following results (# of
> folds of zero comparisons):
>
>  gcc             2586
>  xalancbmk 1456
>  perlbench   375
>  x264           307
>  omnetpp     137
>  leela           24
>  deepsjeng  15
>  exchange2  4
>  xz                4
>
> My test runs on Aarch64 do not show any significant change in runtime.
> In some cases (e.g. gcc) the binary is smaller in size, but that can
> depend on a number of other things.
>
> Thanks,
> Manolis
>
> > Thanks,
> > Richard.
> >
> > >
> > > Example from Aarch64:
> > >
> > > Before
> > >         ldr     w1, [x0]
> > >         add     w1, w1, 1
> > >         str     w1, [x0]
> > >         cmp     w1, 1
> > >         beq     .L4
> > >         ret
> > >
> > > After
> > >         ldr     w1, [x0]
> > >         add     w2, w1, 1
> > >         str     w2, [x0]
> > >         cbz     w1, .L4
> > >         ret
> > >
> > > gcc/ChangeLog:
> > >
> > >         * tree-ssa-forwprop.cc (combine_cond_expr_cond):
> > >         (forward_propagate_into_comparison_1): Optimize
> > >         for zero comparisons.
> > >
> > > Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> > > ---
> > >
> > >  gcc/tree-ssa-forwprop.cc | 41 +++++++++++++++++++++++++++-------------
> > >  1 file changed, 28 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> > > index e34f0888954..93d5043821b 100644
> > > --- a/gcc/tree-ssa-forwprop.cc
> > > +++ b/gcc/tree-ssa-forwprop.cc
> > > @@ -373,12 +373,13 @@ rhs_to_tree (tree type, gimple *stmt)
> > >  /* Combine OP0 CODE OP1 in the context of a COND_EXPR.  Returns
> > >     the folded result in a form suitable for COND_EXPR_COND or
> > >     NULL_TREE, if there is no suitable simplified form.  If
> > > -   INVARIANT_ONLY is true only gimple_min_invariant results are
> > > -   considered simplified.  */
> > > +   ALWAYS_COMBINE is false then only combine it the resulting
> > > +   expression is gimple_min_invariant or considered simplified
> > > +   compared to the original.  */
> > >
> > >  static tree
> > >  combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> > > -                       tree op0, tree op1, bool invariant_only)
> > > +                       tree op0, tree op1, bool always_combine)
> > >  {
> > >    tree t;
> > >
> > > @@ -398,17 +399,31 @@ combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> > >    /* Canonicalize the combined condition for use in a COND_EXPR.  */
> > >    t = canonicalize_cond_expr_cond (t);
> > >
> > > -  /* Bail out if we required an invariant but didn't get one.  */
> > > -  if (!t || (invariant_only && !is_gimple_min_invariant (t)))
> > > +  if (!t)
> > >      {
> > >        fold_undefer_overflow_warnings (false, NULL, 0);
> > >        return NULL_TREE;
> > >      }
> > >
> > > -  bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > -  fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > +  if (always_combine || is_gimple_min_invariant (t))
> > > +    {
> > > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > +      return t;
> > > +    }
> > >
> > > -  return t;
> > > +  /* If the result of folding is a zero comparison treat it preferentially.  */
> > > +  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison
> > > +      && integer_zerop (TREE_OPERAND (t, 1))
> > > +      && !integer_zerop (op1))
> > > +    {
> > > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > +      return t;
> > > +    }
> > > +
> > > +  fold_undefer_overflow_warnings (false, NULL, 0);
> > > +  return NULL_TREE;
> > >  }
> > >
> > >  /* Combine the comparison OP0 CODE OP1 at LOC with the defining statements
> > > @@ -432,7 +447,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >        if (def_stmt && can_propagate_from (def_stmt))
> > >         {
> > >           enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
> > > -         bool invariant_only_p = !single_use0_p;
> > > +         bool always_combine = single_use0_p;
> > >
> > >           rhs0 = rhs_to_tree (TREE_TYPE (op1), def_stmt);
> > >
> > > @@ -442,10 +457,10 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >                    && TREE_CODE (TREE_TYPE (TREE_OPERAND (rhs0, 0)))
> > >                       == BOOLEAN_TYPE)
> > >                   || TREE_CODE_CLASS (def_code) == tcc_comparison))
> > > -           invariant_only_p = false;
> > > +           always_combine = true;
> > >
> > >           tmp = combine_cond_expr_cond (stmt, code, type,
> > > -                                       rhs0, op1, invariant_only_p);
> > > +                                       rhs0, op1, always_combine);
> > >           if (tmp)
> > >             return tmp;
> > >         }
> > > @@ -459,7 +474,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >         {
> > >           rhs1 = rhs_to_tree (TREE_TYPE (op0), def_stmt);
> > >           tmp = combine_cond_expr_cond (stmt, code, type,
> > > -                                       op0, rhs1, !single_use1_p);
> > > +                                       op0, rhs1, single_use1_p);
> > >           if (tmp)
> > >             return tmp;
> > >         }
> > > @@ -470,7 +485,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > >        && rhs1 != NULL_TREE)
> > >      tmp = combine_cond_expr_cond (stmt, code, type,
> > >                                   rhs0, rhs1,
> > > -                                 !(single_use0_p && single_use1_p));
> > > +                                 single_use0_p && single_use1_p);
> > >
> > >    return tmp;
> > >  }
> > > --
> > > 2.34.1
> > >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-04-21 21:01     ` Philipp Tomsich
@ 2023-04-24  8:06       ` Richard Biener
  2023-04-24 23:05         ` Jeff Law
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Biener @ 2023-04-24  8:06 UTC (permalink / raw)
  To: Philipp Tomsich; +Cc: Manolis Tsamis, Andrew MacLeod, gcc-patches

On Fri, Apr 21, 2023 at 11:01 PM Philipp Tomsich
<philipp.tomsich@vrull.eu> wrote:
>
> Any guidance on the next steps for this patch?

I think we want to perform this transform later, in particular when
the test is a loop
exit test we do not want to do it as it prevents coalescing of the IV
on the backedge
at out-of-SSA time.

That means doing the transform in folding and/or before inlining (the
test could become
a loop exit test) would be a no-go.  In fact for SSA coalescing we'd
want the reverse
transform in some cases, see PRs 86270 and 70359.

If we can reliably undo for the loop case I suppose we can do the
canonicalization
to compare against zero.  In any case please split up the patch (note I've also
hoped we could eventually get rid of that part of tree-ssa-forwprop.cc in favor
of match.pd patterns since it uses GENERIC folding :/).

Richard.

> I believe that we answered all open questions, but may have missed something.
>
> With trunk open for new development, we would like to revise and land this…
>
> Thanks,
> Philipp.
>
> On Mon, 20 Mar 2023 at 15:02, Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> >
> > On Fri, Mar 17, 2023 at 10:31 AM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On Thu, Mar 16, 2023 at 4:27 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> > > >
> > > > For this C testcase:
> > > >
> > > > void g();
> > > > void f(unsigned int *a)
> > > > {
> > > >   if (++*a == 1)
> > > >     g();
> > > > }
> > > >
> > > > GCC will currently emit a comparison with 1 by using the value
> > > > of *a after the increment. This can be improved by comparing
> > > > against 0 and using the value before the increment. As a result
> > > > there is a potentially shorter dependancy chain (no need to wait
> > > > for the result of +1) and on targets with compare zero instructions
> > > > the generated code is one instruction shorter.
> > >
> > > The downside is we now need two registers and their lifetime overlaps.
> > >
> > > Your patch mixes changing / inverting a parameter (which seems unneeded
> > > for the actual change) with preferring compares against zero.
> > >
> >
> > Indeed. I thought that without that change the original names wouldn't properly
> > describe what the parameter actually does and that's why I've changed it.
> > I can undo that in the next revision.
> >
> > > What's the reason to specifically prefer compares against zero?  On x86
> > > we have add that sets flags, so ++*a == 0 would be preferred, but
> > > for your sequence we'd need a test reg, reg; branch on zero, so we do
> > > not save any instruction.
> > >
> >
> > My reasoning is that zero is treated preferentially  in most if not
> > all architectures. Some specifically have zero/non-zero comparisons so
> > we get one less instruction. X86 doesn't explicitly have that but I
> > think that test reg, reg may not be always needed depending on the
> > rest of the code. By what Andrew mentions below there may even be
> > optimizations for zero in the microarchitecture level.
> >
> > Because this is still an arch-specific thing I initially tried to make
> > it arch-depended by invoking the target's const functions (e.g. If I
> > recall correctly aarch64 will return a lower cost for zero
> > comparisons). But the code turned out complicated and messy so I came
> > up with this alternative that just treats zero preferentially.
> >
> > If you have in mind a way that this can be done in a better way I
> > could try to implement it.
> >
> > > We do have quite some number of bugreports with regards to making VRPs
> > > life harder when splitting things this way.  It's easier for VRP to handle
> > >
> > >   _1 = _2 + 1;
> > >   if (_1 == 1)
> > >
> > > than it is
> > >
> > >   _1 = _2 + 1;
> > >   if (_2 == 0)
> > >
> > > where VRP fails to derive a range for _1 on the _2 == 0 branch.  So besides
> > > the life-range issue there's other side-effects as well.  Maybe ranger meanwhile
> > > can handle the above case?
> > >
> >
> > Answered by Andrew MacLeod.
> >
> > > What's the overall effect of the change on a larger code base?
> > >
> >
> > I made some quick runs of SPEC2017 and got the following results (# of
> > folds of zero comparisons):
> >
> >  gcc             2586
> >  xalancbmk 1456
> >  perlbench   375
> >  x264           307
> >  omnetpp     137
> >  leela           24
> >  deepsjeng  15
> >  exchange2  4
> >  xz                4
> >
> > My test runs on Aarch64 do not show any significant change in runtime.
> > In some cases (e.g. gcc) the binary is smaller in size, but that can
> > depend on a number of other things.
> >
> > Thanks,
> > Manolis
> >
> > > Thanks,
> > > Richard.
> > >
> > > >
> > > > Example from Aarch64:
> > > >
> > > > Before
> > > >         ldr     w1, [x0]
> > > >         add     w1, w1, 1
> > > >         str     w1, [x0]
> > > >         cmp     w1, 1
> > > >         beq     .L4
> > > >         ret
> > > >
> > > > After
> > > >         ldr     w1, [x0]
> > > >         add     w2, w1, 1
> > > >         str     w2, [x0]
> > > >         cbz     w1, .L4
> > > >         ret
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         * tree-ssa-forwprop.cc (combine_cond_expr_cond):
> > > >         (forward_propagate_into_comparison_1): Optimize
> > > >         for zero comparisons.
> > > >
> > > > Signed-off-by: Manolis Tsamis <manolis.tsamis@vrull.eu>
> > > > ---
> > > >
> > > >  gcc/tree-ssa-forwprop.cc | 41 +++++++++++++++++++++++++++-------------
> > > >  1 file changed, 28 insertions(+), 13 deletions(-)
> > > >
> > > > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> > > > index e34f0888954..93d5043821b 100644
> > > > --- a/gcc/tree-ssa-forwprop.cc
> > > > +++ b/gcc/tree-ssa-forwprop.cc
> > > > @@ -373,12 +373,13 @@ rhs_to_tree (tree type, gimple *stmt)
> > > >  /* Combine OP0 CODE OP1 in the context of a COND_EXPR.  Returns
> > > >     the folded result in a form suitable for COND_EXPR_COND or
> > > >     NULL_TREE, if there is no suitable simplified form.  If
> > > > -   INVARIANT_ONLY is true only gimple_min_invariant results are
> > > > -   considered simplified.  */
> > > > +   ALWAYS_COMBINE is false then only combine it the resulting
> > > > +   expression is gimple_min_invariant or considered simplified
> > > > +   compared to the original.  */
> > > >
> > > >  static tree
> > > >  combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> > > > -                       tree op0, tree op1, bool invariant_only)
> > > > +                       tree op0, tree op1, bool always_combine)
> > > >  {
> > > >    tree t;
> > > >
> > > > @@ -398,17 +399,31 @@ combine_cond_expr_cond (gimple *stmt, enum tree_code code, tree type,
> > > >    /* Canonicalize the combined condition for use in a COND_EXPR.  */
> > > >    t = canonicalize_cond_expr_cond (t);
> > > >
> > > > -  /* Bail out if we required an invariant but didn't get one.  */
> > > > -  if (!t || (invariant_only && !is_gimple_min_invariant (t)))
> > > > +  if (!t)
> > > >      {
> > > >        fold_undefer_overflow_warnings (false, NULL, 0);
> > > >        return NULL_TREE;
> > > >      }
> > > >
> > > > -  bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > > -  fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > > +  if (always_combine || is_gimple_min_invariant (t))
> > > > +    {
> > > > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > > +      return t;
> > > > +    }
> > > >
> > > > -  return t;
> > > > +  /* If the result of folding is a zero comparison treat it preferentially.  */
> > > > +  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison
> > > > +      && integer_zerop (TREE_OPERAND (t, 1))
> > > > +      && !integer_zerop (op1))
> > > > +    {
> > > > +      bool nowarn = warning_suppressed_p (stmt, OPT_Wstrict_overflow);
> > > > +      fold_undefer_overflow_warnings (!nowarn, stmt, 0);
> > > > +      return t;
> > > > +    }
> > > > +
> > > > +  fold_undefer_overflow_warnings (false, NULL, 0);
> > > > +  return NULL_TREE;
> > > >  }
> > > >
> > > >  /* Combine the comparison OP0 CODE OP1 at LOC with the defining statements
> > > > @@ -432,7 +447,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > > >        if (def_stmt && can_propagate_from (def_stmt))
> > > >         {
> > > >           enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
> > > > -         bool invariant_only_p = !single_use0_p;
> > > > +         bool always_combine = single_use0_p;
> > > >
> > > >           rhs0 = rhs_to_tree (TREE_TYPE (op1), def_stmt);
> > > >
> > > > @@ -442,10 +457,10 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > > >                    && TREE_CODE (TREE_TYPE (TREE_OPERAND (rhs0, 0)))
> > > >                       == BOOLEAN_TYPE)
> > > >                   || TREE_CODE_CLASS (def_code) == tcc_comparison))
> > > > -           invariant_only_p = false;
> > > > +           always_combine = true;
> > > >
> > > >           tmp = combine_cond_expr_cond (stmt, code, type,
> > > > -                                       rhs0, op1, invariant_only_p);
> > > > +                                       rhs0, op1, always_combine);
> > > >           if (tmp)
> > > >             return tmp;
> > > >         }
> > > > @@ -459,7 +474,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > > >         {
> > > >           rhs1 = rhs_to_tree (TREE_TYPE (op0), def_stmt);
> > > >           tmp = combine_cond_expr_cond (stmt, code, type,
> > > > -                                       op0, rhs1, !single_use1_p);
> > > > +                                       op0, rhs1, single_use1_p);
> > > >           if (tmp)
> > > >             return tmp;
> > > >         }
> > > > @@ -470,7 +485,7 @@ forward_propagate_into_comparison_1 (gimple *stmt,
> > > >        && rhs1 != NULL_TREE)
> > > >      tmp = combine_cond_expr_cond (stmt, code, type,
> > > >                                   rhs0, rhs1,
> > > > -                                 !(single_use0_p && single_use1_p));
> > > > +                                 single_use0_p && single_use1_p);
> > > >
> > > >    return tmp;
> > > >  }
> > > > --
> > > > 2.34.1
> > > >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-04-24  8:06       ` Richard Biener
@ 2023-04-24 23:05         ` Jeff Law
  2023-04-25  7:21           ` Richard Biener
  0 siblings, 1 reply; 20+ messages in thread
From: Jeff Law @ 2023-04-24 23:05 UTC (permalink / raw)
  To: Richard Biener, Philipp Tomsich
  Cc: Manolis Tsamis, Andrew MacLeod, gcc-patches




On 4/24/23 02:06, Richard Biener via Gcc-patches wrote:
> On Fri, Apr 21, 2023 at 11:01 PM Philipp Tomsich
> <philipp.tomsich@vrull.eu> wrote:
>>
>> Any guidance on the next steps for this patch?
> 
> I think we want to perform this transform later, in particular when 
> the test is a loop exit test we do not want to do it as it prevents
> coalescing of the IV on the backedge at out-of-SSA time.
> 
> That means doing the transform in folding and/or before inlining
> (the test could become a loop exit test) would be a no-go.  In fact
> for SSA coalescing we'd want the reverse transform in some cases, see
> PRs 86270 and 70359.
> 
> If we can reliably undo for the loop case I suppose we can do the 
> canonicalization to compare against zero.  In any case please split
> up the patch (note
I've also
> hoped we could eventually get rid of that part of
> tree-ssa-forwprop.cc
in favor
> of match.pd patterns since it uses GENERIC folding :/).
> 
Do we have enough information to do this at expansion time?  That would 
avoid introducing the target dependencies to drive this in gimple.

Jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-04-24 23:05         ` Jeff Law
@ 2023-04-25  7:21           ` Richard Biener
  2023-04-26  2:30             ` Jeff Law
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Biener @ 2023-04-25  7:21 UTC (permalink / raw)
  To: Jeff Law; +Cc: Philipp Tomsich, Manolis Tsamis, Andrew MacLeod, gcc-patches

On Tue, Apr 25, 2023 at 1:05 AM Jeff Law <jeffreyalaw@gmail.com> wrote
>
>
>
>
> On 4/24/23 02:06, Richard Biener via Gcc-patches wrote:
> > On Fri, Apr 21, 2023 at 11:01 PM Philipp Tomsich
> > <philipp.tomsich@vrull.eu> wrote:
> >>
> >> Any guidance on the next steps for this patch?
> >
> > I think we want to perform this transform later, in particular when
> > the test is a loop exit test we do not want to do it as it prevents
> > coalescing of the IV on the backedge at out-of-SSA time.
> >
> > That means doing the transform in folding and/or before inlining
> > (the test could become a loop exit test) would be a no-go.  In fact
> > for SSA coalescing we'd want the reverse transform in some cases, see
> > PRs 86270 and 70359.
> >
> > If we can reliably undo for the loop case I suppose we can do the
> > canonicalization to compare against zero.  In any case please split
> > up the patch (note
> I've also
> > hoped we could eventually get rid of that part of
> > tree-ssa-forwprop.cc
> in favor
> > of match.pd patterns since it uses GENERIC folding :/).
> >
> Do we have enough information to do this at expansion time?  That would
> avoid introducing the target dependencies to drive this in gimple.

I think so, but there isn't any convenient place to do this I think.  I suppose
there's no hope to catch it during RTL opts?

> Jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-04-25  7:21           ` Richard Biener
@ 2023-04-26  2:30             ` Jeff Law
  2023-04-26  6:41               ` Richard Biener
  0 siblings, 1 reply; 20+ messages in thread
From: Jeff Law @ 2023-04-26  2:30 UTC (permalink / raw)
  To: Richard Biener
  Cc: Philipp Tomsich, Manolis Tsamis, Andrew MacLeod, gcc-patches



On 4/25/23 01:21, Richard Biener wrote:
> On Tue, Apr 25, 2023 at 1:05 AM Jeff Law <jeffreyalaw@gmail.com> wrote
>>
>>
>>
>>
>> On 4/24/23 02:06, Richard Biener via Gcc-patches wrote:
>>> On Fri, Apr 21, 2023 at 11:01 PM Philipp Tomsich
>>> <philipp.tomsich@vrull.eu> wrote:
>>>>
>>>> Any guidance on the next steps for this patch?
>>>
>>> I think we want to perform this transform later, in particular when
>>> the test is a loop exit test we do not want to do it as it prevents
>>> coalescing of the IV on the backedge at out-of-SSA time.
>>>
>>> That means doing the transform in folding and/or before inlining
>>> (the test could become a loop exit test) would be a no-go.  In fact
>>> for SSA coalescing we'd want the reverse transform in some cases, see
>>> PRs 86270 and 70359.
>>>
>>> If we can reliably undo for the loop case I suppose we can do the
>>> canonicalization to compare against zero.  In any case please split
>>> up the patch (note
>> I've also
>>> hoped we could eventually get rid of that part of
>>> tree-ssa-forwprop.cc
>> in favor
>>> of match.pd patterns since it uses GENERIC folding :/).
>>>
>> Do we have enough information to do this at expansion time?  That would
>> avoid introducing the target dependencies to drive this in gimple.
> 
> I think so, but there isn't any convenient place to do this I think.  I suppose
> there's no hope to catch it during RTL opts?
Combine would be the most natural place in the RTL pipeline, but it'd be 
a 2->2 combination which would be rejected.

We could possibly do it as a define_insn_and_split, but the gimple->RTL 
interface seems like a better fit to me.  If TER has done its job, we 
should see a complex enough tree node to do the right thing.

jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-04-26  2:30             ` Jeff Law
@ 2023-04-26  6:41               ` Richard Biener
  2023-08-02 14:07                 ` Manolis Tsamis
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Biener @ 2023-04-26  6:41 UTC (permalink / raw)
  To: Jeff Law; +Cc: Philipp Tomsich, Manolis Tsamis, Andrew MacLeod, gcc-patches

On Wed, Apr 26, 2023 at 4:30 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 4/25/23 01:21, Richard Biener wrote:
> > On Tue, Apr 25, 2023 at 1:05 AM Jeff Law <jeffreyalaw@gmail.com> wrote
> >>
> >>
> >>
> >>
> >> On 4/24/23 02:06, Richard Biener via Gcc-patches wrote:
> >>> On Fri, Apr 21, 2023 at 11:01 PM Philipp Tomsich
> >>> <philipp.tomsich@vrull.eu> wrote:
> >>>>
> >>>> Any guidance on the next steps for this patch?
> >>>
> >>> I think we want to perform this transform later, in particular when
> >>> the test is a loop exit test we do not want to do it as it prevents
> >>> coalescing of the IV on the backedge at out-of-SSA time.
> >>>
> >>> That means doing the transform in folding and/or before inlining
> >>> (the test could become a loop exit test) would be a no-go.  In fact
> >>> for SSA coalescing we'd want the reverse transform in some cases, see
> >>> PRs 86270 and 70359.
> >>>
> >>> If we can reliably undo for the loop case I suppose we can do the
> >>> canonicalization to compare against zero.  In any case please split
> >>> up the patch (note
> >> I've also
> >>> hoped we could eventually get rid of that part of
> >>> tree-ssa-forwprop.cc
> >> in favor
> >>> of match.pd patterns since it uses GENERIC folding :/).
> >>>
> >> Do we have enough information to do this at expansion time?  That would
> >> avoid introducing the target dependencies to drive this in gimple.
> >
> > I think so, but there isn't any convenient place to do this I think.  I suppose
> > there's no hope to catch it during RTL opts?
> Combine would be the most natural place in the RTL pipeline, but it'd be
> a 2->2 combination which would be rejected.
>
> We could possibly do it as a define_insn_and_split, but the gimple->RTL
> interface seems like a better fit to me.  If TER has done its job, we
> should see a complex enough tree node to do the right thing.

Of course we'd want to get rid of TER in favor of ISEL

Richard.

> jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-04-26  6:41               ` Richard Biener
@ 2023-08-02 14:07                 ` Manolis Tsamis
  2023-08-03  7:04                   ` Richard Biener
  0 siblings, 1 reply; 20+ messages in thread
From: Manolis Tsamis @ 2023-08-02 14:07 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jeff Law, Philipp Tomsich, Andrew MacLeod, gcc-patches

Hi all,

I'm pinging to discuss again if we want to move this forward for GCC14.

I did some testing again and I haven't been able to find obvious
regressions, including testing the code from PR86270 and PR70359 that
Richard mentioned.
I still believe that zero can be considered a special case even for
hardware that doesn't directly benefit in the comparison.
For example it happens that the testcase from the commit compiles to
one instruction less in x86:

.LFB0:
    movl    (%rdi), %eax
    leal    1(%rax), %edx
    movl    %edx, (%rdi)
    testl    %eax, %eax
    je    .L4
    ret
.L4:
    jmp    g

vs

.LFB0:
    movl    (%rdi), %eax
    addl    $1, %eax
    movl    %eax, (%rdi)
    cmpl    $1, %eax
    je    .L4
    ret
.L4:
    xorl    %eax, %eax
    jmp    g

(The xorl is not emitted  when testl is used. LLVM uses testl but also
does xor eax, eax :) )
Although this is accidental, I believe it also showcases that zero is
a preferential value in various ways.

I'm running benchmarks comparing the effects of this change and I'm
also still looking for testcases that result in problematic
regressions.
Any feedback or other concerns about this are appreciated!

Thanks,
Manolis

On Wed, Apr 26, 2023 at 9:43 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Apr 26, 2023 at 4:30 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
> >
> >
> >
> > On 4/25/23 01:21, Richard Biener wrote:
> > > On Tue, Apr 25, 2023 at 1:05 AM Jeff Law <jeffreyalaw@gmail.com> wrote
> > >>
> > >>
> > >>
> > >>
> > >> On 4/24/23 02:06, Richard Biener via Gcc-patches wrote:
> > >>> On Fri, Apr 21, 2023 at 11:01 PM Philipp Tomsich
> > >>> <philipp.tomsich@vrull.eu> wrote:
> > >>>>
> > >>>> Any guidance on the next steps for this patch?
> > >>>
> > >>> I think we want to perform this transform later, in particular when
> > >>> the test is a loop exit test we do not want to do it as it prevents
> > >>> coalescing of the IV on the backedge at out-of-SSA time.
> > >>>
> > >>> That means doing the transform in folding and/or before inlining
> > >>> (the test could become a loop exit test) would be a no-go.  In fact
> > >>> for SSA coalescing we'd want the reverse transform in some cases, see
> > >>> PRs 86270 and 70359.
> > >>>
> > >>> If we can reliably undo for the loop case I suppose we can do the
> > >>> canonicalization to compare against zero.  In any case please split
> > >>> up the patch (note
> > >> I've also
> > >>> hoped we could eventually get rid of that part of
> > >>> tree-ssa-forwprop.cc
> > >> in favor
> > >>> of match.pd patterns since it uses GENERIC folding :/).
> > >>>
> > >> Do we have enough information to do this at expansion time?  That would
> > >> avoid introducing the target dependencies to drive this in gimple.
> > >
> > > I think so, but there isn't any convenient place to do this I think.  I suppose
> > > there's no hope to catch it during RTL opts?
> > Combine would be the most natural place in the RTL pipeline, but it'd be
> > a 2->2 combination which would be rejected.
> >
> > We could possibly do it as a define_insn_and_split, but the gimple->RTL
> > interface seems like a better fit to me.  If TER has done its job, we
> > should see a complex enough tree node to do the right thing.
>
> Of course we'd want to get rid of TER in favor of ISEL
>
> Richard.
>
> > jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-08-02 14:07                 ` Manolis Tsamis
@ 2023-08-03  7:04                   ` Richard Biener
  2023-08-03 15:21                     ` Jeff Law
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Biener @ 2023-08-03  7:04 UTC (permalink / raw)
  To: Manolis Tsamis; +Cc: Jeff Law, Philipp Tomsich, Andrew MacLeod, gcc-patches

On Wed, Aug 2, 2023 at 4:08 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>
> Hi all,
>
> I'm pinging to discuss again if we want to move this forward for GCC14.
>
> I did some testing again and I haven't been able to find obvious
> regressions, including testing the code from PR86270 and PR70359 that
> Richard mentioned.
> I still believe that zero can be considered a special case even for
> hardware that doesn't directly benefit in the comparison.
> For example it happens that the testcase from the commit compiles to
> one instruction less in x86:
>
> .LFB0:
>     movl    (%rdi), %eax
>     leal    1(%rax), %edx
>     movl    %edx, (%rdi)
>     testl    %eax, %eax
>     je    .L4
>     ret
> .L4:
>     jmp    g
>
> vs
>
> .LFB0:
>     movl    (%rdi), %eax
>     addl    $1, %eax
>     movl    %eax, (%rdi)
>     cmpl    $1, %eax
>     je    .L4
>     ret
> .L4:
>     xorl    %eax, %eax
>     jmp    g
>
> (The xorl is not emitted  when testl is used. LLVM uses testl but also
> does xor eax, eax :) )
> Although this is accidental, I believe it also showcases that zero is
> a preferential value in various ways.
>
> I'm running benchmarks comparing the effects of this change and I'm
> also still looking for testcases that result in problematic
> regressions.
> Any feedback or other concerns about this are appreciated!

My comment from Apr 24th still holds, IMO this is something for
instruction selection (aka the ISEL pass) or the out-of-SSA tweaks
we do during RTL expansion (see insert_backedge_copies)

Richard.

> Thanks,
> Manolis
>
> On Wed, Apr 26, 2023 at 9:43 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, Apr 26, 2023 at 4:30 AM Jeff Law <jeffreyalaw@gmail.com> wrote:
> > >
> > >
> > >
> > > On 4/25/23 01:21, Richard Biener wrote:
> > > > On Tue, Apr 25, 2023 at 1:05 AM Jeff Law <jeffreyalaw@gmail.com> wrote
> > > >>
> > > >>
> > > >>
> > > >>
> > > >> On 4/24/23 02:06, Richard Biener via Gcc-patches wrote:
> > > >>> On Fri, Apr 21, 2023 at 11:01 PM Philipp Tomsich
> > > >>> <philipp.tomsich@vrull.eu> wrote:
> > > >>>>
> > > >>>> Any guidance on the next steps for this patch?
> > > >>>
> > > >>> I think we want to perform this transform later, in particular when
> > > >>> the test is a loop exit test we do not want to do it as it prevents
> > > >>> coalescing of the IV on the backedge at out-of-SSA time.
> > > >>>
> > > >>> That means doing the transform in folding and/or before inlining
> > > >>> (the test could become a loop exit test) would be a no-go.  In fact
> > > >>> for SSA coalescing we'd want the reverse transform in some cases, see
> > > >>> PRs 86270 and 70359.
> > > >>>
> > > >>> If we can reliably undo for the loop case I suppose we can do the
> > > >>> canonicalization to compare against zero.  In any case please split
> > > >>> up the patch (note
> > > >> I've also
> > > >>> hoped we could eventually get rid of that part of
> > > >>> tree-ssa-forwprop.cc
> > > >> in favor
> > > >>> of match.pd patterns since it uses GENERIC folding :/).
> > > >>>
> > > >> Do we have enough information to do this at expansion time?  That would
> > > >> avoid introducing the target dependencies to drive this in gimple.
> > > >
> > > > I think so, but there isn't any convenient place to do this I think.  I suppose
> > > > there's no hope to catch it during RTL opts?
> > > Combine would be the most natural place in the RTL pipeline, but it'd be
> > > a 2->2 combination which would be rejected.
> > >
> > > We could possibly do it as a define_insn_and_split, but the gimple->RTL
> > > interface seems like a better fit to me.  If TER has done its job, we
> > > should see a complex enough tree node to do the right thing.
> >
> > Of course we'd want to get rid of TER in favor of ISEL
> >
> > Richard.
> >
> > > jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-08-03  7:04                   ` Richard Biener
@ 2023-08-03 15:21                     ` Jeff Law
  2023-08-04  6:37                       ` Richard Biener
  0 siblings, 1 reply; 20+ messages in thread
From: Jeff Law @ 2023-08-03 15:21 UTC (permalink / raw)
  To: Richard Biener, Manolis Tsamis
  Cc: Philipp Tomsich, Andrew MacLeod, gcc-patches



On 8/3/23 01:04, Richard Biener wrote:
> On Wed, Aug 2, 2023 at 4:08 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
>>
>> Hi all,
>>
>> I'm pinging to discuss again if we want to move this forward for GCC14.
>>
>> I did some testing again and I haven't been able to find obvious
>> regressions, including testing the code from PR86270 and PR70359 that
>> Richard mentioned.
>> I still believe that zero can be considered a special case even for
>> hardware that doesn't directly benefit in the comparison.
>> For example it happens that the testcase from the commit compiles to
>> one instruction less in x86:
>>
>> .LFB0:
>>      movl    (%rdi), %eax
>>      leal    1(%rax), %edx
>>      movl    %edx, (%rdi)
>>      testl    %eax, %eax
>>      je    .L4
>>      ret
>> .L4:
>>      jmp    g
>>
>> vs
>>
>> .LFB0:
>>      movl    (%rdi), %eax
>>      addl    $1, %eax
>>      movl    %eax, (%rdi)
>>      cmpl    $1, %eax
>>      je    .L4
>>      ret
>> .L4:
>>      xorl    %eax, %eax
>>      jmp    g
>>
>> (The xorl is not emitted  when testl is used. LLVM uses testl but also
>> does xor eax, eax :) )
>> Although this is accidental, I believe it also showcases that zero is
>> a preferential value in various ways.
>>
>> I'm running benchmarks comparing the effects of this change and I'm
>> also still looking for testcases that result in problematic
>> regressions.
>> Any feedback or other concerns about this are appreciated!
> 
> My comment from Apr 24th still holds, IMO this is something for
> instruction selection (aka the ISEL pass) or the out-of-SSA tweaks
> we do during RTL expansion (see insert_backedge_copies)
I'm still generally supportive of biasing to zero, but as Richi has 
noted the current implementation needs to be pushed further back into 
the pipeline, preferably all the way to isel or gimple->rtl expansion.

Jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop.
  2023-08-03 15:21                     ` Jeff Law
@ 2023-08-04  6:37                       ` Richard Biener
  0 siblings, 0 replies; 20+ messages in thread
From: Richard Biener @ 2023-08-04  6:37 UTC (permalink / raw)
  To: Jeff Law; +Cc: Manolis Tsamis, Philipp Tomsich, Andrew MacLeod, gcc-patches

On Thu, Aug 3, 2023 at 5:21 PM Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 8/3/23 01:04, Richard Biener wrote:
> > On Wed, Aug 2, 2023 at 4:08 PM Manolis Tsamis <manolis.tsamis@vrull.eu> wrote:
> >>
> >> Hi all,
> >>
> >> I'm pinging to discuss again if we want to move this forward for GCC14.
> >>
> >> I did some testing again and I haven't been able to find obvious
> >> regressions, including testing the code from PR86270 and PR70359 that
> >> Richard mentioned.
> >> I still believe that zero can be considered a special case even for
> >> hardware that doesn't directly benefit in the comparison.
> >> For example it happens that the testcase from the commit compiles to
> >> one instruction less in x86:
> >>
> >> .LFB0:
> >>      movl    (%rdi), %eax
> >>      leal    1(%rax), %edx
> >>      movl    %edx, (%rdi)
> >>      testl    %eax, %eax
> >>      je    .L4
> >>      ret
> >> .L4:
> >>      jmp    g
> >>
> >> vs
> >>
> >> .LFB0:
> >>      movl    (%rdi), %eax
> >>      addl    $1, %eax
> >>      movl    %eax, (%rdi)
> >>      cmpl    $1, %eax
> >>      je    .L4
> >>      ret
> >> .L4:
> >>      xorl    %eax, %eax
> >>      jmp    g
> >>
> >> (The xorl is not emitted  when testl is used. LLVM uses testl but also
> >> does xor eax, eax :) )
> >> Although this is accidental, I believe it also showcases that zero is
> >> a preferential value in various ways.
> >>
> >> I'm running benchmarks comparing the effects of this change and I'm
> >> also still looking for testcases that result in problematic
> >> regressions.
> >> Any feedback or other concerns about this are appreciated!
> >
> > My comment from Apr 24th still holds, IMO this is something for
> > instruction selection (aka the ISEL pass) or the out-of-SSA tweaks
> > we do during RTL expansion (see insert_backedge_copies)
> I'm still generally supportive of biasing to zero, but as Richi has
> noted the current implementation needs to be pushed further back into
> the pipeline, preferably all the way to isel or gimple->rtl expansion.

Note the main reason is that if you only "fix" forwprop you miss other places
that will happily undo this.  As the intent is to get better
instruction selection
doing the canoncalization at RTL expansion sounds like the best idea to me.

Richard.

> Jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2023-08-04  6:37 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-03-16 15:27 [PATCH v1] [RFC] Improve folding for comparisons with zero in tree-ssa-forwprop Manolis Tsamis
2023-03-16 16:41 ` Jeff Law
2023-03-16 20:32   ` Philipp Tomsich
2023-03-17  8:31 ` Richard Biener
2023-03-17 13:15   ` Philipp Tomsich
2023-03-17 14:03     ` Richard Biener
2023-03-17 20:43     ` Andrew Waterman
2023-03-17 14:12   ` Andrew MacLeod
2023-03-20 14:01   ` Manolis Tsamis
2023-03-23 23:27     ` Jeff Law
2023-04-21 21:01     ` Philipp Tomsich
2023-04-24  8:06       ` Richard Biener
2023-04-24 23:05         ` Jeff Law
2023-04-25  7:21           ` Richard Biener
2023-04-26  2:30             ` Jeff Law
2023-04-26  6:41               ` Richard Biener
2023-08-02 14:07                 ` Manolis Tsamis
2023-08-03  7:04                   ` Richard Biener
2023-08-03 15:21                     ` Jeff Law
2023-08-04  6:37                       ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).