public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals
@ 2024-06-25  5:44 tnfchris at gcc dot gnu.org
  2024-06-25  5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-06-25  5:44 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

            Bug ID: 115629
           Summary: Inefficient if-convert of masked conditionals
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

With cases such as:

void foo1 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}

we generate:

foo1:
        cmp     w5, 0
        ble     .L1
        mov     x6, 0
        whilelo p7.s, wzr, w5
        ptrue   p3.b, all
.L3:
        ld1w    z31.s, p7/z, [x0, x6, lsl 2]
        cmpeq   p6.s, p7/z, z31.s, #0
        cmpne   p5.s, p7/z, z31.s, #0
        ld1w    z0.s, p6/z, [x2, x6, lsl 2]
        ld1w    z30.s, p5/z, [x1, x6, lsl 2]
        cmpne   p15.s, p3/z, z0.s, #0
        orr     z0.d, z31.d, z0.d
        and     p6.b, p15/z, p6.b, p6.b
        cmpeq   p4.s, p7/z, z0.s, #0
        ld1w    z28.s, p6/z, [x1, x6, lsl 2]
        ld1w    z29.s, p4/z, [x3, x6, lsl 2]
        sel     z29.s, p15, z28.s, z29.s
        sel     z29.s, p5, z30.s, z29.s
        st1w    z29.s, p7, [x4, x6, lsl 2]
        incw    x6
        whilelo p7.s, w6, w5
        b.any   .L3

where b is loaded twice, once with the mask a[i] != 0, and once a[i] == 0 &&
c[i] != 0.
clearly we don't need the second compare nor the second load.  This loop can be
optimally handled as:

foo1:
        cmp     w5, 0
        ble     .L1
        mov     x6, 0
        cntw    x7
        whilelo p7.s, wzr, w5
        .p2align 5,,15
.L3:
        ld1w    z1.s, p7/z, [x0, x6, lsl 2]
        ld1w    z0.s, p7/z, [x2, x6, lsl 2]
        orr     z0.d, z1.d, z0.d
        cmpne   p6.s, p7/z, z0.s, #0
        cmpeq   p5.s, p7/z, z0.s, #0
        ld1w    z31.s, p6/z, [x1, x6, lsl 2]
        ld1w    z30.s, p5/z, [x3, x6, lsl 2]
        sel     z30.s, p6, z31.s, z30.s
        st1w    z30.s, p7, [x4, x6, lsl 2]
        add     x6, x6, x7
        whilelo p7.s, w6, w5
        b.any   .L3
.L1:
        ret

i.e. transform a ? b : (c ? b : d) into (a || c) ? b : d.

This transform is actually also beneficial for scalar, but that's not the case
when one of the conditions have to be inverted. i.e. cases 2 to 4 are
beneficial for vector masked operations but not scalar:

/* Convert a ? b : (c ? b : d) into (a || c) ? b : d.  */
void foo1 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}

/* Convert a ? (c ? b : d) : b into (-a || c) ? b : d.  */
void foo2 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? (c[i] ? b[i] : d[i]) : b[i];
}

/* Convert a ? (c ? d :b) : b into (-a || -c) ? b : d.  */
void foo3 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? (c[i] ? d[i] : b[i]) : b[i];
}

/* Convert a ? b : (c ? d : b) into (a || -c) ? b : d.  */
void foo4 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? b[i] : (c[i] ? d[i] : b[i]);
}

for instance case 3 is currently vectorized as:

foo3:
        cmp     w5, 0
        ble     .L10
        mov     x6, 0
        whilelo p7.s, wzr, w5
        ptrue   p3.b, all
.L12:
        ld1w    z1.s, p7/z, [x0, x6, lsl 2]
        cmpeq   p5.s, p7/z, z1.s, #0
        cmpne   p6.s, p7/z, z1.s, #0
        ld1w    z29.s, p5/z, [x1, x6, lsl 2]
        ld1w    z0.s, p6/z, [x2, x6, lsl 2]
        cmpne   p15.s, p3/z, z0.s, #0
        cmpeq   p4.s, p6/z, z0.s, #0
        and     p5.b, p15/z, p6.b, p6.b
        ld1w    z30.s, p4/z, [x1, x6, lsl 2]
        ld1w    z31.s, p5/z, [x3, x6, lsl 2]
        sel     z30.s, p15, z31.s, z30.s
        sel     z30.s, p6, z30.s, z29.s
        st1w    z30.s, p7, [x4, x6, lsl 2]
        incw    x6
        whilelo p7.s, w6, w5
        b.any   .L12

but can be

foo3:
        cmp     w5, 0
        ble     .L10
        mov     x6, 0
        cntw    x7
        whilelo p6.s, wzr, w5
        ptrue   p5.b, all
        .p2align 5,,15
.L12:
        ld1w    z29.s, p6/z, [x0, x6, lsl 2]
        ld1w    z28.s, p6/z, [x2, x6, lsl 2]
        cmpeq   p15.s, p5/z, z29.s, #0
        cmpeq   p14.s, p5/z, z28.s, #0
        orr     p15.b, p5/z, p15.b, p14.b
        and     p4.b, p6/z, p15.b, p15.b
        bic     p7.b, p5/z, p6.b, p15.b
        ld1w    z31.s, p4/z, [x1, x6, lsl 2]
        ld1w    z30.s, p7/z, [x3, x6, lsl 2]
        sel     z30.s, p4, z31.s, z30.s
        st1w    z30.s, p6, [x4, x6, lsl 2]
        add     x6, x6, x7
        whilelo p6.s, w6, w5
        b.any   .L12

with those match.pd rule, however this likely needs to be done in if-convert
since not all the transforms are beneficial for scalar because the negation
adds an extra instruction before the branch.

I think this would have to be done before conditional load lowering, because
we'd have to transform:

  _32 = _4 != 0;
  iftmp.0_22 = .MASK_LOAD (_5, 32B, _32);
  _61 = _4 == 0;
  _7 = .MASK_LOAD (_6, 32B, _61);
  _26 = _7 != 0;
  _13 = _26 & _61;
  iftmp.0_21 = .MASK_LOAD (_5, 32B, _13);
  _66 = _4 | _7;
  _36 = _66 == 0;
  iftmp.0_19 = .MASK_LOAD (_9, 32B, _36);
  _ifc__56 = _26 ? iftmp.0_21 : iftmp.0_19;
  iftmp.0_12 = _32 ? iftmp.0_22 : _ifc__56;

into

  _4 = *_3;
  _6 = *_5;
  _7 = _4 | _6;
  _35 = _7 != 0;
  iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
  _51 = _7 == 0;
  iftmp.0_19 = .MASK_LOAD (_9, 32B, _51);
  iftmp.0_12 = _35 ? iftmp.0_21 : iftmp.0_19;

which would be quite hard.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
@ 2024-06-25  5:47 ` pinskia at gcc dot gnu.org
  2024-06-25  9:41 ` rguenth at gcc dot gnu.org
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-06-25  5:47 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |pinskia at gcc dot gnu.org
           Severity|normal                      |enhancement
             Blocks|                            |53947


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
  2024-06-25  5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
@ 2024-06-25  9:41 ` rguenth at gcc dot gnu.org
  2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
  2024-06-26 16:55 ` rguenth at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-25  9:41 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2024-06-25
     Ever confirmed|0                           |1

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
I think the transform is profitable for scalar for all cases as well.

Now, if-conversion records BB predicates and uses those to predicate
stmts/PHIs, I'm not sure how to go optimize it on that level.

It's more a CFG transform of a if (A) x; else { if (B) x; else y; } to
if (A || B) x; else y;  with the downside that in this case evaluating
'B' has side-effects (memory dereference which can trap).

Due to the trapping this needs to be somehow done in if-conversion
where you also need to determine that x == x' (both copies of x
evaluate to the same code).  Some scalar GIMPLE opt could have
de-duplicated the BB with 'x' (tail-merging to the rescue?  it considers
the cands but somehow doesn't merge - I think the PHI node isn't
supported).

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
  2024-06-25  5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
  2024-06-25  9:41 ` rguenth at gcc dot gnu.org
@ 2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
  2024-06-26 16:55 ` rguenth at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2024-06-26 16:53 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

--- Comment #2 from GCC Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:

https://gcc.gnu.org/g:629257bcb81434117f1e9c68479032563176dc0c

commit r15-1662-g629257bcb81434117f1e9c68479032563176dc0c
Author: Richard Biener <rguenther@suse.de>
Date:   Tue Jun 25 14:04:31 2024 +0200

    tree-optimization/115629 - missed tail merging

    The following fixes a missed tail-merging observed for the testcase
    in PR115629.  The issue is that when deps_ok_for_redirect doesn't
    compute both would be valid prevailing blocks it rejects the merge.
    The following instead makes sure to record the working block as
    prevailing.  Also stmt comparison fails for indirect references
    and is not handling memory references thoroughly, failing to unify
    array indices and pointers indirected.  The following attempts to
    fix this.

            PR tree-optimization/115629
            * tree-ssa-tail-merge.cc (gimple_equal_p): Handle
            memory references better.
            (deps_ok_for_redirect): Handle the case not both blocks
            are considered a valid prevailing block.

            * gcc.dg/tree-ssa/tail-merge-1.c: New testcase.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
@ 2024-06-26 16:55 ` rguenth at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-26 16:55 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
So we now tail-merge the two b[i] loading blocks.  Can you check SVE code-gen
with this?  If that fixes the PR consider adding a SVE testcase.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-06-26 16:55 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
2024-06-25  5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
2024-06-25  9:41 ` rguenth at gcc dot gnu.org
2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
2024-06-26 16:55 ` rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).