public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals
@ 2024-06-25  5:44 tnfchris at gcc dot gnu.org
  2024-06-25  5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
                   ` (9 more replies)
  0 siblings, 10 replies; 11+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-06-25  5:44 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

            Bug ID: 115629
           Summary: Inefficient if-convert of masked conditionals
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

With cases such as:

void foo1 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}

we generate:

foo1:
        cmp     w5, 0
        ble     .L1
        mov     x6, 0
        whilelo p7.s, wzr, w5
        ptrue   p3.b, all
.L3:
        ld1w    z31.s, p7/z, [x0, x6, lsl 2]
        cmpeq   p6.s, p7/z, z31.s, #0
        cmpne   p5.s, p7/z, z31.s, #0
        ld1w    z0.s, p6/z, [x2, x6, lsl 2]
        ld1w    z30.s, p5/z, [x1, x6, lsl 2]
        cmpne   p15.s, p3/z, z0.s, #0
        orr     z0.d, z31.d, z0.d
        and     p6.b, p15/z, p6.b, p6.b
        cmpeq   p4.s, p7/z, z0.s, #0
        ld1w    z28.s, p6/z, [x1, x6, lsl 2]
        ld1w    z29.s, p4/z, [x3, x6, lsl 2]
        sel     z29.s, p15, z28.s, z29.s
        sel     z29.s, p5, z30.s, z29.s
        st1w    z29.s, p7, [x4, x6, lsl 2]
        incw    x6
        whilelo p7.s, w6, w5
        b.any   .L3

where b is loaded twice, once with the mask a[i] != 0, and once a[i] == 0 &&
c[i] != 0.
clearly we don't need the second compare nor the second load.  This loop can be
optimally handled as:

foo1:
        cmp     w5, 0
        ble     .L1
        mov     x6, 0
        cntw    x7
        whilelo p7.s, wzr, w5
        .p2align 5,,15
.L3:
        ld1w    z1.s, p7/z, [x0, x6, lsl 2]
        ld1w    z0.s, p7/z, [x2, x6, lsl 2]
        orr     z0.d, z1.d, z0.d
        cmpne   p6.s, p7/z, z0.s, #0
        cmpeq   p5.s, p7/z, z0.s, #0
        ld1w    z31.s, p6/z, [x1, x6, lsl 2]
        ld1w    z30.s, p5/z, [x3, x6, lsl 2]
        sel     z30.s, p6, z31.s, z30.s
        st1w    z30.s, p7, [x4, x6, lsl 2]
        add     x6, x6, x7
        whilelo p7.s, w6, w5
        b.any   .L3
.L1:
        ret

i.e. transform a ? b : (c ? b : d) into (a || c) ? b : d.

This transform is actually also beneficial for scalar, but that's not the case
when one of the conditions have to be inverted. i.e. cases 2 to 4 are
beneficial for vector masked operations but not scalar:

/* Convert a ? b : (c ? b : d) into (a || c) ? b : d.  */
void foo1 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}

/* Convert a ? (c ? b : d) : b into (-a || c) ? b : d.  */
void foo2 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? (c[i] ? b[i] : d[i]) : b[i];
}

/* Convert a ? (c ? d :b) : b into (-a || -c) ? b : d.  */
void foo3 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? (c[i] ? d[i] : b[i]) : b[i];
}

/* Convert a ? b : (c ? d : b) into (a || -c) ? b : d.  */
void foo4 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? b[i] : (c[i] ? d[i] : b[i]);
}

for instance case 3 is currently vectorized as:

foo3:
        cmp     w5, 0
        ble     .L10
        mov     x6, 0
        whilelo p7.s, wzr, w5
        ptrue   p3.b, all
.L12:
        ld1w    z1.s, p7/z, [x0, x6, lsl 2]
        cmpeq   p5.s, p7/z, z1.s, #0
        cmpne   p6.s, p7/z, z1.s, #0
        ld1w    z29.s, p5/z, [x1, x6, lsl 2]
        ld1w    z0.s, p6/z, [x2, x6, lsl 2]
        cmpne   p15.s, p3/z, z0.s, #0
        cmpeq   p4.s, p6/z, z0.s, #0
        and     p5.b, p15/z, p6.b, p6.b
        ld1w    z30.s, p4/z, [x1, x6, lsl 2]
        ld1w    z31.s, p5/z, [x3, x6, lsl 2]
        sel     z30.s, p15, z31.s, z30.s
        sel     z30.s, p6, z30.s, z29.s
        st1w    z30.s, p7, [x4, x6, lsl 2]
        incw    x6
        whilelo p7.s, w6, w5
        b.any   .L12

but can be

foo3:
        cmp     w5, 0
        ble     .L10
        mov     x6, 0
        cntw    x7
        whilelo p6.s, wzr, w5
        ptrue   p5.b, all
        .p2align 5,,15
.L12:
        ld1w    z29.s, p6/z, [x0, x6, lsl 2]
        ld1w    z28.s, p6/z, [x2, x6, lsl 2]
        cmpeq   p15.s, p5/z, z29.s, #0
        cmpeq   p14.s, p5/z, z28.s, #0
        orr     p15.b, p5/z, p15.b, p14.b
        and     p4.b, p6/z, p15.b, p15.b
        bic     p7.b, p5/z, p6.b, p15.b
        ld1w    z31.s, p4/z, [x1, x6, lsl 2]
        ld1w    z30.s, p7/z, [x3, x6, lsl 2]
        sel     z30.s, p4, z31.s, z30.s
        st1w    z30.s, p6, [x4, x6, lsl 2]
        add     x6, x6, x7
        whilelo p6.s, w6, w5
        b.any   .L12

with those match.pd rule, however this likely needs to be done in if-convert
since not all the transforms are beneficial for scalar because the negation
adds an extra instruction before the branch.

I think this would have to be done before conditional load lowering, because
we'd have to transform:

  _32 = _4 != 0;
  iftmp.0_22 = .MASK_LOAD (_5, 32B, _32);
  _61 = _4 == 0;
  _7 = .MASK_LOAD (_6, 32B, _61);
  _26 = _7 != 0;
  _13 = _26 & _61;
  iftmp.0_21 = .MASK_LOAD (_5, 32B, _13);
  _66 = _4 | _7;
  _36 = _66 == 0;
  iftmp.0_19 = .MASK_LOAD (_9, 32B, _36);
  _ifc__56 = _26 ? iftmp.0_21 : iftmp.0_19;
  iftmp.0_12 = _32 ? iftmp.0_22 : _ifc__56;

into

  _4 = *_3;
  _6 = *_5;
  _7 = _4 | _6;
  _35 = _7 != 0;
  iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
  _51 = _7 == 0;
  iftmp.0_19 = .MASK_LOAD (_9, 32B, _51);
  iftmp.0_12 = _35 ? iftmp.0_21 : iftmp.0_19;

which would be quite hard.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
@ 2024-06-25  5:47 ` pinskia at gcc dot gnu.org
  2024-06-25  9:41 ` rguenth at gcc dot gnu.org
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-06-25  5:47 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |pinskia at gcc dot gnu.org
           Severity|normal                      |enhancement
             Blocks|                            |53947


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
  2024-06-25  5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
@ 2024-06-25  9:41 ` rguenth at gcc dot gnu.org
  2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-25  9:41 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2024-06-25
     Ever confirmed|0                           |1

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
I think the transform is profitable for scalar for all cases as well.

Now, if-conversion records BB predicates and uses those to predicate
stmts/PHIs, I'm not sure how to go optimize it on that level.

It's more a CFG transform of a if (A) x; else { if (B) x; else y; } to
if (A || B) x; else y;  with the downside that in this case evaluating
'B' has side-effects (memory dereference which can trap).

Due to the trapping this needs to be somehow done in if-conversion
where you also need to determine that x == x' (both copies of x
evaluate to the same code).  Some scalar GIMPLE opt could have
de-duplicated the BB with 'x' (tail-merging to the rescue?  it considers
the cands but somehow doesn't merge - I think the PHI node isn't
supported).

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
  2024-06-25  5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
  2024-06-25  9:41 ` rguenth at gcc dot gnu.org
@ 2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
  2024-06-26 16:55 ` rguenth at gcc dot gnu.org
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: cvs-commit at gcc dot gnu.org @ 2024-06-26 16:53 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

--- Comment #2 from GCC Commits <cvs-commit at gcc dot gnu.org> ---
The master branch has been updated by Richard Biener <rguenth@gcc.gnu.org>:

https://gcc.gnu.org/g:629257bcb81434117f1e9c68479032563176dc0c

commit r15-1662-g629257bcb81434117f1e9c68479032563176dc0c
Author: Richard Biener <rguenther@suse.de>
Date:   Tue Jun 25 14:04:31 2024 +0200

    tree-optimization/115629 - missed tail merging

    The following fixes a missed tail-merging observed for the testcase
    in PR115629.  The issue is that when deps_ok_for_redirect doesn't
    compute both would be valid prevailing blocks it rejects the merge.
    The following instead makes sure to record the working block as
    prevailing.  Also stmt comparison fails for indirect references
    and is not handling memory references thoroughly, failing to unify
    array indices and pointers indirected.  The following attempts to
    fix this.

            PR tree-optimization/115629
            * tree-ssa-tail-merge.cc (gimple_equal_p): Handle
            memory references better.
            (deps_ok_for_redirect): Handle the case not both blocks
            are considered a valid prevailing block.

            * gcc.dg/tree-ssa/tail-merge-1.c: New testcase.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
@ 2024-06-26 16:55 ` rguenth at gcc dot gnu.org
  2024-07-01 10:10 ` tnfchris at gcc dot gnu.org
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-26 16:55 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
So we now tail-merge the two b[i] loading blocks.  Can you check SVE code-gen
with this?  If that fixes the PR consider adding a SVE testcase.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
                   ` (3 preceding siblings ...)
  2024-06-26 16:55 ` rguenth at gcc dot gnu.org
@ 2024-07-01 10:10 ` tnfchris at gcc dot gnu.org
  2024-07-01 12:27 ` rguenther at suse dot de
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-07-01 10:10 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

--- Comment #4 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #3)
> So we now tail-merge the two b[i] loading blocks.  Can you check SVE
> code-gen with this?  If that fixes the PR consider adding a SVE testcase.

Thanks, the codegen is much better now, but shows some other missing mask
tracking in the vectorizer.

Atm we generate:

.L3:
        ld1w    z31.s, p6/z, [x0, x6, lsl 2] <-- load a
        cmpeq   p7.s, p6/z, z31.s, #0        <-- a == 0, !a
        ld1w    z0.s, p7/z, [x2, x6, lsl 2]  <-- load c conditionally on !a
        cmpeq   p7.s, p7/z, z0.s, #0         <-- !a && !c
        orr     z0.d, z31.d, z0.d            <-- a || c
        ld1w    z29.s, p7/z, [x3, x6, lsl 2] <--- load d where !a && !c
        cmpne   p5.s, p6/z, z0.s, #0         <--- (a || c) & loop_mask
        and     p7.b, p6/z, p7.b, p7.b       <--- ((!a && !c) && (!a && !c)) &
loop_mask 
        ld1w    z30.s, p5/z, [x1, x6, lsl 2] <-- load b conditionally on (a ||
c)
        sel     z30.s, p7, z29.s, z30.s      <-- select (!a && !c, d, b)
        st1w    z30.s, p6, [x4, x6, lsl 2]
        add     x6, x6, x7
        whilelo p6.s, w6, w5
        b.any   .L3

which corresponds to:

  # loop_mask_63 = PHI <next_mask_95(10), max_mask_94(20)>
  vect__4.10_64 = .MASK_LOAD (vectp_a.8_53, 32B, loop_mask_63);
  mask__31.11_66 = vect__4.10_64 != { 0, ... };
  mask__56.12_67 = ~mask__31.11_66;
  vec_mask_and_70 = mask__56.12_67 & loop_mask_63;
  vect__7.15_71 = .MASK_LOAD (vectp_c.13_68, 32B, vec_mask_and_70);
  mask__22.16_73 = vect__7.15_71 == { 0, ... };
  mask__34.17_75 = vec_mask_and_70 & mask__22.16_73;
  vect_iftmp.20_78 = .MASK_LOAD (vectp_d.18_76, 32B, mask__34.17_75);
  vect__61.21_79 = vect__4.10_64 | vect__7.15_71;
  mask__35.22_81 = vect__61.21_79 != { 0, ... };
  vec_mask_and_84 = mask__35.22_81 & loop_mask_63;
  vect_iftmp.25_85 = .MASK_LOAD (vectp_b.23_82, 32B, vec_mask_and_84);
  _86 = mask__34.17_75 & loop_mask_63;
  vect_iftmp.26_87 = VEC_COND_EXPR <_86, vect_iftmp.20_78, vect_iftmp.25_85>;
  .MASK_STORE (vectp_res.27_88, 32B, loop_mask_63, vect_iftmp.26_87);

it looks like what's missing is that the mask tracking doesn't know that other
masked operations naturally perform an AND when combined.  We do some of this
in the backend but I feel like it may be better to do it in the vectorizer.

In this case, the second load is conditional on the first load mask,  which
means it's already done an AND.
And crucially inverting it means you also inverted both conditions.

So there are some superflous masking operations happening.  But I guess that's
a separate bug.  Shall I just add some tests here and close it and open a new
PR?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
                   ` (4 preceding siblings ...)
  2024-07-01 10:10 ` tnfchris at gcc dot gnu.org
@ 2024-07-01 12:27 ` rguenther at suse dot de
  2024-07-01 19:33 ` tnfchris at gcc dot gnu.org
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: rguenther at suse dot de @ 2024-07-01 12:27 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

--- Comment #5 from rguenther at suse dot de <rguenther at suse dot de> ---
> Am 01.07.2024 um 12:10 schrieb tnfchris at gcc dot gnu.org <gcc-bugzilla@gcc.gnu.org>:
> 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
> 
> --- Comment #4 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
> (In reply to Richard Biener from comment #3)
>> So we now tail-merge the two b[i] loading blocks.  Can you check SVE
>> code-gen with this?  If that fixes the PR consider adding a SVE testcase.
> 
> Thanks, the codegen is much better now, but shows some other missing mask
> tracking in the vectorizer.
> 
> Atm we generate:
> 
> .L3:
>        ld1w    z31.s, p6/z, [x0, x6, lsl 2] <-- load a
>        cmpeq   p7.s, p6/z, z31.s, #0        <-- a == 0, !a
>        ld1w    z0.s, p7/z, [x2, x6, lsl 2]  <-- load c conditionally on !a
>        cmpeq   p7.s, p7/z, z0.s, #0         <-- !a && !c
>        orr     z0.d, z31.d, z0.d            <-- a || c
>        ld1w    z29.s, p7/z, [x3, x6, lsl 2] <--- load d where !a && !c
>        cmpne   p5.s, p6/z, z0.s, #0         <--- (a || c) & loop_mask
>        and     p7.b, p6/z, p7.b, p7.b       <--- ((!a && !c) && (!a && !c)) &
> loop_mask
>        ld1w    z30.s, p5/z, [x1, x6, lsl 2] <-- load b conditionally on (a ||
> c)
>        sel     z30.s, p7, z29.s, z30.s      <-- select (!a && !c, d, b)
>        st1w    z30.s, p6, [x4, x6, lsl 2]
>        add     x6, x6, x7
>        whilelo p6.s, w6, w5
>        b.any   .L3
> 
> which corresponds to:
> 
>  # loop_mask_63 = PHI <next_mask_95(10), max_mask_94(20)>
>  vect__4.10_64 = .MASK_LOAD (vectp_a.8_53, 32B, loop_mask_63);
>  mask__31.11_66 = vect__4.10_64 != { 0, ... };
>  mask__56.12_67 = ~mask__31.11_66;
>  vec_mask_and_70 = mask__56.12_67 & loop_mask_63;
>  vect__7.15_71 = .MASK_LOAD (vectp_c.13_68, 32B, vec_mask_and_70);
>  mask__22.16_73 = vect__7.15_71 == { 0, ... };
>  mask__34.17_75 = vec_mask_and_70 & mask__22.16_73;
>  vect_iftmp.20_78 = .MASK_LOAD (vectp_d.18_76, 32B, mask__34.17_75);
>  vect__61.21_79 = vect__4.10_64 | vect__7.15_71;
>  mask__35.22_81 = vect__61.21_79 != { 0, ... };
>  vec_mask_and_84 = mask__35.22_81 & loop_mask_63;
>  vect_iftmp.25_85 = .MASK_LOAD (vectp_b.23_82, 32B, vec_mask_and_84);
>  _86 = mask__34.17_75 & loop_mask_63;
>  vect_iftmp.26_87 = VEC_COND_EXPR <_86, vect_iftmp.20_78, vect_iftmp.25_85>;
>  .MASK_STORE (vectp_res.27_88, 32B, loop_mask_63, vect_iftmp.26_87);
> 
> it looks like what's missing is that the mask tracking doesn't know that other
> masked operations naturally perform an AND when combined.  We do some of this
> in the backend but I feel like it may be better to do it in the vectorizer.
> 
> In this case, the second load is conditional on the first load mask,  which
> means it's already done an AND.
> And crucially inverting it means you also inverted both conditions.
> 
> So there are some superflous masking operations happening.  But I guess that's
> a separate bug.  Shall I just add some tests here and close it and open a new
> PR?

Not sure if that helps - do we fully understand this is a separate issue and
not related to how we if-convert?

Adding a testcase is nevertheless OK of course.

> --
> You are receiving this mail because:
> You are on the CC list for the bug.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
                   ` (5 preceding siblings ...)
  2024-07-01 12:27 ` rguenther at suse dot de
@ 2024-07-01 19:33 ` tnfchris at gcc dot gnu.org
  2024-07-02  7:03 ` rguenth at gcc dot gnu.org
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 11+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-07-01 19:33 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

--- Comment #6 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
(In reply to rguenther@suse.de from comment #5)
> > In this case, the second load is conditional on the first load mask,  which
> > means it's already done an AND.
> > And crucially inverting it means you also inverted both conditions.
> > 
> > So there are some superflous masking operations happening.  But I guess that's
> > a separate bug.  Shall I just add some tests here and close it and open a new
> > PR?
> 
> Not sure if that helps - do we fully understand this is a separate issue and
> not related to how we if-convert?
> 

if-convert looks ok to me:

  <bb 3> [local count: 955630226]:
  # i_28 = PHI <i_25(10), 0(21)>
  _1 = (long unsigned int) i_28;
  _2 = _1 * 4;
  _3 = a_16(D) + _2;
  _4 = *_3;
  _31 = _4 != 0;
  _55 = _54 + _2;
  _6 = (int *) _55;
  _56 = ~_31;
  _7 = .MASK_LOAD (_6, 32B, _56);
  _22 = _7 == 0;
  _34 = _22 & _56;
  _58 = _57 + _2;
  _9 = (int *) _58;
  iftmp.0_19 = .MASK_LOAD (_9, 32B, _34);
  _61 = _4 | _7;
  _35 = _61 != 0;
  _60 = _59 + _2;
  _8 = (int *) _60;
  iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
  iftmp.0_12 = _34 ? iftmp.0_19 : iftmp.0_21;
  _10 = res_23(D) + _2;
  *_10 = iftmp.0_12;
  i_25 = i_28 + 1;
  if (n_15(D) > i_25)
    goto <bb 10>; [89.00%]
  else
    goto <bb 11>; [11.00%]

I think what's missing here is that

  _7 = .MASK_LOAD (_6, 32B, _56);
  _22 = _7 == 0;
  _34 = _22 & _56;
  iftmp.0_19 = .MASK_LOAD (_9, 32B, _34);

in itself produces an AND. namely (_7 && _34) && _56 where _56 is the loop
mask.

my (probably poor understanding) is that the mask tracking in the vectorizer
attempts to prevent to keep masks and their inverse live at the same time.

but that this code in this case doesn't track masks introduced by nested
MASK_LOADS.  at least, that's my naive interpretation.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
                   ` (6 preceding siblings ...)
  2024-07-01 19:33 ` tnfchris at gcc dot gnu.org
@ 2024-07-02  7:03 ` rguenth at gcc dot gnu.org
  2024-07-02 19:33 ` rsandifo at gcc dot gnu.org
  2024-07-03  7:27 ` rguenth at gcc dot gnu.org
  9 siblings, 0 replies; 11+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-07-02  7:03 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

--- Comment #7 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Tamar Christina from comment #6)
> (In reply to rguenther@suse.de from comment #5)
> > > In this case, the second load is conditional on the first load mask,  which
> > > means it's already done an AND.
> > > And crucially inverting it means you also inverted both conditions.
> > > 
> > > So there are some superflous masking operations happening.  But I guess that's
> > > a separate bug.  Shall I just add some tests here and close it and open a new
> > > PR?
> > 
> > Not sure if that helps - do we fully understand this is a separate issue and
> > not related to how we if-convert?
> > 
> 
> if-convert looks ok to me:
> 
>   <bb 3> [local count: 955630226]:
>   # i_28 = PHI <i_25(10), 0(21)>
>   _1 = (long unsigned int) i_28;
>   _2 = _1 * 4;
>   _3 = a_16(D) + _2;
>   _4 = *_3;
>   _31 = _4 != 0;
>   _55 = _54 + _2;
>   _6 = (int *) _55;
>   _56 = ~_31;
>   _7 = .MASK_LOAD (_6, 32B, _56);
>   _22 = _7 == 0;
>   _34 = _22 & _56;
>   _58 = _57 + _2;
>   _9 = (int *) _58;
>   iftmp.0_19 = .MASK_LOAD (_9, 32B, _34);
>   _61 = _4 | _7;
>   _35 = _61 != 0;
>   _60 = _59 + _2;
>   _8 = (int *) _60;
>   iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
>   iftmp.0_12 = _34 ? iftmp.0_19 : iftmp.0_21;
>   _10 = res_23(D) + _2;
>   *_10 = iftmp.0_12;
>   i_25 = i_28 + 1;
>   if (n_15(D) > i_25)
>     goto <bb 10>; [89.00%]
>   else
>     goto <bb 11>; [11.00%]
> 
> I think what's missing here is that
> 
>   _7 = .MASK_LOAD (_6, 32B, _56);
>   _22 = _7 == 0;
>   _34 = _22 & _56;
>   iftmp.0_19 = .MASK_LOAD (_9, 32B, _34);
> 
> in itself produces an AND. namely (_7 && _34) && _56 where _56 is the loop
> mask.
> 
> my (probably poor understanding) is that the mask tracking in the vectorizer
> attempts to prevent to keep masks and their inverse live at the same time.
> 
> but that this code in this case doesn't track masks introduced by nested
> MASK_LOADS.  at least, that's my naive interpretation.

Hmm, without carrying out the math it seems like for

  iftmp.0_19 = .MASK_LOAD (_9, 32B, _34);
..
  iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
  iftmp.0_12 = _34 ? iftmp.0_19 : iftmp.0_21;

the _34 conditional and the _34 and _35 masks would make it so this
could be written with sth like

  iftmp.0_12 = iftmp.0_19 | iftmp.0_21;

but a detail might have escaped me ;)  Is that what you're trying to say
as well?  That of course still leaves maybe partly redundant mask
computations.  In the ifcvt dump it doesn't help that != 0 is sometimes
elided and sometimes not ...

IIRC the vectorizer at some point tries to optimize loop mask and mask
mask?  Maybe we need to teach that mechanism to track how mask masks are
combined from other masks via & and | to improve the masks used?

This could be also a separate mask optimization phase on the SLP
representation.  While it doesn't represent loop masking at least
it does for .MASK_LOAD and .MASK_STORE (and .COND_*).

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
                   ` (7 preceding siblings ...)
  2024-07-02  7:03 ` rguenth at gcc dot gnu.org
@ 2024-07-02 19:33 ` rsandifo at gcc dot gnu.org
  2024-07-03  7:27 ` rguenth at gcc dot gnu.org
  9 siblings, 0 replies; 11+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2024-07-02 19:33 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

Richard Sandiford <rsandifo at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |rsandifo at gcc dot gnu.org

--- Comment #8 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
Perhaps I'm missing the point, but I think one of the issues here is that we
(still) don't model that MASK_LOAD sets inactive elements to zero.  Inactive
elements are currently undefined instead.  (I think Robin mentioned that
assuming zero is problematic for RVV, so we might need an explicit MASK_LOAD
argument for inactive elements, like for COND_ADD etc.)

So quoting the IL in comment 4:

  # loop_mask_63 = PHI <next_mask_95(10), max_mask_94(20)>
  vect__4.10_64 = .MASK_LOAD (vectp_a.8_53, 32B, loop_mask_63);
  mask__31.11_66 = vect__4.10_64 != { 0, ... };
  mask__56.12_67 = ~mask__31.11_66;
  vec_mask_and_70 = mask__56.12_67 & loop_mask_63;
  vect__7.15_71 = .MASK_LOAD (vectp_c.13_68, 32B, vec_mask_and_70);
  mask__22.16_73 = vect__7.15_71 == { 0, ... };
  mask__34.17_75 = vec_mask_and_70 & mask__22.16_73;

I think this and...

  vect_iftmp.20_78 = .MASK_LOAD (vectp_d.18_76, 32B, mask__34.17_75);
  vect__61.21_79 = vect__4.10_64 | vect__7.15_71;
  mask__35.22_81 = vect__61.21_79 != { 0, ... };
  vec_mask_and_84 = mask__35.22_81 & loop_mask_63;

...this have to be kept until we model inactive elements.

  vect_iftmp.25_85 = .MASK_LOAD (vectp_b.23_82, 32B, vec_mask_and_84);
  _86 = mask__34.17_75 & loop_mask_63;

This one is really curious though :)  Why does the code think that the loop
mask is needed here?  Does the code think the mask is needed for correctness,
or is the scalar_cond_masked_set optimisation misfiring?

  vect_iftmp.26_87 = VEC_COND_EXPR <_86, vect_iftmp.20_78, vect_iftmp.25_85>;
  .MASK_STORE (vectp_res.27_88, 32B, loop_mask_63, vect_iftmp.26_87);

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Bug tree-optimization/115629] Inefficient if-convert of masked conditionals
  2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
                   ` (8 preceding siblings ...)
  2024-07-02 19:33 ` rsandifo at gcc dot gnu.org
@ 2024-07-03  7:27 ` rguenth at gcc dot gnu.org
  9 siblings, 0 replies; 11+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-07-03  7:27 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

--- Comment #9 from Richard Biener <rguenth at gcc dot gnu.org> ---
I think we sort-of agreed to have .MASK_LOAD to have inactive lanes zeroed but
we never got around to formalizing that.  Note AVX512 supports zero-masking for
.MASK_STORE IIRC, not just merge.  Same for .MASK_LOAD where it supports
merging as well.  How would we ask the target whether it can do 'merge'?
zero and undefined could be done via predicates but merge?

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2024-07-03  7:27 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
2024-06-25  5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
2024-06-25  9:41 ` rguenth at gcc dot gnu.org
2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
2024-06-26 16:55 ` rguenth at gcc dot gnu.org
2024-07-01 10:10 ` tnfchris at gcc dot gnu.org
2024-07-01 12:27 ` rguenther at suse dot de
2024-07-01 19:33 ` tnfchris at gcc dot gnu.org
2024-07-02  7:03 ` rguenth at gcc dot gnu.org
2024-07-02 19:33 ` rsandifo at gcc dot gnu.org
2024-07-03  7:27 ` rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).