public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals
@ 2024-06-25  5:44 tnfchris at gcc dot gnu.org
  2024-06-25  5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
                   ` (8 more replies)
  0 siblings, 9 replies; 10+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-06-25  5:44 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

            Bug ID: 115629
           Summary: Inefficient if-convert of masked conditionals
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

With cases such as:

void foo1 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}

we generate:

foo1:
        cmp     w5, 0
        ble     .L1
        mov     x6, 0
        whilelo p7.s, wzr, w5
        ptrue   p3.b, all
.L3:
        ld1w    z31.s, p7/z, [x0, x6, lsl 2]
        cmpeq   p6.s, p7/z, z31.s, #0
        cmpne   p5.s, p7/z, z31.s, #0
        ld1w    z0.s, p6/z, [x2, x6, lsl 2]
        ld1w    z30.s, p5/z, [x1, x6, lsl 2]
        cmpne   p15.s, p3/z, z0.s, #0
        orr     z0.d, z31.d, z0.d
        and     p6.b, p15/z, p6.b, p6.b
        cmpeq   p4.s, p7/z, z0.s, #0
        ld1w    z28.s, p6/z, [x1, x6, lsl 2]
        ld1w    z29.s, p4/z, [x3, x6, lsl 2]
        sel     z29.s, p15, z28.s, z29.s
        sel     z29.s, p5, z30.s, z29.s
        st1w    z29.s, p7, [x4, x6, lsl 2]
        incw    x6
        whilelo p7.s, w6, w5
        b.any   .L3

where b is loaded twice, once with the mask a[i] != 0, and once a[i] == 0 &&
c[i] != 0.
clearly we don't need the second compare nor the second load.  This loop can be
optimally handled as:

foo1:
        cmp     w5, 0
        ble     .L1
        mov     x6, 0
        cntw    x7
        whilelo p7.s, wzr, w5
        .p2align 5,,15
.L3:
        ld1w    z1.s, p7/z, [x0, x6, lsl 2]
        ld1w    z0.s, p7/z, [x2, x6, lsl 2]
        orr     z0.d, z1.d, z0.d
        cmpne   p6.s, p7/z, z0.s, #0
        cmpeq   p5.s, p7/z, z0.s, #0
        ld1w    z31.s, p6/z, [x1, x6, lsl 2]
        ld1w    z30.s, p5/z, [x3, x6, lsl 2]
        sel     z30.s, p6, z31.s, z30.s
        st1w    z30.s, p7, [x4, x6, lsl 2]
        add     x6, x6, x7
        whilelo p7.s, w6, w5
        b.any   .L3
.L1:
        ret

i.e. transform a ? b : (c ? b : d) into (a || c) ? b : d.

This transform is actually also beneficial for scalar, but that's not the case
when one of the conditions have to be inverted. i.e. cases 2 to 4 are
beneficial for vector masked operations but not scalar:

/* Convert a ? b : (c ? b : d) into (a || c) ? b : d.  */
void foo1 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}

/* Convert a ? (c ? b : d) : b into (-a || c) ? b : d.  */
void foo2 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? (c[i] ? b[i] : d[i]) : b[i];
}

/* Convert a ? (c ? d :b) : b into (-a || -c) ? b : d.  */
void foo3 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? (c[i] ? d[i] : b[i]) : b[i];
}

/* Convert a ? b : (c ? d : b) into (a || -c) ? b : d.  */
void foo4 (int *restrict a, int *restrict b, int *restrict c,
          int *restrict d, int *restrict res, int n)
{
    for (int i = 0; i < n; i++)
      res[i] = a[i] ? b[i] : (c[i] ? d[i] : b[i]);
}

for instance case 3 is currently vectorized as:

foo3:
        cmp     w5, 0
        ble     .L10
        mov     x6, 0
        whilelo p7.s, wzr, w5
        ptrue   p3.b, all
.L12:
        ld1w    z1.s, p7/z, [x0, x6, lsl 2]
        cmpeq   p5.s, p7/z, z1.s, #0
        cmpne   p6.s, p7/z, z1.s, #0
        ld1w    z29.s, p5/z, [x1, x6, lsl 2]
        ld1w    z0.s, p6/z, [x2, x6, lsl 2]
        cmpne   p15.s, p3/z, z0.s, #0
        cmpeq   p4.s, p6/z, z0.s, #0
        and     p5.b, p15/z, p6.b, p6.b
        ld1w    z30.s, p4/z, [x1, x6, lsl 2]
        ld1w    z31.s, p5/z, [x3, x6, lsl 2]
        sel     z30.s, p15, z31.s, z30.s
        sel     z30.s, p6, z30.s, z29.s
        st1w    z30.s, p7, [x4, x6, lsl 2]
        incw    x6
        whilelo p7.s, w6, w5
        b.any   .L12

but can be

foo3:
        cmp     w5, 0
        ble     .L10
        mov     x6, 0
        cntw    x7
        whilelo p6.s, wzr, w5
        ptrue   p5.b, all
        .p2align 5,,15
.L12:
        ld1w    z29.s, p6/z, [x0, x6, lsl 2]
        ld1w    z28.s, p6/z, [x2, x6, lsl 2]
        cmpeq   p15.s, p5/z, z29.s, #0
        cmpeq   p14.s, p5/z, z28.s, #0
        orr     p15.b, p5/z, p15.b, p14.b
        and     p4.b, p6/z, p15.b, p15.b
        bic     p7.b, p5/z, p6.b, p15.b
        ld1w    z31.s, p4/z, [x1, x6, lsl 2]
        ld1w    z30.s, p7/z, [x3, x6, lsl 2]
        sel     z30.s, p4, z31.s, z30.s
        st1w    z30.s, p6, [x4, x6, lsl 2]
        add     x6, x6, x7
        whilelo p6.s, w6, w5
        b.any   .L12

with those match.pd rule, however this likely needs to be done in if-convert
since not all the transforms are beneficial for scalar because the negation
adds an extra instruction before the branch.

I think this would have to be done before conditional load lowering, because
we'd have to transform:

  _32 = _4 != 0;
  iftmp.0_22 = .MASK_LOAD (_5, 32B, _32);
  _61 = _4 == 0;
  _7 = .MASK_LOAD (_6, 32B, _61);
  _26 = _7 != 0;
  _13 = _26 & _61;
  iftmp.0_21 = .MASK_LOAD (_5, 32B, _13);
  _66 = _4 | _7;
  _36 = _66 == 0;
  iftmp.0_19 = .MASK_LOAD (_9, 32B, _36);
  _ifc__56 = _26 ? iftmp.0_21 : iftmp.0_19;
  iftmp.0_12 = _32 ? iftmp.0_22 : _ifc__56;

into

  _4 = *_3;
  _6 = *_5;
  _7 = _4 | _6;
  _35 = _7 != 0;
  iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
  _51 = _7 == 0;
  iftmp.0_19 = .MASK_LOAD (_9, 32B, _51);
  iftmp.0_12 = _35 ? iftmp.0_21 : iftmp.0_19;

which would be quite hard.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2024-07-02 19:33 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-25  5:44 [Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals tnfchris at gcc dot gnu.org
2024-06-25  5:47 ` [Bug tree-optimization/115629] " pinskia at gcc dot gnu.org
2024-06-25  9:41 ` rguenth at gcc dot gnu.org
2024-06-26 16:53 ` cvs-commit at gcc dot gnu.org
2024-06-26 16:55 ` rguenth at gcc dot gnu.org
2024-07-01 10:10 ` tnfchris at gcc dot gnu.org
2024-07-01 12:27 ` rguenther at suse dot de
2024-07-01 19:33 ` tnfchris at gcc dot gnu.org
2024-07-02  7:03 ` rguenth at gcc dot gnu.org
2024-07-02 19:33 ` rsandifo at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).