public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal
@ 2023-10-13  7:28 rguenth at gcc dot gnu.org
  2023-10-13  7:33 ` [Bug tree-optimization/111793] " rguenth at gcc dot gnu.org
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-10-13  7:28 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793

            Bug ID: 111793
           Summary: OpenMP SIMD inbranch clones for AVX are highly
                    sub-optimal
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rguenth at gcc dot gnu.org
  Target Milestone: ---

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX are highly sub-optimal
  2023-10-13  7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
@ 2023-10-13  7:33 ` rguenth at gcc dot gnu.org
  2023-10-13  7:43 ` [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 " rguenth at gcc dot gnu.org
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-10-13  7:33 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Blocks|                            |53947
             Target|                            |x86_64-*-*
           Keywords|                            |missed-optimization

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
#pragma omp declare simd simdlen(16) inbranch
__attribute__((noinline)) int
foo (int a, int b)
{
  return a + b;
}

we present if-conversion with the following which I think is good:

  <bb 3> [local count: 1073741824]:
  # iter.49_5 = PHI <0(2), iter.49_6(7)>
  # ivtmp_19 = PHI <16(2), ivtmp_18(7)>
  b_2 = b.45[iter.49_5];
  a_1 = a.44[iter.49_5];
  _8 = mask.48_7(D) >> iter.49_5;
  _9 = _8 & 1;
  if (_9 == 0)
    goto <bb 8>; [20.00%]
  else
    goto <bb 4>; [80.00%]

  <bb 8> [local count: 214748360]:
  goto <bb 5>; [100.00%]

  <bb 4> [local count: 858993464]:
  _3 = a_1 + b_2;
  retval.43[iter.49_5] = _3;

  <bb 5> [local count: 1073741824]:
  iter.49_6 = iter.49_5 + 1;
  ivtmp_18 = ivtmp_19 - 1;
  if (ivtmp_18 != 0)
    goto <bb 7>; [100.00%]
  else
    goto <bb 6>; [0.00%]

and we if-convert the body to

  <bb 3> [local count: 1073741824]:
  # iter.49_5 = PHI <iter.49_6(7), 0(15)>
  # ivtmp_19 = PHI <ivtmp_18(7), 16(15)>
  b_2 = b.45[iter.49_5];
  a_1 = a.44[iter.49_5];
  _8 = mask.48_7(D) >> iter.49_5;
  _9 = _8 & 1; 
  _22 = _9 == 0;
  _36 = (unsigned int) a_1;
  _37 = (unsigned int) b_2;
  _38 = _36 + _37;
  _3 = (int) _38;
  _39 = ~_22;
  _40 = &retval.43[iter.49_5];
  .MASK_STORE (_40, 32B, _39, _3);
  iter.49_6 = iter.49_5 + 1;
  ivtmp_18 = ivtmp_19 - 1;
  if (ivtmp_18 != 0)
    goto <bb 7>; [100.00%]
  else
    goto <bb 6>; [0.00%]

but rather than recovering the mask in its original form we vectorize this as

  vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D)};

  <bb 3> [local count: 10631108]:
  # vect_vec_iv_.125_42 = PHI <_43(7), { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15 }(2)>
  # vectp_b.126_44 = PHI <vectp_b.126_45(7), &b.45(2)>
  # vectp_a.129_47 = PHI <vectp_a.129_48(7), &a.44(2)>
  # vectp_retval.140_61 = PHI <vectp_retval.140_62(7), &retval.43(2)>
  # ivtmp_64 = PHI <ivtmp_65(7), 0(2)>
  _43 = vect_vec_iv_.125_42 + { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16 };
  vect_b_2.128_46 = MEM <vector(16) int> [(int *)vectp_b.126_44];
  vect_a_1.131_49 = MEM <vector(16) int> [(int *)vectp_a.129_47];
  vect__8.132_51 = vect_cst__50 >> vect_vec_iv_.125_42;
  vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1 };
  mask__22.134_55 = vect__9.133_53 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0 };
  vect__36.135_56 = VIEW_CONVERT_EXPR<vector(16) unsigned
int>(vect_a_1.131_49);
  vect__37.136_57 = VIEW_CONVERT_EXPR<vector(16) unsigned
int>(vect_b_2.128_46);
  vect__38.137_58 = vect__36.135_56 + vect__37.136_57;
  vect__3.138_59 = VIEW_CONVERT_EXPR<vector(16) int>(vect__38.137_58);
  mask__39.139_60 = ~mask__22.134_55;
  if (mask__39.139_60 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 })
    goto <bb 20>; [20.00%]
  else
    goto <bb 21>; [80.00%]

  <bb 21> [local count: 8504886]:
  .MASK_STORE (vectp_retval.140_61, 512B, mask__39.139_60, vect__3.138_59);

  <bb 20> [local count: 10631108]:
  vectp_b.126_45 = vectp_b.126_44 + 64;
  vectp_a.129_48 = vectp_a.129_47 + 64;
  vectp_retval.140_62 = vectp_retval.140_61 + 64;
  ivtmp_65 = ivtmp_64 + 1;
  if (ivtmp_65 < 1)
    goto <bb 7>; [0.00%]
  else
    goto <bb 17>; [100.00%]

in the end leaving us with

  <bb 2> [local count: 1073741824]:
  vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D)};
  vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15 };
  vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1 };
  vect__36.135_56 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(simd.46_13(D));
  vect__37.136_57 = VIEW_CONVERT_EXPR<vector(16) unsigned int>(simd.47_15(D));
  vect__38.137_58 = vect__36.135_56 + vect__37.136_57;
  vect__3.138_59 = VIEW_CONVERT_EXPR<vector(16) int>(vect__38.137_58);
  mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0 };
  if (mask__39.139_60 == { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 })
    goto <bb 4>; [20.00%]
  else
    goto <bb 3>; [80.00%]

  <bb 3> [local count: 858993419]:
  .MASK_STORE (&retval.43, 512B, mask__39.139_60, vect__3.138_59);

  <bb 4> [local count: 1073741824]:
  _10 = VIEW_CONVERT_EXPR<vector(16) int>(retval.43);
  return _10;

and

_ZGVeM16vv_foo:
.LFB4:
        .cfi_startproc
        movl    $1, %eax
        pushq   %rbp
        .cfi_def_cfa_offset 16 
        .cfi_offset 6, -16
        vpbroadcastd    %edi, %zmm2
        vpaddd  %zmm1, %zmm0, %zmm0
        vpbroadcastd    %eax, %zmm3
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-64, %rsp
        vpsrlvd .LC0(%rip), %zmm2, %zmm2
        vpxor   %xmm1, %xmm1, %xmm1
        vpandd  %zmm3, %zmm2, %zmm2
        vpcmpd  $4, %zmm1, %zmm2, %k1
        kortestw        %k1, %k1
        je      .L25
        vmovdqa32       %zmm0, -64(%rsp){%k1}
.L25:
        vmovdqa32       -64(%rsp), %zmm0
        leave
        .cfi_def_cfa 7, 8
        ret


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
  2023-10-13  7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
  2023-10-13  7:33 ` [Bug tree-optimization/111793] " rguenth at gcc dot gnu.org
@ 2023-10-13  7:43 ` rguenth at gcc dot gnu.org
  2023-10-13  9:08 ` jakub at gcc dot gnu.org
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-10-13  7:43 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2023-10-13
     Ever confirmed|0                           |1

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
For AVX2 and simdlen(8) we get

  <bb 2> [local count: 1073741824]:
  vect__35.98_51 = VIEW_CONVERT_EXPR<vector(8) unsigned int>(simd.25_11(D));
  vect__36.99_52 = VIEW_CONVERT_EXPR<vector(8) unsigned int>(simd.26_13(D));
  vect__37.100_53 = vect__35.98_51 + vect__36.99_52;
  vect__3.101_54 = VIEW_CONVERT_EXPR<vector(8) int>(vect__37.100_53);
  mask__38.102_55 = mask.27_15(D) != { 0, 0, 0, 0, 0, 0, 0, 0 };
  if (mask__38.102_55 == { 0, 0, 0, 0, 0, 0, 0, 0 })
    goto <bb 4>; [20.00%]
  else
    goto <bb 3>; [80.00%]

  <bb 3> [local count: 858993419]:
  .MASK_STORE (&retval.21, 256B, mask__38.102_55, vect__3.101_54);

  <bb 4> [local count: 1073741824]:
  _8 = VIEW_CONVERT_EXPR<vector(8) int>(retval.21);
  return _8;

which is much more reasonable - I'm not sure whether the compare against 0
is required or if the ABI guarantees either 0 or -1 in the elements.  That
we end up with memory due to the use of a .MASK_STORE unfortunately persists
to the final assembly:

_ZGVdM8vv_foo:  
.LFB3:
        .cfi_startproc
        vpaddd  %ymm1, %ymm0, %ymm0
        vpxor   %xmm1, %xmm1, %xmm1
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        vpcmpeqd        %ymm1, %ymm2, %ymm2
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-32, %rsp
        vpcmpeqd        %ymm1, %ymm2, %ymm2
        vptest  %ymm2, %ymm2
        je      .L12
        vpmaskmovd      %ymm0, %ymm2, -32(%rsp)
.L12:
        vmovdqa -32(%rsp), %ymm0
        leave
        .cfi_def_cfa 7, 8
        ret

there's the opportunity to maybe rewrite .MASK_STORE to a VEC_COND_EXPR
and rewriting retval.21 to SSA.  Maybe if-conversion can see this already
given retval.21 is local automatic and not address taken.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
  2023-10-13  7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
  2023-10-13  7:33 ` [Bug tree-optimization/111793] " rguenth at gcc dot gnu.org
  2023-10-13  7:43 ` [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 " rguenth at gcc dot gnu.org
@ 2023-10-13  9:08 ` jakub at gcc dot gnu.org
  2023-10-13  9:13 ` jakub at gcc dot gnu.org
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: jakub at gcc dot gnu.org @ 2023-10-13  9:08 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793

Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |jakub at gcc dot gnu.org

--- Comment #3 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #2)
> which is much more reasonable - I'm not sure whether the compare against 0
> is required or if the ABI guarantees either 0 or -1 in the elements.

Yes, it does.  All ones elements that is, the element type can be unsigned
integral or floating point as well...

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
  2023-10-13  7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2023-10-13  9:08 ` jakub at gcc dot gnu.org
@ 2023-10-13  9:13 ` jakub at gcc dot gnu.org
  2023-10-13  9:39 ` rguenth at gcc dot gnu.org
  2024-06-14 13:26 ` rguenth at gcc dot gnu.org
  5 siblings, 0 replies; 7+ messages in thread
From: jakub at gcc dot gnu.org @ 2023-10-13  9:13 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793

--- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
So, shouldn't we match.pd (or something else) pattern match
  vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
mask.48_7(D), mask.48_7(D)};
  vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15 };
  vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1 };
  mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0 };
back into mask__39.139_60 = mask.48_7(D); ?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
  2023-10-13  7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
                   ` (3 preceding siblings ...)
  2023-10-13  9:13 ` jakub at gcc dot gnu.org
@ 2023-10-13  9:39 ` rguenth at gcc dot gnu.org
  2024-06-14 13:26 ` rguenth at gcc dot gnu.org
  5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-10-13  9:39 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793

--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Jakub Jelinek from comment #4)
> So, shouldn't we match.pd (or something else) pattern match
>   vect_cst__50 = {mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
> mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
> mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D), mask.48_7(D),
> mask.48_7(D), mask.48_7(D)};
>   vect__8.132_51 = vect_cst__50 >> { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
> 12, 13, 14, 15 };
>   vect__9.133_53 = vect__8.132_51 & { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
> 1, 1, 1 };
>   mask__39.139_60 = vect__9.133_53 != { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> 0, 0, 0, 0 };
> back into mask__39.139_60 = mask.48_7(D); ?

Yes, that's a possibility.  I wonder if it's possible to arrange things in the
vectorizer itself so that costing gets more accurate (probably not that
important for OMP SIMD though).

Maybe it works a bit better if we did mask & (1 << iv), but I guess we
canonicalize that back.

I've opened this for tracking for now, working on PR111795 first.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 are highly sub-optimal
  2023-10-13  7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
                   ` (4 preceding siblings ...)
  2023-10-13  9:39 ` rguenth at gcc dot gnu.org
@ 2024-06-14 13:26 ` rguenth at gcc dot gnu.org
  5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-06-14 13:26 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111793

--- Comment #6 from Richard Biener <rguenth at gcc dot gnu.org> ---
non-openmp testcase for the AVX512 mask inefficiency:

unsigned foo (unsigned *a, unsigned short mask)
{
  unsigned sum = 0;
  for (int i = 0; i < 16; ++i)
    if ((mask >> i) & 1)
      sum += a[i];
  return sum;
}

I think the AVX2 one is settled as being as good as it can get.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-06-14 13:26 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-13  7:28 [Bug tree-optimization/111793] New: OpenMP SIMD inbranch clones for AVX are highly sub-optimal rguenth at gcc dot gnu.org
2023-10-13  7:33 ` [Bug tree-optimization/111793] " rguenth at gcc dot gnu.org
2023-10-13  7:43 ` [Bug tree-optimization/111793] OpenMP SIMD inbranch clones for AVX512 " rguenth at gcc dot gnu.org
2023-10-13  9:08 ` jakub at gcc dot gnu.org
2023-10-13  9:13 ` jakub at gcc dot gnu.org
2023-10-13  9:39 ` rguenth at gcc dot gnu.org
2024-06-14 13:26 ` rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).