public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/111829] New: Redudant register moves inside the loop
@ 2023-10-16  5:54 crazylht at gmail dot com
  2023-10-16  6:03 ` [Bug target/111829] " crazylht at gmail dot com
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: crazylht at gmail dot com @ 2023-10-16  5:54 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829

            Bug ID: 111829
           Summary: Redudant register moves inside the loop
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
  Target Milestone: ---
            Target: x86_64-*-* i?86-*-*

#include<immintrin.h>
int
foo (__m128i* __restrict pa, int* b,
 __m128i* __restrict pc, int n)
{
    __m128i vsum = _mm_setzero_si128();
    for (int i = 0; i != 100000; i++)
    {
        vsum = _mm_dpbusd_epi32 (vsum, pa[i], _mm_set1_epi32 (b[i]));
    }
    *pc = vsum;
    int ssum = 0;
    for (int i = 0; i != 4; i++)
      ssum += ((__v4si)vsum)[i];
    return ssum;
}

gcc -O2 -mavxvnni

foo(long long __vector(2)*, int*, long long __vector(2)*, int):
        leaq    400000(%rsi), %rax
        vpxor   %xmm0, %xmm0, %xmm0
.L2:
        vmovdqa (%rdi), %xmm2
        vmovdqa %xmm0, %xmm1 ---- redundant
        addq    $4, %rsi
        addq    $16, %rdi
        vpbroadcastd    -4(%rsi), %xmm3
        {vex} vpdpbusd  %xmm3, %xmm2, %xmm1
        vmovdqa %xmm1, %xmm0 --- redundant
        cmpq    %rax, %rsi
        jne     .L2
        vmovdqa %xmm1, (%rdx)
        leaq    -24(%rsp), %rax
        leaq    -8(%rsp), %rcx
        xorl    %edx, %edx
.L3:
        vmovdqa %xmm0, -24(%rsp)
        addq    $4, %rax
        addl    -4(%rax), %edx
        cmpq    %rax, %rcx
        jne     .L3
        movl    %edx, %eax
        ret


it can be better with


foo(long long __vector(2)*, int*, long long __vector(2)*, int):
        leaq    400000(%rsi), %rax
        vpxor   %xmm0, %xmm0, %xmm0
.L2:
        vmovdqa (%rdi), %xmm2

        addq    $4, %rsi
        addq    $16, %rdi
        vpbroadcastd    -4(%rsi), %xmm3
        {vex} vpdpbusd  %xmm3, %xmm2, %xmm0
        cmpq    %rax, %rsi
        jne     .L2
        vmovdqa %xmm0, (%rdx)
        leaq    -24(%rsp), %rax
        leaq    -8(%rsp), %rcx
        xorl    %edx, %edx
.L3:
        vmovdqa %xmm0, -24(%rsp)
        addq    $4, %rax
        addl    -4(%rax), %edx
        cmpq    %rax, %rcx
        jne     .L3
        movl    %edx, %eax
        ret

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/111829] Redudant register moves inside the loop
  2023-10-16  5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
@ 2023-10-16  6:03 ` crazylht at gmail dot com
  2023-10-16  7:27 ` rguenth at gcc dot gnu.org
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: crazylht at gmail dot com @ 2023-10-16  6:03 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829

--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> ---
  ivtmp.23_31 = (unsigned long) b_24(D);
  ivtmp.24_46 = (unsigned long) pa_26(D);
  _50 = ivtmp.23_31 + 400000;

  <bb 3> [local count: 1063004408]:
  # vsum_35 = PHI <vsum_28(3), { 0, 0 }(2)>
  # ivtmp.23_14 = PHI <ivtmp.23_15(3), ivtmp.23_31(2)>
  # ivtmp.24_30 = PHI <ivtmp.24_45(3), ivtmp.24_46(2)>
  _47 = (void *) ivtmp.23_14;
  _4 = MEM[(int *)_47];
  _25 = {_4, _4, _4, _4};
  _48 = (void *) ivtmp.24_30;
  _7 = MEM[(__m128i * {ref-all})_48];
  _8 = VIEW_CONVERT_EXPR<__v4si>(_7);
  _9 = VIEW_CONVERT_EXPR<__v4si>(vsum_35);
  _27 = __builtin_ia32_vpdpbusd_v4si (_9, _8, _25);
  vsum_28 = VIEW_CONVERT_EXPR<__m128i>(_27);
  ivtmp.23_15 = ivtmp.23_14 + 4;
  ivtmp.24_45 = ivtmp.24_30 + 16;
  if (ivtmp.23_15 != _50)
    goto <bb 3>; [98.99%]
  else
    goto <bb 4>; [1.01%]

  <bb 4> [local count: 10737416]:
  *pc_19(D) = vsum_28;
  ivtmp.15_34 = (unsigned long) &vsum.0;
  _13 = ivtmp.15_34 + 16;

  <bb 5> [local count: 42949663]:
  # ssum_38 = PHI <ssum_22(5), 0(4)>
  # ivtmp.15_33 = PHI <ivtmp.15_32(5), ivtmp.15_34(4)>

I'm curious if we can "move" VIEW_EXPR_CONVERT outside of the loop as below

  <bb 3> [local count: 1063004408]:
-  # vsum_35 = PHI <vsum_28(3), { 0, 0 }(2)>
+  # _9 = PHI <_27(3), { 0, 0, 0, 0}(2)>
  # ivtmp.23_14 = PHI <ivtmp.23_15(3), ivtmp.23_31(2)>
  # ivtmp.24_30 = PHI <ivtmp.24_45(3), ivtmp.24_46(2)>
  _47 = (void *) ivtmp.23_14;
  _4 = MEM[(int *)_47];
  _25 = {_4, _4, _4, _4};
  _48 = (void *) ivtmp.24_30;
  _7 = MEM[(__m128i * {ref-all})_48];
  _8 = VIEW_CONVERT_EXPR<__v4si>(_7);
-  _9 = VIEW_CONVERT_EXPR<__v4si>(vsum_35);
  _27 = __builtin_ia32_vpdpbusd_v4si (_9, _8, _25);
-  vsum_28 = VIEW_CONVERT_EXPR<__m128i>(_27);
  ivtmp.23_15 = ivtmp.23_14 + 4;
  ivtmp.24_45 = ivtmp.24_30 + 16;
  if (ivtmp.23_15 != _50)
    goto <bb 3>; [98.99%]
  else
    goto <bb 4>; [1.01%]

  <bb 4> [local count: 10737416]:
+  vsum_28 = VIEW_CONVERT_EXPR <_27>
  *pc_19(D) = vsum_28;
  ivtmp.15_34 = (unsigned long) &vsum.0;
  _13 = ivtmp.15_34 + 16;

  <bb 5> [local count: 42949663]:
  # ssum_38 = PHI <ssum_22(5), 0(4)>
  # ivtmp.15_33 = PHI <ivtmp.15_32(5), ivtmp.15_34(4)>


It looks like an lazy code motion optimization, but currently not handled by
PRE.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/111829] Redudant register moves inside the loop
  2023-10-16  5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
  2023-10-16  6:03 ` [Bug target/111829] " crazylht at gmail dot com
@ 2023-10-16  7:27 ` rguenth at gcc dot gnu.org
  2023-10-16  8:01 ` crazylht at gmail dot com
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-10-16  7:27 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
     Ever confirmed|0                           |1
   Last reconfirmed|                            |2023-10-16

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
You sink the conversion, so it would be PRE on the reverse graph.  The
transform doesn't really fit a particular pass I think.

Why does the problem persist in RTL?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/111829] Redudant register moves inside the loop
  2023-10-16  5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
  2023-10-16  6:03 ` [Bug target/111829] " crazylht at gmail dot com
  2023-10-16  7:27 ` rguenth at gcc dot gnu.org
@ 2023-10-16  8:01 ` crazylht at gmail dot com
  2023-10-16  8:03 ` crazylht at gmail dot com
  2023-10-16 17:19 ` pinskia at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: crazylht at gmail dot com @ 2023-10-16  8:01 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829

--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Richard Biener from comment #2)
> You sink the conversion, so it would be PRE on the reverse graph.  The
> transform doesn't really fit a particular pass I think.
> 
> Why does the problem persist in RTL?
Normally, combine will eliminate the redudant move by combine subreg to the
pattern like.

1004(insn 19 17 21 3 (set (subreg:V4SI (reg/v:V2DI 103 [ vsum ]) 0)
1005        (unspec:V4SI [
1006                (subreg:V4SI (reg/v:V2DI 103 [ vsum ]) 0)
1007                (reg:V4SI 123 [ MEM[(__m128i * {ref-all})_52] ])
1008                (reg:V4SI 124)
1009            ] UNSPEC_VPDPBUSD)) "test.c":9:16 discrim 1 9182
{vpdpbusd_v4si}

but for this case, before combine, cse1/fwprop propagate the subreg(insn 21)
from inner loop to outside(insn 28), since there's use for (reg:V4SI 121),
combine failed to eliminate the redudnat mov of subreg.

------loop_begin----------
...
(insn 19 18 20 3 (set (reg:V4SI 121)
393        (unspec:V4SI [
394                (reg:V4SI 122 [ vsum ])
395                (reg:V4SI 123 [ MEM[(__m128i * {ref-all})_52] ])
396                (reg:V4SI 124)
397            ] UNSPEC_VPDPBUSD)) "test.c":9:16 discrim 1 9182 {vpdpbusd_v4si}
398     (expr_list:REG_DEAD (reg:V4SI 125)
399        (expr_list:REG_DEAD (reg:V4SI 123 [ MEM[(__m128i * {ref-all})_52] ])
400            (expr_list:REG_DEAD (reg:V4SI 122 [ vsum ])
401                (nil)))))
402(insn 20 19 21 3 (set (reg:V4SI 102 [ _11 ])
403        (reg:V4SI 121)) "test.c":9:16 discrim 1 1906 {movv4si_internal}
404     (expr_list:REG_DEAD (reg:V4SI 121)
405        (nil)))
406(insn 21 20 22 3 (set (reg/v:V2DI 103 [ vsum ])
407        (subreg:V2DI (reg:V4SI 121) 0)) "test.c":9:16 discrim 2 1909
{movv2di_internal}
408     (nil))
...
---------loop_end---------

453(note 27 26 28 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
454(insn 28 27 29 4 (set (mem:V2DI (reg/v/f:DI 119 [ pc ]) [0 *pc_22(D)+0 S16
A128])
455        (subreg:V2DI (reg:V4SI 121) 0)) "test.c":11:9 1909
{movv2di_internal}
456     (expr_list:REG_DEAD (reg/v/f:DI 119 [ pc ]) --- propogate from insn 21
457        (expr_list:REG_DEAD (reg/v:V2DI 103 [ vsum ])
458            (nil))))

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/111829] Redudant register moves inside the loop
  2023-10-16  5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
                   ` (2 preceding siblings ...)
  2023-10-16  8:01 ` crazylht at gmail dot com
@ 2023-10-16  8:03 ` crazylht at gmail dot com
  2023-10-16 17:19 ` pinskia at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: crazylht at gmail dot com @ 2023-10-16  8:03 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829

--- Comment #4 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Richard Biener from comment #2)
> You sink the conversion, so it would be PRE on the reverse graph.  The
> transform doesn't really fit a particular pass I think.
The conversions also needs to be hoisted if the initial variable is not
constant v2di{0, 0}/v4si{0, 0, 0, 0}

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/111829] Redudant register moves inside the loop
  2023-10-16  5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
                   ` (3 preceding siblings ...)
  2023-10-16  8:03 ` crazylht at gmail dot com
@ 2023-10-16 17:19 ` pinskia at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: pinskia at gcc dot gnu.org @ 2023-10-16 17:19 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829

--- Comment #5 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
I am 99% sure it is a dup of bug 94663 (and others).

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-10-16 17:19 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-16  5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
2023-10-16  6:03 ` [Bug target/111829] " crazylht at gmail dot com
2023-10-16  7:27 ` rguenth at gcc dot gnu.org
2023-10-16  8:01 ` crazylht at gmail dot com
2023-10-16  8:03 ` crazylht at gmail dot com
2023-10-16 17:19 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).