public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/111829] New: Redudant register moves inside the loop
@ 2023-10-16 5:54 crazylht at gmail dot com
2023-10-16 6:03 ` [Bug target/111829] " crazylht at gmail dot com
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: crazylht at gmail dot com @ 2023-10-16 5:54 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829
Bug ID: 111829
Summary: Redudant register moves inside the loop
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: crazylht at gmail dot com
Target Milestone: ---
Target: x86_64-*-* i?86-*-*
#include<immintrin.h>
int
foo (__m128i* __restrict pa, int* b,
__m128i* __restrict pc, int n)
{
__m128i vsum = _mm_setzero_si128();
for (int i = 0; i != 100000; i++)
{
vsum = _mm_dpbusd_epi32 (vsum, pa[i], _mm_set1_epi32 (b[i]));
}
*pc = vsum;
int ssum = 0;
for (int i = 0; i != 4; i++)
ssum += ((__v4si)vsum)[i];
return ssum;
}
gcc -O2 -mavxvnni
foo(long long __vector(2)*, int*, long long __vector(2)*, int):
leaq 400000(%rsi), %rax
vpxor %xmm0, %xmm0, %xmm0
.L2:
vmovdqa (%rdi), %xmm2
vmovdqa %xmm0, %xmm1 ---- redundant
addq $4, %rsi
addq $16, %rdi
vpbroadcastd -4(%rsi), %xmm3
{vex} vpdpbusd %xmm3, %xmm2, %xmm1
vmovdqa %xmm1, %xmm0 --- redundant
cmpq %rax, %rsi
jne .L2
vmovdqa %xmm1, (%rdx)
leaq -24(%rsp), %rax
leaq -8(%rsp), %rcx
xorl %edx, %edx
.L3:
vmovdqa %xmm0, -24(%rsp)
addq $4, %rax
addl -4(%rax), %edx
cmpq %rax, %rcx
jne .L3
movl %edx, %eax
ret
it can be better with
foo(long long __vector(2)*, int*, long long __vector(2)*, int):
leaq 400000(%rsi), %rax
vpxor %xmm0, %xmm0, %xmm0
.L2:
vmovdqa (%rdi), %xmm2
addq $4, %rsi
addq $16, %rdi
vpbroadcastd -4(%rsi), %xmm3
{vex} vpdpbusd %xmm3, %xmm2, %xmm0
cmpq %rax, %rsi
jne .L2
vmovdqa %xmm0, (%rdx)
leaq -24(%rsp), %rax
leaq -8(%rsp), %rcx
xorl %edx, %edx
.L3:
vmovdqa %xmm0, -24(%rsp)
addq $4, %rax
addl -4(%rax), %edx
cmpq %rax, %rcx
jne .L3
movl %edx, %eax
ret
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/111829] Redudant register moves inside the loop
2023-10-16 5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
@ 2023-10-16 6:03 ` crazylht at gmail dot com
2023-10-16 7:27 ` rguenth at gcc dot gnu.org
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: crazylht at gmail dot com @ 2023-10-16 6:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829
--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> ---
ivtmp.23_31 = (unsigned long) b_24(D);
ivtmp.24_46 = (unsigned long) pa_26(D);
_50 = ivtmp.23_31 + 400000;
<bb 3> [local count: 1063004408]:
# vsum_35 = PHI <vsum_28(3), { 0, 0 }(2)>
# ivtmp.23_14 = PHI <ivtmp.23_15(3), ivtmp.23_31(2)>
# ivtmp.24_30 = PHI <ivtmp.24_45(3), ivtmp.24_46(2)>
_47 = (void *) ivtmp.23_14;
_4 = MEM[(int *)_47];
_25 = {_4, _4, _4, _4};
_48 = (void *) ivtmp.24_30;
_7 = MEM[(__m128i * {ref-all})_48];
_8 = VIEW_CONVERT_EXPR<__v4si>(_7);
_9 = VIEW_CONVERT_EXPR<__v4si>(vsum_35);
_27 = __builtin_ia32_vpdpbusd_v4si (_9, _8, _25);
vsum_28 = VIEW_CONVERT_EXPR<__m128i>(_27);
ivtmp.23_15 = ivtmp.23_14 + 4;
ivtmp.24_45 = ivtmp.24_30 + 16;
if (ivtmp.23_15 != _50)
goto <bb 3>; [98.99%]
else
goto <bb 4>; [1.01%]
<bb 4> [local count: 10737416]:
*pc_19(D) = vsum_28;
ivtmp.15_34 = (unsigned long) &vsum.0;
_13 = ivtmp.15_34 + 16;
<bb 5> [local count: 42949663]:
# ssum_38 = PHI <ssum_22(5), 0(4)>
# ivtmp.15_33 = PHI <ivtmp.15_32(5), ivtmp.15_34(4)>
I'm curious if we can "move" VIEW_EXPR_CONVERT outside of the loop as below
<bb 3> [local count: 1063004408]:
- # vsum_35 = PHI <vsum_28(3), { 0, 0 }(2)>
+ # _9 = PHI <_27(3), { 0, 0, 0, 0}(2)>
# ivtmp.23_14 = PHI <ivtmp.23_15(3), ivtmp.23_31(2)>
# ivtmp.24_30 = PHI <ivtmp.24_45(3), ivtmp.24_46(2)>
_47 = (void *) ivtmp.23_14;
_4 = MEM[(int *)_47];
_25 = {_4, _4, _4, _4};
_48 = (void *) ivtmp.24_30;
_7 = MEM[(__m128i * {ref-all})_48];
_8 = VIEW_CONVERT_EXPR<__v4si>(_7);
- _9 = VIEW_CONVERT_EXPR<__v4si>(vsum_35);
_27 = __builtin_ia32_vpdpbusd_v4si (_9, _8, _25);
- vsum_28 = VIEW_CONVERT_EXPR<__m128i>(_27);
ivtmp.23_15 = ivtmp.23_14 + 4;
ivtmp.24_45 = ivtmp.24_30 + 16;
if (ivtmp.23_15 != _50)
goto <bb 3>; [98.99%]
else
goto <bb 4>; [1.01%]
<bb 4> [local count: 10737416]:
+ vsum_28 = VIEW_CONVERT_EXPR <_27>
*pc_19(D) = vsum_28;
ivtmp.15_34 = (unsigned long) &vsum.0;
_13 = ivtmp.15_34 + 16;
<bb 5> [local count: 42949663]:
# ssum_38 = PHI <ssum_22(5), 0(4)>
# ivtmp.15_33 = PHI <ivtmp.15_32(5), ivtmp.15_34(4)>
It looks like an lazy code motion optimization, but currently not handled by
PRE.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/111829] Redudant register moves inside the loop
2023-10-16 5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
2023-10-16 6:03 ` [Bug target/111829] " crazylht at gmail dot com
@ 2023-10-16 7:27 ` rguenth at gcc dot gnu.org
2023-10-16 8:01 ` crazylht at gmail dot com
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-10-16 7:27 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Ever confirmed|0 |1
Last reconfirmed| |2023-10-16
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
You sink the conversion, so it would be PRE on the reverse graph. The
transform doesn't really fit a particular pass I think.
Why does the problem persist in RTL?
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/111829] Redudant register moves inside the loop
2023-10-16 5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
2023-10-16 6:03 ` [Bug target/111829] " crazylht at gmail dot com
2023-10-16 7:27 ` rguenth at gcc dot gnu.org
@ 2023-10-16 8:01 ` crazylht at gmail dot com
2023-10-16 8:03 ` crazylht at gmail dot com
2023-10-16 17:19 ` pinskia at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: crazylht at gmail dot com @ 2023-10-16 8:01 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829
--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Richard Biener from comment #2)
> You sink the conversion, so it would be PRE on the reverse graph. The
> transform doesn't really fit a particular pass I think.
>
> Why does the problem persist in RTL?
Normally, combine will eliminate the redudant move by combine subreg to the
pattern like.
1004(insn 19 17 21 3 (set (subreg:V4SI (reg/v:V2DI 103 [ vsum ]) 0)
1005 (unspec:V4SI [
1006 (subreg:V4SI (reg/v:V2DI 103 [ vsum ]) 0)
1007 (reg:V4SI 123 [ MEM[(__m128i * {ref-all})_52] ])
1008 (reg:V4SI 124)
1009 ] UNSPEC_VPDPBUSD)) "test.c":9:16 discrim 1 9182
{vpdpbusd_v4si}
but for this case, before combine, cse1/fwprop propagate the subreg(insn 21)
from inner loop to outside(insn 28), since there's use for (reg:V4SI 121),
combine failed to eliminate the redudnat mov of subreg.
------loop_begin----------
...
(insn 19 18 20 3 (set (reg:V4SI 121)
393 (unspec:V4SI [
394 (reg:V4SI 122 [ vsum ])
395 (reg:V4SI 123 [ MEM[(__m128i * {ref-all})_52] ])
396 (reg:V4SI 124)
397 ] UNSPEC_VPDPBUSD)) "test.c":9:16 discrim 1 9182 {vpdpbusd_v4si}
398 (expr_list:REG_DEAD (reg:V4SI 125)
399 (expr_list:REG_DEAD (reg:V4SI 123 [ MEM[(__m128i * {ref-all})_52] ])
400 (expr_list:REG_DEAD (reg:V4SI 122 [ vsum ])
401 (nil)))))
402(insn 20 19 21 3 (set (reg:V4SI 102 [ _11 ])
403 (reg:V4SI 121)) "test.c":9:16 discrim 1 1906 {movv4si_internal}
404 (expr_list:REG_DEAD (reg:V4SI 121)
405 (nil)))
406(insn 21 20 22 3 (set (reg/v:V2DI 103 [ vsum ])
407 (subreg:V2DI (reg:V4SI 121) 0)) "test.c":9:16 discrim 2 1909
{movv2di_internal}
408 (nil))
...
---------loop_end---------
453(note 27 26 28 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
454(insn 28 27 29 4 (set (mem:V2DI (reg/v/f:DI 119 [ pc ]) [0 *pc_22(D)+0 S16
A128])
455 (subreg:V2DI (reg:V4SI 121) 0)) "test.c":11:9 1909
{movv2di_internal}
456 (expr_list:REG_DEAD (reg/v/f:DI 119 [ pc ]) --- propogate from insn 21
457 (expr_list:REG_DEAD (reg/v:V2DI 103 [ vsum ])
458 (nil))))
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/111829] Redudant register moves inside the loop
2023-10-16 5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
` (2 preceding siblings ...)
2023-10-16 8:01 ` crazylht at gmail dot com
@ 2023-10-16 8:03 ` crazylht at gmail dot com
2023-10-16 17:19 ` pinskia at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: crazylht at gmail dot com @ 2023-10-16 8:03 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829
--- Comment #4 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Richard Biener from comment #2)
> You sink the conversion, so it would be PRE on the reverse graph. The
> transform doesn't really fit a particular pass I think.
The conversions also needs to be hoisted if the initial variable is not
constant v2di{0, 0}/v4si{0, 0, 0, 0}
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/111829] Redudant register moves inside the loop
2023-10-16 5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
` (3 preceding siblings ...)
2023-10-16 8:03 ` crazylht at gmail dot com
@ 2023-10-16 17:19 ` pinskia at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: pinskia at gcc dot gnu.org @ 2023-10-16 17:19 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829
--- Comment #5 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
I am 99% sure it is a dup of bug 94663 (and others).
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2023-10-16 17:19 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-16 5:54 [Bug target/111829] New: Redudant register moves inside the loop crazylht at gmail dot com
2023-10-16 6:03 ` [Bug target/111829] " crazylht at gmail dot com
2023-10-16 7:27 ` rguenth at gcc dot gnu.org
2023-10-16 8:01 ` crazylht at gmail dot com
2023-10-16 8:03 ` crazylht at gmail dot com
2023-10-16 17:19 ` pinskia at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).