public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug target/114570] New: GCC doesn't perform good loop invariant code motion for very long vector operations.
@ 2024-04-03 7:59 liuhongt at gcc dot gnu.org
2024-04-03 11:35 ` [Bug middle-end/114570] " rguenth at gcc dot gnu.org
2024-04-03 16:52 ` pinskia at gcc dot gnu.org
0 siblings, 2 replies; 3+ messages in thread
From: liuhongt at gcc dot gnu.org @ 2024-04-03 7:59 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114570
Bug ID: 114570
Summary: GCC doesn't perform good loop invariant code motion
for very long vector operations.
Product: gcc
Version: 14.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: liuhongt at gcc dot gnu.org
Target Milestone: ---
typedef float v128_32 __attribute__((vector_size (128 * 4), aligned(2048)));
v128_32
foo (v128_32 a, v128_32 b, v128_32 c, int n)
{
for (int i = 0; i != 2048; i++)
{
a = a / c;
a = a / b;
}
return a;
}
<bb 3> [local count: 1063004408]:
# a_13 = PHI <a_9(3), a_3(D)(2)>
# ivtmp_2 = PHI <ivtmp_1(3), 2048(2)>
# DEBUG i => NULL
# DEBUG a => NULL
# DEBUG BEGIN_STMT
_14 = BIT_FIELD_REF <a_13, 256, 0>;
_15 = BIT_FIELD_REF <c_6(D), 256, 0>;
_10 = _14 / _15;
_11 = BIT_FIELD_REF <a_13, 256, 256>;
_12 = BIT_FIELD_REF <c_6(D), 256, 256>;
_16 = _11 / _12;
_17 = BIT_FIELD_REF <a_13, 256, 512>;
_18 = BIT_FIELD_REF <c_6(D), 256, 512>;
_19 = _17 / _18;
_20 = BIT_FIELD_REF <a_13, 256, 768>;
_21 = BIT_FIELD_REF <c_6(D), 256, 768>;
_22 = _20 / _21;
_23 = BIT_FIELD_REF <a_13, 256, 1024>;
_24 = BIT_FIELD_REF <c_6(D), 256, 1024>;
_25 = _23 / _24;
_26 = BIT_FIELD_REF <a_13, 256, 1280>;
_27 = BIT_FIELD_REF <c_6(D), 256, 1280>;
_28 = _26 / _27;
_29 = BIT_FIELD_REF <a_13, 256, 1536>;
_30 = BIT_FIELD_REF <c_6(D), 256, 1536>;
_31 = _29 / _30;
_32 = BIT_FIELD_REF <a_13, 256, 1792>;
_33 = BIT_FIELD_REF <c_6(D), 256, 1792>;
_34 = _32 / _33;
_35 = BIT_FIELD_REF <a_13, 256, 2048>;
_36 = BIT_FIELD_REF <c_6(D), 256, 2048>;
_37 = _35 / _36;
_38 = BIT_FIELD_REF <a_13, 256, 2304>;
_39 = BIT_FIELD_REF <c_6(D), 256, 2304>;
_40 = _38 / _39;
_41 = BIT_FIELD_REF <a_13, 256, 2560>;
_42 = BIT_FIELD_REF <c_6(D), 256, 2560>;
_43 = _41 / _42;
_44 = BIT_FIELD_REF <a_13, 256, 2816>;
_45 = BIT_FIELD_REF <c_6(D), 256, 2816>;
_46 = _44 / _45;
_47 = BIT_FIELD_REF <a_13, 256, 3072>;
_48 = BIT_FIELD_REF <c_6(D), 256, 3072>;
_49 = _47 / _48;
_50 = BIT_FIELD_REF <a_13, 256, 3328>;
_51 = BIT_FIELD_REF <c_6(D), 256, 3328>;
_52 = _50 / _51;
_53 = BIT_FIELD_REF <a_13, 256, 3584>;
_54 = BIT_FIELD_REF <c_6(D), 256, 3584>;
_55 = _53 / _54;
_56 = BIT_FIELD_REF <a_13, 256, 3840>;
_57 = BIT_FIELD_REF <c_6(D), 256, 3840>;
_58 = _56 / _57;
# DEBUG a => {_10, _16, _19, _22, _25, _28, _31, _34, _37, _40, _43, _46,
_49, _52, _55, _58}
# DEBUG BEGIN_STMT
_59 = BIT_FIELD_REF <b_8(D), 256, 0>;
_60 = _10 / _59;
_61 = BIT_FIELD_REF <b_8(D), 256, 256>;
_62 = _16 / _61;
_63 = BIT_FIELD_REF <b_8(D), 256, 512>;
_64 = _19 / _63;
_65 = BIT_FIELD_REF <b_8(D), 256, 768>;
_66 = _22 / _65;
_67 = BIT_FIELD_REF <b_8(D), 256, 1024>;
_68 = _25 / _67;
_69 = BIT_FIELD_REF <b_8(D), 256, 1280>;
_70 = _28 / _69;
_71 = BIT_FIELD_REF <b_8(D), 256, 1536>;
_72 = _31 / _71;
_73 = BIT_FIELD_REF <b_8(D), 256, 1792>;
_74 = _34 / _73;
_75 = BIT_FIELD_REF <b_8(D), 256, 2048>;
_76 = _37 / _75;
_77 = BIT_FIELD_REF <b_8(D), 256, 2304>;
_78 = _40 / _77;
_79 = BIT_FIELD_REF <b_8(D), 256, 2560>;
_80 = _43 / _79;
_81 = BIT_FIELD_REF <b_8(D), 256, 2816>;
_82 = _46 / _81;
_83 = BIT_FIELD_REF <b_8(D), 256, 3072>;
_84 = _49 / _83;
_85 = BIT_FIELD_REF <b_8(D), 256, 3328>;
_86 = _52 / _85;
_87 = BIT_FIELD_REF <b_8(D), 256, 3584>;
_88 = _55 / _87;
_89 = BIT_FIELD_REF <b_8(D), 256, 3840>;
_90 = _58 / _89;
a_9 = {_60, _62, _64, _66, _68, _70, _72, _74, _76, _78, _80, _82, _84, _86,
_88, _90};
# DEBUG a => a_9
# DEBUG BEGIN_STMT
# DEBUG i => NULL
# DEBUG a => a_9
# DEBUG BEGIN_STMT
ivtmp_1 = ivtmp_2 + 4294967295;
if (ivtmp_1 != 0)
goto <bb 3>; [98.99%]
else
goto <bb 4>; [1.01%]
Ideally, those BIT_FIELD_REF can be hoisted out and
# a_13 = PHI <a_9(3), a_3(D)(2)> can be optimized with those 256-bit vectors.
we finanly generate
foo:
pushq %rbp
movq %rdi, %rax
movl $2048, %edx
movq %rsp, %rbp
subq $408, %rsp
leaq -120(%rsp), %r8
.L2:
vmovaps 16(%rbp), %ymm15
vmovaps 48(%rbp), %ymm14
movq %r8, %rsi
vdivps 1040(%rbp), %ymm15, %ymm15
vmovaps 80(%rbp), %ymm13
vmovaps 112(%rbp), %ymm12
vdivps 528(%rbp), %ymm15, %ymm15
vdivps 1072(%rbp), %ymm14, %ymm14
vmovaps 144(%rbp), %ymm11
vmovaps 176(%rbp), %ymm10
vdivps 560(%rbp), %ymm14, %ymm14
vdivps 1104(%rbp), %ymm13, %ymm13
vmovaps 208(%rbp), %ymm9
vmovaps 240(%rbp), %ymm8
vdivps 592(%rbp), %ymm13, %ymm13
vdivps 1136(%rbp), %ymm12, %ymm12
vmovaps 272(%rbp), %ymm7
vmovaps 304(%rbp), %ymm6
vdivps 624(%rbp), %ymm12, %ymm12
vdivps 1168(%rbp), %ymm11, %ymm11
vmovaps 336(%rbp), %ymm5
vdivps 656(%rbp), %ymm11, %ymm11
vdivps 1200(%rbp), %ymm10, %ymm10
vdivps 1232(%rbp), %ymm9, %ymm9
vdivps 688(%rbp), %ymm10, %ymm10
vdivps 720(%rbp), %ymm9, %ymm9
vdivps 1264(%rbp), %ymm8, %ymm8
vdivps 1296(%rbp), %ymm7, %ymm7
vdivps 752(%rbp), %ymm8, %ymm8
vdivps 784(%rbp), %ymm7, %ymm7
vdivps 1328(%rbp), %ymm6, %ymm6
movl $64, %ecx
vdivps 816(%rbp), %ymm6, %ymm6
leaq 16(%rbp), %rdi
vdivps 1360(%rbp), %ymm5, %ymm5
vdivps 848(%rbp), %ymm5, %ymm5
vmovaps 368(%rbp), %ymm4
vmovaps 400(%rbp), %ymm3
vdivps 1392(%rbp), %ymm4, %ymm4
vdivps 1424(%rbp), %ymm3, %ymm3
vmovaps 432(%rbp), %ymm2
vmovaps 464(%rbp), %ymm1
vdivps 880(%rbp), %ymm4, %ymm4
vdivps 912(%rbp), %ymm3, %ymm3
vmovaps 496(%rbp), %ymm0
vdivps 1456(%rbp), %ymm2, %ymm2
vdivps 1488(%rbp), %ymm1, %ymm1
vdivps 944(%rbp), %ymm2, %ymm2
vdivps 976(%rbp), %ymm1, %ymm1
vdivps 1520(%rbp), %ymm0, %ymm0
vmovaps %ymm15, -120(%rsp)
vdivps 1008(%rbp), %ymm0, %ymm0
vmovaps %ymm14, -88(%rsp)
vmovaps %ymm13, -56(%rsp)
vmovaps %ymm12, -24(%rsp)
vmovaps %ymm11, 8(%rsp)
vmovaps %ymm10, 40(%rsp)
vmovaps %ymm9, 72(%rsp)
vmovaps %ymm8, 104(%rsp)
vmovaps %ymm7, 136(%rsp)
vmovaps %ymm6, 168(%rsp)
vmovaps %ymm5, 200(%rsp)
vmovaps %ymm4, 232(%rsp)
vmovaps %ymm3, 264(%rsp)
vmovaps %ymm2, 296(%rsp)
vmovaps %ymm1, 328(%rsp)
vmovaps %ymm0, 360(%rsp)
rep movsq
subl $1, %edx
jne .L2
leaq 16(%rbp), %rsi
movl $64, %ecx
movq %rax, %rdi
rep movsq
vzeroupper
leave
ret
But it can be better with just
foo: # @foo
pushq %rbp
movq %rsp, %rbp
andq $-512, %rsp # imm = 0xFE00
subq $1536, %rsp # imm = 0x600
movq %rdi, %rax
vmovaps 496(%rbp), %ymm14
vmovaps 464(%rbp), %ymm13
vmovaps 432(%rbp), %ymm12
vmovaps 400(%rbp), %ymm11
vmovaps 368(%rbp), %ymm10
vmovaps 336(%rbp), %ymm9
vmovaps 304(%rbp), %ymm8
vmovaps 272(%rbp), %ymm7
vmovaps 240(%rbp), %ymm6
vmovaps 208(%rbp), %ymm5
vmovaps 176(%rbp), %ymm4
vmovaps 144(%rbp), %ymm3
vmovaps 16(%rbp), %ymm0
vmovaps %ymm0, 416(%rsp) # 32-byte Spill
vmovaps 48(%rbp), %ymm2
vmovaps 80(%rbp), %ymm15
vmovaps 112(%rbp), %ymm0
vmovaps %ymm0, 448(%rsp) # 32-byte Spill
movl $2048, %ecx # imm = 0x800
vmovaps 1008(%rbp), %ymm0
vmovaps %ymm0, 1472(%rsp) # 32-byte Spill
vmovaps 976(%rbp), %ymm1
vmovaps %ymm1, 1440(%rsp) # 32-byte Spill
vmovaps %ymm2, %ymm1
vmovaps 944(%rbp), %ymm2
vmovaps %ymm2, 1408(%rsp) # 32-byte Spill
vmovaps %ymm3, %ymm2
vmovaps 912(%rbp), %ymm3
vmovaps %ymm3, 1376(%rsp) # 32-byte Spill
vmovaps %ymm4, %ymm3
vmovaps 880(%rbp), %ymm4
vmovaps %ymm4, 1344(%rsp) # 32-byte Spill
vmovaps %ymm5, %ymm4
vmovaps 848(%rbp), %ymm5
vmovaps %ymm5, 1312(%rsp) # 32-byte Spill
vmovaps %ymm6, %ymm5
vmovaps 816(%rbp), %ymm6
vmovaps %ymm6, 1280(%rsp) # 32-byte Spill
vmovaps %ymm7, %ymm6
vmovaps 784(%rbp), %ymm7
vmovaps %ymm7, 1248(%rsp) # 32-byte Spill
vmovaps %ymm8, %ymm7
vmovaps 752(%rbp), %ymm8
vmovaps %ymm8, 1216(%rsp) # 32-byte Spill
vmovaps %ymm9, %ymm8
vmovaps 720(%rbp), %ymm9
vmovaps %ymm9, 1184(%rsp) # 32-byte Spill
vmovaps %ymm10, %ymm9
vmovaps 688(%rbp), %ymm10
vmovaps %ymm10, 1152(%rsp) # 32-byte Spill
vmovaps %ymm11, %ymm10
vmovaps 656(%rbp), %ymm11
vmovaps %ymm11, 1120(%rsp) # 32-byte Spill
vmovaps %ymm12, %ymm11
vmovaps 528(%rbp), %ymm12
vmovaps %ymm12, 1088(%rsp) # 32-byte Spill
vmovaps %ymm13, %ymm12
vmovaps 560(%rbp), %ymm13
vmovaps %ymm13, 1056(%rsp) # 32-byte Spill
vmovaps %ymm14, %ymm13
vmovaps 592(%rbp), %ymm14
vmovaps %ymm14, 1024(%rsp) # 32-byte Spill
vmovaps %ymm15, %ymm14
vmovaps 624(%rbp), %ymm15
vmovaps %ymm15, 992(%rsp) # 32-byte Spill
vmovaps 448(%rsp), %ymm15 # 32-byte Reload
vmovaps 1520(%rbp), %ymm0
vmovaps %ymm0, 960(%rsp) # 32-byte Spill
vmovaps 1488(%rbp), %ymm0
vmovaps %ymm0, 928(%rsp) # 32-byte Spill
vmovaps 1456(%rbp), %ymm0
vmovaps %ymm0, 896(%rsp) # 32-byte Spill
vmovaps 1424(%rbp), %ymm0
vmovaps %ymm0, 864(%rsp) # 32-byte Spill
vmovaps 1392(%rbp), %ymm0
vmovaps %ymm0, 832(%rsp) # 32-byte Spill
vmovaps 1360(%rbp), %ymm0
vmovaps %ymm0, 800(%rsp) # 32-byte Spill
vmovaps 1328(%rbp), %ymm0
vmovaps %ymm0, 768(%rsp) # 32-byte Spill
vmovaps 1296(%rbp), %ymm0
vmovaps %ymm0, 736(%rsp) # 32-byte Spill
vmovaps 1264(%rbp), %ymm0
vmovaps %ymm0, 704(%rsp) # 32-byte Spill
vmovaps 1232(%rbp), %ymm0
vmovaps %ymm0, 672(%rsp) # 32-byte Spill
vmovaps 1200(%rbp), %ymm0
vmovaps %ymm0, 640(%rsp) # 32-byte Spill
vmovaps 1168(%rbp), %ymm0
vmovaps %ymm0, 608(%rsp) # 32-byte Spill
vmovaps 1040(%rbp), %ymm0
vmovaps %ymm0, 576(%rsp) # 32-byte Spill
vmovaps 1072(%rbp), %ymm0
vmovaps %ymm0, 544(%rsp) # 32-byte Spill
vmovaps 1104(%rbp), %ymm0
vmovaps %ymm0, 512(%rsp) # 32-byte Spill
vmovaps 1136(%rbp), %ymm0
vmovaps %ymm0, 480(%rsp) # 32-byte Spill
.LBB0_1: # =>This Inner Loop Header: Depth=1
vdivps 960(%rsp), %ymm13, %ymm13 # 32-byte Folded Reload
vdivps 928(%rsp), %ymm12, %ymm12 # 32-byte Folded Reload
vdivps 896(%rsp), %ymm11, %ymm11 # 32-byte Folded Reload
vdivps 864(%rsp), %ymm10, %ymm10 # 32-byte Folded Reload
vdivps 832(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload
vdivps 800(%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
vdivps 768(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
vdivps 736(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload
vdivps 704(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload
vdivps 672(%rsp), %ymm4, %ymm4 # 32-byte Folded Reload
vdivps 640(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload
vdivps 608(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
vdivps 480(%rsp), %ymm15, %ymm15 # 32-byte Folded Reload
vdivps 512(%rsp), %ymm14, %ymm14 # 32-byte Folded Reload
vdivps 544(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
vmovaps 416(%rsp), %ymm0 # 32-byte Reload
vdivps 576(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
vdivps 1088(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
vmovaps %ymm0, 416(%rsp) # 32-byte Spill
vdivps 1056(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
vdivps 1024(%rsp), %ymm14, %ymm14 # 32-byte Folded Reload
vdivps 992(%rsp), %ymm15, %ymm15 # 32-byte Folded Reload
vdivps 1120(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
vdivps 1152(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload
vdivps 1184(%rsp), %ymm4, %ymm4 # 32-byte Folded Reload
vdivps 1216(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload
vdivps 1248(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload
vdivps 1280(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
vdivps 1312(%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
vdivps 1344(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload
vdivps 1376(%rsp), %ymm10, %ymm10 # 32-byte Folded Reload
vdivps 1408(%rsp), %ymm11, %ymm11 # 32-byte Folded Reload
vdivps 1440(%rsp), %ymm12, %ymm12 # 32-byte Folded Reload
vdivps 1472(%rsp), %ymm13, %ymm13 # 32-byte Folded Reload
decl %ecx
jne .LBB0_1
vmovaps 416(%rsp), %ymm0 # 32-byte Reload
vmovaps %ymm0, (%rax)
vmovaps %ymm1, 32(%rax)
vmovaps %ymm14, 64(%rax)
vmovaps %ymm15, 96(%rax)
vmovaps %ymm2, 128(%rax)
vmovaps %ymm3, 160(%rax)
vmovaps %ymm4, 192(%rax)
vmovaps %ymm5, 224(%rax)
vmovaps %ymm6, 256(%rax)
vmovaps %ymm7, 288(%rax)
vmovaps %ymm8, 320(%rax)
vmovaps %ymm9, 352(%rax)
vmovaps %ymm10, 384(%rax)
vmovaps %ymm11, 416(%rax)
vmovaps %ymm12, 448(%rax)
vmovaps %ymm13, 480(%rax)
movq %rbp, %rsp
popq %rbp
vzeroupper
retq
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug middle-end/114570] GCC doesn't perform good loop invariant code motion for very long vector operations.
2024-04-03 7:59 [Bug target/114570] New: GCC doesn't perform good loop invariant code motion for very long vector operations liuhongt at gcc dot gnu.org
@ 2024-04-03 11:35 ` rguenth at gcc dot gnu.org
2024-04-03 16:52 ` pinskia at gcc dot gnu.org
1 sibling, 0 replies; 3+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-04-03 11:35 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114570
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Ever confirmed|0 |1
Last reconfirmed| |2024-04-03
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
There's no (gimple) invariant motion after vector operation lowering. RTL
invariant motion should see this though but it might have a prohibiting cost
model (or doesn't handle [stack] memory?).
There's other reasons we want to move vector lowering earlier which might then
also catch invariant motion opportunities.
^ permalink raw reply [flat|nested] 3+ messages in thread
* [Bug middle-end/114570] GCC doesn't perform good loop invariant code motion for very long vector operations.
2024-04-03 7:59 [Bug target/114570] New: GCC doesn't perform good loop invariant code motion for very long vector operations liuhongt at gcc dot gnu.org
2024-04-03 11:35 ` [Bug middle-end/114570] " rguenth at gcc dot gnu.org
@ 2024-04-03 16:52 ` pinskia at gcc dot gnu.org
1 sibling, 0 replies; 3+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-04-03 16:52 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114570
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|NEW |RESOLVED
Resolution|--- |DUPLICATE
--- Comment #2 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Dup. (there might be another too).
*** This bug has been marked as a duplicate of bug 107916 ***
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2024-04-03 16:52 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-04-03 7:59 [Bug target/114570] New: GCC doesn't perform good loop invariant code motion for very long vector operations liuhongt at gcc dot gnu.org
2024-04-03 11:35 ` [Bug middle-end/114570] " rguenth at gcc dot gnu.org
2024-04-03 16:52 ` pinskia at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).