From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 2476F384640C; Wed, 3 Apr 2024 07:59:38 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 2476F384640C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1712131178; bh=G1O5psRGwKYPepW+Sq6LdOLYqrRr/PJl2nTVNJ/dUJg=; h=From:To:Subject:Date:From; b=jFYyddeKxYE3UVaN15PfxUGSwhp1UH3vH8SZO9Xdx4x4lDOIZ3/VPGsPrUOOQ0aK4 j7F1XaciaQNcBgqlwxWeC9fHVf/9+LRLaYBT63kDqtj5tVwWqBuqb06pikMcgctDTq U1ASvnYjzsdwanJvkxlP2zJd87me2a3q5W+skV5A= From: "liuhongt at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/114570] New: GCC doesn't perform good loop invariant code motion for very long vector operations. Date: Wed, 03 Apr 2024 07:59:36 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 14.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: liuhongt at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D114570 Bug ID: 114570 Summary: GCC doesn't perform good loop invariant code motion for very long vector operations. Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: liuhongt at gcc dot gnu.org Target Milestone: --- typedef float v128_32 __attribute__((vector_size (128 * 4), aligned(2048))); v128_32 foo (v128_32 a, v128_32 b, v128_32 c, int n) { for (int i =3D 0; i !=3D 2048; i++) { a =3D a / c; a =3D a / b; } return a; } [local count: 1063004408]: # a_13 =3D PHI # ivtmp_2 =3D PHI # DEBUG i =3D> NULL # DEBUG a =3D> NULL # DEBUG BEGIN_STMT _14 =3D BIT_FIELD_REF ; _15 =3D BIT_FIELD_REF ; _10 =3D _14 / _15; _11 =3D BIT_FIELD_REF ; _12 =3D BIT_FIELD_REF ; _16 =3D _11 / _12; _17 =3D BIT_FIELD_REF ; _18 =3D BIT_FIELD_REF ; _19 =3D _17 / _18; _20 =3D BIT_FIELD_REF ; _21 =3D BIT_FIELD_REF ; _22 =3D _20 / _21; _23 =3D BIT_FIELD_REF ; _24 =3D BIT_FIELD_REF ; _25 =3D _23 / _24; _26 =3D BIT_FIELD_REF ; _27 =3D BIT_FIELD_REF ; _28 =3D _26 / _27; _29 =3D BIT_FIELD_REF ; _30 =3D BIT_FIELD_REF ; _31 =3D _29 / _30; _32 =3D BIT_FIELD_REF ; _33 =3D BIT_FIELD_REF ; _34 =3D _32 / _33; _35 =3D BIT_FIELD_REF ; _36 =3D BIT_FIELD_REF ; _37 =3D _35 / _36; _38 =3D BIT_FIELD_REF ; _39 =3D BIT_FIELD_REF ; _40 =3D _38 / _39; _41 =3D BIT_FIELD_REF ; _42 =3D BIT_FIELD_REF ; _43 =3D _41 / _42; _44 =3D BIT_FIELD_REF ; _45 =3D BIT_FIELD_REF ; _46 =3D _44 / _45; _47 =3D BIT_FIELD_REF ; _48 =3D BIT_FIELD_REF ; _49 =3D _47 / _48; _50 =3D BIT_FIELD_REF ; _51 =3D BIT_FIELD_REF ; _52 =3D _50 / _51; _53 =3D BIT_FIELD_REF ; _54 =3D BIT_FIELD_REF ; _55 =3D _53 / _54; _56 =3D BIT_FIELD_REF ; _57 =3D BIT_FIELD_REF ; _58 =3D _56 / _57; # DEBUG a =3D> {_10, _16, _19, _22, _25, _28, _31, _34, _37, _40, _43, _4= 6, _49, _52, _55, _58} # DEBUG BEGIN_STMT _59 =3D BIT_FIELD_REF ; _60 =3D _10 / _59; _61 =3D BIT_FIELD_REF ; _62 =3D _16 / _61; _63 =3D BIT_FIELD_REF ; _64 =3D _19 / _63; _65 =3D BIT_FIELD_REF ; _66 =3D _22 / _65; _67 =3D BIT_FIELD_REF ; _68 =3D _25 / _67; _69 =3D BIT_FIELD_REF ; _70 =3D _28 / _69; _71 =3D BIT_FIELD_REF ; _72 =3D _31 / _71; _73 =3D BIT_FIELD_REF ; _74 =3D _34 / _73; _75 =3D BIT_FIELD_REF ; _76 =3D _37 / _75; _77 =3D BIT_FIELD_REF ; _78 =3D _40 / _77; _79 =3D BIT_FIELD_REF ; _80 =3D _43 / _79; _81 =3D BIT_FIELD_REF ; _82 =3D _46 / _81; _83 =3D BIT_FIELD_REF ; _84 =3D _49 / _83; _85 =3D BIT_FIELD_REF ; _86 =3D _52 / _85; _87 =3D BIT_FIELD_REF ; _88 =3D _55 / _87; _89 =3D BIT_FIELD_REF ; _90 =3D _58 / _89; a_9 =3D {_60, _62, _64, _66, _68, _70, _72, _74, _76, _78, _80, _82, _84,= _86, _88, _90}; # DEBUG a =3D> a_9 # DEBUG BEGIN_STMT # DEBUG i =3D> NULL # DEBUG a =3D> a_9 # DEBUG BEGIN_STMT ivtmp_1 =3D ivtmp_2 + 4294967295; if (ivtmp_1 !=3D 0) goto ; [98.99%] else goto ; [1.01%] Ideally, those BIT_FIELD_REF can be hoisted out and=20 # a_13 =3D PHI can be optimized with those 256-bit vect= ors. we finanly generate=20 foo: pushq %rbp movq %rdi, %rax movl $2048, %edx movq %rsp, %rbp subq $408, %rsp leaq -120(%rsp), %r8 .L2: vmovaps 16(%rbp), %ymm15 vmovaps 48(%rbp), %ymm14 movq %r8, %rsi vdivps 1040(%rbp), %ymm15, %ymm15 vmovaps 80(%rbp), %ymm13 vmovaps 112(%rbp), %ymm12 vdivps 528(%rbp), %ymm15, %ymm15 vdivps 1072(%rbp), %ymm14, %ymm14 vmovaps 144(%rbp), %ymm11 vmovaps 176(%rbp), %ymm10 vdivps 560(%rbp), %ymm14, %ymm14 vdivps 1104(%rbp), %ymm13, %ymm13 vmovaps 208(%rbp), %ymm9 vmovaps 240(%rbp), %ymm8 vdivps 592(%rbp), %ymm13, %ymm13 vdivps 1136(%rbp), %ymm12, %ymm12 vmovaps 272(%rbp), %ymm7 vmovaps 304(%rbp), %ymm6 vdivps 624(%rbp), %ymm12, %ymm12 vdivps 1168(%rbp), %ymm11, %ymm11 vmovaps 336(%rbp), %ymm5 vdivps 656(%rbp), %ymm11, %ymm11 vdivps 1200(%rbp), %ymm10, %ymm10 vdivps 1232(%rbp), %ymm9, %ymm9 vdivps 688(%rbp), %ymm10, %ymm10 vdivps 720(%rbp), %ymm9, %ymm9 vdivps 1264(%rbp), %ymm8, %ymm8 vdivps 1296(%rbp), %ymm7, %ymm7 vdivps 752(%rbp), %ymm8, %ymm8 vdivps 784(%rbp), %ymm7, %ymm7 vdivps 1328(%rbp), %ymm6, %ymm6 movl $64, %ecx vdivps 816(%rbp), %ymm6, %ymm6 leaq 16(%rbp), %rdi vdivps 1360(%rbp), %ymm5, %ymm5 vdivps 848(%rbp), %ymm5, %ymm5 vmovaps 368(%rbp), %ymm4 vmovaps 400(%rbp), %ymm3 vdivps 1392(%rbp), %ymm4, %ymm4 vdivps 1424(%rbp), %ymm3, %ymm3 vmovaps 432(%rbp), %ymm2 vmovaps 464(%rbp), %ymm1 vdivps 880(%rbp), %ymm4, %ymm4 vdivps 912(%rbp), %ymm3, %ymm3 vmovaps 496(%rbp), %ymm0 vdivps 1456(%rbp), %ymm2, %ymm2 vdivps 1488(%rbp), %ymm1, %ymm1 vdivps 944(%rbp), %ymm2, %ymm2 vdivps 976(%rbp), %ymm1, %ymm1 vdivps 1520(%rbp), %ymm0, %ymm0 vmovaps %ymm15, -120(%rsp) vdivps 1008(%rbp), %ymm0, %ymm0 vmovaps %ymm14, -88(%rsp) vmovaps %ymm13, -56(%rsp) vmovaps %ymm12, -24(%rsp) vmovaps %ymm11, 8(%rsp) vmovaps %ymm10, 40(%rsp) vmovaps %ymm9, 72(%rsp) vmovaps %ymm8, 104(%rsp) vmovaps %ymm7, 136(%rsp) vmovaps %ymm6, 168(%rsp) vmovaps %ymm5, 200(%rsp) vmovaps %ymm4, 232(%rsp) vmovaps %ymm3, 264(%rsp) vmovaps %ymm2, 296(%rsp) vmovaps %ymm1, 328(%rsp) vmovaps %ymm0, 360(%rsp) rep movsq subl $1, %edx jne .L2 leaq 16(%rbp), %rsi movl $64, %ecx movq %rax, %rdi rep movsq vzeroupper leave ret But it can be better with just foo: # @foo pushq %rbp movq %rsp, %rbp andq $-512, %rsp # imm =3D 0xFE00 subq $1536, %rsp # imm =3D 0x600 movq %rdi, %rax vmovaps 496(%rbp), %ymm14 vmovaps 464(%rbp), %ymm13 vmovaps 432(%rbp), %ymm12 vmovaps 400(%rbp), %ymm11 vmovaps 368(%rbp), %ymm10 vmovaps 336(%rbp), %ymm9 vmovaps 304(%rbp), %ymm8 vmovaps 272(%rbp), %ymm7 vmovaps 240(%rbp), %ymm6 vmovaps 208(%rbp), %ymm5 vmovaps 176(%rbp), %ymm4 vmovaps 144(%rbp), %ymm3 vmovaps 16(%rbp), %ymm0 vmovaps %ymm0, 416(%rsp) # 32-byte Spill vmovaps 48(%rbp), %ymm2 vmovaps 80(%rbp), %ymm15 vmovaps 112(%rbp), %ymm0 vmovaps %ymm0, 448(%rsp) # 32-byte Spill movl $2048, %ecx # imm =3D 0x800 vmovaps 1008(%rbp), %ymm0 vmovaps %ymm0, 1472(%rsp) # 32-byte Spill vmovaps 976(%rbp), %ymm1 vmovaps %ymm1, 1440(%rsp) # 32-byte Spill vmovaps %ymm2, %ymm1 vmovaps 944(%rbp), %ymm2 vmovaps %ymm2, 1408(%rsp) # 32-byte Spill vmovaps %ymm3, %ymm2 vmovaps 912(%rbp), %ymm3 vmovaps %ymm3, 1376(%rsp) # 32-byte Spill vmovaps %ymm4, %ymm3 vmovaps 880(%rbp), %ymm4 vmovaps %ymm4, 1344(%rsp) # 32-byte Spill vmovaps %ymm5, %ymm4 vmovaps 848(%rbp), %ymm5 vmovaps %ymm5, 1312(%rsp) # 32-byte Spill vmovaps %ymm6, %ymm5 vmovaps 816(%rbp), %ymm6 vmovaps %ymm6, 1280(%rsp) # 32-byte Spill vmovaps %ymm7, %ymm6 vmovaps 784(%rbp), %ymm7 vmovaps %ymm7, 1248(%rsp) # 32-byte Spill vmovaps %ymm8, %ymm7 vmovaps 752(%rbp), %ymm8 vmovaps %ymm8, 1216(%rsp) # 32-byte Spill vmovaps %ymm9, %ymm8 vmovaps 720(%rbp), %ymm9 vmovaps %ymm9, 1184(%rsp) # 32-byte Spill vmovaps %ymm10, %ymm9 vmovaps 688(%rbp), %ymm10 vmovaps %ymm10, 1152(%rsp) # 32-byte Spill vmovaps %ymm11, %ymm10 vmovaps 656(%rbp), %ymm11 vmovaps %ymm11, 1120(%rsp) # 32-byte Spill vmovaps %ymm12, %ymm11 vmovaps 528(%rbp), %ymm12 vmovaps %ymm12, 1088(%rsp) # 32-byte Spill vmovaps %ymm13, %ymm12 vmovaps 560(%rbp), %ymm13 vmovaps %ymm13, 1056(%rsp) # 32-byte Spill vmovaps %ymm14, %ymm13 vmovaps 592(%rbp), %ymm14 vmovaps %ymm14, 1024(%rsp) # 32-byte Spill vmovaps %ymm15, %ymm14 vmovaps 624(%rbp), %ymm15 vmovaps %ymm15, 992(%rsp) # 32-byte Spill vmovaps 448(%rsp), %ymm15 # 32-byte Reload vmovaps 1520(%rbp), %ymm0 vmovaps %ymm0, 960(%rsp) # 32-byte Spill vmovaps 1488(%rbp), %ymm0 vmovaps %ymm0, 928(%rsp) # 32-byte Spill vmovaps 1456(%rbp), %ymm0 vmovaps %ymm0, 896(%rsp) # 32-byte Spill vmovaps 1424(%rbp), %ymm0 vmovaps %ymm0, 864(%rsp) # 32-byte Spill vmovaps 1392(%rbp), %ymm0 vmovaps %ymm0, 832(%rsp) # 32-byte Spill vmovaps 1360(%rbp), %ymm0 vmovaps %ymm0, 800(%rsp) # 32-byte Spill vmovaps 1328(%rbp), %ymm0 vmovaps %ymm0, 768(%rsp) # 32-byte Spill vmovaps 1296(%rbp), %ymm0 vmovaps %ymm0, 736(%rsp) # 32-byte Spill vmovaps 1264(%rbp), %ymm0 vmovaps %ymm0, 704(%rsp) # 32-byte Spill vmovaps 1232(%rbp), %ymm0 vmovaps %ymm0, 672(%rsp) # 32-byte Spill vmovaps 1200(%rbp), %ymm0 vmovaps %ymm0, 640(%rsp) # 32-byte Spill vmovaps 1168(%rbp), %ymm0 vmovaps %ymm0, 608(%rsp) # 32-byte Spill vmovaps 1040(%rbp), %ymm0 vmovaps %ymm0, 576(%rsp) # 32-byte Spill vmovaps 1072(%rbp), %ymm0 vmovaps %ymm0, 544(%rsp) # 32-byte Spill vmovaps 1104(%rbp), %ymm0 vmovaps %ymm0, 512(%rsp) # 32-byte Spill vmovaps 1136(%rbp), %ymm0 vmovaps %ymm0, 480(%rsp) # 32-byte Spill .LBB0_1: # =3D>This Inner Loop Header: Depth= =3D1 vdivps 960(%rsp), %ymm13, %ymm13 # 32-byte Folded Reload vdivps 928(%rsp), %ymm12, %ymm12 # 32-byte Folded Reload vdivps 896(%rsp), %ymm11, %ymm11 # 32-byte Folded Reload vdivps 864(%rsp), %ymm10, %ymm10 # 32-byte Folded Reload vdivps 832(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload vdivps 800(%rsp), %ymm8, %ymm8 # 32-byte Folded Reload vdivps 768(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload vdivps 736(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload vdivps 704(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload vdivps 672(%rsp), %ymm4, %ymm4 # 32-byte Folded Reload vdivps 640(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload vdivps 608(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload vdivps 480(%rsp), %ymm15, %ymm15 # 32-byte Folded Reload vdivps 512(%rsp), %ymm14, %ymm14 # 32-byte Folded Reload vdivps 544(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload vmovaps 416(%rsp), %ymm0 # 32-byte Reload vdivps 576(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload vdivps 1088(%rsp), %ymm0, %ymm0 # 32-byte Folded Reload vmovaps %ymm0, 416(%rsp) # 32-byte Spill vdivps 1056(%rsp), %ymm1, %ymm1 # 32-byte Folded Reload vdivps 1024(%rsp), %ymm14, %ymm14 # 32-byte Folded Reload vdivps 992(%rsp), %ymm15, %ymm15 # 32-byte Folded Reload vdivps 1120(%rsp), %ymm2, %ymm2 # 32-byte Folded Reload vdivps 1152(%rsp), %ymm3, %ymm3 # 32-byte Folded Reload vdivps 1184(%rsp), %ymm4, %ymm4 # 32-byte Folded Reload vdivps 1216(%rsp), %ymm5, %ymm5 # 32-byte Folded Reload vdivps 1248(%rsp), %ymm6, %ymm6 # 32-byte Folded Reload vdivps 1280(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload vdivps 1312(%rsp), %ymm8, %ymm8 # 32-byte Folded Reload vdivps 1344(%rsp), %ymm9, %ymm9 # 32-byte Folded Reload vdivps 1376(%rsp), %ymm10, %ymm10 # 32-byte Folded Reload vdivps 1408(%rsp), %ymm11, %ymm11 # 32-byte Folded Reload vdivps 1440(%rsp), %ymm12, %ymm12 # 32-byte Folded Reload vdivps 1472(%rsp), %ymm13, %ymm13 # 32-byte Folded Reload decl %ecx jne .LBB0_1 vmovaps 416(%rsp), %ymm0 # 32-byte Reload vmovaps %ymm0, (%rax) vmovaps %ymm1, 32(%rax) vmovaps %ymm14, 64(%rax) vmovaps %ymm15, 96(%rax) vmovaps %ymm2, 128(%rax) vmovaps %ymm3, 160(%rax) vmovaps %ymm4, 192(%rax) vmovaps %ymm5, 224(%rax) vmovaps %ymm6, 256(%rax) vmovaps %ymm7, 288(%rax) vmovaps %ymm8, 320(%rax) vmovaps %ymm9, 352(%rax) vmovaps %ymm10, 384(%rax) vmovaps %ymm11, 416(%rax) vmovaps %ymm12, 448(%rax) vmovaps %ymm13, 480(%rax) movq %rbp, %rsp popq %rbp vzeroupper retq=