public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/68030] New: Redundant address calculations in vectorized loop
@ 2015-10-20 13:37 kyukhin at gcc dot gnu.org
  2015-10-20 14:06 ` [Bug tree-optimization/68030] " rguenth at gcc dot gnu.org
  0 siblings, 1 reply; 2+ messages in thread
From: kyukhin at gcc dot gnu.org @ 2015-10-20 13:37 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68030

            Bug ID: 68030
           Summary: Redundant address calculations in vectorized loop
           Product: gcc
           Version: 6.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: kyukhin at gcc dot gnu.org
  Target Milestone: ---

Created attachment 36548
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=36548&action=edit
Reproducer

Attached testcase.
Compiled w/: -S -Ofast -march=haswell 1.c

Code in main loop is:
.L7:
        movq    -176(%rbp), %rdx
        vmovaps -272(%rbp), %ymm1
        addl    $1, %r9d
        vmulps  (%rdx,%rax), %ymm11, %ymm0
        movq    -184(%rbp), %rdx ; <- Fill
        vfmadd231ps     (%rdx,%rax), %ymm12, %ymm0
        movq    -192(%rbp), %rdx ; <- Fill
        vmovaps %ymm0, %ymm15
        vmulps  (%rdx,%rax), %ymm13, %ymm0
        movq    -112(%rbp), %rdx; <- Fill
        vfmadd231ps     (%rdx,%rax), %ymm14, %ymm0
        movq    -160(%rbp), %rdx; <- Fill
        vaddps  %ymm0, %ymm15, %ymm0
        vmulps  (%rdx,%rax), %ymm9, %ymm15
        movq    -168(%rbp), %rdx
        ...

Those loads are related to the same array  (global_Input in the source)
with fixed offsets from the base.
Unfortunately, this is not recognized by Gimple (optimized):
  # ratio_mult_vf.11_1609 = PHI <504(8), 512(5)>
  # bnd.10_1618 = PHI <63(8), 64(5)>
  # niters.9_1610 = PHI <niters.9_339(8), 512(5)>
  # prolog_loop_adjusted_niters.8_1611 = PHI
<prolog_loop_adjusted_niters.8_340(8), 0(5)>
  # ivtmp_1615 = PHI <_401(8), 512(5)>
  # ix_1592 = PHI <_456(8), 2(5)>
  _999 = ivtmp.345_510 + prolog_loop_adjusted_niters.8_1611;
  _998 = _999 * 4;
  _995 = _998 + 18446744073709547488;
  vectp.15_1 = pretmp_889 + _995; <-- addr is (base + _999*4 + OFFSET1)
  _986 = *local_Filter_12;
  vect_cst__984 = {_986, _986, _986, _986, _986, _986, _986, _986};
  _975 = _998 + 18446744073709547492;
  vectp.22_982 = pretmp_889 + _975; <-- addr is (base + _999*4 + OFFSET11)
  _965 = MEM[(float *)local_Filter_12 + 4B];
  vect_cst__964 = {_965, _965, _965, _965, _965, _965, _965, _965};
  _956 = _998 + 18446744073709547496;
  vectp.30_961 = pretmp_889 + _956; <-- addr is (base + _999*4 + OFFSET2)
  _948 = MEM[(float *)local_Filter_12 + 8B];
  vect_cst__947 = {_948, _948, _948, _948, _948, _948, _948, _948};
  _940 = _998 + 18446744073709547500;
  vectp.37_945 = pretmp_889 + _940; <-- addr is (base + _999*4 + OFFSET3)
  _932 = MEM[(float *)local_Filter_12 + 12B];
  vect_cst__931 = {_932, _932, _932, _932, _932, _932, _932, _932};
  _924 = _998 + 18446744073709547504;
  vectp.44_929 = pretmp_889 + _924; <-- addr is (base + _999*4 + OFFSET4)
  _916 = MEM[(float *)local_Filter_12 + 16B];`
  vect_cst__915 = {_916, _916, _916, _916, _916, _916, _916, _916};
  _903 = _998 + 18446744073709549552;
  vectp.53_911 = pretmp_889 + _903; <-- addr is (base + _999*4 + OFFSET5)
  _895 = MEM[(float *)local_Filter_12 + 20B];
  vect_cst__894 = {_895, _895, _895, _895, _895, _895, _895, _895};
  _155 = _998 + 18446744073709549556;
  vectp.60_892 = pretmp_889 + _155;<-- addr is (base + _999*4 + OFFSET6)
  _500 = MEM[(float *)local_Filter_12 + 24B];
  vect_cst__37 = {_500, _500, _500, _500, _500, _500, _500, _500};
  _1070 = _998 + 18446744073709549560;
  vectp.68_907 = pretmp_889 + _1070; <-- addr is (base + _999*4 + OFFSET7)
  _1078 = MEM[(float *)local_Filter_12 + 28B];
  vect_cst__1079 = {_1078, _1078, _1078, _1078, _1078, _1078, _1078, _1078};
  _1087 = _998 + 18446744073709549564;
  vectp.76_1082 = pretmp_889 + _1087; <-- addr is (base + _999*4 + OFFSET8)
  _1095 = MEM[(float *)local_Filter_12 + 32B];
  vect_cst__1096 = {_1095, _1095, _1095, _1095, _1095, _1095, _1095, _1095};
  _1103 = _998 + 18446744073709549568; <-- addr is (base + _999*4 + OFFSET9)
  vectp.83_1098 = pretmp_889 + _1103;
  _1111 = MEM[(float *)local_Filter_12 + 36B];
...
  <bb 10>:
  # ivtmp.250_79 = PHI <ivtmp.250_56(10), 0(9)>
  # ivtmp.253_329 = PHI <ivtmp.253_330(10), 0(9)>
  vect__161.16_992 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset: 0B];
// load @ (base + _999*4 + OFFSET1 + IV + 0)
  vect__177.23_972 = MEM[base: vectp.22_982, index: ivtmp.253_329, offset: 0B];
// load @ (base + _999*4 + OFFSET11 + IV + 0)
  vect__182.27_963 = vect_cst__964 * vect__177.23_972;
  _1256 = vect_cst__984 * vect__161.16_992 + vect__182.27_963;
  vect__193.31_953 = MEM[base: vectp.30_961, index: ivtmp.253_329, offset: 0B];
// load @ (base + _999*4 + OFFSET2 + IV + 0)
  vect__209.38_937 = MEM[base: vectp.37_945, index: ivtmp.253_329, offset: 0B];
// load @ (base + _999*4 + OFFSET3 + IV + 0)
  vect__214.42_930 = vect_cst__931 * vect__209.38_937;
  _1235 = vect_cst__947 * vect__193.31_953 + vect__214.42_930;
  _1307 = _1235 + _1256;
  vect__225.45_921 = MEM[base: vectp.44_929, index: ivtmp.253_329, offset: 0B];
// load @ (base + _999*4 + OFFSET4 + IV + 0)
  vect__247.54_900 = MEM[base: vectp.53_911, index: ivtmp.253_329, offset: 0B];
// load @ (base + _999*4 + OFFSET5 + IV + 0)
  vect__252.58_893 = vect_cst__894 * vect__247.54_900;
  _1291 = vect_cst__915 * vect__225.45_921 + vect__252.58_893;
  _341 = _1291 + _1307;
  vect__263.61_242 = MEM[base: vectp.60_892, index: ivtmp.253_329, offset: 0B];
// load @ (base + _999*4 + OFFSET6 + IV + 0)
  vect__279.69_1073 = MEM[base: vectp.68_907, index: ivtmp.253_329, offset:
0B]; // load @ (base + _999*4 + OFFSET7 + IV + 0)
...

You can see all loads are differ in a constant.
All vector loads mentions may use common base for addres + constant offset.

Which pass is responsible for such optimization?

I'd like to see something like this:
  _999 = ivtmp.345_510 + prolog_loop_adjusted_niters.8_1611;
  _998 = _999 * 4;
  vectp.15_1 = pretmp_889 + _998; <-- addr is (base + _999*4)
...
  <bb 10>:
  # ivtmp.250_79 = PHI <ivtmp.250_56(10), 0(9)>
  # ivtmp.253_329 = PHI <ivtmp.253_330(10), 0(9)>
  vect__161.16_992 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset:
OFFSET1]; // load @ (base + _999*4 + IV + OFFSET1)
  vect__177.23_972 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset:
OFFSET11]; // load @ (base + _999*4 + IV + OFFSET11)
  vect__182.27_963 = vect_cst__964 * vect__177.23_972;
  _1256 = vect_cst__984 * vect__161.16_992 + vect__182.27_963;
  vect__193.31_953 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset:
OFFSET2]; // load @ (base + _999*4 + IV + OFFSET2)
  vect__209.38_937 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset:
OFFSET3]; // load @ (base + _999*4 + IV + OFFSET3)
  vect__214.42_930 = vect_cst__931 * vect__209.38_937;
  _1235 = vect_cst__947 * vect__193.31_953 + vect__214.42_930;
  _1307 = _1235 + _1256;
  vect__225.45_921 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset:
OFFSET4]; // load @ (base + _999*4 + IV + OFFSET4)
  vect__247.54_900 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset:
OFFSET5]; // load @ (base + _999*4 + IV + OFFSET5)
  vect__252.58_893 = vect_cst__894 * vect__247.54_900;
  _1291 = vect_cst__915 * vect__225.45_921 + vect__252.58_893;
  _341 = _1291 + _1307;
  vect__263.61_242 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset:
OFFSET6]; // load @ (base + _999*4 + IV + OFFSET6 )
  vect__279.69_1073 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset:
OFFSET7]; // load @ (base + _999*4 + IV + OFFSET7)


ICC exploit this successfully, it puts difference to offset:
  402636:       c4 01 64 59 a4 8a 20    vmulps 0x820(%r10,%r9,4),%ymm3,%ymm12
  40263d:       08 00 00
  402640:       c4 01 14 59 b4 8a 30    vmulps 0x1030(%r10,%r9,4),%ymm13,%ymm14
  402647:       10 00 00
  40264a:       c4 02 5d b8 a4 8a 18    vfmadd231ps
0x818(%r10,%r9,4),%ymm4,%ymm12
  402651:       08 00 00
  402654:       c4 02 2d b8 b4 8a 28    vfmadd231ps
0x1028(%r10,%r9,4),%ymm10,%ymm14
  40265b:       10 00 00
  40265e:       c4 41 24 57 db          vxorps %ymm11,%ymm11,%ymm11
  402663:       c4 02 7d b8 9c 8a 2c    vfmadd231ps
0x102c(%r10,%r9,4),%ymm0,%ymm11
  40266a:       10 00 00
  40266d:       c4 02 75 b8 9c 8a 24    vfmadd231ps
0x1024(%r10,%r9,4),%ymm1,%ymm11
  402674:       10 00 00
  402677:       c4 41 24 58 de          vaddps %ymm14,%ymm11,%ymm11
  40267c:       c4 01 6c 59 b4 8a 20    vmulps 0x1020(%r10,%r9,4),%ymm2,%ymm14

This causes GCC to be ~50% slower than ICC on this kernel.


^ permalink raw reply	[flat|nested] 2+ messages in thread

* [Bug tree-optimization/68030] Redundant address calculations in vectorized loop
  2015-10-20 13:37 [Bug tree-optimization/68030] New: Redundant address calculations in vectorized loop kyukhin at gcc dot gnu.org
@ 2015-10-20 14:06 ` rguenth at gcc dot gnu.org
  0 siblings, 0 replies; 2+ messages in thread
From: rguenth at gcc dot gnu.org @ 2015-10-20 14:06 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68030

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2015-10-20
                 CC|                            |rguenth at gcc dot gnu.org
     Ever confirmed|0                           |1

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
Induction variable optimization is responsible here but it needs some help from
a CSE.  I proposed adding a late FRE for that some time ago.  The issue is that
the vectorizer creates some redundancies when creating address IVs for the
vectorized accesses.


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2015-10-20 14:06 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-20 13:37 [Bug tree-optimization/68030] New: Redundant address calculations in vectorized loop kyukhin at gcc dot gnu.org
2015-10-20 14:06 ` [Bug tree-optimization/68030] " rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).