From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 113733 invoked by alias); 20 Oct 2015 13:37:26 -0000 Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org Received: (qmail 113646 invoked by uid 48); 20 Oct 2015 13:37:22 -0000 From: "kyukhin at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/68030] New: Redundant address calculations in vectorized loop Date: Tue, 20 Oct 2015 13:37:00 -0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Version: 6.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: kyukhin at gcc dot gnu.org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone attachments.created Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 7bit X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-SW-Source: 2015-10/txt/msg01613.txt.bz2 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68030 Bug ID: 68030 Summary: Redundant address calculations in vectorized loop Product: gcc Version: 6.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: kyukhin at gcc dot gnu.org Target Milestone: --- Created attachment 36548 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=36548&action=edit Reproducer Attached testcase. Compiled w/: -S -Ofast -march=haswell 1.c Code in main loop is: .L7: movq -176(%rbp), %rdx vmovaps -272(%rbp), %ymm1 addl $1, %r9d vmulps (%rdx,%rax), %ymm11, %ymm0 movq -184(%rbp), %rdx ; <- Fill vfmadd231ps (%rdx,%rax), %ymm12, %ymm0 movq -192(%rbp), %rdx ; <- Fill vmovaps %ymm0, %ymm15 vmulps (%rdx,%rax), %ymm13, %ymm0 movq -112(%rbp), %rdx; <- Fill vfmadd231ps (%rdx,%rax), %ymm14, %ymm0 movq -160(%rbp), %rdx; <- Fill vaddps %ymm0, %ymm15, %ymm0 vmulps (%rdx,%rax), %ymm9, %ymm15 movq -168(%rbp), %rdx ... Those loads are related to the same array (global_Input in the source) with fixed offsets from the base. Unfortunately, this is not recognized by Gimple (optimized): # ratio_mult_vf.11_1609 = PHI <504(8), 512(5)> # bnd.10_1618 = PHI <63(8), 64(5)> # niters.9_1610 = PHI # prolog_loop_adjusted_niters.8_1611 = PHI # ivtmp_1615 = PHI <_401(8), 512(5)> # ix_1592 = PHI <_456(8), 2(5)> _999 = ivtmp.345_510 + prolog_loop_adjusted_niters.8_1611; _998 = _999 * 4; _995 = _998 + 18446744073709547488; vectp.15_1 = pretmp_889 + _995; <-- addr is (base + _999*4 + OFFSET1) _986 = *local_Filter_12; vect_cst__984 = {_986, _986, _986, _986, _986, _986, _986, _986}; _975 = _998 + 18446744073709547492; vectp.22_982 = pretmp_889 + _975; <-- addr is (base + _999*4 + OFFSET11) _965 = MEM[(float *)local_Filter_12 + 4B]; vect_cst__964 = {_965, _965, _965, _965, _965, _965, _965, _965}; _956 = _998 + 18446744073709547496; vectp.30_961 = pretmp_889 + _956; <-- addr is (base + _999*4 + OFFSET2) _948 = MEM[(float *)local_Filter_12 + 8B]; vect_cst__947 = {_948, _948, _948, _948, _948, _948, _948, _948}; _940 = _998 + 18446744073709547500; vectp.37_945 = pretmp_889 + _940; <-- addr is (base + _999*4 + OFFSET3) _932 = MEM[(float *)local_Filter_12 + 12B]; vect_cst__931 = {_932, _932, _932, _932, _932, _932, _932, _932}; _924 = _998 + 18446744073709547504; vectp.44_929 = pretmp_889 + _924; <-- addr is (base + _999*4 + OFFSET4) _916 = MEM[(float *)local_Filter_12 + 16B];` vect_cst__915 = {_916, _916, _916, _916, _916, _916, _916, _916}; _903 = _998 + 18446744073709549552; vectp.53_911 = pretmp_889 + _903; <-- addr is (base + _999*4 + OFFSET5) _895 = MEM[(float *)local_Filter_12 + 20B]; vect_cst__894 = {_895, _895, _895, _895, _895, _895, _895, _895}; _155 = _998 + 18446744073709549556; vectp.60_892 = pretmp_889 + _155;<-- addr is (base + _999*4 + OFFSET6) _500 = MEM[(float *)local_Filter_12 + 24B]; vect_cst__37 = {_500, _500, _500, _500, _500, _500, _500, _500}; _1070 = _998 + 18446744073709549560; vectp.68_907 = pretmp_889 + _1070; <-- addr is (base + _999*4 + OFFSET7) _1078 = MEM[(float *)local_Filter_12 + 28B]; vect_cst__1079 = {_1078, _1078, _1078, _1078, _1078, _1078, _1078, _1078}; _1087 = _998 + 18446744073709549564; vectp.76_1082 = pretmp_889 + _1087; <-- addr is (base + _999*4 + OFFSET8) _1095 = MEM[(float *)local_Filter_12 + 32B]; vect_cst__1096 = {_1095, _1095, _1095, _1095, _1095, _1095, _1095, _1095}; _1103 = _998 + 18446744073709549568; <-- addr is (base + _999*4 + OFFSET9) vectp.83_1098 = pretmp_889 + _1103; _1111 = MEM[(float *)local_Filter_12 + 36B]; ... : # ivtmp.250_79 = PHI # ivtmp.253_329 = PHI vect__161.16_992 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset: 0B]; // load @ (base + _999*4 + OFFSET1 + IV + 0) vect__177.23_972 = MEM[base: vectp.22_982, index: ivtmp.253_329, offset: 0B]; // load @ (base + _999*4 + OFFSET11 + IV + 0) vect__182.27_963 = vect_cst__964 * vect__177.23_972; _1256 = vect_cst__984 * vect__161.16_992 + vect__182.27_963; vect__193.31_953 = MEM[base: vectp.30_961, index: ivtmp.253_329, offset: 0B]; // load @ (base + _999*4 + OFFSET2 + IV + 0) vect__209.38_937 = MEM[base: vectp.37_945, index: ivtmp.253_329, offset: 0B]; // load @ (base + _999*4 + OFFSET3 + IV + 0) vect__214.42_930 = vect_cst__931 * vect__209.38_937; _1235 = vect_cst__947 * vect__193.31_953 + vect__214.42_930; _1307 = _1235 + _1256; vect__225.45_921 = MEM[base: vectp.44_929, index: ivtmp.253_329, offset: 0B]; // load @ (base + _999*4 + OFFSET4 + IV + 0) vect__247.54_900 = MEM[base: vectp.53_911, index: ivtmp.253_329, offset: 0B]; // load @ (base + _999*4 + OFFSET5 + IV + 0) vect__252.58_893 = vect_cst__894 * vect__247.54_900; _1291 = vect_cst__915 * vect__225.45_921 + vect__252.58_893; _341 = _1291 + _1307; vect__263.61_242 = MEM[base: vectp.60_892, index: ivtmp.253_329, offset: 0B]; // load @ (base + _999*4 + OFFSET6 + IV + 0) vect__279.69_1073 = MEM[base: vectp.68_907, index: ivtmp.253_329, offset: 0B]; // load @ (base + _999*4 + OFFSET7 + IV + 0) ... You can see all loads are differ in a constant. All vector loads mentions may use common base for addres + constant offset. Which pass is responsible for such optimization? I'd like to see something like this: _999 = ivtmp.345_510 + prolog_loop_adjusted_niters.8_1611; _998 = _999 * 4; vectp.15_1 = pretmp_889 + _998; <-- addr is (base + _999*4) ... : # ivtmp.250_79 = PHI # ivtmp.253_329 = PHI vect__161.16_992 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset: OFFSET1]; // load @ (base + _999*4 + IV + OFFSET1) vect__177.23_972 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset: OFFSET11]; // load @ (base + _999*4 + IV + OFFSET11) vect__182.27_963 = vect_cst__964 * vect__177.23_972; _1256 = vect_cst__984 * vect__161.16_992 + vect__182.27_963; vect__193.31_953 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset: OFFSET2]; // load @ (base + _999*4 + IV + OFFSET2) vect__209.38_937 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset: OFFSET3]; // load @ (base + _999*4 + IV + OFFSET3) vect__214.42_930 = vect_cst__931 * vect__209.38_937; _1235 = vect_cst__947 * vect__193.31_953 + vect__214.42_930; _1307 = _1235 + _1256; vect__225.45_921 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset: OFFSET4]; // load @ (base + _999*4 + IV + OFFSET4) vect__247.54_900 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset: OFFSET5]; // load @ (base + _999*4 + IV + OFFSET5) vect__252.58_893 = vect_cst__894 * vect__247.54_900; _1291 = vect_cst__915 * vect__225.45_921 + vect__252.58_893; _341 = _1291 + _1307; vect__263.61_242 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset: OFFSET6]; // load @ (base + _999*4 + IV + OFFSET6 ) vect__279.69_1073 = MEM[base: vectp.15_1, index: ivtmp.253_329, offset: OFFSET7]; // load @ (base + _999*4 + IV + OFFSET7) ICC exploit this successfully, it puts difference to offset: 402636: c4 01 64 59 a4 8a 20 vmulps 0x820(%r10,%r9,4),%ymm3,%ymm12 40263d: 08 00 00 402640: c4 01 14 59 b4 8a 30 vmulps 0x1030(%r10,%r9,4),%ymm13,%ymm14 402647: 10 00 00 40264a: c4 02 5d b8 a4 8a 18 vfmadd231ps 0x818(%r10,%r9,4),%ymm4,%ymm12 402651: 08 00 00 402654: c4 02 2d b8 b4 8a 28 vfmadd231ps 0x1028(%r10,%r9,4),%ymm10,%ymm14 40265b: 10 00 00 40265e: c4 41 24 57 db vxorps %ymm11,%ymm11,%ymm11 402663: c4 02 7d b8 9c 8a 2c vfmadd231ps 0x102c(%r10,%r9,4),%ymm0,%ymm11 40266a: 10 00 00 40266d: c4 02 75 b8 9c 8a 24 vfmadd231ps 0x1024(%r10,%r9,4),%ymm1,%ymm11 402674: 10 00 00 402677: c4 41 24 58 de vaddps %ymm14,%ymm11,%ymm11 40267c: c4 01 6c 59 b4 8a 20 vmulps 0x1020(%r10,%r9,4),%ymm2,%ymm14 This causes GCC to be ~50% slower than ICC on this kernel.