From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 9806 invoked by alias); 12 Jun 2012 09:54:21 -0000 Received: (qmail 9566 invoked by uid 22791); 12 Jun 2012 09:54:18 -0000 X-SWARE-Spam-Status: No, hits=-4.1 required=5.0 tests=ALL_TRUSTED,AWL,BAYES_00,KHOP_THREADED,TW_DD,TW_DQ,TW_PX,TW_SR,TW_VD X-Spam-Check-By: sourceware.org Received: from localhost (HELO gcc.gnu.org) (127.0.0.1) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Tue, 12 Jun 2012 09:54:04 +0000 From: "rguenth at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug rtl-optimization/53533] [4.7/4.8 regression] vectorization causes loop unrolling test slowdown as measured by Adobe's C++Benchmark Date: Tue, 12 Jun 2012 09:54:00 -0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: rtl-optimization X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: normal X-Bugzilla-Who: rguenth at gcc dot gnu.org X-Bugzilla-Status: NEW X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Changed-Fields: Target Status Known to work Keywords Component CC Summary Known to fail Severity Message-ID: In-Reply-To: References: X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated Content-Type: text/plain; charset="UTF-8" MIME-Version: 1.0 Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org X-SW-Source: 2012-06/txt/msg00668.txt.bz2 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53533 Richard Guenther changed: What |Removed |Added ---------------------------------------------------------------------------- Target| |x86_64-*-* Status|WAITING |NEW Known to work| |4.6.3 Keywords| |missed-optimization Component|middle-end |rtl-optimization CC| |jakub at gcc dot gnu.org, | |uros at gcc dot gnu.org Summary|[4.7 regression] loop |[4.7/4.8 regression] |unrolling as measured by |vectorization causes loop |Adobe's C++Benchmark is |unrolling test slowdown as |twice as slow versus |measured by Adobe's |4.4-4.6 |C++Benchmark Known to fail| |4.7.1, 4.8.0 Severity|major |normal --- Comment #6 from Richard Guenther 2012-06-12 09:54:02 UTC --- Ok, it seems to me that this has template-metaprogramming loop unrolling. With GCC 4.7 we unroll and vectorize all loops, for example unroll factor 8 looks like : # vect_var_.941_3474 = PHI # vect_var_.941_3473 = PHI # ivtmp.1325_970 = PHI D.9934_819 = (void *) ivtmp.1325_970; vect_var_.918_323 = MEM[base: D.9934_819, offset: 0B]; vect_var_.919_325 = MEM[base: D.9934_819, offset: 16B]; vect_var_.920_328 = vect_var_.918_323 + { 12345, 12345, 12345, 12345 }; vect_var_.920_330 = vect_var_.919_325 + { 12345, 12345, 12345, 12345 }; vect_var_.923_480 = vect_var_.920_328 * { 914237, 914237, 914237, 914237 }; vect_var_.923_895 = vect_var_.920_330 * { 914237, 914237, 914237, 914237 }; vect_var_.926_231 = vect_var_.923_480 + { 12332, 12332, 12332, 12332 }; vect_var_.926_232 = vect_var_.923_895 + { 12332, 12332, 12332, 12332 }; vect_var_.929_235 = vect_var_.926_231 * { 914237, 914237, 914237, 914237 }; vect_var_.929_236 = vect_var_.926_232 * { 914237, 914237, 914237, 914237 }; vect_var_.932_239 = vect_var_.929_235 + { 12332, 12332, 12332, 12332 }; vect_var_.932_240 = vect_var_.929_236 + { 12332, 12332, 12332, 12332 }; vect_var_.935_113 = vect_var_.932_239 * { 914237, 914237, 914237, 914237 }; vect_var_.935_247 = vect_var_.932_240 * { 914237, 914237, 914237, 914237 }; vect_var_.938_582 = vect_var_.935_113 + { -13, -13, -13, -13 }; vect_var_.938_839 = vect_var_.935_247 + { -13, -13, -13, -13 }; vect_var_.941_3472 = vect_var_.938_582 + vect_var_.941_3474; vect_var_.941_3471 = vect_var_.938_839 + vect_var_.941_3473; ivtmp.1325_812 = ivtmp.1325_970 + 32; if (ivtmp.1325_812 != D.9937_388) goto ; else goto ; : # vect_var_.941_3468 = PHI # vect_var_.941_3467 = PHI vect_var_.945_3466 = vect_var_.941_3468 + vect_var_.941_3467; vect_var_.946_3465 = vect_var_.945_3466 v>> 64; vect_var_.946_3464 = vect_var_.946_3465 + vect_var_.945_3466; vect_var_.946_3463 = vect_var_.946_3464 v>> 32; vect_var_.946_3462 = vect_var_.946_3463 + vect_var_.946_3464; stmp_var_.944_3461 = BIT_FIELD_REF ; init_value.7_795 = init_value; D.8606_796 = (int) init_value.7_795; D.8600_797 = D.8606_796 + 12345; D.8599_798 = D.8600_797 * 914237; D.8602_799 = D.8599_798 + 12332; D.8601_800 = D.8602_799 * 914237; D.8604_801 = D.8601_800 + 12332; D.8603_802 = D.8604_801 * 914237; D.8605_803 = D.8603_802 + -13; temp_804 = D.8605_803 * 8000; if (temp_804 != stmp_var_.944_3461) goto ; else goto ; With GCC 4.6 OTOH the above loop is not vectorized, only the (slow) not unrolled loop is. : # result_622 = PHI # ivtmp.852_1026 = PHI D.9283_3302 = (void *) ivtmp.852_1026; temp_801 = MEM[base: D.9283_3302, offset: 0B]; D.8366_802 = temp_801 + 12345; D.8365_803 = D.8366_802 * 914237; D.8368_804 = D.8365_803 + 12332; D.8367_805 = D.8368_804 * 914237; D.8370_806 = D.8367_805 + 12332; D.8369_807 = D.8370_806 * 914237; temp_808 = D.8369_807 + -13; result_810 = temp_808 + result_622; temp_815 = MEM[base: D.9283_3302, offset: 4B]; D.8381_816 = temp_815 + 12345; D.8382_817 = D.8381_816 * 914237; D.8378_818 = D.8382_817 + 12332; D.8379_819 = D.8378_818 * 914237; D.8376_820 = D.8379_819 + 12332; D.8377_821 = D.8376_820 * 914237; temp_822 = D.8377_821 + -13; result_824 = result_810 + temp_822; temp_788 = MEM[base: D.9283_3302, offset: 8B]; D.8351_789 = temp_788 + 12345; D.8352_790 = D.8351_789 * 914237; D.8348_791 = D.8352_790 + 12332; D.8349_792 = D.8348_791 * 914237; D.8346_793 = D.8349_792 + 12332; D.8347_794 = D.8346_793 * 914237; temp_795 = D.8347_794 + -13; result_797 = temp_795 + result_824; temp_774 = MEM[base: D.9283_3302, offset: 12B]; D.8333_775 = temp_774 + 12345; D.8334_776 = D.8333_775 * 914237; D.8330_777 = D.8334_776 + 12332; D.8331_778 = D.8330_777 * 914237; D.8328_779 = D.8331_778 + 12332; D.8329_780 = D.8328_779 * 914237; temp_781 = D.8329_780 + -13; result_783 = temp_781 + result_797; temp_760 = MEM[base: D.9283_3302, offset: 16B]; D.8315_761 = temp_760 + 12345; D.8316_762 = D.8315_761 * 914237; D.8312_763 = D.8316_762 + 12332; D.8313_764 = D.8312_763 * 914237; D.8310_765 = D.8313_764 + 12332; D.8311_766 = D.8310_765 * 914237; temp_767 = D.8311_766 + -13; result_769 = temp_767 + result_783; temp_746 = MEM[base: D.9283_3302, offset: 20B]; D.8297_747 = temp_746 + 12345; D.8298_748 = D.8297_747 * 914237; D.8294_749 = D.8298_748 + 12332; D.8295_750 = D.8294_749 * 914237; D.8292_751 = D.8295_750 + 12332; D.8293_752 = D.8292_751 * 914237; temp_753 = D.8293_752 + -13; result_755 = temp_753 + result_769; temp_732 = MEM[base: D.9283_3302, offset: 24B]; D.8279_733 = temp_732 + 12345; D.8280_734 = D.8279_733 * 914237; D.8276_735 = D.8280_734 + 12332; D.8277_736 = D.8276_735 * 914237; D.8274_737 = D.8277_736 + 12332; D.8275_738 = D.8274_737 * 914237; temp_739 = D.8275_738 + -13; result_741 = temp_739 + result_755; temp_695 = MEM[base: D.9283_3302, offset: 28B]; D.8246_696 = temp_695 + 12345; D.8245_697 = D.8246_696 * 914237; D.8248_698 = D.8245_697 + 12332; D.8247_699 = D.8248_698 * 914237; D.8250_700 = D.8247_699 + 12332; D.8249_701 = D.8250_700 * 914237; temp_702 = D.8249_701 + -13; result_704 = temp_702 + result_741; ivtmp.852_842 = ivtmp.852_1026 + 32; if (ivtmp.852_842 != D.9292_3369) goto ; else goto ; : # result_3198 = PHI init_value.7_825 = init_value; D.8393_826 = (int) init_value.7_825; D.8387_827 = D.8393_826 + 12345; D.8386_828 = D.8387_827 * 914237; D.8389_829 = D.8386_828 + 12332; D.8388_830 = D.8389_829 * 914237; D.8391_831 = D.8388_830 + 12332; D.8390_832 = D.8391_831 * 914237; D.8392_833 = D.8390_832 + -13; temp_834 = D.8392_833 * 8000; if (temp_834 != result_3198) goto ; else goto ; With -fno-tree-vectorize the performance is the same. It seems that vectorization is not profitable here for some reason. Same behavior can be observed with GCC 4.8. I used the preprocessed source for 4.7 from the ZIP file. The code generated is odd at least, the inner loop looks like movdqa .LC6(%rip), %xmm3 xorl %ebx, %ebx movdqa .LC7(%rip), %xmm0 movdqa .LC8(%rip), %xmm1 movdqa .LC9(%rip), %xmm2 .p2align 4,,10 .p2align 3 .L51: pxor %xmm6, %xmm6 movl $data32, %eax movdqa %xmm6, %xmm7 .p2align 4,,10 .p2align 3 .L53: movdqa (%rax), %xmm4 movdqa %xmm0, %xmm8 paddd %xmm3, %xmm4 movdqa %xmm4, %xmm5 psrldq $4, %xmm4 psrldq $4, %xmm8 pmuludq %xmm8, %xmm4 pshufd $8, %xmm4, %xmm4 pmuludq %xmm0, %xmm5 pshufd $8, %xmm5, %xmm5 movdqa %xmm0, %xmm8 psrldq $4, %xmm8 punpckldq %xmm4, %xmm5 paddd %xmm1, %xmm5 movdqa %xmm5, %xmm4 psrldq $4, %xmm5 pmuludq %xmm8, %xmm5 pshufd $8, %xmm5, %xmm5 pmuludq %xmm0, %xmm4 pshufd $8, %xmm4, %xmm4 punpckldq %xmm5, %xmm4 movdqa %xmm0, %xmm5 paddd %xmm1, %xmm4 movdqa %xmm4, %xmm8 psrldq $4, %xmm5 psrldq $4, %xmm4 pmuludq %xmm4, %xmm5 pshufd $8, %xmm5, %xmm5 pmuludq %xmm0, %xmm8 pshufd $8, %xmm8, %xmm4 movdqa %xmm0, %xmm8 psrldq $4, %xmm8 punpckldq %xmm5, %xmm4 paddd %xmm2, %xmm4 paddd %xmm4, %xmm7 movdqa 16(%rax), %xmm4 addq $32, %rax paddd %xmm3, %xmm4 movdqa %xmm4, %xmm5 psrldq $4, %xmm4 pmuludq %xmm8, %xmm4 pshufd $8, %xmm4, %xmm4 movdqa %xmm0, %xmm8 pmuludq %xmm0, %xmm5 pshufd $8, %xmm5, %xmm5 cmpq $data32+32000, %rax psrldq $4, %xmm8 punpckldq %xmm4, %xmm5 paddd %xmm1, %xmm5 movdqa %xmm5, %xmm4 psrldq $4, %xmm5 pmuludq %xmm8, %xmm5 pshufd $8, %xmm5, %xmm5 pmuludq %xmm0, %xmm4 pshufd $8, %xmm4, %xmm4 punpckldq %xmm5, %xmm4 movdqa %xmm0, %xmm5 paddd %xmm1, %xmm4 movdqa %xmm4, %xmm8 psrldq $4, %xmm5 psrldq $4, %xmm4 pmuludq %xmm4, %xmm5 pshufd $8, %xmm5, %xmm5 pmuludq %xmm0, %xmm8 pshufd $8, %xmm8, %xmm4 punpckldq %xmm5, %xmm4 paddd %xmm2, %xmm4 paddd %xmm4, %xmm6 jne .L53 which means we expand the multiplications with the constants in an odd way.