From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugs-return-394041-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 9806 invoked by alias); 12 Jun 2012 09:54:21 -0000
Received: (qmail 9566 invoked by uid 22791); 12 Jun 2012 09:54:18 -0000
X-SWARE-Spam-Status: No, hits=-4.1 required=5.0	tests=ALL_TRUSTED,AWL,BAYES_00,KHOP_THREADED,TW_DD,TW_DQ,TW_PX,TW_SR,TW_VD
X-Spam-Check-By: sourceware.org
Received: from localhost (HELO gcc.gnu.org) (127.0.0.1)    by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Tue, 12 Jun 2012 09:54:04 +0000
From: "rguenth at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug rtl-optimization/53533] [4.7/4.8 regression] vectorization causes loop unrolling test slowdown as measured by Adobe's C++Benchmark
Date: Tue, 12 Jun 2012 09:54:00 -0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: rtl-optimization
X-Bugzilla-Keywords: missed-optimization
X-Bugzilla-Severity: normal
X-Bugzilla-Who: rguenth at gcc dot gnu.org
X-Bugzilla-Status: NEW
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Changed-Fields: Target Status Known to work Keywords Component CC Summary Known to fail Severity
Message-ID: <bug-53533-4-GL6Mqb2IZ5@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-53533-4@http.gcc.gnu.org/bugzilla/>
References: <bug-53533-4@http.gcc.gnu.org/bugzilla/>
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
Content-Type: text/plain; charset="UTF-8"
MIME-Version: 1.0
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-bugs.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
X-SW-Source: 2012-06/txt/msg00668.txt.bz2

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53533

Richard Guenther <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Target|                            |x86_64-*-*
             Status|WAITING                     |NEW
      Known to work|                            |4.6.3
           Keywords|                            |missed-optimization
          Component|middle-end                  |rtl-optimization
                 CC|                            |jakub at gcc dot gnu.org,
                   |                            |uros at gcc dot gnu.org
            Summary|[4.7 regression] loop       |[4.7/4.8 regression]
                   |unrolling as measured by    |vectorization causes loop
                   |Adobe's C++Benchmark is     |unrolling test slowdown as
                   |twice as slow versus        |measured by Adobe's
                   |4.4-4.6                     |C++Benchmark
      Known to fail|                            |4.7.1, 4.8.0
           Severity|major                       |normal
--- Comment #6 from Richard Guenther <rguenth at gcc dot gnu.org> 2012-06-12 09:54:02 UTC ---
Ok, it seems to me that this has template-metaprogramming loop unrolling.  With
GCC 4.7 we unroll and vectorize all loops, for example unroll factor 8 looks
like

<bb 50>:
  # vect_var_.941_3474 = PHI <vect_var_.941_3472(50), {0, 0, 0, 0}(64)>
  # vect_var_.941_3473 = PHI <vect_var_.941_3471(50), {0, 0, 0, 0}(64)>
  # ivtmp.1325_970 = PHI <ivtmp.1325_812(50), ivtmp.1325_813(64)>
  D.9934_819 = (void *) ivtmp.1325_970;
  vect_var_.918_323 = MEM[base: D.9934_819, offset: 0B];
  vect_var_.919_325 = MEM[base: D.9934_819, offset: 16B];
  vect_var_.920_328 = vect_var_.918_323 + { 12345, 12345, 12345, 12345 };
  vect_var_.920_330 = vect_var_.919_325 + { 12345, 12345, 12345, 12345 };
  vect_var_.923_480 = vect_var_.920_328 * { 914237, 914237, 914237, 914237 };
  vect_var_.923_895 = vect_var_.920_330 * { 914237, 914237, 914237, 914237 };
  vect_var_.926_231 = vect_var_.923_480 + { 12332, 12332, 12332, 12332 };
  vect_var_.926_232 = vect_var_.923_895 + { 12332, 12332, 12332, 12332 };
  vect_var_.929_235 = vect_var_.926_231 * { 914237, 914237, 914237, 914237 };
  vect_var_.929_236 = vect_var_.926_232 * { 914237, 914237, 914237, 914237 };
  vect_var_.932_239 = vect_var_.929_235 + { 12332, 12332, 12332, 12332 };
  vect_var_.932_240 = vect_var_.929_236 + { 12332, 12332, 12332, 12332 };
  vect_var_.935_113 = vect_var_.932_239 * { 914237, 914237, 914237, 914237 };
  vect_var_.935_247 = vect_var_.932_240 * { 914237, 914237, 914237, 914237 };
  vect_var_.938_582 = vect_var_.935_113 + { -13, -13, -13, -13 };
  vect_var_.938_839 = vect_var_.935_247 + { -13, -13, -13, -13 };
  vect_var_.941_3472 = vect_var_.938_582 + vect_var_.941_3474;
  vect_var_.941_3471 = vect_var_.938_839 + vect_var_.941_3473;
  ivtmp.1325_812 = ivtmp.1325_970 + 32;
  if (ivtmp.1325_812 != D.9937_388)
    goto <bb 50>;
  else
    goto <bb 51>;

<bb 51>:
  # vect_var_.941_3468 = PHI <vect_var_.941_3472(50)>
  # vect_var_.941_3467 = PHI <vect_var_.941_3471(50)>
  vect_var_.945_3466 = vect_var_.941_3468 + vect_var_.941_3467;
  vect_var_.946_3465 = vect_var_.945_3466 v>> 64;
  vect_var_.946_3464 = vect_var_.946_3465 + vect_var_.945_3466;
  vect_var_.946_3463 = vect_var_.946_3464 v>> 32;
  vect_var_.946_3462 = vect_var_.946_3463 + vect_var_.946_3464;
  stmp_var_.944_3461 = BIT_FIELD_REF <vect_var_.946_3462, 32, 0>;
  init_value.7_795 = init_value;
  D.8606_796 = (int) init_value.7_795;
  D.8600_797 = D.8606_796 + 12345;
  D.8599_798 = D.8600_797 * 914237;
  D.8602_799 = D.8599_798 + 12332;
  D.8601_800 = D.8602_799 * 914237;
  D.8604_801 = D.8601_800 + 12332;
  D.8603_802 = D.8604_801 * 914237;
  D.8605_803 = D.8603_802 + -13;
  temp_804 = D.8605_803 * 8000;
  if (temp_804 != stmp_var_.944_3461)
    goto <bb 52>;
  else
    goto <bb 53>;


With GCC 4.6 OTOH the above loop is not vectorized, only the (slow) not
unrolled loop is.

<bb 49>:
  # result_622 = PHI <result_704(49), 0(63)>
  # ivtmp.852_1026 = PHI <ivtmp.852_842(49), ivtmp.852_844(63)>
  D.9283_3302 = (void *) ivtmp.852_1026;
  temp_801 = MEM[base: D.9283_3302, offset: 0B];
  D.8366_802 = temp_801 + 12345;
  D.8365_803 = D.8366_802 * 914237;
  D.8368_804 = D.8365_803 + 12332;
  D.8367_805 = D.8368_804 * 914237;
  D.8370_806 = D.8367_805 + 12332;
  D.8369_807 = D.8370_806 * 914237;
  temp_808 = D.8369_807 + -13;
  result_810 = temp_808 + result_622;
  temp_815 = MEM[base: D.9283_3302, offset: 4B];
  D.8381_816 = temp_815 + 12345;
  D.8382_817 = D.8381_816 * 914237;
  D.8378_818 = D.8382_817 + 12332;
  D.8379_819 = D.8378_818 * 914237;
  D.8376_820 = D.8379_819 + 12332;
  D.8377_821 = D.8376_820 * 914237;
  temp_822 = D.8377_821 + -13;
  result_824 = result_810 + temp_822;
  temp_788 = MEM[base: D.9283_3302, offset: 8B];
  D.8351_789 = temp_788 + 12345;
  D.8352_790 = D.8351_789 * 914237;
  D.8348_791 = D.8352_790 + 12332;
  D.8349_792 = D.8348_791 * 914237;
  D.8346_793 = D.8349_792 + 12332;
  D.8347_794 = D.8346_793 * 914237;
  temp_795 = D.8347_794 + -13;
  result_797 = temp_795 + result_824;
  temp_774 = MEM[base: D.9283_3302, offset: 12B];
  D.8333_775 = temp_774 + 12345;
  D.8334_776 = D.8333_775 * 914237;
  D.8330_777 = D.8334_776 + 12332;
  D.8331_778 = D.8330_777 * 914237;
  D.8328_779 = D.8331_778 + 12332;
  D.8329_780 = D.8328_779 * 914237;
  temp_781 = D.8329_780 + -13;
  result_783 = temp_781 + result_797;
  temp_760 = MEM[base: D.9283_3302, offset: 16B];
  D.8315_761 = temp_760 + 12345;
  D.8316_762 = D.8315_761 * 914237;
  D.8312_763 = D.8316_762 + 12332;
  D.8313_764 = D.8312_763 * 914237;
  D.8310_765 = D.8313_764 + 12332;
  D.8311_766 = D.8310_765 * 914237;
  temp_767 = D.8311_766 + -13;
  result_769 = temp_767 + result_783;
  temp_746 = MEM[base: D.9283_3302, offset: 20B];
  D.8297_747 = temp_746 + 12345;
  D.8298_748 = D.8297_747 * 914237;
  D.8294_749 = D.8298_748 + 12332;
  D.8295_750 = D.8294_749 * 914237;
  D.8292_751 = D.8295_750 + 12332;
  D.8293_752 = D.8292_751 * 914237;
  temp_753 = D.8293_752 + -13;
  result_755 = temp_753 + result_769;
  temp_732 = MEM[base: D.9283_3302, offset: 24B];
  D.8279_733 = temp_732 + 12345;
  D.8280_734 = D.8279_733 * 914237;
  D.8276_735 = D.8280_734 + 12332;
  D.8277_736 = D.8276_735 * 914237;
  D.8274_737 = D.8277_736 + 12332;
  D.8275_738 = D.8274_737 * 914237;
  temp_739 = D.8275_738 + -13;
  result_741 = temp_739 + result_755;
  temp_695 = MEM[base: D.9283_3302, offset: 28B];
  D.8246_696 = temp_695 + 12345;
  D.8245_697 = D.8246_696 * 914237;
  D.8248_698 = D.8245_697 + 12332;
  D.8247_699 = D.8248_698 * 914237;
  D.8250_700 = D.8247_699 + 12332;
  D.8249_701 = D.8250_700 * 914237;
  temp_702 = D.8249_701 + -13;
  result_704 = temp_702 + result_741;
  ivtmp.852_842 = ivtmp.852_1026 + 32;
  if (ivtmp.852_842 != D.9292_3369)
    goto <bb 49>;
  else
    goto <bb 50>;

<bb 50>:
  # result_3198 = PHI <result_704(49)>
  init_value.7_825 = init_value;
  D.8393_826 = (int) init_value.7_825;
  D.8387_827 = D.8393_826 + 12345;
  D.8386_828 = D.8387_827 * 914237;
  D.8389_829 = D.8386_828 + 12332;
  D.8388_830 = D.8389_829 * 914237;
  D.8391_831 = D.8388_830 + 12332;
  D.8390_832 = D.8391_831 * 914237;
  D.8392_833 = D.8390_832 + -13;
  temp_834 = D.8392_833 * 8000;
  if (temp_834 != result_3198)
    goto <bb 51>;
  else
    goto <bb 52>;

With -fno-tree-vectorize the performance is the same.  It seems that
vectorization is not profitable here for some reason.  Same behavior
can be observed with GCC 4.8.

I used the preprocessed source for 4.7 from the ZIP file.

The code generated is odd at least, the inner loop looks like

        movdqa  .LC6(%rip), %xmm3
        xorl    %ebx, %ebx
        movdqa  .LC7(%rip), %xmm0
        movdqa  .LC8(%rip), %xmm1
        movdqa  .LC9(%rip), %xmm2
        .p2align 4,,10
        .p2align 3
.L51:
        pxor    %xmm6, %xmm6
        movl    $data32, %eax
        movdqa  %xmm6, %xmm7
        .p2align 4,,10
        .p2align 3
.L53:
        movdqa  (%rax), %xmm4
        movdqa  %xmm0, %xmm8
        paddd   %xmm3, %xmm4
        movdqa  %xmm4, %xmm5
        psrldq  $4, %xmm4
        psrldq  $4, %xmm8
        pmuludq %xmm8, %xmm4
        pshufd  $8, %xmm4, %xmm4
        pmuludq %xmm0, %xmm5
        pshufd  $8, %xmm5, %xmm5
        movdqa  %xmm0, %xmm8
        psrldq  $4, %xmm8
        punpckldq       %xmm4, %xmm5
        paddd   %xmm1, %xmm5
        movdqa  %xmm5, %xmm4
        psrldq  $4, %xmm5
        pmuludq %xmm8, %xmm5
        pshufd  $8, %xmm5, %xmm5
        pmuludq %xmm0, %xmm4
        pshufd  $8, %xmm4, %xmm4
        punpckldq       %xmm5, %xmm4
        movdqa  %xmm0, %xmm5
        paddd   %xmm1, %xmm4
        movdqa  %xmm4, %xmm8
        psrldq  $4, %xmm5
        psrldq  $4, %xmm4
        pmuludq %xmm4, %xmm5
        pshufd  $8, %xmm5, %xmm5
        pmuludq %xmm0, %xmm8
        pshufd  $8, %xmm8, %xmm4
        movdqa  %xmm0, %xmm8
        psrldq  $4, %xmm8
        punpckldq       %xmm5, %xmm4
        paddd   %xmm2, %xmm4
        paddd   %xmm4, %xmm7
        movdqa  16(%rax), %xmm4
        addq    $32, %rax
        paddd   %xmm3, %xmm4
        movdqa  %xmm4, %xmm5
        psrldq  $4, %xmm4
        pmuludq %xmm8, %xmm4
        pshufd  $8, %xmm4, %xmm4
        movdqa  %xmm0, %xmm8
        pmuludq %xmm0, %xmm5
        pshufd  $8, %xmm5, %xmm5
        cmpq    $data32+32000, %rax
        psrldq  $4, %xmm8
        punpckldq       %xmm4, %xmm5
        paddd   %xmm1, %xmm5
        movdqa  %xmm5, %xmm4
        psrldq  $4, %xmm5
        pmuludq %xmm8, %xmm5
        pshufd  $8, %xmm5, %xmm5
        pmuludq %xmm0, %xmm4
        pshufd  $8, %xmm4, %xmm4
        punpckldq       %xmm5, %xmm4
        movdqa  %xmm0, %xmm5
        paddd   %xmm1, %xmm4
        movdqa  %xmm4, %xmm8
        psrldq  $4, %xmm5
        psrldq  $4, %xmm4
        pmuludq %xmm4, %xmm5
        pshufd  $8, %xmm5, %xmm5
        pmuludq %xmm0, %xmm8
        pshufd  $8, %xmm8, %xmm4
        punpckldq       %xmm5, %xmm4
        paddd   %xmm2, %xmm4
        paddd   %xmm4, %xmm6
        jne     .L53

which means we expand the multiplications with the constants in an odd way.