From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugs-return-276845-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 4267 invoked by alias); 25 Feb 2009 13:53:38 -0000
Received: (qmail 2638 invoked by uid 48); 25 Feb 2009 13:53:21 -0000
Date: Wed, 25 Feb 2009 13:53:00 -0000
Message-ID: <20090225135321.2637.qmail@sourceware.org>
X-Bugzilla-Reason: CC
References: <bug-39300-12182@http.gcc.gnu.org/bugzilla/>
Subject: [Bug tree-optimization/39300] vectorizer confused by predictive commoning
In-Reply-To: <bug-39300-12182@http.gcc.gnu.org/bugzilla/>
Reply-To: gcc-bugzilla@gcc.gnu.org
To: gcc-bugs@gcc.gnu.org
From: "matz at gcc dot gnu dot org" <gcc-bugzilla@gcc.gnu.org>
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-bugs.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
X-SW-Source: 2009-02/txt/msg02160.txt.bz2


------- Comment #1 from matz at gcc dot gnu dot org  2009-02-25 13:53 -------
For reference intel fortran (11.0) produces three loops, one where it uses
predictive commoning (that is used when there are only few iterations):

..B1.7:                         # Preds ..B1.6
        movsd     8(%r8), %xmm1                                 #13.52
        movsd     (%r8), %xmm0                                  #13.52
                                # LOE rax rdx rcx rbx rbp rsi rdi r8 r9 r12 r13
r14 r15 xmm0 xmm1
..B1.8:                         # Preds ..B1.8 ..B1.7
        movaps    %xmm1, %xmm2                                  #13.33
        movsd     16(%r8,%rdi,8), %xmm3                         #13.52
        addsd     %xmm3, %xmm2                                  #13.33
        addsd     %xmm0, %xmm2                                  #13.41
        movaps    %xmm1, %xmm0                                  #14.7
        movaps    %xmm3, %xmm1                                  #14.7
        addsd     8(%rdx,%rdi,8), %xmm2                         #13.9
        movsd     %xmm2, 8(%rcx,%rdi,8)                         #13.9
        incq      %rdi                                          #14.7
        cmpq      %rax, %rdi                                    #14.7
        jl        ..B1.8        # Prob 82%                      #14.7

And two others which are vectorized (plus four/eight times unrolled), but
do _not_ use something like predictive commoning (i.e. no cross iteration
values).  Both loops are just versions of each other, one for aligned
destinations and the other for unaligned.  The aligned variant is this:

..B1.15:                        # Preds ..B1.10 ..B1.15
        movsd     8(%rdx,%rax,8), %xmm1                         #13.18
        movhpd    16(%rdx,%rax,8), %xmm1                        #13.18
        movsd     8(%r8,%rax,8), %xmm0                          #13.34
        movhpd    16(%r8,%rax,8), %xmm0                         #13.34
        movsd     24(%rdx,%rax,8), %xmm4                        #13.18
        movhpd    32(%rdx,%rax,8), %xmm4                        #13.18
        movsd     24(%r8,%rax,8), %xmm2                         #13.34
        movhpd    32(%r8,%rax,8), %xmm2                         #13.34
        movsd     40(%rdx,%rax,8), %xmm7                        #13.18
        movhpd    48(%rdx,%rax,8), %xmm7                        #13.18
        movsd     40(%r8,%rax,8), %xmm5                         #13.34
        movhpd    48(%r8,%rax,8), %xmm5                         #13.34
        movsd     56(%rdx,%rax,8), %xmm10                       #13.18
        movhpd    64(%rdx,%rax,8), %xmm10                       #13.18
        movsd     56(%r8,%rax,8), %xmm8                         #13.34
        movhpd    64(%r8,%rax,8), %xmm8                         #13.34
        addpd     %xmm0, %xmm1                                  #13.33
        addpd     (%r8,%rax,8), %xmm1                           #13.41
        addpd     %xmm2, %xmm4                                  #13.33
        addpd     %xmm5, %xmm7                                  #13.33
        addpd     %xmm8, %xmm10                                 #13.33
        movaps    16(%r8,%rax,8), %xmm3                         #13.52
        addpd     %xmm3, %xmm1                                  #13.9
        movaps    32(%r8,%rax,8), %xmm6                         #13.52
        movaps    48(%r8,%rax,8), %xmm9                         #13.52
        movaps    %xmm1, 8(%rcx,%rax,8)                         #13.9
        addpd     %xmm3, %xmm4                                  #13.41
        addpd     %xmm6, %xmm4                                  #13.9
        movaps    %xmm4, 24(%rcx,%rax,8)                        #13.9
        addpd     %xmm6, %xmm7                                  #13.41
        addpd     %xmm9, %xmm7                                  #13.9
        movaps    %xmm7, 40(%rcx,%rax,8)                        #13.9
        addpd     %xmm9, %xmm10                                 #13.41
        addpd     64(%r8,%rax,8), %xmm10                        #13.9
        movaps    %xmm10, 56(%rcx,%rax,8)                       #13.9
        addq      $8, %rax                                      #14.7
        cmpq      %r9, %rax                                     #14.7
        jl        ..B1.15       # Prob 82%                      #14.7

Not most optimal, due to not using the cross-iteration values to save 
two loads per iteration.  But still much better than what GCC uses.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39300