[Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop
@ 2014-10-20  7:35 vincenzo.innocente at cern dot ch
  2014-10-20  7:39 ` [Bug target/63599] " pinskia at gcc dot gnu.org
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: vincenzo.innocente at cern dot ch @ 2014-10-20  7:35 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599

            Bug ID: 63599
           Summary: "wrong" branch optimization with Ofast in a loop
           Product: gcc
           Version: 5.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vincenzo.innocente at cern dot ch

given this code

#include <x86intrin.h>

typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;

inline
float32x4_t atan(float32x4_t t) {
  constexpr float PIO4F = 0.7853981633974483096f;
  float32x4_t high = t > 0.4142135623730950f;
  auto z = t;
  float32x4_t ret={0.f,0.f,0.f,0.f};
    // if all low no need to blend
  if ( _mm_movemask_ps(high) != 0) {
    z   = ( t > 0.4142135623730950f ) ? (t-1.0f)/(t+1.0f) : t;
    ret = ( t > 0.4142135623730950f ) ? ret+PIO4F : ret;
  }
  /* polynomial removed */
  return  ret += z;
}


float32x4_t doAtan(float32x4_t z) { return atan(z);}

float32x4_t va[1024];
float32x4_t vb[1024];

void computeV() {
  for (int i=0;i!=1024;++i)
    vb[i]=atan(va[i]);
}


compiled with -Ofast
c++ -S -std=c++1y -Ofast bugmvmk.cc -march=nehalem; cat bugmvmk.s
produces the following code where the "movmskps    %xmm8, %edx"
does not protect the code in the if block...

__Z8computeVv:
LFB2512:
    movaps    LC0(%rip), %xmm4
    xorl    %eax, %eax
    movaps    LC1(%rip), %xmm7
    leaq    _va(%rip), %rcx
    movaps    LC2(%rip), %xmm6
    movaps    LC3(%rip), %xmm5
    .align 4,0x90
L10:
    movaps    (%rcx,%rax), %xmm2
    movaps    %xmm4, %xmm8
    movaps    %xmm2, %xmm3
    cmpltps    %xmm2, %xmm8
    movaps    %xmm2, %xmm1
    addps    %xmm6, %xmm3
    addps    %xmm7, %xmm1
    movmskps    %xmm8, %edx
    andps    %xmm5, %xmm8
    rcpps    %xmm3, %xmm0
    mulps    %xmm0, %xmm3
    mulps    %xmm0, %xmm3
    addps    %xmm0, %xmm0
    subps    %xmm3, %xmm0
    mulps    %xmm0, %xmm1
    movaps    %xmm2, %xmm0
    cmpleps    %xmm4, %xmm0
    blendvps    %xmm0, %xmm2, %xmm1
    pxor    %xmm0, %xmm0
    testl    %edx, %edx
    je    L7
    movaps    %xmm8, %xmm0
L7:
    testl    %edx, %edx
    je    L9
    movaps    %xmm1, %xmm2
L9:
    addps    %xmm0, %xmm2
    leaq    _vb(%rip), %rdx
    movaps    %xmm2, (%rdx,%rax)
    addq    $16, %rax
    cmpq    $16384, %rax
    jne    L10
    ret

while with O2 is ok
__Z8computeVv:
LFB2512:
    movaps    LC0(%rip), %xmm4
    xorl    %eax, %eax
    movaps    LC1(%rip), %xmm7
    leaq    _va(%rip), %rsi
    movaps    LC2(%rip), %xmm6
    leaq    _vb(%rip), %rcx
    movaps    LC3(%rip), %xmm5
    .align 4,0x90
L7:
    movaps    (%rsi,%rax), %xmm1
    movaps    %xmm4, %xmm0
    pxor    %xmm2, %xmm2
    cmpltps    %xmm1, %xmm0
    movmskps    %xmm0, %edx
    testl    %edx, %edx
    je    L6
    movaps    %xmm1, %xmm3
    movaps    %xmm1, %xmm2
    addps    %xmm6, %xmm2
    addps    %xmm7, %xmm3
    divps    %xmm2, %xmm3
    movaps    %xmm0, %xmm2
    andps    %xmm5, %xmm2
    blendvps    %xmm0, %xmm3, %xmm1
L6:
    addps    %xmm2, %xmm1
    movaps    %xmm1, (%rcx,%rax)
    addq    $16, %rax
    cmpq    $16384, %rax
    jne    L7
    ret

note that the function not in the loop (doAtan) is ok with both O2 and Ofast


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/63599] "wrong" branch optimization with Ofast in a loop
  2014-10-20  7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
@ 2014-10-20  7:39 ` pinskia at gcc dot gnu.org
  2014-10-20  9:17 ` vincenzo.innocente at cern dot ch
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: pinskia at gcc dot gnu.org @ 2014-10-20  7:39 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599

--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
The tree level looks like this:
  t_13 = VEC_COND_EXPR <t_4 <= { 4.142135679721832275390625e-1,
4.142135679721832275390625e-1, 4.142135679721832275390625e-1,
4.142135679721832275390625e-1 }, t_4, _12>;
  ret_14 = VEC_COND_EXPR <t_4 > { 4.142135679721832275390625e-1,
4.142135679721832275390625e-1, 4.142135679721832275390625e-1,
4.142135679721832275390625e-1 }, { 7.85398185253143310546875e-1,
7.85398185253143310546875e-1, 7.85398185253143310546875e-1,
7.85398185253143310546875e-1 }, { 0.0, 0.0, 0.0, 0.0 }>;
  t_16 = _9 != 0 ? t_13 : t_4;
  ret_15 = _9 != 0 ? ret_14 : { 0.0, 0.0, 0.0, 0.0 };


>"movmskps	%xmm8, %edx"
> does not protect the code in the if block...
Yes it does just not the way you think it does.

Notice the last two statements are conditional expressions.

And that gets translated into the following:
    testl    %edx, %edx
    jne    .L9
    movaps    %xmm3, %xmm1
    pxor    %xmm2, %xmm2
.L9:

So if anything it is a missed optimization dealing with conditional moves with
vectors without a vector comparison.
>From gcc-bugs-return-464480-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org Mon Oct 20 07:39:48 2014
Return-Path: <gcc-bugs-return-464480-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Delivered-To: listarch-gcc-bugs@gcc.gnu.org
Received: (qmail 21144 invoked by alias); 20 Oct 2014 07:39:48 -0000
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-bugs.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
Delivered-To: mailing list gcc-bugs@gcc.gnu.org
Received: (qmail 20850 invoked by uid 55); 20 Oct 2014 07:39:44 -0000
From: "rguenther at suse dot de" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/54488] tree loop invariant motion uses an excessive amount of memory
Date: Mon, 20 Oct 2014 07:44:00 -0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: 4.8.0
X-Bugzilla-Keywords: memory-hog
X-Bugzilla-Severity: normal
X-Bugzilla-Who: rguenther at suse dot de
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags:
X-Bugzilla-Changed-Fields:
Message-ID: <bug-54488-4-RbaZEXwv7V@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-54488-4@http.gcc.gnu.org/bugzilla/>
References: <bug-54488-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: 7bit
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-SW-Source: 2014-10/txt/msg01501.txt.bz2
Content-length: 638

https://gcc.gnu.org/bugzilla/show_bug.cgi?idT488

--- Comment #6 from rguenther at suse dot de <rguenther at suse dot de> ---
On Sun, 19 Oct 2014, evgeniya.maenkova at gmail dot com wrote:

> https://gcc.gnu.org/bugzilla/show_bug.cgi?idT488
>
> --- Comment #5 from Evgeniya Maenkova <evgeniya.maenkova at gmail dot com> ---
> Also, I collect massif data and see no tree-ssa-lim in it (i mean in top
> contributors).
>
> So what do you think?
>
> (How did you measured 1,8Gb caused by lim? - this is for me to understand
> whether this bug is actual or not)

I basically watched 'top' with breakpoints at the start and end of LIM.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/63599] "wrong" branch optimization with Ofast in a loop
  2014-10-20  7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
  2014-10-20  7:39 ` [Bug target/63599] " pinskia at gcc dot gnu.org
@ 2014-10-20  9:17 ` vincenzo.innocente at cern dot ch
  2014-10-20 10:07 ` glisse at gcc dot gnu.org
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: vincenzo.innocente at cern dot ch @ 2014-10-20  9:17 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599

--- Comment #2 from vincenzo Innocente <vincenzo.innocente at cern dot ch> ---
I agree that the code produces correct results. It looks to me  sub-optimal.
I understand that with Ofast the sequence below will be always executed

    andps    %xmm5, %xmm8
    rcpps    %xmm3, %xmm0
    mulps    %xmm0, %xmm3
    mulps    %xmm0, %xmm3
    addps    %xmm0, %xmm0
    subps    %xmm3, %xmm0
    mulps    %xmm0, %xmm1
    movaps    %xmm2, %xmm0
    cmpleps    %xmm4, %xmm0
    blendvps    %xmm0, %xmm2, %xmm1

while with O2 it will not.
and this generates a performance penalty for samples where the test is often
false.
( I tried to add __builtin_expect(x, false) with no effect. )


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/63599] "wrong" branch optimization with Ofast in a loop
  2014-10-20  7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
  2014-10-20  7:39 ` [Bug target/63599] " pinskia at gcc dot gnu.org
  2014-10-20  9:17 ` vincenzo.innocente at cern dot ch
@ 2014-10-20 10:07 ` glisse at gcc dot gnu.org
  2014-10-20 17:29 ` jakub at gcc dot gnu.org
  2024-03-12 23:06 ` pinskia at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: glisse at gcc dot gnu.org @ 2014-10-20 10:07 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599

--- Comment #3 from Marc Glisse <glisse at gcc dot gnu.org> ---
ifcvt making a transformation that doesn't help vectorization and ends up
pessimizing the code... not really the first time this happens. I believe Jakub
had a big patch for that, but it never got in. Maybe vectors could be
special-cased if we never vectorize them anyway.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/63599] "wrong" branch optimization with Ofast in a loop
  2014-10-20  7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
                   ` (2 preceding siblings ...)
  2014-10-20 10:07 ` glisse at gcc dot gnu.org
@ 2014-10-20 17:29 ` jakub at gcc dot gnu.org
  2024-03-12 23:06 ` pinskia at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: jakub at gcc dot gnu.org @ 2014-10-20 17:29 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599

Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |jakub at gcc dot gnu.org

--- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
The big patch got committed in, but generally turning off tree if-conversion
didn't turn to be a win, so what ended up being committed is only if there are
any masked loads/stores, if-conversion applies only to vectorized loop and
nothing else.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Bug target/63599] "wrong" branch optimization with Ofast in a loop
  2014-10-20  7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
                   ` (3 preceding siblings ...)
  2014-10-20 17:29 ` jakub at gcc dot gnu.org
@ 2024-03-12 23:06 ` pinskia at gcc dot gnu.org
  4 siblings, 0 replies; 6+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-03-12 23:06 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
      Known to fail|                            |5.1.0, 6.1.0
           Keywords|wrong-code                  |missed-optimization
             Status|UNCONFIRMED                 |RESOLVED
      Known to work|                            |7.1.0
         Resolution|---                         |FIXED
   Target Milestone|---                         |7.0

--- Comment #5 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Fixed for GCC 7.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-03-12 23:06 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-20  7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
2014-10-20  7:39 ` [Bug target/63599] " pinskia at gcc dot gnu.org
2014-10-20  9:17 ` vincenzo.innocente at cern dot ch
2014-10-20 10:07 ` glisse at gcc dot gnu.org
2014-10-20 17:29 ` jakub at gcc dot gnu.org
2024-03-12 23:06 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).