public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop
@ 2014-10-20 7:35 vincenzo.innocente at cern dot ch
2014-10-20 7:39 ` [Bug target/63599] " pinskia at gcc dot gnu.org
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: vincenzo.innocente at cern dot ch @ 2014-10-20 7:35 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599
Bug ID: 63599
Summary: "wrong" branch optimization with Ofast in a loop
Product: gcc
Version: 5.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: vincenzo.innocente at cern dot ch
given this code
#include <x86intrin.h>
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
inline
float32x4_t atan(float32x4_t t) {
constexpr float PIO4F = 0.7853981633974483096f;
float32x4_t high = t > 0.4142135623730950f;
auto z = t;
float32x4_t ret={0.f,0.f,0.f,0.f};
// if all low no need to blend
if ( _mm_movemask_ps(high) != 0) {
z = ( t > 0.4142135623730950f ) ? (t-1.0f)/(t+1.0f) : t;
ret = ( t > 0.4142135623730950f ) ? ret+PIO4F : ret;
}
/* polynomial removed */
return ret += z;
}
float32x4_t doAtan(float32x4_t z) { return atan(z);}
float32x4_t va[1024];
float32x4_t vb[1024];
void computeV() {
for (int i=0;i!=1024;++i)
vb[i]=atan(va[i]);
}
compiled with -Ofast
c++ -S -std=c++1y -Ofast bugmvmk.cc -march=nehalem; cat bugmvmk.s
produces the following code where the "movmskps %xmm8, %edx"
does not protect the code in the if block...
__Z8computeVv:
LFB2512:
movaps LC0(%rip), %xmm4
xorl %eax, %eax
movaps LC1(%rip), %xmm7
leaq _va(%rip), %rcx
movaps LC2(%rip), %xmm6
movaps LC3(%rip), %xmm5
.align 4,0x90
L10:
movaps (%rcx,%rax), %xmm2
movaps %xmm4, %xmm8
movaps %xmm2, %xmm3
cmpltps %xmm2, %xmm8
movaps %xmm2, %xmm1
addps %xmm6, %xmm3
addps %xmm7, %xmm1
movmskps %xmm8, %edx
andps %xmm5, %xmm8
rcpps %xmm3, %xmm0
mulps %xmm0, %xmm3
mulps %xmm0, %xmm3
addps %xmm0, %xmm0
subps %xmm3, %xmm0
mulps %xmm0, %xmm1
movaps %xmm2, %xmm0
cmpleps %xmm4, %xmm0
blendvps %xmm0, %xmm2, %xmm1
pxor %xmm0, %xmm0
testl %edx, %edx
je L7
movaps %xmm8, %xmm0
L7:
testl %edx, %edx
je L9
movaps %xmm1, %xmm2
L9:
addps %xmm0, %xmm2
leaq _vb(%rip), %rdx
movaps %xmm2, (%rdx,%rax)
addq $16, %rax
cmpq $16384, %rax
jne L10
ret
while with O2 is ok
__Z8computeVv:
LFB2512:
movaps LC0(%rip), %xmm4
xorl %eax, %eax
movaps LC1(%rip), %xmm7
leaq _va(%rip), %rsi
movaps LC2(%rip), %xmm6
leaq _vb(%rip), %rcx
movaps LC3(%rip), %xmm5
.align 4,0x90
L7:
movaps (%rsi,%rax), %xmm1
movaps %xmm4, %xmm0
pxor %xmm2, %xmm2
cmpltps %xmm1, %xmm0
movmskps %xmm0, %edx
testl %edx, %edx
je L6
movaps %xmm1, %xmm3
movaps %xmm1, %xmm2
addps %xmm6, %xmm2
addps %xmm7, %xmm3
divps %xmm2, %xmm3
movaps %xmm0, %xmm2
andps %xmm5, %xmm2
blendvps %xmm0, %xmm3, %xmm1
L6:
addps %xmm2, %xmm1
movaps %xmm1, (%rcx,%rax)
addq $16, %rax
cmpq $16384, %rax
jne L7
ret
note that the function not in the loop (doAtan) is ok with both O2 and Ofast
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/63599] "wrong" branch optimization with Ofast in a loop
2014-10-20 7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
@ 2014-10-20 7:39 ` pinskia at gcc dot gnu.org
2014-10-20 9:17 ` vincenzo.innocente at cern dot ch
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: pinskia at gcc dot gnu.org @ 2014-10-20 7:39 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
The tree level looks like this:
t_13 = VEC_COND_EXPR <t_4 <= { 4.142135679721832275390625e-1,
4.142135679721832275390625e-1, 4.142135679721832275390625e-1,
4.142135679721832275390625e-1 }, t_4, _12>;
ret_14 = VEC_COND_EXPR <t_4 > { 4.142135679721832275390625e-1,
4.142135679721832275390625e-1, 4.142135679721832275390625e-1,
4.142135679721832275390625e-1 }, { 7.85398185253143310546875e-1,
7.85398185253143310546875e-1, 7.85398185253143310546875e-1,
7.85398185253143310546875e-1 }, { 0.0, 0.0, 0.0, 0.0 }>;
t_16 = _9 != 0 ? t_13 : t_4;
ret_15 = _9 != 0 ? ret_14 : { 0.0, 0.0, 0.0, 0.0 };
>"movmskps %xmm8, %edx"
> does not protect the code in the if block...
Yes it does just not the way you think it does.
Notice the last two statements are conditional expressions.
And that gets translated into the following:
testl %edx, %edx
jne .L9
movaps %xmm3, %xmm1
pxor %xmm2, %xmm2
.L9:
So if anything it is a missed optimization dealing with conditional moves with
vectors without a vector comparison.
>From gcc-bugs-return-464480-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org Mon Oct 20 07:39:48 2014
Return-Path: <gcc-bugs-return-464480-listarch-gcc-bugs=gcc.gnu.org@gcc.gnu.org>
Delivered-To: listarch-gcc-bugs@gcc.gnu.org
Received: (qmail 21144 invoked by alias); 20 Oct 2014 07:39:48 -0000
Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-bugs.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-help@gcc.gnu.org>
Sender: gcc-bugs-owner@gcc.gnu.org
Delivered-To: mailing list gcc-bugs@gcc.gnu.org
Received: (qmail 20850 invoked by uid 55); 20 Oct 2014 07:39:44 -0000
From: "rguenther at suse dot de" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/54488] tree loop invariant motion uses an excessive amount of memory
Date: Mon, 20 Oct 2014 07:44:00 -0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: 4.8.0
X-Bugzilla-Keywords: memory-hog
X-Bugzilla-Severity: normal
X-Bugzilla-Who: rguenther at suse dot de
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags:
X-Bugzilla-Changed-Fields:
Message-ID: <bug-54488-4-RbaZEXwv7V@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-54488-4@http.gcc.gnu.org/bugzilla/>
References: <bug-54488-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: 7bit
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-SW-Source: 2014-10/txt/msg01501.txt.bz2
Content-length: 638
https://gcc.gnu.org/bugzilla/show_bug.cgi?idT488
--- Comment #6 from rguenther at suse dot de <rguenther at suse dot de> ---
On Sun, 19 Oct 2014, evgeniya.maenkova at gmail dot com wrote:
> https://gcc.gnu.org/bugzilla/show_bug.cgi?idT488
>
> --- Comment #5 from Evgeniya Maenkova <evgeniya.maenkova at gmail dot com> ---
> Also, I collect massif data and see no tree-ssa-lim in it (i mean in top
> contributors).
>
> So what do you think?
>
> (How did you measured 1,8Gb caused by lim? - this is for me to understand
> whether this bug is actual or not)
I basically watched 'top' with breakpoints at the start and end of LIM.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/63599] "wrong" branch optimization with Ofast in a loop
2014-10-20 7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
2014-10-20 7:39 ` [Bug target/63599] " pinskia at gcc dot gnu.org
@ 2014-10-20 9:17 ` vincenzo.innocente at cern dot ch
2014-10-20 10:07 ` glisse at gcc dot gnu.org
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: vincenzo.innocente at cern dot ch @ 2014-10-20 9:17 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599
--- Comment #2 from vincenzo Innocente <vincenzo.innocente at cern dot ch> ---
I agree that the code produces correct results. It looks to me sub-optimal.
I understand that with Ofast the sequence below will be always executed
andps %xmm5, %xmm8
rcpps %xmm3, %xmm0
mulps %xmm0, %xmm3
mulps %xmm0, %xmm3
addps %xmm0, %xmm0
subps %xmm3, %xmm0
mulps %xmm0, %xmm1
movaps %xmm2, %xmm0
cmpleps %xmm4, %xmm0
blendvps %xmm0, %xmm2, %xmm1
while with O2 it will not.
and this generates a performance penalty for samples where the test is often
false.
( I tried to add __builtin_expect(x, false) with no effect. )
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/63599] "wrong" branch optimization with Ofast in a loop
2014-10-20 7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
2014-10-20 7:39 ` [Bug target/63599] " pinskia at gcc dot gnu.org
2014-10-20 9:17 ` vincenzo.innocente at cern dot ch
@ 2014-10-20 10:07 ` glisse at gcc dot gnu.org
2014-10-20 17:29 ` jakub at gcc dot gnu.org
2024-03-12 23:06 ` pinskia at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: glisse at gcc dot gnu.org @ 2014-10-20 10:07 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599
--- Comment #3 from Marc Glisse <glisse at gcc dot gnu.org> ---
ifcvt making a transformation that doesn't help vectorization and ends up
pessimizing the code... not really the first time this happens. I believe Jakub
had a big patch for that, but it never got in. Maybe vectors could be
special-cased if we never vectorize them anyway.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/63599] "wrong" branch optimization with Ofast in a loop
2014-10-20 7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
` (2 preceding siblings ...)
2014-10-20 10:07 ` glisse at gcc dot gnu.org
@ 2014-10-20 17:29 ` jakub at gcc dot gnu.org
2024-03-12 23:06 ` pinskia at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: jakub at gcc dot gnu.org @ 2014-10-20 17:29 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599
Jakub Jelinek <jakub at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |jakub at gcc dot gnu.org
--- Comment #4 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
The big patch got committed in, but generally turning off tree if-conversion
didn't turn to be a win, so what ended up being committed is only if there are
any masked loads/stores, if-conversion applies only to vectorized loop and
nothing else.
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug target/63599] "wrong" branch optimization with Ofast in a loop
2014-10-20 7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
` (3 preceding siblings ...)
2014-10-20 17:29 ` jakub at gcc dot gnu.org
@ 2024-03-12 23:06 ` pinskia at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: pinskia at gcc dot gnu.org @ 2024-03-12 23:06 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Known to fail| |5.1.0, 6.1.0
Keywords|wrong-code |missed-optimization
Status|UNCONFIRMED |RESOLVED
Known to work| |7.1.0
Resolution|--- |FIXED
Target Milestone|--- |7.0
--- Comment #5 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Fixed for GCC 7.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2024-03-12 23:06 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-20 7:35 [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop vincenzo.innocente at cern dot ch
2014-10-20 7:39 ` [Bug target/63599] " pinskia at gcc dot gnu.org
2014-10-20 9:17 ` vincenzo.innocente at cern dot ch
2014-10-20 10:07 ` glisse at gcc dot gnu.org
2014-10-20 17:29 ` jakub at gcc dot gnu.org
2024-03-12 23:06 ` pinskia at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).