From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 17656 invoked by alias); 20 Oct 2014 07:05:52 -0000 Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org Received: (qmail 17628 invoked by uid 48); 20 Oct 2014 07:05:43 -0000 From: "vincenzo.innocente at cern dot ch" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/63599] New: "wrong" branch optimization with Ofast in a loop Date: Mon, 20 Oct 2014 07:35:00 -0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Version: 5.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: vincenzo.innocente at cern dot ch X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: 7bit X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-SW-Source: 2014-10/txt/msg01499.txt.bz2 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63599 Bug ID: 63599 Summary: "wrong" branch optimization with Ofast in a loop Product: gcc Version: 5.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: vincenzo.innocente at cern dot ch given this code #include typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t; inline float32x4_t atan(float32x4_t t) { constexpr float PIO4F = 0.7853981633974483096f; float32x4_t high = t > 0.4142135623730950f; auto z = t; float32x4_t ret={0.f,0.f,0.f,0.f}; // if all low no need to blend if ( _mm_movemask_ps(high) != 0) { z = ( t > 0.4142135623730950f ) ? (t-1.0f)/(t+1.0f) : t; ret = ( t > 0.4142135623730950f ) ? ret+PIO4F : ret; } /* polynomial removed */ return ret += z; } float32x4_t doAtan(float32x4_t z) { return atan(z);} float32x4_t va[1024]; float32x4_t vb[1024]; void computeV() { for (int i=0;i!=1024;++i) vb[i]=atan(va[i]); } compiled with -Ofast c++ -S -std=c++1y -Ofast bugmvmk.cc -march=nehalem; cat bugmvmk.s produces the following code where the "movmskps %xmm8, %edx" does not protect the code in the if block... __Z8computeVv: LFB2512: movaps LC0(%rip), %xmm4 xorl %eax, %eax movaps LC1(%rip), %xmm7 leaq _va(%rip), %rcx movaps LC2(%rip), %xmm6 movaps LC3(%rip), %xmm5 .align 4,0x90 L10: movaps (%rcx,%rax), %xmm2 movaps %xmm4, %xmm8 movaps %xmm2, %xmm3 cmpltps %xmm2, %xmm8 movaps %xmm2, %xmm1 addps %xmm6, %xmm3 addps %xmm7, %xmm1 movmskps %xmm8, %edx andps %xmm5, %xmm8 rcpps %xmm3, %xmm0 mulps %xmm0, %xmm3 mulps %xmm0, %xmm3 addps %xmm0, %xmm0 subps %xmm3, %xmm0 mulps %xmm0, %xmm1 movaps %xmm2, %xmm0 cmpleps %xmm4, %xmm0 blendvps %xmm0, %xmm2, %xmm1 pxor %xmm0, %xmm0 testl %edx, %edx je L7 movaps %xmm8, %xmm0 L7: testl %edx, %edx je L9 movaps %xmm1, %xmm2 L9: addps %xmm0, %xmm2 leaq _vb(%rip), %rdx movaps %xmm2, (%rdx,%rax) addq $16, %rax cmpq $16384, %rax jne L10 ret while with O2 is ok __Z8computeVv: LFB2512: movaps LC0(%rip), %xmm4 xorl %eax, %eax movaps LC1(%rip), %xmm7 leaq _va(%rip), %rsi movaps LC2(%rip), %xmm6 leaq _vb(%rip), %rcx movaps LC3(%rip), %xmm5 .align 4,0x90 L7: movaps (%rsi,%rax), %xmm1 movaps %xmm4, %xmm0 pxor %xmm2, %xmm2 cmpltps %xmm1, %xmm0 movmskps %xmm0, %edx testl %edx, %edx je L6 movaps %xmm1, %xmm3 movaps %xmm1, %xmm2 addps %xmm6, %xmm2 addps %xmm7, %xmm3 divps %xmm2, %xmm3 movaps %xmm0, %xmm2 andps %xmm5, %xmm2 blendvps %xmm0, %xmm3, %xmm1 L6: addps %xmm2, %xmm1 movaps %xmm1, (%rcx,%rax) addq $16, %rax cmpq $16384, %rax jne L7 ret note that the function not in the loop (doAtan) is ok with both O2 and Ofast