From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 20313 invoked by alias); 27 Apr 2009 08:16:38 -0000 Received: (qmail 19947 invoked by uid 48); 27 Apr 2009 08:16:21 -0000 Date: Mon, 27 Apr 2009 08:16:00 -0000 Message-ID: <20090427081621.19946.qmail@sourceware.org> X-Bugzilla-Reason: CC References: Subject: [Bug regression/39914] 96% performance regression in floating point code; part of the problem started 2009/03/12-13 In-Reply-To: Reply-To: gcc-bugzilla@gcc.gnu.org To: gcc-bugs@gcc.gnu.org From: "ubizjak at gmail dot com" Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org X-SW-Source: 2009-04/txt/msg02559.txt.bz2 ------- Comment #2 from ubizjak at gmail dot com 2009-04-27 08:16 ------- (In reply to comment #0) > (same .i file, same instructions for reproducing, same compiler options, same > everything) I guess that this is direct.i compiled with -O1? Trunk, revision: 146825 -O1 on x86_64 linux gives: .L27: leaq 4(%rbx), %rbp movq %rbx, %rdx addq (%r11), %rdx movq %rdx, (%rsi) addq $4, %rdx movq %rdx, (%r10) movq (%r11), %rdx addq (%rsi), %rdx movq %rdx, (%rcx) addq $4, %rdx movq %rdx, (%r9) movq (%r11), %r12 addq (%rcx), %r12 movq %r12, (%r8) addq $4, %r12 movq %r12, (%r15) movq (%rax), %rdx addq $7, %rdx movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -96(%rsp) movq (%r8), %r12 movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -64(%rsp) movq (%r9), %r12 movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -56(%rsp) movq (%rcx), %r12 movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -48(%rsp) movq (%r10), %r12 movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -104(%rsp) movq (%rsi), %r12 movsd (%rdx,%r12,2), %xmm2 movsd %xmm2, -88(%rsp) leaq (%rbp,%rbp), %r12 movsd (%r12,%rdx), %xmm2 movsd %xmm2, -80(%rsp) leaq (%rdx,%rbx,2), %r14 movq 24(%rdi), %rdx movsd 31(%rdx), %xmm2 movsd %xmm2, -32(%rsp) movsd 23(%rdx), %xmm2 movsd %xmm2, -40(%rsp) movsd 15(%rdx), %xmm2 movsd %xmm2, -120(%rsp) movsd 7(%rdx), %xmm2 movsd %xmm2, -112(%rsp) movapd %xmm2, %xmm3 mulsd -96(%rsp), %xmm3 movsd -120(%rsp), %xmm2 mulsd -64(%rsp), %xmm2 addsd %xmm2, %xmm3 movsd %xmm3, -24(%rsp) movsd -112(%rsp), %xmm3 mulsd -64(%rsp), %xmm3 movsd -120(%rsp), %xmm2 mulsd -96(%rsp), %xmm2 subsd %xmm2, %xmm3 movsd %xmm3, -96(%rsp) movsd -112(%rsp), %xmm3 mulsd -56(%rsp), %xmm3 movsd -120(%rsp), %xmm2 mulsd -48(%rsp), %xmm2 addsd %xmm2, %xmm3 movsd %xmm3, -64(%rsp) movsd -112(%rsp), %xmm3 mulsd -48(%rsp), %xmm3 movsd -120(%rsp), %xmm2 mulsd -56(%rsp), %xmm2 subsd %xmm2, %xmm3 movsd %xmm3, -120(%rsp) movsd -104(%rsp), %xmm2 subsd -24(%rsp), %xmm2 movsd %xmm2, -112(%rsp) movsd -88(%rsp), %xmm2 subsd -96(%rsp), %xmm2 movsd %xmm2, -56(%rsp) movsd -80(%rsp), %xmm2 subsd -64(%rsp), %xmm2 movsd %xmm2, -48(%rsp) movsd (%r14), %xmm2 subsd %xmm3, %xmm2 movsd %xmm2, -16(%rsp) movsd -104(%rsp), %xmm2 addsd -24(%rsp), %xmm2 movsd %xmm2, -104(%rsp) movsd -88(%rsp), %xmm2 addsd -96(%rsp), %xmm2 movsd %xmm2, -88(%rsp) movsd -80(%rsp), %xmm2 addsd -64(%rsp), %xmm2 movsd %xmm2, -80(%rsp) movsd (%r14), %xmm2 addsd %xmm3, %xmm2 movsd %xmm2, -72(%rsp) movsd -32(%rsp), %xmm2 xorpd %xmm0, %xmm2 movsd %xmm2, -120(%rsp) movapd %xmm2, %xmm3 mulsd -112(%rsp), %xmm3 movsd -40(%rsp), %xmm2 mulsd -56(%rsp), %xmm2 addsd %xmm2, %xmm3 movsd %xmm3, -96(%rsp) movsd -120(%rsp), %xmm3 mulsd -56(%rsp), %xmm3 movsd -40(%rsp), %xmm2 mulsd -112(%rsp), %xmm2 subsd %xmm2, %xmm3 movsd %xmm3, -120(%rsp) movsd -40(%rsp), %xmm3 mulsd -104(%rsp), %xmm3 movsd -32(%rsp), %xmm2 mulsd -88(%rsp), %xmm2 addsd %xmm2, %xmm3 movsd %xmm3, -112(%rsp) movsd -40(%rsp), %xmm3 mulsd -88(%rsp), %xmm3 movsd -32(%rsp), %xmm2 mulsd -104(%rsp), %xmm2 subsd %xmm2, %xmm3 movsd %xmm3, -104(%rsp) movsd -72(%rsp), %xmm2 addsd %xmm3, %xmm2 movsd %xmm2, (%r14) movq (%rax), %rdx movsd -80(%rsp), %xmm2 addsd -112(%rsp), %xmm2 movsd %xmm2, 7(%r12,%rdx) movq (%rsi), %r12 movq (%rax), %rdx movsd -72(%rsp), %xmm2 subsd -104(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) movq (%r10), %r12 movq (%rax), %rdx movsd -80(%rsp), %xmm2 subsd -112(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) movq (%rcx), %r12 movq (%rax), %rdx movsd -16(%rsp), %xmm2 addsd -120(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) movq (%r9), %r12 movq (%rax), %rdx movsd -48(%rsp), %xmm2 addsd -96(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) movq (%r8), %r12 movq (%rax), %rdx movsd -16(%rsp), %xmm2 subsd -120(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) movq (%r15), %r12 movq (%rax), %rdx movsd -48(%rsp), %xmm2 subsd -96(%rsp), %xmm2 movsd %xmm2, 7(%rdx,%r12,2) addq $8, %rbx cmpq %rbx, -8(%rsp) jg .L27 The code above looks similar to your gcc version 4.4.0 20090313 code. Using -O2, I get: .L27: movq -96(%rsp), %r14 leaq (%rax,%rcx,2), %rdi leaq -8(%rax,%rcx,2), %rbp leaq (%rax,%rsi,2), %r8 leaq -8(%rax,%rsi,2), %r9 leaq 8(%rax,%rdx,2), %r12 movsd (%rdi), %xmm2 leaq 8(%rax,%rbx,2), %r10 movsd (%r14), %xmm4 movq -88(%rsp), %r14 movsd (%rbp), %xmm6 leaq (%rax,%rbx,2), %r11 movsd (%r8), %xmm9 leaq (%rax,%rdx,2), %r13 movsd (%r14), %xmm1 movq -120(%rsp), %r14 movsd (%r9), %xmm10 movq %rcx, -80(%rsp) movapd %xmm1, %xmm14 addq $8, %rdx movsd (%r14), %xmm5 addq $8, %rcx mulsd %xmm6, %xmm14 addq $8, %rsi addq $8, %rbx movapd %xmm5, %xmm7 mulsd %xmm5, %xmm6 movsd (%r12), %xmm11 cmpq %rdx, -112(%rsp) mulsd %xmm2, %xmm7 mulsd %xmm1, %xmm2 movsd (%r15), %xmm8 movsd (%r11), %xmm3 addsd %xmm14, %xmm7 movapd %xmm1, %xmm14 subsd %xmm2, %xmm6 movapd %xmm5, %xmm2 mulsd %xmm10, %xmm14 mulsd %xmm9, %xmm2 mulsd %xmm9, %xmm1 movapd %xmm11, %xmm9 mulsd %xmm10, %xmm5 movsd (%r10), %xmm15 addsd %xmm14, %xmm2 movsd (%r13), %xmm0 movapd %xmm15, %xmm14 subsd %xmm1, %xmm5 movapd %xmm3, %xmm1 subsd %xmm7, %xmm14 movapd %xmm0, %xmm10 subsd %xmm2, %xmm9 addsd %xmm2, %xmm11 movapd %xmm8, %xmm2 subsd %xmm6, %xmm1 xorpd %xmm12, %xmm2 subsd %xmm5, %xmm10 addsd %xmm3, %xmm6 movapd %xmm4, %xmm3 addsd %xmm0, %xmm5 movapd %xmm2, %xmm0 mulsd %xmm1, %xmm3 addsd %xmm15, %xmm7 mulsd %xmm2, %xmm1 mulsd %xmm14, %xmm0 movapd %xmm4, %xmm2 mulsd %xmm4, %xmm14 mulsd %xmm7, %xmm2 addsd %xmm3, %xmm0 movapd %xmm8, %xmm3 mulsd %xmm8, %xmm7 subsd %xmm14, %xmm1 mulsd %xmm6, %xmm3 addsd %xmm3, %xmm2 movapd %xmm4, %xmm3 movapd %xmm5, %xmm4 mulsd %xmm6, %xmm3 subsd %xmm7, %xmm3 addsd %xmm3, %xmm4 subsd %xmm3, %xmm5 movsd %xmm4, (%r13) movapd %xmm11, %xmm4 subsd %xmm2, %xmm11 addsd %xmm2, %xmm4 movapd %xmm10, %xmm2 subsd %xmm1, %xmm10 addsd %xmm1, %xmm2 movsd %xmm4, (%r12) movsd %xmm5, (%r11) movsd %xmm11, (%r10) movsd %xmm2, (%r9) movapd %xmm9, %xmm2 subsd %xmm0, %xmm9 addsd %xmm0, %xmm2 movsd %xmm2, (%r8) movsd %xmm10, (%rbp) movsd %xmm9, (%rdi) jg .L27 It is not clear from your report, if -O1 flag is problematic, -O2 code looks good to me. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39914