From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 29692 invoked by alias); 1 Dec 2009 17:03:50 -0000 Received: (qmail 26980 invoked by uid 48); 1 Dec 2009 17:03:32 -0000 Date: Tue, 01 Dec 2009 17:03:00 -0000 Message-ID: <20091201170332.26979.qmail@sourceware.org> X-Bugzilla-Reason: CC References: Subject: [Bug tree-optimization/42216] [4.5 Regression] rev 154688 regress 464.h264ref peak 20% In-Reply-To: Reply-To: gcc-bugzilla@gcc.gnu.org To: gcc-bugs@gcc.gnu.org From: "rguenth at gcc dot gnu dot org" Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org X-SW-Source: 2009-12/txt/msg00064.txt.bz2 ------- Comment #8 from rguenth at gcc dot gnu dot org 2009-12-01 17:03 ------- The hot loop is mv-search.c:SetupFastFullPelSearch for (pos = 0; pos < max_pos; pos++) { abs_y = offset_y + spiral_search_y[pos]; abs_x = offset_x + spiral_search_x[pos]; if (range_partly_outside) { if (abs_y >= 0 && abs_y <= max_height && abs_x >= 0 && abs_x <= max_width ) { PelYline_11 = FastLine16Y_11; } else { PelYline_11 = UMVLine16Y_11; } } orgptr = orig_blocks; bindex = 0; for (blky = 0; blky < 4; blky++) { LineSadBlk0 = LineSadBlk1 = LineSadBlk2 = LineSadBlk3 = 0; for (y = 0; y < 4; y++) { refptr = PelYline_11 (ref_pic, abs_y++, abs_x, img_height, img_width); LineSadBlk0 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk0 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk0 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk0 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk1 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk1 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk1 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk1 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk2 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk2 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk2 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk2 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk3 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk3 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk3 += byte_abs [*refptr++ - *orgptr++]; LineSadBlk3 += byte_abs [*refptr++ - *orgptr++]; } block_sad[bindex++][pos] = LineSadBlk0; block_sad[bindex++][pos] = LineSadBlk1; block_sad[bindex++][pos] = LineSadBlk2; block_sad[bindex++][pos] = LineSadBlk3; } } good assembly of the innermost loop: .L1422: leal 1(%rsi), %r9d movl 64(%rsp), %r8d movl 68(%rsp), %ecx movl 52(%rsp), %edx movq 72(%rsp), %rdi movl %r9d, 32(%rsp) call *%rax movzwl (%rbx), %ecx movzwl (%rax), %edx movzwl 2(%rbx), %r10d movzwl 2(%rax), %r11d movq byte_abs(%rip), %r9 movzwl 4(%rbx), %esi subl %ecx, %edx movzwl 4(%rax), %ecx movslq %edx, %r8 subl %r10d, %r11d movzwl 6(%rax), %r10d addl (%r9,%r8,4), %r14d movzwl 6(%rbx), %r8d movslq %r11d, %rdi addl (%r9,%rdi,4), %r14d movzwl 8(%rbx), %edi subl %esi, %ecx movslq %ecx, %rdx movzwl 8(%rax), %ecx subl %r8d, %r10d addl (%r9,%rdx,4), %r14d movzwl 10(%rax), %r8d movzwl 10(%rbx), %edx movslq %r10d, %r11 addl (%r9,%r11,4), %r14d movzwl 12(%rbx), %r11d subl %edi, %ecx movzwl 12(%rax), %edi movslq %ecx, %rsi subl %edx, %r8d addl (%r9,%rsi,4), %r13d movzwl 14(%rax), %edx movzwl 14(%rbx), %esi movslq %r8d, %r10 subl %r11d, %edi addl (%r9,%r10,4), %r13d movzwl 16(%rax), %r11d movzwl 16(%rbx), %r10d movslq %edi, %rcx addl (%r9,%rcx,4), %r13d movzwl 18(%rbx), %ecx subl %esi, %edx movslq %edx, %r8 movzwl 18(%rax), %edx subl %r10d, %r11d addl (%r9,%r8,4), %r13d movzwl 20(%rax), %r10d movzwl 20(%rbx), %r8d movslq %r11d, %rdi addl (%r9,%rdi,4), %ebp movzwl 22(%rbx), %edi subl %ecx, %edx movzwl 22(%rax), %ecx movslq %edx, %rsi subl %r8d, %r10d addl (%r9,%rsi,4), %ebp movzwl 24(%rax), %r8d movzwl 24(%rbx), %esi movslq %r10d, %r11 subl %edi, %ecx addl (%r9,%r11,4), %ebp movzwl 26(%rax), %edi movslq %ecx, %rdx movzwl 26(%rbx), %r11d addl (%r9,%rdx,4), %ebp movzwl 28(%rbx), %edx subl %esi, %r8d movslq %r8d, %r10 movzwl 28(%rax), %r8d movzwl 30(%rax), %eax addl (%r9,%r10,4), %r12d movzwl 30(%rbx), %r10d subl %r11d, %edi movslq %edi, %rcx addl (%r9,%rcx,4), %r12d subl %edx, %r8d subl %r10d, %eax movslq %r8d, %rsi addl (%r9,%rsi,4), %r12d cltq addl (%r9,%rax,4), %r12d addq $32, %rbx addq $32, %r15 cmpq $128, %r15 je .L1600 movq PelYline_11(%rip), %rax movl 32(%rsp), %esi jmp .L1422 bad assembly: .L1422: leal 1(%rsi), %edx movl 68(%rsp), %ecx movl 64(%rsp), %r8d movq 72(%rsp), %rdi movl %edx, 32(%rsp) movl 52(%rsp), %edx call *%rax movzwl (%rbx), %esi movzwl (%rax), %ecx movq byte_abs(%rip), %rdx subl %esi, %ecx movzwl 2(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %r14d movzwl 2(%rax), %ecx subl %esi, %ecx movzwl 4(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %r14d movzwl 4(%rax), %ecx subl %esi, %ecx movzwl 6(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %r14d movzwl 6(%rax), %ecx subl %esi, %ecx movzwl 8(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %r14d movzwl 8(%rax), %ecx subl %esi, %ecx movzwl 10(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %r13d movzwl 10(%rax), %ecx subl %esi, %ecx movzwl 12(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %r13d movzwl 12(%rax), %ecx subl %esi, %ecx movzwl 14(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %r13d movzwl 14(%rax), %ecx subl %esi, %ecx movzwl 16(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %r13d movzwl 16(%rax), %ecx subl %esi, %ecx movzwl 18(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %ebp movzwl 18(%rax), %ecx subl %esi, %ecx movzwl 20(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %ebp movzwl 20(%rax), %ecx subl %esi, %ecx movzwl 22(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %ebp movzwl 22(%rax), %ecx subl %esi, %ecx movzwl 24(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %ebp movzwl 24(%rax), %ecx subl %esi, %ecx movzwl 26(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %r12d movzwl 26(%rax), %ecx subl %esi, %ecx movzwl 28(%rbx), %esi movslq %ecx, %rcx addl (%rdx,%rcx,4), %r12d movzwl 28(%rax), %ecx movzwl 30(%rax), %eax subl %esi, %ecx movslq %ecx, %rcx addl (%rdx,%rcx,4), %r12d movzwl 30(%rbx), %ecx subl %ecx, %eax cltq addl (%rdx,%rax,4), %r12d addq $32, %rbx addq $32, %r15 cmpq $128, %r15 je .L1600 movq PelYline_11(%rip), %rax movl 32(%rsp), %esi jmp .L1422 seems to be really only scheduling differences... -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42216