[Bug c/16962] New: loop unrolling with x86-64 asm not efficient

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug c/16962] New: loop unrolling with x86-64 asm not efficient
@ 2004-08-10 13:42 tomstdenis at iahu dot ca
  2004-08-10 14:09 ` [Bug tree-optimization/16962] " falk at debian dot org
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: tomstdenis at iahu dot ca @ 2004-08-10 13:42 UTC (permalink / raw)
  To: gcc-bugs

[note this bug was previously reported as part of bug #16961 which currently 
reports inefficient 128-bit additions]. 
 
I have the following demo program 
 
typedef unsigned long long mp_digit; 
typedef unsigned long      mp_word __attribute__ ((mode(TI))); 
mp_word a, b; 
 
// demo slow 128-bit add 
void test(void) 
{ 
   a += b; 
} 
 
// this unrolls right (but is otherwise inefficient cuz of the 128-bit add) 
void test2(mp_word *out, mp_digit x, mp_digit *y, int n) 
{ 
  int z; 
  for (z = 0; z < n; z++) out[z] += ((mp_word)x) * ((mp_word)*y++); 
} 
 
// this unrolls poorly 
void test3(mp_word *out, mp_digit x, mp_digit *y, int n) 
{ 
  int z; 
  for (z = 0; z < n; z++) { 
    asm("movq %0,%%rax\n" 
        "mulq (%1)\n" 
        "addq %%rax,(%2)\n" 
        "adcq %%rdx,8(%2)\n" 
        ::"r"(x), "r"(y), "r"(out) : "%rax", "%rdx"); 
    ++out; 
    ++y; 
  } 
} 
 
 
And it produces inefficient loop unrolling when ASM blocks are used... 
 
I used "gcc -O3 -fomit-frame-pointer -funroll-loops -march=k8 -m64 -S test.c" 
 
	.file	"test.c" 
	.text 
	.p2align 4,,15 
.globl test 
	.type	test, @function 
test: 
.LFB2: 
	movq	a(%rip), %r10 
	movq	b(%rip), %r8 
	xorl	%ecx, %ecx 
	movq	a+8(%rip), %rdi 
	movq	b+8(%rip), %r9 
	leaq	(%r10,%r8), %rax 
	leaq	(%rdi,%r9), %rsi 
	cmpq	%r10, %rax 
	movq	%rax, a(%rip) 
	setb	%cl 
	leaq	(%rcx,%rsi), %rdx 
	movq	%rdx, a+8(%rip) 
	ret 
.LFE2: 
	.size	test, .-test 
	.p2align 4,,15 
.globl test2 
	.type	test2, @function 
test2: 
.LFB3: 
	movq	%r13, -24(%rsp) 
.LCFI0: 
	movq	%r14, -16(%rsp) 
.LCFI1: 
	movq	%rdi, %r11 
	movq	%r15, -8(%rsp) 
.LCFI2: 
	movq	%rbx, -48(%rsp) 
.LCFI3: 
	movq	%rsi, %r13 
	movq	%rbp, -40(%rsp) 
.LCFI4: 
	movq	%r12, -32(%rsp) 
.LCFI5: 
	subq	$64, %rsp 
.LCFI6: 
	testl	%ecx, %ecx 
	movq	%rdx, %r14 
	movl	%ecx, %r15d 
	jle	.L8 
	movq	%rsi, %rax 
	movq	(%rdi), %r12 
	movq	8(%rdi), %rdi 
	mulq	(%rdx) 
	leal	-1(%r15), %r10d 
	xorl	%ecx, %ecx 
	leaq	8(%r14), %rbp 
	movl	%r10d, %ebx 
	andl	$3, %ebx 
	movq	%rdx, %r9 
	leaq	(%r12,%rax), %rdx 
	leaq	(%rdi,%r9), %rsi 
	cmpq	%r12, %rdx 
	movq	%rdx, -8(%rsp) 
	movq	-8(%rsp), %rax 
	setb	%cl 
	movq	%rsi, (%rsp) 
	addq	%rcx, (%rsp) 
	movq	(%rsp), %rdx 
	movl	%r10d, %r12d 
	movl	$16, %r10d 
	testl	%r12d, %r12d 
	movq	%rax, (%r11) 
	movq	%rdx, 8(%r11) 
	je	.L8 
	testl	%ebx, %ebx 
	je	.L6 
	cmpl	$1, %ebx 
	je	.L23 
	cmpl	$2, %ebx 
	.p2align 4,,5 
	je	.L24 
	movq	%r13, %rax 
	movq	16(%r11), %rsi 
	movq	24(%r11), %rdi 
	mulq	8(%r14) 
	leaq	16(%r14), %rbp 
	movb	$32, %r10b 
	leaq	(%rsi,%rax), %r12 
	leaq	(%rdi,%rdx), %rcx 
	xorl	%eax, %eax 
	cmpq	%rsi, %r12 
	movq	%rcx, -80(%rsp) 
	movq	%r12, -88(%rsp) 
	setb	%al 
	addq	%rax, -80(%rsp) 
	movq	-88(%rsp), %r14 
	movq	-80(%rsp), %rbx 
	leal	-2(%r15), %r12d 
	movq	%r14, 16(%r11) 
	movq	%rbx, 24(%r11) 
.L24: 
	movq	%r13, %rax 
	movq	(%r10,%r11), %rcx 
	xorl	%r8d, %r8d 
	mulq	(%rbp) 
	addq	$8, %rbp 
	movq	%rax, %rdi 
	movq	8(%r10,%r11), %rax 
	leaq	(%rcx,%rdi), %r9 
	leaq	(%rax,%rdx), %rbx 
	cmpq	%rcx, %r9 
	movq	%r9, -104(%rsp) 
	setb	%r8b 
	movq	-104(%rsp), %rdx 
	decl	%r12d 
	movq	%rbx, -96(%rsp) 
	addq	%r8, -96(%rsp) 
	movq	-96(%rsp), %r15 
	movq	%rdx, (%r10,%r11) 
	movq	%r15, 8(%r10,%r11) 
	addq	$16, %r10 
.L23: 
	movq	%r13, %rax 
	movq	8(%r10,%r11), %r14 
	xorl	%r8d, %r8d 
	mulq	(%rbp) 
	addq	$8, %rbp 
	movq	%rax, %r9 
	movq	(%r10,%r11), %rax 
	leaq	(%r14,%rdx), %rdx 
	movq	%rdx, -112(%rsp) 
	leaq	(%rax,%r9), %rcx 
	cmpq	%rax, %rcx 
	movq	%rcx, -120(%rsp) 
	movq	-120(%rsp), %r15 
	setb	%r8b 
	addq	%r8, -112(%rsp) 
	movq	-112(%rsp), %rsi 
	movq	%r15, (%r10,%r11) 
	movq	%rsi, 8(%r10,%r11) 
	addq	$16, %r10 
	decl	%r12d 
	je	.L8 
	.p2align 4,,7 
.L6: 
	movq	%r13, %rax 
	movq	(%r10,%r11), %rbx 
	movq	(%r10,%r11), %r15 
	mulq	(%rbp) 
	movq	8(%r10,%r11), %rsi 
	xorl	%r9d, %r9d 
	movq	16(%r10,%r11), %r8 
	movq	32(%r10,%r11), %r14 
	addq	%rax, %rbx 
	movq	%r13, %rax 
	cmpq	%r15, %rbx 
	movq	24(%r10,%r11), %r15 
	movq	%rbx, -24(%rsp) 
	setb	%r9b 
	addq	%rdx, %rsi 
	movq	-24(%rsp), %rcx 
	movq	%rsi, -16(%rsp) 
	addq	%r9, -16(%rsp) 
	xorl	%esi, %esi 
	movq	-16(%rsp), %rdx 
	movq	%rcx, (%r10,%r11) 
	movq	%rdx, 8(%r10,%r11) 
	mulq	8(%rbp) 
	addq	%rax, %r8 
	movq	16(%r10,%r11), %rax 
	movq	%r8, -40(%rsp) 
	movq	-40(%rsp), %rdi 
	cmpq	%rax, %r8 
	movq	%r13, %rax 
	setb	%sil 
	addq	%rdx, %r15 
	movq	%rdi, 16(%r10,%r11) 
	mulq	16(%rbp) 
	movq	%r15, -32(%rsp) 
	movq	40(%r10,%r11), %r15 
	addq	%rsi, -32(%rsp) 
	movq	-32(%rsp), %r9 
	movq	%r9, 24(%r10,%r11) 
	movq	%rdx, %rbx 
	movq	32(%r10,%r11), %rdx 
	movq	%rax, %rcx 
	addq	%rcx, %rdx 
	cmpq	%r14, %rdx 
	movq	%rdx, -56(%rsp) 
	movq	-56(%rsp), %rdi 
	setb	%r8b 
	addq	%rbx, %r15 
	movl	%r8d, %eax 
	movq	%r15, -48(%rsp) 
	xorl	%r15d, %r15d 
	movzbl	%al, %esi  
	addq	%rsi, -48(%rsp) 
	movq	%r13, %rax 
	movq	-48(%rsp), %r9 
	movq	%rdi, 32(%r10,%r11) 
	mulq	24(%rbp) 
	movq	56(%r10,%r11), %r14 
	addq	$32, %rbp 
	movq	%r9, 40(%r10,%r11) 
	movq	%rax, %rcx 
	movq	48(%r10,%r11), %rax 
	movq	%rdx, %rbx 
	leaq	(%r14,%rbx), %r8 
	leaq	(%rax,%rcx), %rdx 
	movq	%r8, -64(%rsp) 
	cmpq	%rax, %rdx 
	movq	%rdx, -72(%rsp) 
	movq	-72(%rsp), %rsi 
	setb	%r15b 
	addq	%r15, -64(%rsp) 
	movq	-64(%rsp), %rdi 
	movq	%rsi, 48(%r10,%r11) 
	movq	%rdi, 56(%r10,%r11) 
	addq	$64, %r10 
	subl	$4, %r12d 
	jne	.L6 
	.p2align 4,,7 
.L8: 
	movq	16(%rsp), %rbx 
	movq	24(%rsp), %rbp 
	movq	32(%rsp), %r12 
	movq	40(%rsp), %r13 
	movq	48(%rsp), %r14 
	movq	56(%rsp), %r15 
	addq	$64, %rsp 
	ret 
.LFE3: 
	.size	test2, .-test2 
	.p2align 4,,15 
.globl test3 
	.type	test3, @function 
test3: 
.LFB4: 
	pushq	%rbp 
.LCFI7: 
	testl	%ecx, %ecx 
	movq	%rsi, %r10 
	movl	%ecx, %ebp 
	pushq	%rbx 
.LCFI8: 
	movq	%rdi, %rbx 
	movq	%rdx, %rdi 
	jle	.L33 
	leal	-1(%rbp), %ecx 
	movl	%ecx, %esi 
	andl	$7, %esi 
#APP 
	movq %r10,%rax 
mulq (%rdi) 
addq %rax,(%rbx) 
adcq %rdx,8(%rbx) 
 
#NO_APP 
	testl	%ecx, %ecx 
	leaq	16(%rbx), %r9 
	leaq	8(%rdi), %r8 
	movl	%ecx, %r11d 
	je	.L33 
	testl	%esi, %esi 
	je	.L31 
	cmpl	$1, %esi 
	je	.L61 
	cmpl	$2, %esi 
	.p2align 4,,5 
	je	.L62 
	cmpl	$3, %esi 
	.p2align 4,,5 
	je	.L63 
	cmpl	$4, %esi 
	.p2align 4,,5 
	je	.L64 
	cmpl	$5, %esi 
	.p2align 4,,5 
	je	.L65 
	cmpl	$6, %esi 
	.p2align 4,,5 
	je	.L66 
#APP 
	movq %r10,%rax 
mulq (%r8) 
addq %rax,(%r9) 
adcq %rdx,8(%r9) 
 
#NO_APP 
	leaq	32(%rbx), %r9 
	leaq	16(%rdi), %r8 
	leal	-2(%rbp), %r11d 
.L66: 
#APP 
	movq %r10,%rax 
mulq (%r8) 
addq %rax,(%r9) 
adcq %rdx,8(%r9) 
 
#NO_APP 
	addq	$16, %r9 
	addq	$8, %r8 
	decl	%r11d 
.L65: 
#APP 
	movq %r10,%rax 
mulq (%r8) 
addq %rax,(%r9) 
adcq %rdx,8(%r9) 
 
#NO_APP 
	addq	$16, %r9 
	addq	$8, %r8 
	decl	%r11d 
.L64: 
#APP 
	movq %r10,%rax 
mulq (%r8) 
addq %rax,(%r9) 
adcq %rdx,8(%r9) 
 
#NO_APP 
	addq	$16, %r9 
	addq	$8, %r8 
	decl	%r11d 
.L63: 
#APP 
	movq %r10,%rax 
mulq (%r8) 
addq %rax,(%r9) 
adcq %rdx,8(%r9) 
 
#NO_APP 
	addq	$16, %r9 
	addq	$8, %r8 
	decl	%r11d 
.L62: 
#APP 
	movq %r10,%rax 
mulq (%r8) 
addq %rax,(%r9) 
adcq %rdx,8(%r9) 
 
#NO_APP 
	addq	$16, %r9 
	addq	$8, %r8 
	decl	%r11d 
.L61: 
#APP 
	movq %r10,%rax 
mulq (%r8) 
addq %rax,(%r9) 
adcq %rdx,8(%r9) 
 
#NO_APP 
	addq	$16, %r9 
	addq	$8, %r8 
	decl	%r11d 
	je	.L33 
.L31: 
#APP 
	movq %r10,%rax 
mulq (%r8) 
addq %rax,(%r9) 
adcq %rdx,8(%r9) 
 
#NO_APP 
	leaq	16(%r9), %rsi 
	leaq	8(%r8), %rbp 
#APP 
	movq %r10,%rax 
mulq (%rbp) 
addq %rax,(%rsi) 
adcq %rdx,8(%rsi) 
 
#NO_APP 
	leaq	32(%r9), %rdi 
	leaq	16(%r8), %rbx 
#APP 
	movq %r10,%rax 
mulq (%rbx) 
addq %rax,(%rdi) 
adcq %rdx,8(%rdi) 
 
#NO_APP 
	leaq	48(%r9), %rcx 
	leaq	24(%r8), %rbp 
#APP 
	movq %r10,%rax 
mulq (%rbp) 
addq %rax,(%rcx) 
adcq %rdx,8(%rcx) 
 
#NO_APP 
	leaq	64(%r9), %rsi 
	leaq	32(%r8), %rdi 
#APP 
	movq %r10,%rax 
mulq (%rdi) 
addq %rax,(%rsi) 
adcq %rdx,8(%rsi) 
 
#NO_APP 
	leaq	80(%r9), %rbx 
	leaq	40(%r8), %rcx 
#APP 
	movq %r10,%rax 
mulq (%rcx) 
addq %rax,(%rbx) 
adcq %rdx,8(%rbx) 
 
#NO_APP 
	leaq	96(%r9), %rbp 
	leaq	48(%r8), %rdi 
#APP 
	movq %r10,%rax 
mulq (%rdi) 
addq %rax,(%rbp) 
adcq %rdx,8(%rbp) 
 
#NO_APP 
	leaq	112(%r9), %rsi 
	leaq	56(%r8), %rbx 
#APP 
	movq %r10,%rax 
mulq (%rbx) 
addq %rax,(%rsi) 
adcq %rdx,8(%rsi) 
 
#NO_APP 
	subq	$-128, %r9 
	addq	$64, %r8 
	subl	$8, %r11d 
	jne	.L31 
.L33: 
	popq	%rbx 
	popq	%rbp 
	ret 
.LFE4: 
	.size	test3, .-test3 
	.comm	a,16,16 
	.comm	b,16,16 
	.section	.eh_frame,"a",@progbits 
.Lframe1: 
	.long	.LECIE1-.LSCIE1 
.LSCIE1: 
	.long	0x0 
	.byte	0x1 
	.string	"" 
	.uleb128 0x1 
	.sleb128 -8 
	.byte	0x10 
	.byte	0xc 
	.uleb128 0x7 
	.uleb128 0x8 
	.byte	0x90 
	.uleb128 0x1 
	.align 8 
.LECIE1: 
.LSFDE1: 
	.long	.LEFDE1-.LASFDE1 
.LASFDE1: 
	.long	.LASFDE1-.Lframe1 
	.quad	.LFB2 
	.quad	.LFE2-.LFB2 
	.align 8 
.LEFDE1: 
.LSFDE3: 
	.long	.LEFDE3-.LASFDE3 
.LASFDE3: 
	.long	.LASFDE3-.Lframe1 
	.quad	.LFB3 
	.quad	.LFE3-.LFB3 
	.byte	0x4 
	.long	.LCFI3-.LFB3 
	.byte	0x83 
	.uleb128 0x7 
	.byte	0x8f 
	.uleb128 0x2 
	.byte	0x8e 
	.uleb128 0x3 
	.byte	0x8d 
	.uleb128 0x4 
	.byte	0x4 
	.long	.LCFI6-.LCFI3 
	.byte	0xe 
	.uleb128 0x48 
	.byte	0x8c 
	.uleb128 0x5 
	.byte	0x86 
	.uleb128 0x6 
	.align 8 
.LEFDE3: 
.LSFDE5: 
	.long	.LEFDE5-.LASFDE5 
.LASFDE5: 
	.long	.LASFDE5-.Lframe1 
	.quad	.LFB4 
	.quad	.LFE4-.LFB4 
	.byte	0x4 
	.long	.LCFI7-.LFB4 
	.byte	0xe 
	.uleb128 0x10 
	.byte	0x86 
	.uleb128 0x2 
	.byte	0x4 
	.long	.LCFI8-.LCFI7 
	.byte	0xe 
	.uleb128 0x18 
	.byte	0x83 
	.uleb128 0x3 
	.align 8 
.LEFDE5: 
	.section	.note.GNU-stack,"",@progbits 
	.ident	"GCC: (GNU) 3.4.1  (Gentoo Linux 3.4.1, ssp-3.4-2, 
pie-8.7.6.3)"

-- 
           Summary: loop unrolling with x86-64 asm not efficient
           Product: gcc
           Version: 3.4.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P2
         Component: c
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: tomstdenis at iahu dot ca
                CC: gcc-bugs at gcc dot gnu dot org
 GCC build triplet: gcc version 3.4.1  (Gentoo Linux 3.4.1, ssp-3.4-2, pie-
                    8.7.6.3)
  GCC host triplet: Linux timmy 2.6.7-gentoo-r11 #1 Thu Aug 5 01:49:49 UTC
                    2004 x86_
GCC target triplet: gcc version 3.4.1  (Gentoo Linux 3.4.1, ssp-3.4-2, pie-
                    8.7.6.3)


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16962


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/16962] loop unrolling with x86-64 asm not efficient
  2004-08-10 13:42 [Bug c/16962] New: loop unrolling with x86-64 asm not efficient tomstdenis at iahu dot ca
@ 2004-08-10 14:09 ` falk at debian dot org
  2004-08-10 14:10 ` falk at debian dot org
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: falk at debian dot org @ 2004-08-10 14:09 UTC (permalink / raw)
  To: gcc-bugs


------- Additional Comments From falk at debian dot org  2004-08-10 14:08 -------
Please use the attachment function for large files in the future. Note also
we don't want assembly output anyway (as stated on http://gcc.gnu.org/bugs.html).

Loop work is currently only being done at the lno-branch. It would be nice
if you could test it there.

Also, this way of unrolling loops doesn't seem fundamentally wrong to me
Can you provide performance numbers that show that it is worse?


-- 
           What    |Removed                     |Added
----------------------------------------------------------------------------
          Component|c                           |tree-optimization
  GCC build triplet|gcc version 3.4.1  (Gentoo  |x86_86-linux
                   |Linux 3.4.1, ssp-3.4-2, pie-|
                   |8.7.6.3)                    |
   GCC host triplet|Linux timmy 2.6.7-gentoo-r11|x86_86-linux
                   |#1 Thu Aug 5 01:49:49 UTC   |
                   |2004 x86_                   |
 GCC target triplet|gcc version 3.4.1  (Gentoo  |x86_86-linux
                   |Linux 3.4.1, ssp-3.4-2, pie-|
                   |8.7.6.3)                    |
           Keywords|                            |missed-optimization


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16962


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/16962] loop unrolling with x86-64 asm not efficient
  2004-08-10 13:42 [Bug c/16962] New: loop unrolling with x86-64 asm not efficient tomstdenis at iahu dot ca
  2004-08-10 14:09 ` [Bug tree-optimization/16962] " falk at debian dot org
@ 2004-08-10 14:10 ` falk at debian dot org
  2004-08-10 14:13 ` tomstdenis at iahu dot ca
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: falk at debian dot org @ 2004-08-10 14:10 UTC (permalink / raw)
  To: gcc-bugs


------- Additional Comments From falk at debian dot org  2004-08-10 14:10 -------
Whoops.


-- 
           What    |Removed                     |Added
----------------------------------------------------------------------------
  GCC build triplet|x86_86-linux                |x86_64-linux
   GCC host triplet|x86_86-linux                |x86_64-linux
 GCC target triplet|x86_86-linux                |x86_64-linux


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16962


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/16962] loop unrolling with x86-64 asm not efficient
  2004-08-10 13:42 [Bug c/16962] New: loop unrolling with x86-64 asm not efficient tomstdenis at iahu dot ca
  2004-08-10 14:09 ` [Bug tree-optimization/16962] " falk at debian dot org
  2004-08-10 14:10 ` falk at debian dot org
@ 2004-08-10 14:13 ` tomstdenis at iahu dot ca
  2004-08-10 14:23 ` pinskia at gcc dot gnu dot org
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: tomstdenis at iahu dot ca @ 2004-08-10 14:13 UTC (permalink / raw)
  To: gcc-bugs


------- Additional Comments From tomstdenis at iahu dot ca  2004-08-10 14:13 -------
(In reply to comment #1) 
> Please use the attachment function for large files in the future. Note also 
> we don't want assembly output anyway (as stated on 
http://gcc.gnu.org/bugs.html). 
 
Ooops sorry. 
 
> Loop work is currently only being done at the lno-branch. It would be nice 
> if you could test it there. 
 
I'll have to ask.  I'm using someone else box atm. 
  
> Also, this way of unrolling loops doesn't seem fundamentally wrong to me 
> Can you provide performance numbers that show that it is worse? 
 
No because I can't get it to compile the other way ;-( that's the point! 
 
>From what I can see the bug [maybe in my code]?  Is that I do things like  
 
mulq (%1) 
 
So GCC doesn't realize it can mod that and do  
 
mulq 0(%1) 
...next iteration 
mulq 8(%1) 
...next iteration 
mulq 16(%1) 
 
So instead it does 
 
mulq 0(%1) 
... 
lea 8(%some_register),%some_other 
mulq (%some_other) 
... 
 
Is my ASM code just wrong (in that I mean I'm not making best use of it?) or 
is this a legit chance for GCC to optimize better? 
 
>  
 
 

-- 
           What    |Removed                     |Added
----------------------------------------------------------------------------
  GCC build triplet|x86_64-linux                |x86_86-linux
   GCC host triplet|x86_64-linux                |x86_86-linux
 GCC target triplet|x86_64-linux                |x86_86-linux


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16962


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/16962] loop unrolling with x86-64 asm not efficient
  2004-08-10 13:42 [Bug c/16962] New: loop unrolling with x86-64 asm not efficient tomstdenis at iahu dot ca
                   ` (2 preceding siblings ...)
  2004-08-10 14:13 ` tomstdenis at iahu dot ca
@ 2004-08-10 14:23 ` pinskia at gcc dot gnu dot org
  2004-08-10 14:55 ` falk at debian dot org
  2004-08-24 21:06 ` falk at debian dot org
  5 siblings, 0 replies; 7+ messages in thread
From: pinskia at gcc dot gnu dot org @ 2004-08-10 14:23 UTC (permalink / raw)
  To: gcc-bugs


------- Additional Comments From pinskia at gcc dot gnu dot org  2004-08-10 14:23 -------
Invalid as there is no way we can schedule instructions inside an asm block.

-- 
           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |RESOLVED
         Resolution|                            |INVALID


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16962


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/16962] loop unrolling with x86-64 asm not efficient
  2004-08-10 13:42 [Bug c/16962] New: loop unrolling with x86-64 asm not efficient tomstdenis at iahu dot ca
                   ` (3 preceding siblings ...)
  2004-08-10 14:23 ` pinskia at gcc dot gnu dot org
@ 2004-08-10 14:55 ` falk at debian dot org
  2004-08-24 21:06 ` falk at debian dot org
  5 siblings, 0 replies; 7+ messages in thread
From: falk at debian dot org @ 2004-08-10 14:55 UTC (permalink / raw)
  To: gcc-bugs


------- Additional Comments From falk at debian dot org  2004-08-10 14:55 -------
Well, the fundamental problem is independent of assembly. Here's a much simpler
example:

void f(int *p, int l) {
    int i;
    for (i = 0; i < l; ++i)
        p[i] = 0;
}

With 3.4 and also with an oldish lno, on Alpha I get

$L36:
        stq $31,0($3)
        lda $5,1($5)
        lda $3,8($3)
$L35:
        stq $31,0($3)
        lda $5,1($5)
        lda $3,8($3)
...

while the offsets could easily be constant folded if there were no jumps into
the loop.

-- 
           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|RESOLVED                    |UNCONFIRMED
         Resolution|INVALID                     |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16962


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Bug tree-optimization/16962] loop unrolling with x86-64 asm not efficient
  2004-08-10 13:42 [Bug c/16962] New: loop unrolling with x86-64 asm not efficient tomstdenis at iahu dot ca
                   ` (4 preceding siblings ...)
  2004-08-10 14:55 ` falk at debian dot org
@ 2004-08-24 21:06 ` falk at debian dot org
  5 siblings, 0 replies; 7+ messages in thread
From: falk at debian dot org @ 2004-08-24 21:06 UTC (permalink / raw)
  To: gcc-bugs


------- Additional Comments From falk at debian dot org  2004-08-24 21:06 -------
Looking more closely, my Alpha example actually generates reasonable code
now. As to your assembly: I can't really read i386 assembly, but it seems it
modifies something without telling gcc, so it is invalid. To improve
performance, you should avoid (%x) and use a "m" constraint, and not clobber
hard regs but use "=r" on temp vars. Something like:

asm("..." : "=m"(*out) :"r"(x), "m"(*y));

With this, I get entirely reasonable code on the lno-branch, so closing.


-- 
           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |RESOLVED
         Resolution|                            |INVALID


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16962


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2004-08-24 21:06 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2004-08-10 13:42 [Bug c/16962] New: loop unrolling with x86-64 asm not efficient tomstdenis at iahu dot ca
2004-08-10 14:09 ` [Bug tree-optimization/16962] " falk at debian dot org
2004-08-10 14:10 ` falk at debian dot org
2004-08-10 14:13 ` tomstdenis at iahu dot ca
2004-08-10 14:23 ` pinskia at gcc dot gnu dot org
2004-08-10 14:55 ` falk at debian dot org
2004-08-24 21:06 ` falk at debian dot org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).