public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
From: "tomstdenis at iahu dot ca" <gcc-bugzilla@gcc.gnu.org> To: gcc-bugs@gcc.gnu.org Subject: [Bug c/16962] New: loop unrolling with x86-64 asm not efficient Date: Tue, 10 Aug 2004 13:42:00 -0000 [thread overview] Message-ID: <20040810134244.16962.tomstdenis@iahu.ca> (raw) [note this bug was previously reported as part of bug #16961 which currently reports inefficient 128-bit additions]. I have the following demo program typedef unsigned long long mp_digit; typedef unsigned long mp_word __attribute__ ((mode(TI))); mp_word a, b; // demo slow 128-bit add void test(void) { a += b; } // this unrolls right (but is otherwise inefficient cuz of the 128-bit add) void test2(mp_word *out, mp_digit x, mp_digit *y, int n) { int z; for (z = 0; z < n; z++) out[z] += ((mp_word)x) * ((mp_word)*y++); } // this unrolls poorly void test3(mp_word *out, mp_digit x, mp_digit *y, int n) { int z; for (z = 0; z < n; z++) { asm("movq %0,%%rax\n" "mulq (%1)\n" "addq %%rax,(%2)\n" "adcq %%rdx,8(%2)\n" ::"r"(x), "r"(y), "r"(out) : "%rax", "%rdx"); ++out; ++y; } } And it produces inefficient loop unrolling when ASM blocks are used... I used "gcc -O3 -fomit-frame-pointer -funroll-loops -march=k8 -m64 -S test.c" .file "test.c" .text .p2align 4,,15 .globl test .type test, @function test: .LFB2: movq a(%rip), %r10 movq b(%rip), %r8 xorl %ecx, %ecx movq a+8(%rip), %rdi movq b+8(%rip), %r9 leaq (%r10,%r8), %rax leaq (%rdi,%r9), %rsi cmpq %r10, %rax movq %rax, a(%rip) setb %cl leaq (%rcx,%rsi), %rdx movq %rdx, a+8(%rip) ret .LFE2: .size test, .-test .p2align 4,,15 .globl test2 .type test2, @function test2: .LFB3: movq %r13, -24(%rsp) .LCFI0: movq %r14, -16(%rsp) .LCFI1: movq %rdi, %r11 movq %r15, -8(%rsp) .LCFI2: movq %rbx, -48(%rsp) .LCFI3: movq %rsi, %r13 movq %rbp, -40(%rsp) .LCFI4: movq %r12, -32(%rsp) .LCFI5: subq $64, %rsp .LCFI6: testl %ecx, %ecx movq %rdx, %r14 movl %ecx, %r15d jle .L8 movq %rsi, %rax movq (%rdi), %r12 movq 8(%rdi), %rdi mulq (%rdx) leal -1(%r15), %r10d xorl %ecx, %ecx leaq 8(%r14), %rbp movl %r10d, %ebx andl $3, %ebx movq %rdx, %r9 leaq (%r12,%rax), %rdx leaq (%rdi,%r9), %rsi cmpq %r12, %rdx movq %rdx, -8(%rsp) movq -8(%rsp), %rax setb %cl movq %rsi, (%rsp) addq %rcx, (%rsp) movq (%rsp), %rdx movl %r10d, %r12d movl $16, %r10d testl %r12d, %r12d movq %rax, (%r11) movq %rdx, 8(%r11) je .L8 testl %ebx, %ebx je .L6 cmpl $1, %ebx je .L23 cmpl $2, %ebx .p2align 4,,5 je .L24 movq %r13, %rax movq 16(%r11), %rsi movq 24(%r11), %rdi mulq 8(%r14) leaq 16(%r14), %rbp movb $32, %r10b leaq (%rsi,%rax), %r12 leaq (%rdi,%rdx), %rcx xorl %eax, %eax cmpq %rsi, %r12 movq %rcx, -80(%rsp) movq %r12, -88(%rsp) setb %al addq %rax, -80(%rsp) movq -88(%rsp), %r14 movq -80(%rsp), %rbx leal -2(%r15), %r12d movq %r14, 16(%r11) movq %rbx, 24(%r11) .L24: movq %r13, %rax movq (%r10,%r11), %rcx xorl %r8d, %r8d mulq (%rbp) addq $8, %rbp movq %rax, %rdi movq 8(%r10,%r11), %rax leaq (%rcx,%rdi), %r9 leaq (%rax,%rdx), %rbx cmpq %rcx, %r9 movq %r9, -104(%rsp) setb %r8b movq -104(%rsp), %rdx decl %r12d movq %rbx, -96(%rsp) addq %r8, -96(%rsp) movq -96(%rsp), %r15 movq %rdx, (%r10,%r11) movq %r15, 8(%r10,%r11) addq $16, %r10 .L23: movq %r13, %rax movq 8(%r10,%r11), %r14 xorl %r8d, %r8d mulq (%rbp) addq $8, %rbp movq %rax, %r9 movq (%r10,%r11), %rax leaq (%r14,%rdx), %rdx movq %rdx, -112(%rsp) leaq (%rax,%r9), %rcx cmpq %rax, %rcx movq %rcx, -120(%rsp) movq -120(%rsp), %r15 setb %r8b addq %r8, -112(%rsp) movq -112(%rsp), %rsi movq %r15, (%r10,%r11) movq %rsi, 8(%r10,%r11) addq $16, %r10 decl %r12d je .L8 .p2align 4,,7 .L6: movq %r13, %rax movq (%r10,%r11), %rbx movq (%r10,%r11), %r15 mulq (%rbp) movq 8(%r10,%r11), %rsi xorl %r9d, %r9d movq 16(%r10,%r11), %r8 movq 32(%r10,%r11), %r14 addq %rax, %rbx movq %r13, %rax cmpq %r15, %rbx movq 24(%r10,%r11), %r15 movq %rbx, -24(%rsp) setb %r9b addq %rdx, %rsi movq -24(%rsp), %rcx movq %rsi, -16(%rsp) addq %r9, -16(%rsp) xorl %esi, %esi movq -16(%rsp), %rdx movq %rcx, (%r10,%r11) movq %rdx, 8(%r10,%r11) mulq 8(%rbp) addq %rax, %r8 movq 16(%r10,%r11), %rax movq %r8, -40(%rsp) movq -40(%rsp), %rdi cmpq %rax, %r8 movq %r13, %rax setb %sil addq %rdx, %r15 movq %rdi, 16(%r10,%r11) mulq 16(%rbp) movq %r15, -32(%rsp) movq 40(%r10,%r11), %r15 addq %rsi, -32(%rsp) movq -32(%rsp), %r9 movq %r9, 24(%r10,%r11) movq %rdx, %rbx movq 32(%r10,%r11), %rdx movq %rax, %rcx addq %rcx, %rdx cmpq %r14, %rdx movq %rdx, -56(%rsp) movq -56(%rsp), %rdi setb %r8b addq %rbx, %r15 movl %r8d, %eax movq %r15, -48(%rsp) xorl %r15d, %r15d movzbl %al, %esi addq %rsi, -48(%rsp) movq %r13, %rax movq -48(%rsp), %r9 movq %rdi, 32(%r10,%r11) mulq 24(%rbp) movq 56(%r10,%r11), %r14 addq $32, %rbp movq %r9, 40(%r10,%r11) movq %rax, %rcx movq 48(%r10,%r11), %rax movq %rdx, %rbx leaq (%r14,%rbx), %r8 leaq (%rax,%rcx), %rdx movq %r8, -64(%rsp) cmpq %rax, %rdx movq %rdx, -72(%rsp) movq -72(%rsp), %rsi setb %r15b addq %r15, -64(%rsp) movq -64(%rsp), %rdi movq %rsi, 48(%r10,%r11) movq %rdi, 56(%r10,%r11) addq $64, %r10 subl $4, %r12d jne .L6 .p2align 4,,7 .L8: movq 16(%rsp), %rbx movq 24(%rsp), %rbp movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 addq $64, %rsp ret .LFE3: .size test2, .-test2 .p2align 4,,15 .globl test3 .type test3, @function test3: .LFB4: pushq %rbp .LCFI7: testl %ecx, %ecx movq %rsi, %r10 movl %ecx, %ebp pushq %rbx .LCFI8: movq %rdi, %rbx movq %rdx, %rdi jle .L33 leal -1(%rbp), %ecx movl %ecx, %esi andl $7, %esi #APP movq %r10,%rax mulq (%rdi) addq %rax,(%rbx) adcq %rdx,8(%rbx) #NO_APP testl %ecx, %ecx leaq 16(%rbx), %r9 leaq 8(%rdi), %r8 movl %ecx, %r11d je .L33 testl %esi, %esi je .L31 cmpl $1, %esi je .L61 cmpl $2, %esi .p2align 4,,5 je .L62 cmpl $3, %esi .p2align 4,,5 je .L63 cmpl $4, %esi .p2align 4,,5 je .L64 cmpl $5, %esi .p2align 4,,5 je .L65 cmpl $6, %esi .p2align 4,,5 je .L66 #APP movq %r10,%rax mulq (%r8) addq %rax,(%r9) adcq %rdx,8(%r9) #NO_APP leaq 32(%rbx), %r9 leaq 16(%rdi), %r8 leal -2(%rbp), %r11d .L66: #APP movq %r10,%rax mulq (%r8) addq %rax,(%r9) adcq %rdx,8(%r9) #NO_APP addq $16, %r9 addq $8, %r8 decl %r11d .L65: #APP movq %r10,%rax mulq (%r8) addq %rax,(%r9) adcq %rdx,8(%r9) #NO_APP addq $16, %r9 addq $8, %r8 decl %r11d .L64: #APP movq %r10,%rax mulq (%r8) addq %rax,(%r9) adcq %rdx,8(%r9) #NO_APP addq $16, %r9 addq $8, %r8 decl %r11d .L63: #APP movq %r10,%rax mulq (%r8) addq %rax,(%r9) adcq %rdx,8(%r9) #NO_APP addq $16, %r9 addq $8, %r8 decl %r11d .L62: #APP movq %r10,%rax mulq (%r8) addq %rax,(%r9) adcq %rdx,8(%r9) #NO_APP addq $16, %r9 addq $8, %r8 decl %r11d .L61: #APP movq %r10,%rax mulq (%r8) addq %rax,(%r9) adcq %rdx,8(%r9) #NO_APP addq $16, %r9 addq $8, %r8 decl %r11d je .L33 .L31: #APP movq %r10,%rax mulq (%r8) addq %rax,(%r9) adcq %rdx,8(%r9) #NO_APP leaq 16(%r9), %rsi leaq 8(%r8), %rbp #APP movq %r10,%rax mulq (%rbp) addq %rax,(%rsi) adcq %rdx,8(%rsi) #NO_APP leaq 32(%r9), %rdi leaq 16(%r8), %rbx #APP movq %r10,%rax mulq (%rbx) addq %rax,(%rdi) adcq %rdx,8(%rdi) #NO_APP leaq 48(%r9), %rcx leaq 24(%r8), %rbp #APP movq %r10,%rax mulq (%rbp) addq %rax,(%rcx) adcq %rdx,8(%rcx) #NO_APP leaq 64(%r9), %rsi leaq 32(%r8), %rdi #APP movq %r10,%rax mulq (%rdi) addq %rax,(%rsi) adcq %rdx,8(%rsi) #NO_APP leaq 80(%r9), %rbx leaq 40(%r8), %rcx #APP movq %r10,%rax mulq (%rcx) addq %rax,(%rbx) adcq %rdx,8(%rbx) #NO_APP leaq 96(%r9), %rbp leaq 48(%r8), %rdi #APP movq %r10,%rax mulq (%rdi) addq %rax,(%rbp) adcq %rdx,8(%rbp) #NO_APP leaq 112(%r9), %rsi leaq 56(%r8), %rbx #APP movq %r10,%rax mulq (%rbx) addq %rax,(%rsi) adcq %rdx,8(%rsi) #NO_APP subq $-128, %r9 addq $64, %r8 subl $8, %r11d jne .L31 .L33: popq %rbx popq %rbp ret .LFE4: .size test3, .-test3 .comm a,16,16 .comm b,16,16 .section .eh_frame,"a",@progbits .Lframe1: .long .LECIE1-.LSCIE1 .LSCIE1: .long 0x0 .byte 0x1 .string "" .uleb128 0x1 .sleb128 -8 .byte 0x10 .byte 0xc .uleb128 0x7 .uleb128 0x8 .byte 0x90 .uleb128 0x1 .align 8 .LECIE1: .LSFDE1: .long .LEFDE1-.LASFDE1 .LASFDE1: .long .LASFDE1-.Lframe1 .quad .LFB2 .quad .LFE2-.LFB2 .align 8 .LEFDE1: .LSFDE3: .long .LEFDE3-.LASFDE3 .LASFDE3: .long .LASFDE3-.Lframe1 .quad .LFB3 .quad .LFE3-.LFB3 .byte 0x4 .long .LCFI3-.LFB3 .byte 0x83 .uleb128 0x7 .byte 0x8f .uleb128 0x2 .byte 0x8e .uleb128 0x3 .byte 0x8d .uleb128 0x4 .byte 0x4 .long .LCFI6-.LCFI3 .byte 0xe .uleb128 0x48 .byte 0x8c .uleb128 0x5 .byte 0x86 .uleb128 0x6 .align 8 .LEFDE3: .LSFDE5: .long .LEFDE5-.LASFDE5 .LASFDE5: .long .LASFDE5-.Lframe1 .quad .LFB4 .quad .LFE4-.LFB4 .byte 0x4 .long .LCFI7-.LFB4 .byte 0xe .uleb128 0x10 .byte 0x86 .uleb128 0x2 .byte 0x4 .long .LCFI8-.LCFI7 .byte 0xe .uleb128 0x18 .byte 0x83 .uleb128 0x3 .align 8 .LEFDE5: .section .note.GNU-stack,"",@progbits .ident "GCC: (GNU) 3.4.1 (Gentoo Linux 3.4.1, ssp-3.4-2, pie-8.7.6.3)" -- Summary: loop unrolling with x86-64 asm not efficient Product: gcc Version: 3.4.1 Status: UNCONFIRMED Severity: normal Priority: P2 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: tomstdenis at iahu dot ca CC: gcc-bugs at gcc dot gnu dot org GCC build triplet: gcc version 3.4.1 (Gentoo Linux 3.4.1, ssp-3.4-2, pie- 8.7.6.3) GCC host triplet: Linux timmy 2.6.7-gentoo-r11 #1 Thu Aug 5 01:49:49 UTC 2004 x86_ GCC target triplet: gcc version 3.4.1 (Gentoo Linux 3.4.1, ssp-3.4-2, pie- 8.7.6.3) http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16962
next reply other threads:[~2004-08-10 13:42 UTC|newest] Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top 2004-08-10 13:42 tomstdenis at iahu dot ca [this message] 2004-08-10 14:09 ` [Bug tree-optimization/16962] " falk at debian dot org 2004-08-10 14:10 ` falk at debian dot org 2004-08-10 14:13 ` tomstdenis at iahu dot ca 2004-08-10 14:23 ` pinskia at gcc dot gnu dot org 2004-08-10 14:55 ` falk at debian dot org 2004-08-24 21:06 ` falk at debian dot org
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20040810134244.16962.tomstdenis@iahu.ca \ --to=gcc-bugzilla@gcc.gnu.org \ --cc=gcc-bugs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).