public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
From: "hewillk at gmail dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug middle-end/100104] std::transform is 1.5 times faster than std::copy with -O3
Date: Fri, 16 Apr 2021 01:43:26 +0000	[thread overview]
Message-ID: <bug-100104-4-FYDJ0Pwfkq@http.gcc.gnu.org/bugzilla/> (raw)
In-Reply-To: <bug-100104-4@http.gcc.gnu.org/bugzilla/>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100104

--- Comment #3 from 康桓瑋 <hewillk at gmail dot com> ---
Build "copy" with -O2 on x86-64 (https://godbolt.org/z/Gja6xrq9G):

.LC0:
        .string "vector::_M_realloc_insert"
copy(std::vector<double, std::allocator<double> > const&):
        push    r15
        pxor    xmm0, xmm0
        push    r14
        push    r13
        push    r12
        mov     r12, rdi
        push    rbp
        push    rbx
        sub     rsp, 40
        mov     r13, QWORD PTR [rsi+8]
        mov     rbx, QWORD PTR [rsi]
        movups  XMMWORD PTR [rdi], xmm0
        mov     QWORD PTR [rdi+16], 0
        cmp     rbx, r13
        je      .L1
        xor     r8d, r8d
        xor     ecx, ecx
        jmp     .L13
.L31:
        mov     DWORD PTR [rcx], ebp
        add     rbx, 8
        add     rcx, 4
        mov     QWORD PTR [r12+8], rcx
        cmp     r13, rbx
        je      .L1
.L13:
        cvttsd2si       ebp, QWORD PTR [rbx]
        cmp     rcx, r8
        jne     .L31
        movabs  rax, 2305843009213693951
        mov     r15, QWORD PTR [r12]
        sub     rcx, r15
        mov     rdx, rcx
        mov     r14, rcx
        sar     rdx, 2
        cmp     rdx, rax
        je      .L32
        test    rdx, rdx
        mov     eax, 1
        cmovne  rax, rdx
        add     rax, rdx
        jc      .L7
        test    rax, rax
        jne     .L33
        xor     r8d, r8d
        xor     edi, edi
.L9:
        lea     rcx, [rdi+4+r14]
        movq    xmm0, rdi
        mov     DWORD PTR [rdi+r14], ebp
        movq    xmm1, rcx
        punpcklqdq      xmm0, xmm1
        test    r14, r14
        jg      .L34
        test    r15, r15
        jne     .L35
.L12:
        add     rbx, 8
        mov     QWORD PTR [r12+16], r8
        movups  XMMWORD PTR [r12], xmm0
        cmp     r13, rbx
        jne     .L13
.L1:
        add     rsp, 40
        mov     rax, r12
        pop     rbx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret
.L34:
        mov     rsi, r15
        mov     rdx, r14
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    memmove
        mov     rsi, QWORD PTR [r12+16]
        mov     rcx, QWORD PTR [rsp]
        mov     r8, QWORD PTR [rsp+8]
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        sub     rsi, r15
.L11:
        mov     rdi, r15
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    operator delete(void*, unsigned long)
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        mov     r8, QWORD PTR [rsp+8]
        mov     rcx, QWORD PTR [rsp]
        jmp     .L12
.L35:
        mov     rsi, QWORD PTR [r12+16]
        sub     rsi, r15
        jmp     .L11
.L33:
        movabs  rdx, 2305843009213693951
        cmp     rax, rdx
        cmova   rax, rdx
        sal     rax, 2
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
.L8:
        call    operator new(unsigned long)
        mov     r8, QWORD PTR [rsp]
        mov     rdi, rax
        add     r8, rax
        jmp     .L9
.L7:
        movabs  rax, 9223372036854775804
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
        jmp     .L8
.L32:
        mov     edi, OFFSET FLAT:.LC0
        call    std::__throw_length_error(char const*)
        mov     rbp, rax
        jmp     .L15
copy(std::vector<double, std::allocator<double> > const&) [clone .cold]:


===========================================================================


with -O3:

.LC0:
        .string "vector::_M_realloc_insert"
copy(std::vector<double, std::allocator<double> > const&):
        push    r15
        pxor    xmm0, xmm0
        push    r14
        push    r13
        push    r12
        mov     r12, rdi
        push    rbp
        push    rbx
        sub     rsp, 40
        mov     r13, QWORD PTR [rsi+8]
        mov     rbx, QWORD PTR [rsi]
        movups  XMMWORD PTR [rdi], xmm0
        mov     QWORD PTR [rdi+16], 0
        cmp     rbx, r13
        je      .L1
        xor     r8d, r8d
        xor     ecx, ecx
        jmp     .L13
.L31:
        mov     DWORD PTR [rcx], ebp
        add     rbx, 8
        add     rcx, 4
        mov     QWORD PTR [r12+8], rcx
        cmp     r13, rbx
        je      .L1
.L13:
        cvttsd2si       ebp, QWORD PTR [rbx]
        cmp     rcx, r8
        jne     .L31
        movabs  rax, 2305843009213693951
        mov     r15, QWORD PTR [r12]
        sub     rcx, r15
        mov     rdx, rcx
        mov     r14, rcx
        sar     rdx, 2
        cmp     rdx, rax
        je      .L32
        test    rdx, rdx
        mov     eax, 1
        cmovne  rax, rdx
        add     rax, rdx
        jc      .L7
        test    rax, rax
        jne     .L33
        xor     r8d, r8d
        xor     edi, edi
.L9:
        lea     rcx, [rdi+4+r14]
        movq    xmm0, rdi
        mov     DWORD PTR [rdi+r14], ebp
        movq    xmm1, rcx
        punpcklqdq      xmm0, xmm1
        test    r14, r14
        jg      .L34
        test    r15, r15
        jne     .L35
.L12:
        add     rbx, 8
        mov     QWORD PTR [r12+16], r8
        movups  XMMWORD PTR [r12], xmm0
        cmp     r13, rbx
        jne     .L13
.L1:
        add     rsp, 40
        mov     rax, r12
        pop     rbx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret
.L34:
        mov     rsi, r15
        mov     rdx, r14
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    memmove
        mov     rsi, QWORD PTR [r12+16]
        mov     rcx, QWORD PTR [rsp]
        mov     r8, QWORD PTR [rsp+8]
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        sub     rsi, r15
.L11:
        mov     rdi, r15
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    operator delete(void*, unsigned long)
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        mov     r8, QWORD PTR [rsp+8]
        mov     rcx, QWORD PTR [rsp]
        jmp     .L12
.L35:
        mov     rsi, QWORD PTR [r12+16]
        sub     rsi, r15
        jmp     .L11
.L33:
        movabs  rdx, 2305843009213693951
        cmp     rax, rdx
        cmova   rax, rdx
        sal     rax, 2
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
.L8:
        call    operator new(unsigned long)
        mov     r8, QWORD PTR [rsp]
        mov     rdi, rax
        add     r8, rax
        jmp     .L9
.L7:
        movabs  rax, 9223372036854775804
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
        jmp     .L8
.L32:
        mov     edi, OFFSET FLAT:.LC0
        call    std::__throw_length_error(char const*)
        mov     rbp, rax
        jmp     .L15
copy(std::vector<double, std::allocator<double> > const&) [clone .cold]:


===========================================================================


Build "transform" with -O2 on x86-64 (https://godbolt.org/z/YTEfWEbcq):

.LC0:
        .string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
        push    r12
        mov     r12, rdi
        push    rbp
        push    rbx
        sub     rsp, 16
        mov     rbp, QWORD PTR [rsi+8]
        mov     rbx, QWORD PTR [rsi]
        mov     QWORD PTR [rdi], 0
        mov     QWORD PTR [rdi+8], 0
        mov     QWORD PTR [rdi+16], 0
        cmp     rbx, rbp
        je      .L19
        xor     edx, edx
        xor     esi, esi
        jmp     .L23
.L31:
        mov     DWORD PTR [rsi], eax
        add     rbx, 8
        add     rsi, 4
        mov     QWORD PTR [r12+8], rsi
        cmp     rbp, rbx
        je      .L19
.L32:
        mov     rsi, QWORD PTR [r12+8]
        mov     rdx, QWORD PTR [r12+16]
.L23:
        cvttsd2si       eax, QWORD PTR [rbx]
        mov     DWORD PTR [rsp+12], eax
        cmp     rsi, rdx
        jne     .L31
        lea     rdx, [rsp+12]
        mov     rdi, r12
        call    void std::vector<int, std::allocator<int>
>::_M_realloc_insert<int>(__gnu_cxx::__normal_iterator<int*, std::vector<int,
std::allocator<int> > >, int&&)
        add     rbx, 8
        cmp     rbp, rbx
        jne     .L32
.L19:
        add     rsp, 16
        mov     rax, r12
        pop     rbx
        pop     rbp
        pop     r12
        ret
        mov     rbp, rax
        jmp     .L24
transform(std::vector<double, std::allocator<double> > const&) [clone .cold]:


===========================================================================


and with -O3:

.LC0:
        .string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
        push    r15
        pxor    xmm0, xmm0
        push    r14
        push    r13
        push    r12
        mov     r12, rdi
        push    rbp
        push    rbx
        sub     rsp, 40
        mov     r13, QWORD PTR [rsi+8]
        mov     rbx, QWORD PTR [rsi]
        movups  XMMWORD PTR [rdi], xmm0
        mov     QWORD PTR [rdi+16], 0
        cmp     rbx, r13
        je      .L1
        xor     r8d, r8d
        xor     ecx, ecx
        jmp     .L13
.L31:
        mov     DWORD PTR [rcx], ebp
        add     rbx, 8
        add     rcx, 4
        mov     QWORD PTR [r12+8], rcx
        cmp     r13, rbx
        je      .L1
.L13:
        cvttsd2si       ebp, QWORD PTR [rbx]
        cmp     rcx, r8
        jne     .L31
        movabs  rax, 2305843009213693951
        mov     r15, QWORD PTR [r12]
        sub     rcx, r15
        mov     rdx, rcx
        mov     r14, rcx
        sar     rdx, 2
        cmp     rdx, rax
        je      .L32
        test    rdx, rdx
        mov     eax, 1
        cmovne  rax, rdx
        add     rax, rdx
        jc      .L7
        test    rax, rax
        jne     .L33
        xor     r8d, r8d
        xor     edi, edi
.L9:
        lea     rcx, [rdi+4+r14]
        movq    xmm0, rdi
        mov     DWORD PTR [rdi+r14], ebp
        movq    xmm1, rcx
        punpcklqdq      xmm0, xmm1
        test    r14, r14
        jg      .L34
        test    r15, r15
        jne     .L35
.L12:
        add     rbx, 8
        mov     QWORD PTR [r12+16], r8
        movups  XMMWORD PTR [r12], xmm0
        cmp     r13, rbx
        jne     .L13
.L1:
        add     rsp, 40
        mov     rax, r12
        pop     rbx
        pop     rbp
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        ret
.L34:
        mov     rsi, r15
        mov     rdx, r14
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    memmove
        mov     rsi, QWORD PTR [r12+16]
        mov     rcx, QWORD PTR [rsp]
        mov     r8, QWORD PTR [rsp+8]
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        sub     rsi, r15
.L11:
        mov     rdi, r15
        mov     QWORD PTR [rsp+8], r8
        mov     QWORD PTR [rsp], rcx
        movaps  XMMWORD PTR [rsp+16], xmm0
        call    operator delete(void*, unsigned long)
        movdqa  xmm0, XMMWORD PTR [rsp+16]
        mov     r8, QWORD PTR [rsp+8]
        mov     rcx, QWORD PTR [rsp]
        jmp     .L12
.L35:
        mov     rsi, QWORD PTR [r12+16]
        sub     rsi, r15
        jmp     .L11
.L33:
        movabs  rdx, 2305843009213693951
        cmp     rax, rdx
        cmova   rax, rdx
        sal     rax, 2
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
.L8:
        call    operator new(unsigned long)
        mov     r8, QWORD PTR [rsp]
        mov     rdi, rax
        add     r8, rax
        jmp     .L9
.L7:
        movabs  rax, 9223372036854775804
        mov     QWORD PTR [rsp], rax
        mov     rdi, rax
        jmp     .L8
.L32:
        mov     edi, OFFSET FLAT:.LC0
        call    std::__throw_length_error(char const*)
        mov     rbp, rax
        jmp     .L15
transform(std::vector<double, std::allocator<double> > const&) [clone .cold]:

  parent reply	other threads:[~2021-04-16  1:43 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-15 18:06 [Bug c++/100104] New: " hewillk at gmail dot com
2021-04-15 19:30 ` [Bug middle-end/100104] " pinskia at gcc dot gnu.org
2021-04-16  1:42 ` hewillk at gmail dot com
2021-04-16  1:43 ` hewillk at gmail dot com [this message]
2021-04-16  1:51 ` hewillk at gmail dot com
2021-04-18 17:30 ` hewillk at gmail dot com

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-100104-4-FYDJ0Pwfkq@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).