From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 423E93860C3F; Fri, 16 Apr 2021 01:42:40 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 423E93860C3F From: "hewillk at gmail dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug middle-end/100104] std::transform is 1.5 times faster than std::copy with -O3 Date: Fri, 16 Apr 2021 01:42:40 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: middle-end X-Bugzilla-Version: 11.0 X-Bugzilla-Keywords: missed-optimization X-Bugzilla-Severity: normal X-Bugzilla-Who: hewillk at gmail dot com X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 16 Apr 2021 01:42:40 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D100104 --- Comment #2 from =E5=BA=B7=E6=A1=93=E7=91=8B = --- Build "copy" with -O2 on x86-64 (https://godbolt.org/z/Gja6xrq9G): .LC0: .string "vector::_M_realloc_insert" copy(std::vector > const&): push r15 pxor xmm0, xmm0 push r14 push r13 push r12 mov r12, rdi push rbp push rbx sub rsp, 40 mov r13, QWORD PTR [rsi+8] mov rbx, QWORD PTR [rsi] movups XMMWORD PTR [rdi], xmm0 mov QWORD PTR [rdi+16], 0 cmp rbx, r13 je .L1 xor r8d, r8d xor ecx, ecx jmp .L13 .L31: mov DWORD PTR [rcx], ebp add rbx, 8 add rcx, 4 mov QWORD PTR [r12+8], rcx cmp r13, rbx je .L1 .L13: cvttsd2si ebp, QWORD PTR [rbx] cmp rcx, r8 jne .L31 movabs rax, 2305843009213693951 mov r15, QWORD PTR [r12] sub rcx, r15 mov rdx, rcx mov r14, rcx sar rdx, 2 cmp rdx, rax je .L32 test rdx, rdx mov eax, 1 cmovne rax, rdx add rax, rdx jc .L7 test rax, rax jne .L33 xor r8d, r8d xor edi, edi .L9: lea rcx, [rdi+4+r14] movq xmm0, rdi mov DWORD PTR [rdi+r14], ebp movq xmm1, rcx punpcklqdq xmm0, xmm1 test r14, r14 jg .L34 test r15, r15 jne .L35 .L12: add rbx, 8 mov QWORD PTR [r12+16], r8 movups XMMWORD PTR [r12], xmm0 cmp r13, rbx jne .L13 .L1: add rsp, 40 mov rax, r12 pop rbx pop rbp pop r12 pop r13 pop r14 pop r15 ret .L34: mov rsi, r15 mov rdx, r14 mov QWORD PTR [rsp+8], r8 mov QWORD PTR [rsp], rcx movaps XMMWORD PTR [rsp+16], xmm0 call memmove mov rsi, QWORD PTR [r12+16] mov rcx, QWORD PTR [rsp] mov r8, QWORD PTR [rsp+8] movdqa xmm0, XMMWORD PTR [rsp+16] sub rsi, r15 .L11: mov rdi, r15 mov QWORD PTR [rsp+8], r8 mov QWORD PTR [rsp], rcx movaps XMMWORD PTR [rsp+16], xmm0 call operator delete(void*, unsigned long) movdqa xmm0, XMMWORD PTR [rsp+16] mov r8, QWORD PTR [rsp+8] mov rcx, QWORD PTR [rsp] jmp .L12 .L35: mov rsi, QWORD PTR [r12+16] sub rsi, r15 jmp .L11 .L33: movabs rdx, 2305843009213693951 cmp rax, rdx cmova rax, rdx sal rax, 2 mov QWORD PTR [rsp], rax mov rdi, rax .L8: call operator new(unsigned long) mov r8, QWORD PTR [rsp] mov rdi, rax add r8, rax jmp .L9 .L7: movabs rax, 9223372036854775804 mov QWORD PTR [rsp], rax mov rdi, rax jmp .L8 .L32: mov edi, OFFSET FLAT:.LC0 call std::__throw_length_error(char const*) mov rbp, rax jmp .L15 copy(std::vector > const&) [clone .cold]: =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D with -O3: .LC0: .string "vector::_M_realloc_insert" copy(std::vector > const&): push r15 pxor xmm0, xmm0 push r14 push r13 push r12 mov r12, rdi push rbp push rbx sub rsp, 40 mov r13, QWORD PTR [rsi+8] mov rbx, QWORD PTR [rsi] movups XMMWORD PTR [rdi], xmm0 mov QWORD PTR [rdi+16], 0 cmp rbx, r13 je .L1 xor r8d, r8d xor ecx, ecx jmp .L13 .L31: mov DWORD PTR [rcx], ebp add rbx, 8 add rcx, 4 mov QWORD PTR [r12+8], rcx cmp r13, rbx je .L1 .L13: cvttsd2si ebp, QWORD PTR [rbx] cmp rcx, r8 jne .L31 movabs rax, 2305843009213693951 mov r15, QWORD PTR [r12] sub rcx, r15 mov rdx, rcx mov r14, rcx sar rdx, 2 cmp rdx, rax je .L32 test rdx, rdx mov eax, 1 cmovne rax, rdx add rax, rdx jc .L7 test rax, rax jne .L33 xor r8d, r8d xor edi, edi .L9: lea rcx, [rdi+4+r14] movq xmm0, rdi mov DWORD PTR [rdi+r14], ebp movq xmm1, rcx punpcklqdq xmm0, xmm1 test r14, r14 jg .L34 test r15, r15 jne .L35 .L12: add rbx, 8 mov QWORD PTR [r12+16], r8 movups XMMWORD PTR [r12], xmm0 cmp r13, rbx jne .L13 .L1: add rsp, 40 mov rax, r12 pop rbx pop rbp pop r12 pop r13 pop r14 pop r15 ret .L34: mov rsi, r15 mov rdx, r14 mov QWORD PTR [rsp+8], r8 mov QWORD PTR [rsp], rcx movaps XMMWORD PTR [rsp+16], xmm0 call memmove mov rsi, QWORD PTR [r12+16] mov rcx, QWORD PTR [rsp] mov r8, QWORD PTR [rsp+8] movdqa xmm0, XMMWORD PTR [rsp+16] sub rsi, r15 .L11: mov rdi, r15 mov QWORD PTR [rsp+8], r8 mov QWORD PTR [rsp], rcx movaps XMMWORD PTR [rsp+16], xmm0 call operator delete(void*, unsigned long) movdqa xmm0, XMMWORD PTR [rsp+16] mov r8, QWORD PTR [rsp+8] mov rcx, QWORD PTR [rsp] jmp .L12 .L35: mov rsi, QWORD PTR [r12+16] sub rsi, r15 jmp .L11 .L33: movabs rdx, 2305843009213693951 cmp rax, rdx cmova rax, rdx sal rax, 2 mov QWORD PTR [rsp], rax mov rdi, rax .L8: call operator new(unsigned long) mov r8, QWORD PTR [rsp] mov rdi, rax add r8, rax jmp .L9 .L7: movabs rax, 9223372036854775804 mov QWORD PTR [rsp], rax mov rdi, rax jmp .L8 .L32: mov edi, OFFSET FLAT:.LC0 call std::__throw_length_error(char const*) mov rbp, rax jmp .L15 copy(std::vector > const&) [clone .cold]: =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D Build "transform" with -O2 on x86-64 (https://godbolt.org/z/YTEfWEbcq): .LC0: .string "vector::_M_realloc_insert" transform(std::vector > const&): push r12 mov r12, rdi push rbp push rbx sub rsp, 16 mov rbp, QWORD PTR [rsi+8] mov rbx, QWORD PTR [rsi] mov QWORD PTR [rdi], 0 mov QWORD PTR [rdi+8], 0 mov QWORD PTR [rdi+16], 0 cmp rbx, rbp je .L19 xor edx, edx xor esi, esi jmp .L23 .L31: mov DWORD PTR [rsi], eax add rbx, 8 add rsi, 4 mov QWORD PTR [r12+8], rsi cmp rbp, rbx je .L19 .L32: mov rsi, QWORD PTR [r12+8] mov rdx, QWORD PTR [r12+16] .L23: cvttsd2si eax, QWORD PTR [rbx] mov DWORD PTR [rsp+12], eax cmp rsi, rdx jne .L31 lea rdx, [rsp+12] mov rdi, r12 call void std::vector >::_M_realloc_insert(__gnu_cxx::__normal_iterator > >, int&&) add rbx, 8 cmp rbp, rbx jne .L32 .L19: add rsp, 16 mov rax, r12 pop rbx pop rbp pop r12 ret mov rbp, rax jmp .L24 transform(std::vector > const&) [clone .cold= ]: =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D and with -O3: .LC0: .string "vector::_M_realloc_insert" transform(std::vector > const&): push r15 pxor xmm0, xmm0 push r14 push r13 push r12 mov r12, rdi push rbp push rbx sub rsp, 40 mov r13, QWORD PTR [rsi+8] mov rbx, QWORD PTR [rsi] movups XMMWORD PTR [rdi], xmm0 mov QWORD PTR [rdi+16], 0 cmp rbx, r13 je .L1 xor r8d, r8d xor ecx, ecx jmp .L13 .L31: mov DWORD PTR [rcx], ebp add rbx, 8 add rcx, 4 mov QWORD PTR [r12+8], rcx cmp r13, rbx je .L1 .L13: cvttsd2si ebp, QWORD PTR [rbx] cmp rcx, r8 jne .L31 movabs rax, 2305843009213693951 mov r15, QWORD PTR [r12] sub rcx, r15 mov rdx, rcx mov r14, rcx sar rdx, 2 cmp rdx, rax je .L32 test rdx, rdx mov eax, 1 cmovne rax, rdx add rax, rdx jc .L7 test rax, rax jne .L33 xor r8d, r8d xor edi, edi .L9: lea rcx, [rdi+4+r14] movq xmm0, rdi mov DWORD PTR [rdi+r14], ebp movq xmm1, rcx punpcklqdq xmm0, xmm1 test r14, r14 jg .L34 test r15, r15 jne .L35 .L12: add rbx, 8 mov QWORD PTR [r12+16], r8 movups XMMWORD PTR [r12], xmm0 cmp r13, rbx jne .L13 .L1: add rsp, 40 mov rax, r12 pop rbx pop rbp pop r12 pop r13 pop r14 pop r15 ret .L34: mov rsi, r15 mov rdx, r14 mov QWORD PTR [rsp+8], r8 mov QWORD PTR [rsp], rcx movaps XMMWORD PTR [rsp+16], xmm0 call memmove mov rsi, QWORD PTR [r12+16] mov rcx, QWORD PTR [rsp] mov r8, QWORD PTR [rsp+8] movdqa xmm0, XMMWORD PTR [rsp+16] sub rsi, r15 .L11: mov rdi, r15 mov QWORD PTR [rsp+8], r8 mov QWORD PTR [rsp], rcx movaps XMMWORD PTR [rsp+16], xmm0 call operator delete(void*, unsigned long) movdqa xmm0, XMMWORD PTR [rsp+16] mov r8, QWORD PTR [rsp+8] mov rcx, QWORD PTR [rsp] jmp .L12 .L35: mov rsi, QWORD PTR [r12+16] sub rsi, r15 jmp .L11 .L33: movabs rdx, 2305843009213693951 cmp rax, rdx cmova rax, rdx sal rax, 2 mov QWORD PTR [rsp], rax mov rdi, rax .L8: call operator new(unsigned long) mov r8, QWORD PTR [rsp] mov rdi, rax add r8, rax jmp .L9 .L7: movabs rax, 9223372036854775804 mov QWORD PTR [rsp], rax mov rdi, rax jmp .L8 .L32: mov edi, OFFSET FLAT:.LC0 call std::__throw_length_error(char const*) mov rbp, rax jmp .L15 transform(std::vector > const&) [clone .cold= ]:=