public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c++/100104] New: std::transform is 1.5 times faster than std::copy with -O3
@ 2021-04-15 18:06 hewillk at gmail dot com
2021-04-15 19:30 ` [Bug middle-end/100104] " pinskia at gcc dot gnu.org
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: hewillk at gmail dot com @ 2021-04-15 18:06 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100104
Bug ID: 100104
Summary: std::transform is 1.5 times faster than std::copy with
-O3
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c++
Assignee: unassigned at gcc dot gnu.org
Reporter: hewillk at gmail dot com
Target Milestone: ---
Consider:
std::vector<double> v1(100, 0);
// copy using copy
std::vector<int> v2;
std::copy(v1.begin(), v1.end(), std::back_inserter(v2));
// copy using transform
std::vector<int> v2;
std::transform(v1.begin(), v1.end(), std::back_inserter(v2), [](auto x) {
return x; });
Those two will generate similar assembly code under -O2, but is very different
under -O3/-Ofast, and transform will be 1.5 times faster than the copy. I don’t
know if this is a bug since these two represent the same thing, and correct me
if I am wrong.
quick-bench with -O2:
https://quick-bench.com/q/uKT8QEmPkS1wr153s3P-DRt90eY
quick-bench with -O3:
https://quick-bench.com/q/syuBCQYVtCoVwT2MRtLT25P-MQI
goldbot:
https://godbolt.org/z/7ee77cs8W
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug middle-end/100104] std::transform is 1.5 times faster than std::copy with -O3
2021-04-15 18:06 [Bug c++/100104] New: std::transform is 1.5 times faster than std::copy with -O3 hewillk at gmail dot com
@ 2021-04-15 19:30 ` pinskia at gcc dot gnu.org
2021-04-16 1:42 ` hewillk at gmail dot com
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-04-15 19:30 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100104
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Keywords| |missed-optimization
Component|c++ |middle-end
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
What target is this on?
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug middle-end/100104] std::transform is 1.5 times faster than std::copy with -O3
2021-04-15 18:06 [Bug c++/100104] New: std::transform is 1.5 times faster than std::copy with -O3 hewillk at gmail dot com
2021-04-15 19:30 ` [Bug middle-end/100104] " pinskia at gcc dot gnu.org
@ 2021-04-16 1:42 ` hewillk at gmail dot com
2021-04-16 1:43 ` hewillk at gmail dot com
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: hewillk at gmail dot com @ 2021-04-16 1:42 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100104
--- Comment #2 from 康桓瑋 <hewillk at gmail dot com> ---
Build "copy" with -O2 on x86-64 (https://godbolt.org/z/Gja6xrq9G):
.LC0:
.string "vector::_M_realloc_insert"
copy(std::vector<double, std::allocator<double> > const&):
push r15
pxor xmm0, xmm0
push r14
push r13
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 40
mov r13, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
movups XMMWORD PTR [rdi], xmm0
mov QWORD PTR [rdi+16], 0
cmp rbx, r13
je .L1
xor r8d, r8d
xor ecx, ecx
jmp .L13
.L31:
mov DWORD PTR [rcx], ebp
add rbx, 8
add rcx, 4
mov QWORD PTR [r12+8], rcx
cmp r13, rbx
je .L1
.L13:
cvttsd2si ebp, QWORD PTR [rbx]
cmp rcx, r8
jne .L31
movabs rax, 2305843009213693951
mov r15, QWORD PTR [r12]
sub rcx, r15
mov rdx, rcx
mov r14, rcx
sar rdx, 2
cmp rdx, rax
je .L32
test rdx, rdx
mov eax, 1
cmovne rax, rdx
add rax, rdx
jc .L7
test rax, rax
jne .L33
xor r8d, r8d
xor edi, edi
.L9:
lea rcx, [rdi+4+r14]
movq xmm0, rdi
mov DWORD PTR [rdi+r14], ebp
movq xmm1, rcx
punpcklqdq xmm0, xmm1
test r14, r14
jg .L34
test r15, r15
jne .L35
.L12:
add rbx, 8
mov QWORD PTR [r12+16], r8
movups XMMWORD PTR [r12], xmm0
cmp r13, rbx
jne .L13
.L1:
add rsp, 40
mov rax, r12
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L34:
mov rsi, r15
mov rdx, r14
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call memmove
mov rsi, QWORD PTR [r12+16]
mov rcx, QWORD PTR [rsp]
mov r8, QWORD PTR [rsp+8]
movdqa xmm0, XMMWORD PTR [rsp+16]
sub rsi, r15
.L11:
mov rdi, r15
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call operator delete(void*, unsigned long)
movdqa xmm0, XMMWORD PTR [rsp+16]
mov r8, QWORD PTR [rsp+8]
mov rcx, QWORD PTR [rsp]
jmp .L12
.L35:
mov rsi, QWORD PTR [r12+16]
sub rsi, r15
jmp .L11
.L33:
movabs rdx, 2305843009213693951
cmp rax, rdx
cmova rax, rdx
sal rax, 2
mov QWORD PTR [rsp], rax
mov rdi, rax
.L8:
call operator new(unsigned long)
mov r8, QWORD PTR [rsp]
mov rdi, rax
add r8, rax
jmp .L9
.L7:
movabs rax, 9223372036854775804
mov QWORD PTR [rsp], rax
mov rdi, rax
jmp .L8
.L32:
mov edi, OFFSET FLAT:.LC0
call std::__throw_length_error(char const*)
mov rbp, rax
jmp .L15
copy(std::vector<double, std::allocator<double> > const&) [clone .cold]:
===========================================================================
with -O3:
.LC0:
.string "vector::_M_realloc_insert"
copy(std::vector<double, std::allocator<double> > const&):
push r15
pxor xmm0, xmm0
push r14
push r13
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 40
mov r13, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
movups XMMWORD PTR [rdi], xmm0
mov QWORD PTR [rdi+16], 0
cmp rbx, r13
je .L1
xor r8d, r8d
xor ecx, ecx
jmp .L13
.L31:
mov DWORD PTR [rcx], ebp
add rbx, 8
add rcx, 4
mov QWORD PTR [r12+8], rcx
cmp r13, rbx
je .L1
.L13:
cvttsd2si ebp, QWORD PTR [rbx]
cmp rcx, r8
jne .L31
movabs rax, 2305843009213693951
mov r15, QWORD PTR [r12]
sub rcx, r15
mov rdx, rcx
mov r14, rcx
sar rdx, 2
cmp rdx, rax
je .L32
test rdx, rdx
mov eax, 1
cmovne rax, rdx
add rax, rdx
jc .L7
test rax, rax
jne .L33
xor r8d, r8d
xor edi, edi
.L9:
lea rcx, [rdi+4+r14]
movq xmm0, rdi
mov DWORD PTR [rdi+r14], ebp
movq xmm1, rcx
punpcklqdq xmm0, xmm1
test r14, r14
jg .L34
test r15, r15
jne .L35
.L12:
add rbx, 8
mov QWORD PTR [r12+16], r8
movups XMMWORD PTR [r12], xmm0
cmp r13, rbx
jne .L13
.L1:
add rsp, 40
mov rax, r12
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L34:
mov rsi, r15
mov rdx, r14
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call memmove
mov rsi, QWORD PTR [r12+16]
mov rcx, QWORD PTR [rsp]
mov r8, QWORD PTR [rsp+8]
movdqa xmm0, XMMWORD PTR [rsp+16]
sub rsi, r15
.L11:
mov rdi, r15
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call operator delete(void*, unsigned long)
movdqa xmm0, XMMWORD PTR [rsp+16]
mov r8, QWORD PTR [rsp+8]
mov rcx, QWORD PTR [rsp]
jmp .L12
.L35:
mov rsi, QWORD PTR [r12+16]
sub rsi, r15
jmp .L11
.L33:
movabs rdx, 2305843009213693951
cmp rax, rdx
cmova rax, rdx
sal rax, 2
mov QWORD PTR [rsp], rax
mov rdi, rax
.L8:
call operator new(unsigned long)
mov r8, QWORD PTR [rsp]
mov rdi, rax
add r8, rax
jmp .L9
.L7:
movabs rax, 9223372036854775804
mov QWORD PTR [rsp], rax
mov rdi, rax
jmp .L8
.L32:
mov edi, OFFSET FLAT:.LC0
call std::__throw_length_error(char const*)
mov rbp, rax
jmp .L15
copy(std::vector<double, std::allocator<double> > const&) [clone .cold]:
===========================================================================
Build "transform" with -O2 on x86-64 (https://godbolt.org/z/YTEfWEbcq):
.LC0:
.string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 16
mov rbp, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
mov QWORD PTR [rdi], 0
mov QWORD PTR [rdi+8], 0
mov QWORD PTR [rdi+16], 0
cmp rbx, rbp
je .L19
xor edx, edx
xor esi, esi
jmp .L23
.L31:
mov DWORD PTR [rsi], eax
add rbx, 8
add rsi, 4
mov QWORD PTR [r12+8], rsi
cmp rbp, rbx
je .L19
.L32:
mov rsi, QWORD PTR [r12+8]
mov rdx, QWORD PTR [r12+16]
.L23:
cvttsd2si eax, QWORD PTR [rbx]
mov DWORD PTR [rsp+12], eax
cmp rsi, rdx
jne .L31
lea rdx, [rsp+12]
mov rdi, r12
call void std::vector<int, std::allocator<int>
>::_M_realloc_insert<int>(__gnu_cxx::__normal_iterator<int*, std::vector<int,
std::allocator<int> > >, int&&)
add rbx, 8
cmp rbp, rbx
jne .L32
.L19:
add rsp, 16
mov rax, r12
pop rbx
pop rbp
pop r12
ret
mov rbp, rax
jmp .L24
transform(std::vector<double, std::allocator<double> > const&) [clone .cold]:
===========================================================================
and with -O3:
.LC0:
.string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
push r15
pxor xmm0, xmm0
push r14
push r13
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 40
mov r13, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
movups XMMWORD PTR [rdi], xmm0
mov QWORD PTR [rdi+16], 0
cmp rbx, r13
je .L1
xor r8d, r8d
xor ecx, ecx
jmp .L13
.L31:
mov DWORD PTR [rcx], ebp
add rbx, 8
add rcx, 4
mov QWORD PTR [r12+8], rcx
cmp r13, rbx
je .L1
.L13:
cvttsd2si ebp, QWORD PTR [rbx]
cmp rcx, r8
jne .L31
movabs rax, 2305843009213693951
mov r15, QWORD PTR [r12]
sub rcx, r15
mov rdx, rcx
mov r14, rcx
sar rdx, 2
cmp rdx, rax
je .L32
test rdx, rdx
mov eax, 1
cmovne rax, rdx
add rax, rdx
jc .L7
test rax, rax
jne .L33
xor r8d, r8d
xor edi, edi
.L9:
lea rcx, [rdi+4+r14]
movq xmm0, rdi
mov DWORD PTR [rdi+r14], ebp
movq xmm1, rcx
punpcklqdq xmm0, xmm1
test r14, r14
jg .L34
test r15, r15
jne .L35
.L12:
add rbx, 8
mov QWORD PTR [r12+16], r8
movups XMMWORD PTR [r12], xmm0
cmp r13, rbx
jne .L13
.L1:
add rsp, 40
mov rax, r12
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L34:
mov rsi, r15
mov rdx, r14
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call memmove
mov rsi, QWORD PTR [r12+16]
mov rcx, QWORD PTR [rsp]
mov r8, QWORD PTR [rsp+8]
movdqa xmm0, XMMWORD PTR [rsp+16]
sub rsi, r15
.L11:
mov rdi, r15
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call operator delete(void*, unsigned long)
movdqa xmm0, XMMWORD PTR [rsp+16]
mov r8, QWORD PTR [rsp+8]
mov rcx, QWORD PTR [rsp]
jmp .L12
.L35:
mov rsi, QWORD PTR [r12+16]
sub rsi, r15
jmp .L11
.L33:
movabs rdx, 2305843009213693951
cmp rax, rdx
cmova rax, rdx
sal rax, 2
mov QWORD PTR [rsp], rax
mov rdi, rax
.L8:
call operator new(unsigned long)
mov r8, QWORD PTR [rsp]
mov rdi, rax
add r8, rax
jmp .L9
.L7:
movabs rax, 9223372036854775804
mov QWORD PTR [rsp], rax
mov rdi, rax
jmp .L8
.L32:
mov edi, OFFSET FLAT:.LC0
call std::__throw_length_error(char const*)
mov rbp, rax
jmp .L15
transform(std::vector<double, std::allocator<double> > const&) [clone .cold]:
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug middle-end/100104] std::transform is 1.5 times faster than std::copy with -O3
2021-04-15 18:06 [Bug c++/100104] New: std::transform is 1.5 times faster than std::copy with -O3 hewillk at gmail dot com
2021-04-15 19:30 ` [Bug middle-end/100104] " pinskia at gcc dot gnu.org
2021-04-16 1:42 ` hewillk at gmail dot com
@ 2021-04-16 1:43 ` hewillk at gmail dot com
2021-04-16 1:51 ` hewillk at gmail dot com
2021-04-18 17:30 ` hewillk at gmail dot com
4 siblings, 0 replies; 6+ messages in thread
From: hewillk at gmail dot com @ 2021-04-16 1:43 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100104
--- Comment #3 from 康桓瑋 <hewillk at gmail dot com> ---
Build "copy" with -O2 on x86-64 (https://godbolt.org/z/Gja6xrq9G):
.LC0:
.string "vector::_M_realloc_insert"
copy(std::vector<double, std::allocator<double> > const&):
push r15
pxor xmm0, xmm0
push r14
push r13
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 40
mov r13, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
movups XMMWORD PTR [rdi], xmm0
mov QWORD PTR [rdi+16], 0
cmp rbx, r13
je .L1
xor r8d, r8d
xor ecx, ecx
jmp .L13
.L31:
mov DWORD PTR [rcx], ebp
add rbx, 8
add rcx, 4
mov QWORD PTR [r12+8], rcx
cmp r13, rbx
je .L1
.L13:
cvttsd2si ebp, QWORD PTR [rbx]
cmp rcx, r8
jne .L31
movabs rax, 2305843009213693951
mov r15, QWORD PTR [r12]
sub rcx, r15
mov rdx, rcx
mov r14, rcx
sar rdx, 2
cmp rdx, rax
je .L32
test rdx, rdx
mov eax, 1
cmovne rax, rdx
add rax, rdx
jc .L7
test rax, rax
jne .L33
xor r8d, r8d
xor edi, edi
.L9:
lea rcx, [rdi+4+r14]
movq xmm0, rdi
mov DWORD PTR [rdi+r14], ebp
movq xmm1, rcx
punpcklqdq xmm0, xmm1
test r14, r14
jg .L34
test r15, r15
jne .L35
.L12:
add rbx, 8
mov QWORD PTR [r12+16], r8
movups XMMWORD PTR [r12], xmm0
cmp r13, rbx
jne .L13
.L1:
add rsp, 40
mov rax, r12
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L34:
mov rsi, r15
mov rdx, r14
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call memmove
mov rsi, QWORD PTR [r12+16]
mov rcx, QWORD PTR [rsp]
mov r8, QWORD PTR [rsp+8]
movdqa xmm0, XMMWORD PTR [rsp+16]
sub rsi, r15
.L11:
mov rdi, r15
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call operator delete(void*, unsigned long)
movdqa xmm0, XMMWORD PTR [rsp+16]
mov r8, QWORD PTR [rsp+8]
mov rcx, QWORD PTR [rsp]
jmp .L12
.L35:
mov rsi, QWORD PTR [r12+16]
sub rsi, r15
jmp .L11
.L33:
movabs rdx, 2305843009213693951
cmp rax, rdx
cmova rax, rdx
sal rax, 2
mov QWORD PTR [rsp], rax
mov rdi, rax
.L8:
call operator new(unsigned long)
mov r8, QWORD PTR [rsp]
mov rdi, rax
add r8, rax
jmp .L9
.L7:
movabs rax, 9223372036854775804
mov QWORD PTR [rsp], rax
mov rdi, rax
jmp .L8
.L32:
mov edi, OFFSET FLAT:.LC0
call std::__throw_length_error(char const*)
mov rbp, rax
jmp .L15
copy(std::vector<double, std::allocator<double> > const&) [clone .cold]:
===========================================================================
with -O3:
.LC0:
.string "vector::_M_realloc_insert"
copy(std::vector<double, std::allocator<double> > const&):
push r15
pxor xmm0, xmm0
push r14
push r13
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 40
mov r13, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
movups XMMWORD PTR [rdi], xmm0
mov QWORD PTR [rdi+16], 0
cmp rbx, r13
je .L1
xor r8d, r8d
xor ecx, ecx
jmp .L13
.L31:
mov DWORD PTR [rcx], ebp
add rbx, 8
add rcx, 4
mov QWORD PTR [r12+8], rcx
cmp r13, rbx
je .L1
.L13:
cvttsd2si ebp, QWORD PTR [rbx]
cmp rcx, r8
jne .L31
movabs rax, 2305843009213693951
mov r15, QWORD PTR [r12]
sub rcx, r15
mov rdx, rcx
mov r14, rcx
sar rdx, 2
cmp rdx, rax
je .L32
test rdx, rdx
mov eax, 1
cmovne rax, rdx
add rax, rdx
jc .L7
test rax, rax
jne .L33
xor r8d, r8d
xor edi, edi
.L9:
lea rcx, [rdi+4+r14]
movq xmm0, rdi
mov DWORD PTR [rdi+r14], ebp
movq xmm1, rcx
punpcklqdq xmm0, xmm1
test r14, r14
jg .L34
test r15, r15
jne .L35
.L12:
add rbx, 8
mov QWORD PTR [r12+16], r8
movups XMMWORD PTR [r12], xmm0
cmp r13, rbx
jne .L13
.L1:
add rsp, 40
mov rax, r12
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L34:
mov rsi, r15
mov rdx, r14
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call memmove
mov rsi, QWORD PTR [r12+16]
mov rcx, QWORD PTR [rsp]
mov r8, QWORD PTR [rsp+8]
movdqa xmm0, XMMWORD PTR [rsp+16]
sub rsi, r15
.L11:
mov rdi, r15
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call operator delete(void*, unsigned long)
movdqa xmm0, XMMWORD PTR [rsp+16]
mov r8, QWORD PTR [rsp+8]
mov rcx, QWORD PTR [rsp]
jmp .L12
.L35:
mov rsi, QWORD PTR [r12+16]
sub rsi, r15
jmp .L11
.L33:
movabs rdx, 2305843009213693951
cmp rax, rdx
cmova rax, rdx
sal rax, 2
mov QWORD PTR [rsp], rax
mov rdi, rax
.L8:
call operator new(unsigned long)
mov r8, QWORD PTR [rsp]
mov rdi, rax
add r8, rax
jmp .L9
.L7:
movabs rax, 9223372036854775804
mov QWORD PTR [rsp], rax
mov rdi, rax
jmp .L8
.L32:
mov edi, OFFSET FLAT:.LC0
call std::__throw_length_error(char const*)
mov rbp, rax
jmp .L15
copy(std::vector<double, std::allocator<double> > const&) [clone .cold]:
===========================================================================
Build "transform" with -O2 on x86-64 (https://godbolt.org/z/YTEfWEbcq):
.LC0:
.string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 16
mov rbp, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
mov QWORD PTR [rdi], 0
mov QWORD PTR [rdi+8], 0
mov QWORD PTR [rdi+16], 0
cmp rbx, rbp
je .L19
xor edx, edx
xor esi, esi
jmp .L23
.L31:
mov DWORD PTR [rsi], eax
add rbx, 8
add rsi, 4
mov QWORD PTR [r12+8], rsi
cmp rbp, rbx
je .L19
.L32:
mov rsi, QWORD PTR [r12+8]
mov rdx, QWORD PTR [r12+16]
.L23:
cvttsd2si eax, QWORD PTR [rbx]
mov DWORD PTR [rsp+12], eax
cmp rsi, rdx
jne .L31
lea rdx, [rsp+12]
mov rdi, r12
call void std::vector<int, std::allocator<int>
>::_M_realloc_insert<int>(__gnu_cxx::__normal_iterator<int*, std::vector<int,
std::allocator<int> > >, int&&)
add rbx, 8
cmp rbp, rbx
jne .L32
.L19:
add rsp, 16
mov rax, r12
pop rbx
pop rbp
pop r12
ret
mov rbp, rax
jmp .L24
transform(std::vector<double, std::allocator<double> > const&) [clone .cold]:
===========================================================================
and with -O3:
.LC0:
.string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
push r15
pxor xmm0, xmm0
push r14
push r13
push r12
mov r12, rdi
push rbp
push rbx
sub rsp, 40
mov r13, QWORD PTR [rsi+8]
mov rbx, QWORD PTR [rsi]
movups XMMWORD PTR [rdi], xmm0
mov QWORD PTR [rdi+16], 0
cmp rbx, r13
je .L1
xor r8d, r8d
xor ecx, ecx
jmp .L13
.L31:
mov DWORD PTR [rcx], ebp
add rbx, 8
add rcx, 4
mov QWORD PTR [r12+8], rcx
cmp r13, rbx
je .L1
.L13:
cvttsd2si ebp, QWORD PTR [rbx]
cmp rcx, r8
jne .L31
movabs rax, 2305843009213693951
mov r15, QWORD PTR [r12]
sub rcx, r15
mov rdx, rcx
mov r14, rcx
sar rdx, 2
cmp rdx, rax
je .L32
test rdx, rdx
mov eax, 1
cmovne rax, rdx
add rax, rdx
jc .L7
test rax, rax
jne .L33
xor r8d, r8d
xor edi, edi
.L9:
lea rcx, [rdi+4+r14]
movq xmm0, rdi
mov DWORD PTR [rdi+r14], ebp
movq xmm1, rcx
punpcklqdq xmm0, xmm1
test r14, r14
jg .L34
test r15, r15
jne .L35
.L12:
add rbx, 8
mov QWORD PTR [r12+16], r8
movups XMMWORD PTR [r12], xmm0
cmp r13, rbx
jne .L13
.L1:
add rsp, 40
mov rax, r12
pop rbx
pop rbp
pop r12
pop r13
pop r14
pop r15
ret
.L34:
mov rsi, r15
mov rdx, r14
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call memmove
mov rsi, QWORD PTR [r12+16]
mov rcx, QWORD PTR [rsp]
mov r8, QWORD PTR [rsp+8]
movdqa xmm0, XMMWORD PTR [rsp+16]
sub rsi, r15
.L11:
mov rdi, r15
mov QWORD PTR [rsp+8], r8
mov QWORD PTR [rsp], rcx
movaps XMMWORD PTR [rsp+16], xmm0
call operator delete(void*, unsigned long)
movdqa xmm0, XMMWORD PTR [rsp+16]
mov r8, QWORD PTR [rsp+8]
mov rcx, QWORD PTR [rsp]
jmp .L12
.L35:
mov rsi, QWORD PTR [r12+16]
sub rsi, r15
jmp .L11
.L33:
movabs rdx, 2305843009213693951
cmp rax, rdx
cmova rax, rdx
sal rax, 2
mov QWORD PTR [rsp], rax
mov rdi, rax
.L8:
call operator new(unsigned long)
mov r8, QWORD PTR [rsp]
mov rdi, rax
add r8, rax
jmp .L9
.L7:
movabs rax, 9223372036854775804
mov QWORD PTR [rsp], rax
mov rdi, rax
jmp .L8
.L32:
mov edi, OFFSET FLAT:.LC0
call std::__throw_length_error(char const*)
mov rbp, rax
jmp .L15
transform(std::vector<double, std::allocator<double> > const&) [clone .cold]:
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug middle-end/100104] std::transform is 1.5 times faster than std::copy with -O3
2021-04-15 18:06 [Bug c++/100104] New: std::transform is 1.5 times faster than std::copy with -O3 hewillk at gmail dot com
` (2 preceding siblings ...)
2021-04-16 1:43 ` hewillk at gmail dot com
@ 2021-04-16 1:51 ` hewillk at gmail dot com
2021-04-18 17:30 ` hewillk at gmail dot com
4 siblings, 0 replies; 6+ messages in thread
From: hewillk at gmail dot com @ 2021-04-16 1:51 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100104
--- Comment #4 from 康桓瑋 <hewillk at gmail dot com> ---
And Build "copy" with -O2 on ARM64 is identical with -O3
(https://godbolt.org/z/5hjKGbrTd):
.LC0:
.string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
stp x29, x30, [sp, -64]!
mov x29, sp
stp x19, x20, [sp, 16]
mov x19, x8
ldp x20, x0, [x0]
stp xzr, xzr, [x8]
str xzr, [x8, 16]
sub x0, x0, x20
cmp x0, 0
ble .L19
mov x1, 0
str x21, [sp, 32]
asr x21, x0, 3
mov x0, 0
b .L23
.L35:
str w2, [x0], 4
add x20, x20, 8
subs x21, x21, #1
str x0, [x19, 8]
beq .L34
.L36:
ldp x0, x1, [x19, 8]
.L23:
ldr d0, [x20]
fcvtzs w2, d0
str w2, [sp, 60]
cmp x1, x0
bne .L35
add x2, sp, 60
mov x0, x19
bl void std::vector<int, std::allocator<int>
>::_M_realloc_insert<int>(__gnu_cxx::__normal_iterator<int*, std::vector<int,
std::allocator<int> > >, int&&)
add x20, x20, 8
subs x21, x21, #1
bne .L36
.L34:
ldr x21, [sp, 32]
.L19:
mov x0, x19
ldp x19, x20, [sp, 16]
ldp x29, x30, [sp], 64
ret
ldr x2, [x19]
ldr x1, [x19, 16]
mov x19, x0
sub x1, x1, x2
cbz x2, .L25
mov x0, x2
bl operator delete(void*, unsigned long)
.L25:
mov x0, x19
bl _Unwind_Resume
DW.ref.__gxx_personality_v0:
.xword __gxx_personality_v0
===========================================================================
However, "transform" has been further optimized under -O3
(https://godbolt.org/z/5hjKGbrTd):
.LC0:
.string "vector::_M_realloc_insert"
transform(std::vector<double, std::allocator<double> > const&):
stp x29, x30, [sp, -96]!
mov x29, sp
stp x19, x20, [sp, 16]
stp x21, x22, [sp, 32]
mov x21, x8
ldp x20, x22, [x0]
stp xzr, xzr, [x8]
str xzr, [x8, 16]
cmp x22, x20
beq .L1
mov x19, 0
stp x23, x24, [sp, 48]
stp x25, x26, [sp, 64]
mov x25, 2305843009213693951
stp x27, x28, [sp, 80]
mov x28, 0
b .L13
.L32:
str w27, [x19], 4
add x20, x20, 8
str x19, [x21, 8]
cmp x22, x20
beq .L31
.L13:
ldr d0, [x20]
fcvtzs w27, d0
cmp x19, x28
bne .L32
ldr x24, [x21]
sub x23, x19, x24
asr x1, x23, 2
cmp x1, x25
beq .L33
cmp x1, 0
csinc x0, x1, xzr, ne
adds x0, x0, x1
bcs .L7
cbnz x0, .L34
mov x28, 0
mov x26, 0
.L9:
add x19, x23, 4
str w27, [x26, x23]
add x19, x26, x19
cmp x23, 0
bgt .L35
cbnz x24, .L36
.L12:
add x20, x20, 8
stp x26, x19, [x21]
str x28, [x21, 16]
cmp x22, x20
bne .L13
.L31:
ldp x23, x24, [sp, 48]
ldp x25, x26, [sp, 64]
ldp x27, x28, [sp, 80]
.L1:
mov x0, x21
ldp x19, x20, [sp, 16]
ldp x21, x22, [sp, 32]
ldp x29, x30, [sp], 96
ret
.L35:
mov x1, x24
mov x2, x23
mov x0, x26
bl memmove
ldr x1, [x21, 16]
sub x1, x1, x24
.L11:
mov x0, x24
bl operator delete(void*, unsigned long)
b .L12
.L36:
ldr x1, [x21, 16]
sub x1, x1, x24
b .L11
.L34:
cmp x0, x25
csel x0, x0, x25, ls
lsl x28, x0, 2
.L8:
mov x0, x28
bl operator new(unsigned long)
mov x26, x0
add x28, x0, x28
b .L9
.L7:
mov x28, 9223372036854775804
b .L8
.L33:
adrp x0, .LC0
add x0, x0, :lo12:.LC0
bl std::__throw_length_error(char const*)
ldr x2, [x21]
mov x19, x0
ldr x1, [x21, 16]
sub x1, x1, x2
cbz x2, .L16
mov x0, x2
bl operator delete(void*, unsigned long)
.L16:
mov x0, x19
bl _Unwind_Resume
DW.ref.__gxx_personality_v0:
.xword __gxx_personality_v0
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug middle-end/100104] std::transform is 1.5 times faster than std::copy with -O3
2021-04-15 18:06 [Bug c++/100104] New: std::transform is 1.5 times faster than std::copy with -O3 hewillk at gmail dot com
` (3 preceding siblings ...)
2021-04-16 1:51 ` hewillk at gmail dot com
@ 2021-04-18 17:30 ` hewillk at gmail dot com
4 siblings, 0 replies; 6+ messages in thread
From: hewillk at gmail dot com @ 2021-04-18 17:30 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100104
康桓瑋 <hewillk at gmail dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
Resolution|--- |INVALID
Status|UNCONFIRMED |RESOLVED
--- Comment #5 from 康桓瑋 <hewillk at gmail dot com> ---
After actually executing the same code on my local and remote servers, I did
not produce such a result, so I think it should only be an issue of the
environment, thank you for your time.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2021-04-18 17:30 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-15 18:06 [Bug c++/100104] New: std::transform is 1.5 times faster than std::copy with -O3 hewillk at gmail dot com
2021-04-15 19:30 ` [Bug middle-end/100104] " pinskia at gcc dot gnu.org
2021-04-16 1:42 ` hewillk at gmail dot com
2021-04-16 1:43 ` hewillk at gmail dot com
2021-04-16 1:51 ` hewillk at gmail dot com
2021-04-18 17:30 ` hewillk at gmail dot com
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).