public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/109441] New: missed optimization when all elements of vector are known
@ 2023-04-06 18:31 hiraditya at msn dot com
2023-04-06 18:36 ` [Bug tree-optimization/109441] " pinskia at gcc dot gnu.org
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: hiraditya at msn dot com @ 2023-04-06 18:31 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109441
Bug ID: 109441
Summary: missed optimization when all elements of vector are
known
Product: gcc
Version: unknown
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: hiraditya at msn dot com
Target Milestone: ---
Reference: https://godbolt.org/z/af4x6zhz9
When all elements of vector are 0, then the compiler should be able to remove
the loop and just return 0.
Testcase:
#include<vector>
using namespace std;
using T = int;
T v() {
T s;
std::vector<T> v;
v.resize(1000, 0);
for (auto i = 0; i < v.size(); ++i) {
s += v[i];
}
return s;
}
$ g++ -O3 -std=c++17
.LC0:
.string "vector::_M_fill_insert"
v():
push rbx
pxor xmm0, xmm0
mov edx, 1000
xor esi, esi
sub rsp, 48
lea rcx, [rsp+12]
lea rdi, [rsp+16]
mov QWORD PTR [rsp+32], 0
mov DWORD PTR [rsp+12], 0
movaps XMMWORD PTR [rsp+16], xmm0
call std::vector<int, std::allocator<int>
>::_M_fill_insert(__gnu_cxx::__normal_iterator<int*, std::vector<int,
std::allocator<int> > >, unsigned long, int const&)
mov rdx, QWORD PTR [rsp+24]
mov rdi, QWORD PTR [rsp+16]
mov rax, rdx
sub rax, rdi
mov rsi, rax
sar rsi, 2
cmp rdx, rdi
je .L99
test rax, rax
mov ecx, 1
cmovne rcx, rsi
cmp rax, 12
jbe .L107
mov rdx, rcx
pxor xmm0, xmm0
mov rax, rdi
shr rdx, 2
sal rdx, 4
add rdx, rdi
.L101:
movdqu xmm2, XMMWORD PTR [rax]
add rax, 16
paddd xmm0, xmm2
cmp rdx, rax
jne .L101
movdqa xmm1, xmm0
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd ebx, xmm0
test cl, 3
je .L99
and rcx, -4
mov eax, ecx
.L100:
lea edx, [rax+1]
add ebx, DWORD PTR [rdi+rcx*4]
movsx rdx, edx
cmp rdx, rsi
jnb .L99
add eax, 2
lea rcx, [0+rdx*4]
add ebx, DWORD PTR [rdi+rdx*4]
cdqe
cmp rax, rsi
jnb .L99
add ebx, DWORD PTR [rdi+4+rcx]
.L99:
test rdi, rdi
je .L98
mov rsi, QWORD PTR [rsp+32]
sub rsi, rdi
call operator delete(void*, unsigned long)
.L98:
add rsp, 48
mov eax, ebx
pop rbx
ret
.L107:
xor eax, eax
xor ecx, ecx
jmp .L100
mov rbx, rax
jmp .L105
v() [clone .cold]:
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/109441] missed optimization when all elements of vector are known
2023-04-06 18:31 [Bug tree-optimization/109441] New: missed optimization when all elements of vector are known hiraditya at msn dot com
@ 2023-04-06 18:36 ` pinskia at gcc dot gnu.org
2023-04-06 18:40 ` hiraditya at msn dot com
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: pinskia at gcc dot gnu.org @ 2023-04-06 18:36 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109441
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Keywords| |missed-optimization
Severity|normal |enhancement
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/109441] missed optimization when all elements of vector are known
2023-04-06 18:31 [Bug tree-optimization/109441] New: missed optimization when all elements of vector are known hiraditya at msn dot com
2023-04-06 18:36 ` [Bug tree-optimization/109441] " pinskia at gcc dot gnu.org
@ 2023-04-06 18:40 ` hiraditya at msn dot com
2023-04-11 13:17 ` rguenth at gcc dot gnu.org
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: hiraditya at msn dot com @ 2023-04-06 18:40 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109441
--- Comment #1 from AK <hiraditya at msn dot com> ---
I guess a better test case is this:
#include<vector>
using namespace std;
using T = int;
T v(std::vector<T> v) {
T s;
std::fill(v.begin(), v.end(), T());
for (auto i = 0; i < v.size(); ++i) {
s += v[i];
}
return s;
}
which has similar effect.
$ g++ -O3 -std=c++17
v(std::vector<int, std::allocator<int> >):
push rbp
push rbx
sub rsp, 8
mov rbp, QWORD PTR [rdi+8]
mov rcx, QWORD PTR [rdi]
cmp rcx, rbp
je .L7
sub rbp, rcx
mov rdi, rcx
xor esi, esi
mov rbx, rcx
mov rdx, rbp
call memset
mov rdi, rbp
mov edx, 1
mov rcx, rbx
sar rdi, 2
test rbp, rbp
cmovne rdx, rdi
cmp rbp, 12
jbe .L8
mov rax, rdx
pxor xmm0, xmm0
shr rax, 2
sal rax, 4
add rax, rbx
.L4:
movdqu xmm2, XMMWORD PTR [rbx]
add rbx, 16
paddd xmm0, xmm2
cmp rbx, rax
jne .L4
movdqa xmm1, xmm0
psrldq xmm1, 8
paddd xmm0, xmm1
movdqa xmm1, xmm0
psrldq xmm1, 4
paddd xmm0, xmm1
movd eax, xmm0
test dl, 3
je .L1
and rdx, -4
mov esi, edx
.L3:
add eax, DWORD PTR [rcx+rdx*4]
lea edx, [rsi+1]
movsx rdx, edx
cmp rdx, rdi
jnb .L1
add esi, 2
lea r8, [0+rdx*4]
add eax, DWORD PTR [rcx+rdx*4]
movsx rsi, esi
cmp rsi, rdi
jnb .L1
add eax, DWORD PTR [rcx+4+r8]
.L1:
add rsp, 8
pop rbx
pop rbp
ret
.L7:
add rsp, 8
xor eax, eax
pop rbx
pop rbp
ret
.L8:
xor eax, eax
xor esi, esi
xor edx, edx
jmp .L3
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/109441] missed optimization when all elements of vector are known
2023-04-06 18:31 [Bug tree-optimization/109441] New: missed optimization when all elements of vector are known hiraditya at msn dot com
2023-04-06 18:36 ` [Bug tree-optimization/109441] " pinskia at gcc dot gnu.org
2023-04-06 18:40 ` hiraditya at msn dot com
@ 2023-04-11 13:17 ` rguenth at gcc dot gnu.org
2023-05-17 17:23 ` hiraditya at msn dot com
2023-05-18 5:34 ` rguenth at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-04-11 13:17 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109441
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Last reconfirmed| |2023-04-11
Status|UNCONFIRMED |NEW
Ever confirmed|0 |1
Version|unknown |13.0
CC| |rguenth at gcc dot gnu.org
--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
Well, it doesn't need std::vector for this, a C testcase with a large enough
array (> 16 elements, the unroll limit) suffers from the same "problem".
But IMHO it's academic, right?
GCC only transforms loopy code like you suggest by complete peeling of the
loop, it doesn't attempt to "simulate" IL (and if, then it would only up
to some complexity limit).
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/109441] missed optimization when all elements of vector are known
2023-04-06 18:31 [Bug tree-optimization/109441] New: missed optimization when all elements of vector are known hiraditya at msn dot com
` (2 preceding siblings ...)
2023-04-11 13:17 ` rguenth at gcc dot gnu.org
@ 2023-05-17 17:23 ` hiraditya at msn dot com
2023-05-18 5:34 ` rguenth at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: hiraditya at msn dot com @ 2023-05-17 17:23 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109441
--- Comment #3 from AK <hiraditya at msn dot com> ---
> But IMHO it's academic, right?
yes. i was just messing with vector codegen. But in case all the elements of a
vector/array are same, maybe the loop can be replaced with equivalent
computation?
^ permalink raw reply [flat|nested] 6+ messages in thread
* [Bug tree-optimization/109441] missed optimization when all elements of vector are known
2023-04-06 18:31 [Bug tree-optimization/109441] New: missed optimization when all elements of vector are known hiraditya at msn dot com
` (3 preceding siblings ...)
2023-05-17 17:23 ` hiraditya at msn dot com
@ 2023-05-18 5:34 ` rguenth at gcc dot gnu.org
4 siblings, 0 replies; 6+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-05-18 5:34 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109441
--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to AK from comment #3)
> > But IMHO it's academic, right?
>
> yes. i was just messing with vector codegen. But in case all the elements of
> a vector/array are same, maybe the loop can be replaced with equivalent
> computation?
Yes. GCC doesn't currently have the ability to constant propagate or
value-number defs defined by cycles [that actually iterate]. In general
doing that is computationally expensive and only in very few cases you
can short-cut simulating all iterations (final value replacement does this,
but the case in this PR is already too complicated because it involves
a memory load).
For the case in this PR when simplified to not require removal of all
the C++ abstraction via inlining we'd need to handle a loopy memory
definition and a loopy memory use. The loopy memory def we can probably
pattern match to a memset and the loopy memory use could be (but isn't
currently) identified to be always zero. Pass ordering still stands in the
way then though.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2023-05-18 5:34 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-06 18:31 [Bug tree-optimization/109441] New: missed optimization when all elements of vector are known hiraditya at msn dot com
2023-04-06 18:36 ` [Bug tree-optimization/109441] " pinskia at gcc dot gnu.org
2023-04-06 18:40 ` hiraditya at msn dot com
2023-04-11 13:17 ` rguenth at gcc dot gnu.org
2023-05-17 17:23 ` hiraditya at msn dot com
2023-05-18 5:34 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).