public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/53355] New: Autovectorization of a simple loop could be improved.
@ 2012-05-15 1:55 svfuerst at gmail dot com
2012-05-15 9:44 ` [Bug tree-optimization/53355] " rguenth at gcc dot gnu.org
` (5 more replies)
0 siblings, 6 replies; 7+ messages in thread
From: svfuerst at gmail dot com @ 2012-05-15 1:55 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53355
Bug #: 53355
Summary: Autovectorization of a simple loop could be improved.
Classification: Unclassified
Product: gcc
Version: 4.7.0
Status: UNCONFIRMED
Severity: enhancement
Priority: P3
Component: c
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: svfuerst@gmail.com
The following simple function gets autovectorized by gcc-4.7 -O3
void foo(double *x)
{
int i;
for (i = 0; i < 100000; i++)
{
x[i] += 2.0;
}
}
However, the generated code is rather poor:
foo:
.LFB0:
.cfi_startproc
movq %rdi, %rax
salq $60, %rax
sarq $63, %rax
movq %rax, %rdx
andl $1, %edx
testq %rdx, %rdx
movl %edx, %ecx
je .L7
movsd .LC0(%rip), %xmm0
movl $99999, %r9d
movl $1, %r10d
addsd (%rdi), %xmm0
movsd %xmm0, (%rdi)
.L2:
movl $100000, %r8d
andl $1, %eax
subl %ecx, %r8d
movl %r8d, %esi
shrl %esi
movl %esi, %r11d
addl %r11d, %r11d
je .L3
movapd .LC1(%rip), %xmm1
leaq (%rdi,%rax,8), %rcx
xorl %edx, %edx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L4:
movapd (%rcx,%rax), %xmm0
addl $1, %edx
addpd %xmm1, %xmm0
movapd %xmm0, (%rcx,%rax)
addq $16, %rax
cmpl %esi, %edx
jb .L4
addl %r11d, %r10d
subl %r11d, %r9d
cmpl %r11d, %r8d
je .L1
.L3:
leal -1(%r9), %edx
movslq %r10d, %r10
leaq (%rdi,%r10,8), %rax
movsd .LC0(%rip), %xmm1
addq %rdx, %r10
leaq 8(%rdi,%r10,8), %rdx
.p2align 4,,10
.p2align 3
.L6:
movsd (%rax), %xmm0
addsd %xmm1, %xmm0
movsd %xmm0, (%rax)
addq $8, %rax
cmpq %rdx, %rax
jne .L6
.L1:
rep
ret
.L7:
movl $100000, %r9d
xorl %r10d, %r10d
jmp .L2
The first thing wrong is that the alignment test is rather clunky. Instead, we
can just do a "testq $8, %rdi" followed by a jump to the miss-aligned case.
The second thing wrong is that the code instead of falling-through to the
aligned case, jumps down to L7 instead. Forward jumps are predicted as not
taken.
The inner loop is also longer than it should be. It is possible to have one
less increment statement inside.
Finally, after the main loop, the code again tests for miss-alignment. This
pessimizes the fast path. Instead, it possible to move the slow miss-aligned
case completely out with very little size penalty.
Doing the above optimizations yields:
movapd 2_s, %xmm1
testq $8, %rdi
jne 2f
lea 800000(%rdi), %rax
# The "Hot" inner loop
1: movapd (%rdi), %xmm0
addq $16, %rdi
addpd %xmm1, %xmm0
movapd %xmm0, (%rdi)
cmpq %rdi, %rax
jne 1b
rep ret
# The slow miss-aligned case
2: movsd (%rdi), %xmm0
addsd %xmm1, %xmm0
movsd %xmm0, (%rdi)
leaq (800000 - 16) (%rdi), %rax
addq $8, %rdi
3: movapd (%rdi), %xmm0
addq $16, %rdi
addpd %xmm1, %xmm0
movapd %xmm0, (%rdi)
cmpq %rdi, %rax
jne 3b
movsd (%rdi), %xmm0
addsd %xmm1, %xmm0
movsd %xmm0, (%rdi)
ret
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/53355] Autovectorization of a simple loop could be improved.
2012-05-15 1:55 [Bug c/53355] New: Autovectorization of a simple loop could be improved svfuerst at gmail dot com
@ 2012-05-15 9:44 ` rguenth at gcc dot gnu.org
2012-05-15 13:19 ` rguenth at gcc dot gnu.org
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2012-05-15 9:44 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53355
Richard Guenther <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |ASSIGNED
Last reconfirmed| |2012-05-15
Component|middle-end |tree-optimization
AssignedTo|unassigned at gcc dot |rguenth at gcc dot gnu.org
|gnu.org |
Ever Confirmed|0 |1
--- Comment #1 from Richard Guenther <rguenth at gcc dot gnu.org> 2012-05-15 09:40:45 UTC ---
Confirmed. Btw, it does not check for misalignment again but for whether
there is a remaining scalar iteration to be done. On trunk (for GCC 4.8)
we already improve somewhat - but the odd induction variable choices
remain.
foo:
.LFB0:
.cfi_startproc
movq %rdi, %rax
salq $60, %rax
shrq $63, %rax
testl %eax, %eax
je .L2
movsd (%rdi), %xmm0
movl $100000, %r8d
movsd .LC0(%rip), %xmm1
subl %eax, %r8d
movl %r8d, %esi
movl $99999, %r11d
addsd %xmm1, %xmm0
shrl %esi
movl %esi, %r9d
addl %r9d, %r9d
movsd %xmm0, (%rdi)
je .L9
movl $1, %r10d
.L8:
movapd .LC1(%rip), %xmm1
leaq (%rdi,%rax,8), %rcx
xorl %edx, %edx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L7:
movapd (%rcx,%rax), %xmm0
addl $1, %edx
addpd %xmm1, %xmm0
movapd %xmm0, (%rcx,%rax)
addq $16, %rax
cmpl %esi, %edx
jb .L7
subl %r9d, %r11d
cmpl %r9d, %r8d
leal (%r10,%r9), %edx
je .L1
movsd .LC0(%rip), %xmm1
.L3:
leal -1(%r11), %ecx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rax
leaq 1(%rdx,%rcx), %rdx
salq $3, %rdx
.p2align 4,,10
.p2align 3
.L6:
movsd (%rdi,%rax), %xmm0
addsd %xmm1, %xmm0
movsd %xmm0, (%rdi,%rax)
addq $8, %rax
cmpq %rdx, %rax
jne .L6
rep
ret
.L1:
ret
.L2:
movl $100000, %r8d
movl $50000, %esi
movl $100000, %r9d
movl $100000, %r11d
xorl %r10d, %r10d
jmp .L8
.L9:
movl $1, %edx
jmp .L3
what we also fail to optimize are the entry checks of the vectorized loops
after the prologue loop:
# BLOCK 2 loop_depth:0 freq:100
# PRED: ENTRY [100.0%] (fallthru,exec)
D.1738_23 = (unsigned long) x_2(D);
D.1739_24 = D.1738_23 & 15;
D.1740_25 = D.1739_24 >> 3;
D.1741_26 = -D.1740_25;
D.1742_27 = (unsigned int) D.1741_26;
D.1802_18 = D.1741_26 & 1;
prolog_loop_niters.16_28 = (unsigned int) D.1802_18;
if (prolog_loop_niters.16_28 == 0)
goto <bb 12>;
else
goto <bb 3>;
# SUCC: 3 [100.0%] (false,exec) 12 (true,exec)
# BLOCK 3 loop_depth:0 freq:100
# PRED: 2 [100.0%] (false,exec)
x_65 = x_2(D);
D.1718_37 = MEM[base: x_65, offset: 0B];
D.1719_38 = D.1718_37 + 2.0e+0;
MEM[base: x_65, offset: 0B] = D.1719_38;
prolog_loop_adjusted_niters.18_52 = D.1741_26 & 1;
niters.19_53 = 100000 - prolog_loop_niters.16_28;
bnd.20_54 = niters.19_53 >> 1;
ratio_mult_vf.21_55 = bnd.20_54 << 1;
if (ratio_mult_vf.21_55 == 0)
goto <bb 7>;
else
goto <bb 4>;
# SUCC: 4 [100.0%] (false,exec) 7 (true,exec)
we fail to see that ratio_mult_vf.21_55 is never zero. Of course that is
because of the awkward representation of the checks.
I will look into this.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/53355] Autovectorization of a simple loop could be improved.
2012-05-15 1:55 [Bug c/53355] New: Autovectorization of a simple loop could be improved svfuerst at gmail dot com
2012-05-15 9:44 ` [Bug tree-optimization/53355] " rguenth at gcc dot gnu.org
@ 2012-05-15 13:19 ` rguenth at gcc dot gnu.org
2012-05-15 13:46 ` rguenth at gcc dot gnu.org
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2012-05-15 13:19 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53355
--- Comment #2 from Richard Guenther <rguenth at gcc dot gnu.org> 2012-05-15 13:18:42 UTC ---
Author: rguenth
Date: Tue May 15 13:18:32 2012
New Revision: 187535
URL: http://gcc.gnu.org/viewcvs?root=gcc&view=rev&rev=187535
Log:
2012-05-15 Richard Guenther <rguenther@suse.de>
PR tree-optimization/53355
* tree-vrp.c (extract_range_from_binary_expr_1): Handle LSHIFT_EXPRs
by constants.
* gcc.dg/tree-ssa/vrp67.c: New testcase.
Added:
trunk/gcc/testsuite/gcc.dg/tree-ssa/vrp67.c
Modified:
trunk/gcc/ChangeLog
trunk/gcc/testsuite/ChangeLog
trunk/gcc/tree-vrp.c
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/53355] Autovectorization of a simple loop could be improved.
2012-05-15 1:55 [Bug c/53355] New: Autovectorization of a simple loop could be improved svfuerst at gmail dot com
2012-05-15 9:44 ` [Bug tree-optimization/53355] " rguenth at gcc dot gnu.org
2012-05-15 13:19 ` rguenth at gcc dot gnu.org
@ 2012-05-15 13:46 ` rguenth at gcc dot gnu.org
2012-05-16 15:25 ` rguenth at gcc dot gnu.org
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2012-05-15 13:46 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53355
--- Comment #3 from Richard Guenther <rguenth at gcc dot gnu.org> 2012-05-15 13:19:06 UTC ---
With the patch we now emit
foo:
.LFB0:
.cfi_startproc
movq %rdi, %rax
salq $60, %rax
sarq $63, %rax
movq %rax, %rdx
andl $1, %edx
testl %edx, %edx
je .L7
movsd .LC0(%rip), %xmm0
movl $99999, %r10d
movl $1, %r11d
addsd (%rdi), %xmm0
movsd %xmm0, (%rdi)
.L2:
movl $100000, %r8d
andl $1, %eax
subl %edx, %r8d
movapd .LC1(%rip), %xmm1
movl %r8d, %esi
leaq (%rdi,%rax,8), %rcx
xorl %edx, %edx
shrl %esi
xorl %eax, %eax
leal (%rsi,%rsi), %r9d
.p2align 4,,10
.p2align 3
.L6:
movapd (%rcx,%rax), %xmm0
addl $1, %edx
addpd %xmm1, %xmm0
movapd %xmm0, (%rcx,%rax)
addq $16, %rax
cmpl %esi, %edx
jb .L6
subl %r9d, %r10d
cmpl %r9d, %r8d
leal (%r11,%r9), %edx
je .L1
leal -1(%r10), %ecx
movslq %edx, %rdx
leaq 0(,%rdx,8), %rax
movsd .LC0(%rip), %xmm1
leaq 1(%rdx,%rcx), %rdx
salq $3, %rdx
.p2align 4,,10
.p2align 3
.L5:
movsd (%rdi,%rax), %xmm0
addsd %xmm1, %xmm0
movsd %xmm0, (%rdi,%rax)
addq $8, %rax
cmpq %rdx, %rax
jne .L5
.L1:
rep
ret
.L7:
movl $100000, %r10d
xorl %r11d, %r11d
jmp .L2
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/53355] Autovectorization of a simple loop could be improved.
2012-05-15 1:55 [Bug c/53355] New: Autovectorization of a simple loop could be improved svfuerst at gmail dot com
` (2 preceding siblings ...)
2012-05-15 13:46 ` rguenth at gcc dot gnu.org
@ 2012-05-16 15:25 ` rguenth at gcc dot gnu.org
2012-05-18 14:21 ` rguenth at gcc dot gnu.org
2012-07-13 13:54 ` rguenth at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2012-05-16 15:25 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53355
--- Comment #4 from Richard Guenther <rguenth at gcc dot gnu.org> 2012-05-16 14:46:33 UTC ---
One major remaining issue is that the entry checks for the versioned loops
all base on the number of iterations of said loop instead of on the
feature (in the case of peeling for alignment we don't check for aligned but
we check for does the align-loop run zero times). That causes unnecessary
code to be executed in the path that does not need it.
Another remaining issue is that we allow for
if (unaligned)
for (;;)
if (any-vectorized-iterations)
for (;;)
if (any-remaining-iterations)
for (;;)
thus any-vectorized-iterations may be false when we leave the alignment
loop. That shounds bogus - this case should drop into a single purely
scalar loop instead, avoiding the check and making all cases faster.
In the testcase this gets optimized away (because we have a known number
of overall iterations), but with a variable upper bound cost considerations
should not allow this case.
In general the scalar loop version, the epilogue and only then the prologue
loop should be considered as fallback for the non profitable case (in this
order). Not the prologue loop first.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/53355] Autovectorization of a simple loop could be improved.
2012-05-15 1:55 [Bug c/53355] New: Autovectorization of a simple loop could be improved svfuerst at gmail dot com
` (3 preceding siblings ...)
2012-05-16 15:25 ` rguenth at gcc dot gnu.org
@ 2012-05-18 14:21 ` rguenth at gcc dot gnu.org
2012-07-13 13:54 ` rguenth at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2012-05-18 14:21 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53355
--- Comment #5 from Richard Guenther <rguenth at gcc dot gnu.org> 2012-05-18 12:32:11 UTC ---
The following testcase (extracted from PR53346, which stores zero though) shows
another case of slowness
void foo (int *p, int n)
{
int i;
for (i = 0; i < n; ++i)
p[i] = 1;
}
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug tree-optimization/53355] Autovectorization of a simple loop could be improved.
2012-05-15 1:55 [Bug c/53355] New: Autovectorization of a simple loop could be improved svfuerst at gmail dot com
` (4 preceding siblings ...)
2012-05-18 14:21 ` rguenth at gcc dot gnu.org
@ 2012-07-13 13:54 ` rguenth at gcc dot gnu.org
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2012-07-13 13:54 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53355
Richard Guenther <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |falk at debian dot org
--- Comment #6 from Richard Guenther <rguenth at gcc dot gnu.org> 2012-07-13 13:52:25 UTC ---
*** Bug 18557 has been marked as a duplicate of this bug. ***
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2012-07-13 13:54 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-05-15 1:55 [Bug c/53355] New: Autovectorization of a simple loop could be improved svfuerst at gmail dot com
2012-05-15 9:44 ` [Bug tree-optimization/53355] " rguenth at gcc dot gnu.org
2012-05-15 13:19 ` rguenth at gcc dot gnu.org
2012-05-15 13:46 ` rguenth at gcc dot gnu.org
2012-05-16 15:25 ` rguenth at gcc dot gnu.org
2012-05-18 14:21 ` rguenth at gcc dot gnu.org
2012-07-13 13:54 ` rguenth at gcc dot gnu.org
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).