public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/51838] New: Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer.
@ 2012-01-12 19:29 svfuerst at gmail dot com
2021-08-28 16:27 ` [Bug target/51838] " pinskia at gcc dot gnu.org
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: svfuerst at gmail dot com @ 2012-01-12 19:29 UTC (permalink / raw)
To: gcc-bugs
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51838
Bug #: 51838
Summary: Inefficient add of 128 bit quantity represented as 64
bit tuple to 128 bit integer.
Classification: Unclassified
Product: gcc
Version: 4.7.0
Status: UNCONFIRMED
Severity: enhancement
Priority: P3
Component: c
AssignedTo: unassigned@gcc.gnu.org
ReportedBy: svfuerst@gmail.com
void foo(__uint128_t *x, unsigned long long y, unsigned long long z)
{
*x += y + ((__uint128_t) z << 64);
}
Compiles into:
mov %rdx,%r8
mov %rsi,%rax
xor %edx,%edx
add (%rdi),%rax
mov %rdi,%rcx
adc 0x8(%rdi),%rdx
xor %esi,%esi
add %rsi,%rax
adc %r8,%rdx
mov %rax,(%rcx)
mov %rdx,0x8(%rcx)
retq
The above can be optimized into:
add %rsi, (%rdi)
adc %rdx, 8(%rdi)
retq
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug target/51838] Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer.
2012-01-12 19:29 [Bug c/51838] New: Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer svfuerst at gmail dot com
@ 2021-08-28 16:27 ` pinskia at gcc dot gnu.org
2021-08-30 4:58 ` crazylht at gmail dot com
2021-08-30 6:33 ` ubizjak at gmail dot com
2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-08-28 16:27 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51838
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Component|middle-end |target
Status|UNCONFIRMED |NEW
Last reconfirmed| |2021-08-28
Ever confirmed|0 |1
Keywords| |missed-optimization
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
We do get slightly better now:
xorl %eax, %eax
movq %rdi, %r8
xorl %edi, %edi
addq %rsi, %rax
adcq %rdi, %rdx
addq %rax, (%r8)
adcq %rdx, 8(%r8)
ret
Note on arch64 we do get good code:
ldp x3, x4, [x0]
adds x3, x3, x1
adc x4, x4, x2
stp x3, x4, [x0]
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug target/51838] Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer.
2012-01-12 19:29 [Bug c/51838] New: Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer svfuerst at gmail dot com
2021-08-28 16:27 ` [Bug target/51838] " pinskia at gcc dot gnu.org
@ 2021-08-30 4:58 ` crazylht at gmail dot com
2021-08-30 6:33 ` ubizjak at gmail dot com
2 siblings, 0 replies; 4+ messages in thread
From: crazylht at gmail dot com @ 2021-08-30 4:58 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51838
--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Andrew Pinski from comment #1)
> We do get slightly better now:
> xorl %eax, %eax
> movq %rdi, %r8
> xorl %edi, %edi
> addq %rsi, %rax
> adcq %rdi, %rdx
> addq %rax, (%r8)
> adcq %rdx, 8(%r8)
> ret
>
> Note on arch64 we do get good code:
> ldp x3, x4, [x0]
> adds x3, x3, x1
> adc x4, x4, x2
> stp x3, x4, [x0]
> ret
The interest thing is when i remove addti3 and ashlti3 from i386.md, GCC
generates optimal code.
void foo(__uint128_t *x, unsigned long long y, unsigned long long z)
{
*x += y + ((__uint128_t) z << 64);
}
void foo1(__uint128_t *x, unsigned long long y, unsigned long long z)
{
*x += (__uint128_t) z << 64;
}
void foo2(__uint128_t *x, unsigned long long y, unsigned long long z)
{
*x += (__uint128_t) z << 3;
}
void foo3(__uint128_t *x, __uint128_t *y)
{
*x += *y;
}
diff --git a/origin.s b/test.s
index 08274ba..764241a 100644
--- a/origin.s
+++ b/test.s
@@ -6,13 +6,8 @@
foo:
.LFB0:
.cfi_startproc
- xorl %eax, %eax
- movq %rdi, %r8
- xorl %edi, %edi
- addq %rsi, %rax
- adcq %rdi, %rdx
- addq %rax, (%r8)
- adcq %rdx, 8(%r8)
+ addq %rsi, (%rdi)
+ adcq %rdx, 8(%rdi)
ret
.cfi_endproc
.LFE0:
@@ -23,9 +18,7 @@ foo:
foo1:
.LFB1:
.cfi_startproc
- xorl %eax, %eax
- addq %rax, (%rdi)
- adcq %rdx, 8(%rdi)
+ addq %rdx, 8(%rdi)
ret
.cfi_endproc
.LFE1:
@@ -36,13 +29,13 @@ foo1:
foo2:
.LFB2:
.cfi_startproc
- movq %rdx, %rax
- movq %rdx, %r8
- salq $3, %rax
- xorl %edx, %edx
- shldq $3, %r8, %rdx
- addq %rax, (%rdi)
- adcq %rdx, 8(%rdi)
+ movq (%rdi), %rcx
+ movq %rdx, %rsi
+ shrq $61, %rsi
+ leaq (%rcx,%rdx,8), %rax
+ cmpq %rcx, %rax
+ movq %rax, (%rdi)
+ adcq %rsi, 8(%rdi)
ret
.cfi_endproc
.LFE2:
@@ -53,9 +46,10 @@ foo2:
foo3:
.LFB3:
.cfi_startproc
- movq (%rsi), %rax
+ movq (%rdi), %rax
+ addq (%rsi), %rax
movq 8(%rsi), %rdx
- addq %rax, (%rdi)
+ movq %rax, (%rdi)
adcq %rdx, 8(%rdi)
ret
.cfi_endproc
(END)
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug target/51838] Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer.
2012-01-12 19:29 [Bug c/51838] New: Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer svfuerst at gmail dot com
2021-08-28 16:27 ` [Bug target/51838] " pinskia at gcc dot gnu.org
2021-08-30 4:58 ` crazylht at gmail dot com
@ 2021-08-30 6:33 ` ubizjak at gmail dot com
2 siblings, 0 replies; 4+ messages in thread
From: ubizjak at gmail dot com @ 2021-08-30 6:33 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51838
--- Comment #3 from Uroš Bizjak <ubizjak at gmail dot com> ---
(In reply to Hongtao.liu from comment #2)
> The interest thing is when i remove addti3 and ashlti3 from i386.md, GCC
> generates optimal code.
Yes, we had this situation with <logic>_doubleword instructions, and it looks
the same cure can also be applied to arithmetic and shift doubleword patterns.
In the past, the middle-end was not able to emit operations with carry, so we
were forced to implement doubleword arithmetic/shifts it as it is currently
done. If this limitation was lifted in the meantime, then removing interfering
patterns is the sure way to go.
There is a small complication with DImode patterns, which have to be present
for i686 to perform STV conversion, and we have to check for special values
(see for example "*anddi3_doubleword) and explicitly handle them.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2021-08-30 6:33 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-01-12 19:29 [Bug c/51838] New: Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer svfuerst at gmail dot com
2021-08-28 16:27 ` [Bug target/51838] " pinskia at gcc dot gnu.org
2021-08-30 4:58 ` crazylht at gmail dot com
2021-08-30 6:33 ` ubizjak at gmail dot com
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).