* [Bug target/51838] Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer.
2012-01-12 19:29 [Bug c/51838] New: Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer svfuerst at gmail dot com
@ 2021-08-28 16:27 ` pinskia at gcc dot gnu.org
2021-08-30 4:58 ` crazylht at gmail dot com
2021-08-30 6:33 ` ubizjak at gmail dot com
2 siblings, 0 replies; 4+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-08-28 16:27 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51838
Andrew Pinski <pinskia at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Component|middle-end |target
Status|UNCONFIRMED |NEW
Last reconfirmed| |2021-08-28
Ever confirmed|0 |1
Keywords| |missed-optimization
--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
We do get slightly better now:
xorl %eax, %eax
movq %rdi, %r8
xorl %edi, %edi
addq %rsi, %rax
adcq %rdi, %rdx
addq %rax, (%r8)
adcq %rdx, 8(%r8)
ret
Note on arch64 we do get good code:
ldp x3, x4, [x0]
adds x3, x3, x1
adc x4, x4, x2
stp x3, x4, [x0]
ret
^ permalink raw reply [flat|nested] 4+ messages in thread
* [Bug target/51838] Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer.
2012-01-12 19:29 [Bug c/51838] New: Inefficient add of 128 bit quantity represented as 64 bit tuple to 128 bit integer svfuerst at gmail dot com
2021-08-28 16:27 ` [Bug target/51838] " pinskia at gcc dot gnu.org
@ 2021-08-30 4:58 ` crazylht at gmail dot com
2021-08-30 6:33 ` ubizjak at gmail dot com
2 siblings, 0 replies; 4+ messages in thread
From: crazylht at gmail dot com @ 2021-08-30 4:58 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51838
--- Comment #2 from Hongtao.liu <crazylht at gmail dot com> ---
(In reply to Andrew Pinski from comment #1)
> We do get slightly better now:
> xorl %eax, %eax
> movq %rdi, %r8
> xorl %edi, %edi
> addq %rsi, %rax
> adcq %rdi, %rdx
> addq %rax, (%r8)
> adcq %rdx, 8(%r8)
> ret
>
> Note on arch64 we do get good code:
> ldp x3, x4, [x0]
> adds x3, x3, x1
> adc x4, x4, x2
> stp x3, x4, [x0]
> ret
The interest thing is when i remove addti3 and ashlti3 from i386.md, GCC
generates optimal code.
void foo(__uint128_t *x, unsigned long long y, unsigned long long z)
{
*x += y + ((__uint128_t) z << 64);
}
void foo1(__uint128_t *x, unsigned long long y, unsigned long long z)
{
*x += (__uint128_t) z << 64;
}
void foo2(__uint128_t *x, unsigned long long y, unsigned long long z)
{
*x += (__uint128_t) z << 3;
}
void foo3(__uint128_t *x, __uint128_t *y)
{
*x += *y;
}
diff --git a/origin.s b/test.s
index 08274ba..764241a 100644
--- a/origin.s
+++ b/test.s
@@ -6,13 +6,8 @@
foo:
.LFB0:
.cfi_startproc
- xorl %eax, %eax
- movq %rdi, %r8
- xorl %edi, %edi
- addq %rsi, %rax
- adcq %rdi, %rdx
- addq %rax, (%r8)
- adcq %rdx, 8(%r8)
+ addq %rsi, (%rdi)
+ adcq %rdx, 8(%rdi)
ret
.cfi_endproc
.LFE0:
@@ -23,9 +18,7 @@ foo:
foo1:
.LFB1:
.cfi_startproc
- xorl %eax, %eax
- addq %rax, (%rdi)
- adcq %rdx, 8(%rdi)
+ addq %rdx, 8(%rdi)
ret
.cfi_endproc
.LFE1:
@@ -36,13 +29,13 @@ foo1:
foo2:
.LFB2:
.cfi_startproc
- movq %rdx, %rax
- movq %rdx, %r8
- salq $3, %rax
- xorl %edx, %edx
- shldq $3, %r8, %rdx
- addq %rax, (%rdi)
- adcq %rdx, 8(%rdi)
+ movq (%rdi), %rcx
+ movq %rdx, %rsi
+ shrq $61, %rsi
+ leaq (%rcx,%rdx,8), %rax
+ cmpq %rcx, %rax
+ movq %rax, (%rdi)
+ adcq %rsi, 8(%rdi)
ret
.cfi_endproc
.LFE2:
@@ -53,9 +46,10 @@ foo2:
foo3:
.LFB3:
.cfi_startproc
- movq (%rsi), %rax
+ movq (%rdi), %rax
+ addq (%rsi), %rax
movq 8(%rsi), %rdx
- addq %rax, (%rdi)
+ movq %rax, (%rdi)
adcq %rdx, 8(%rdi)
ret
.cfi_endproc
(END)
^ permalink raw reply [flat|nested] 4+ messages in thread