* Optimiser failure for ternary foo == 0L ? NULL : bar;
@ 2021-07-17 18:54 Stefan Kanthak
2021-07-17 19:31 ` Richard Biener
0 siblings, 1 reply; 2+ messages in thread
From: Stefan Kanthak @ 2021-07-17 18:54 UTC (permalink / raw)
To: gcc
Hi,
GCC 10.2.0 (and GCC 8.3; other versions and targets except i386 and
amd64 not tested) generate rather bad code for the following ternary
expression:
--- repro.c ---
#define NULL (char *) 0
char *dummy(char *string, long count) {
return count == 0 ? NULL : string + 1;
}
--- EOF ---
$ gcc -m64 -o- -O3 -S repro.c
dummy:
addq $1, %rdi
movl $0, %eax
testq %rsi, %rsi
cmovne %rdi, %rax
ret
JFTR: why does GCC NOT generate the shorter "XOR %eax, %eax" here?
$ gcc -m64 -O3 -c dummy.c
$ objdump -D dummy.o
0000000000000000 <dummy>:
0: 48 83 c7 01 add $0x1,%rdi
4: b8 00 00 00 00 mov $0x0,%eax
9: 48 85 f6 test %rsi,%rsi
c: 48 0f 45 c7 cmovne %rdi,%rax
10: c3 retq
i386 and AMD64 use the ILP32 and LP64 data model where a "long" and
a "pointer" have the same size, and 0L and the null pointer have the
same binary representation, so the contents of RSI should be used to
load RAX with 0 conditionally:
dummy:
leaq 1(%rdi), %rax
testq %rsi, %rsi
cmoveq %rdi, %rax
ret
$ gcc -m32 -o- -O3 -S dummy.c
_dummy:
movl 8(%esp), %edx
movl 4(%esp), %eax
addl $1, %eax
testl %edx, %edx
movl $0, %edx
cmove %edx, %eax # OUCH: if this executes, EDX was 0 before,
ret # so the MOV is really a NOP!
$ gcc -m32 -O3 -c dummy.c
$ objdump -D dummy.o
00000000 <_dummy>:
0: 8b 54 24 08 mov 0x8(%esp),%edx
4: 8b 44 24 04 mov 0x4(%esp),%eax
8: 83 c0 01 add $0x1,%eax
b: 85 d2 test %edx,%edx
d: ba 00 00 00 00 mov $0x0,%edx
12: 0f 44 c2 cmove %edx,%eax
15: c3 ret
Here's what GCC should but generate:
00000000 <_dummy>:
0: 8b 44 24 04 mov 0x4(%esp),%eax
4: 8b 4c 24 08 mov 0x8(%esp),%ecx
8: 40 inc %eax
9: f7 d9 neg %ecx
b: 19 c9 sbb %ecx,%ecx
d: 21 c8 and %ecx,%eax
f: c3 ret
For (pre)historic processors which don't support CMOVcc the
following code is generated:
$ gcc -m32 -mtune=i386 -o- -S dummy.c
_dummy:
movl 8(%esp), %eax
testl %eax, %eax
je L3
movl 4(%esp), %eax
incl %eax
ret
.p2align 2
L3: # OUCH: EAX is already 0 here!
xorl %eax, %eax
ret
00000000 <dummy>:
0: 8b 44 24 08 mov 0x8(%esp),%eax
4: 85 c0 test %eax,%eax
6: 74 08 je 10 <dummy+0x10>
8: 8b 44 24 04 mov 0x4(%esp),%eax
c: 40 inc %eax
d: c3 ret
e: 66 90 xchg %ax,%ax
10: 31 c0 xor %eax,%eax
12: c3 ret
not amused
Stefan Kanthak
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: Optimiser failure for ternary foo == 0L ? NULL : bar;
2021-07-17 18:54 Optimiser failure for ternary foo == 0L ? NULL : bar; Stefan Kanthak
@ 2021-07-17 19:31 ` Richard Biener
0 siblings, 0 replies; 2+ messages in thread
From: Richard Biener @ 2021-07-17 19:31 UTC (permalink / raw)
To: gcc, Stefan Kanthak
On July 17, 2021 8:54:38 PM GMT+02:00, Stefan Kanthak <stefan.kanthak@nexgo.de> wrote:
>Hi,
>
>GCC 10.2.0 (and GCC 8.3; other versions and targets except i386 and
>amd64 not tested) generate rather bad code for the following ternary
>expression:
>
>--- repro.c ---
>#define NULL (char *) 0
>
>char *dummy(char *string, long count) {
> return count == 0 ? NULL : string + 1;
>}
>--- EOF ---
>
>$ gcc -m64 -o- -O3 -S repro.c
>
>dummy:
> addq $1, %rdi
> movl $0, %eax
> testq %rsi, %rsi
> cmovne %rdi, %rax
> ret
>
>JFTR: why does GCC NOT generate the shorter "XOR %eax, %eax" here?
>
>$ gcc -m64 -O3 -c dummy.c
>$ objdump -D dummy.o
>
>0000000000000000 <dummy>:
> 0: 48 83 c7 01 add $0x1,%rdi
> 4: b8 00 00 00 00 mov $0x0,%eax
> 9: 48 85 f6 test %rsi,%rsi
> c: 48 0f 45 c7 cmovne %rdi,%rax
> 10: c3 retq
>
>
>i386 and AMD64 use the ILP32 and LP64 data model where a "long" and
>a "pointer" have the same size, and 0L and the null pointer have the
>same binary representation, so the contents of RSI should be used to
>load RAX with 0 conditionally:
>
>dummy:
> leaq 1(%rdi), %rax
> testq %rsi, %rsi
> cmoveq %rdi, %rax
> ret
>
>$ gcc -m32 -o- -O3 -S dummy.c
>
>_dummy:
> movl 8(%esp), %edx
> movl 4(%esp), %eax
> addl $1, %eax
> testl %edx, %edx
> movl $0, %edx
> cmove %edx, %eax # OUCH: if this executes, EDX was 0 before,
> ret # so the MOV is really a NOP!
>
>
>$ gcc -m32 -O3 -c dummy.c
>$ objdump -D dummy.o
>
>00000000 <_dummy>:
> 0: 8b 54 24 08 mov 0x8(%esp),%edx
> 4: 8b 44 24 04 mov 0x4(%esp),%eax
> 8: 83 c0 01 add $0x1,%eax
> b: 85 d2 test %edx,%edx
> d: ba 00 00 00 00 mov $0x0,%edx
> 12: 0f 44 c2 cmove %edx,%eax
> 15: c3 ret
>
>Here's what GCC should but generate:
>
>00000000 <_dummy>:
> 0: 8b 44 24 04 mov 0x4(%esp),%eax
> 4: 8b 4c 24 08 mov 0x8(%esp),%ecx
> 8: 40 inc %eax
> 9: f7 d9 neg %ecx
> b: 19 c9 sbb %ecx,%ecx
> d: 21 c8 and %ecx,%eax
> f: c3 ret
>
>
>For (pre)historic processors which don't support CMOVcc the
>following code is generated:
>
>$ gcc -m32 -mtune=i386 -o- -S dummy.c
>
>_dummy:
> movl 8(%esp), %eax
> testl %eax, %eax
> je L3
> movl 4(%esp), %eax
> incl %eax
> ret
> .p2align 2
>L3: # OUCH: EAX is already 0 here!
> xorl %eax, %eax
> ret
>
>00000000 <dummy>:
> 0: 8b 44 24 08 mov 0x8(%esp),%eax
> 4: 85 c0 test %eax,%eax
> 6: 74 08 je 10 <dummy+0x10>
> 8: 8b 44 24 04 mov 0x4(%esp),%eax
> c: 40 inc %eax
> d: c3 ret
> e: 66 90 xchg %ax,%ax
> 10: 31 c0 xor %eax,%eax
> 12: c3 ret
>
>
>not amused
Patches welcome. You might want to file a bugzilla report which has a higher chance of being found after a while.
Richard.
>Stefan Kanthak
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-07-17 19:31 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-17 18:54 Optimiser failure for ternary foo == 0L ? NULL : bar; Stefan Kanthak
2021-07-17 19:31 ` Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).