public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/63273] New: atomic operations lead to inefficient code
@ 2014-09-16 1:23 dvyukov at google dot com
2014-09-16 8:59 ` [Bug middle-end/63273] " rguenth at gcc dot gnu.org
` (5 more replies)
0 siblings, 6 replies; 7+ messages in thread
From: dvyukov at google dot com @ 2014-09-16 1:23 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63273
Bug ID: 63273
Summary: atomic operations lead to inefficient code
Product: gcc
Version: 5.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: dvyukov at google dot com
Created attachment 33498
--> https://gcc.gnu.org/bugzilla/attachment.cgi?id=33498&action=edit
reproducer
gcc version 5.0.0 20140830 (experimental) (GCC)
Build the attached reproducer as:
$ g++ atomic.cc -std=c++11 -O3
Then disassemble the output. I would expect that all of test_nonatomic,
test_atomicops (hand rolled implementation of atomic operations), test_std and
test_intrinsic lead to generation of the same code. However, test_std and
test_intrinsic lead to generation of vastly inefficient code with lots of
unnecessary spills.
00000000004008a0 <_Z14test_nonatomicv>:
4008a0: mov 0x200b82(%rip),%eax # 601428 <a>
4008a6: addl $0x1,0x200b2b(%rip) # 6013d8 <x>
4008ad: add $0x1,%eax
4008b0: mov %eax,0x200b72(%rip) # 601428 <a>
4008b6: mov 0x200b68(%rip),%eax # 601424 <b>
4008bc: add $0x1,%eax
4008bf: mov %eax,0x200b5f(%rip) # 601424 <b>
4008c5: mov 0x200b55(%rip),%eax # 601420 <c>
4008cb: add $0x1,%eax
4008ce: mov %eax,0x200b4c(%rip) # 601420 <c>
4008d4: mov 0x200b42(%rip),%eax # 60141c <d>
4008da: add $0x1,%eax
4008dd: mov %eax,0x200b39(%rip) # 60141c <d>
4008e3: mov 0x200b2f(%rip),%eax # 601418 <e>
4008e9: add $0x1,%eax
4008ec: mov %eax,0x200b26(%rip) # 601418 <e>
4008f2: mov 0x200b1c(%rip),%eax # 601414 <f>
4008f8: add $0x1,%eax
4008fb: mov %eax,0x200b13(%rip) # 601414 <f>
400901: mov 0x200b09(%rip),%eax # 601410 <g>
400907: add $0x1,%eax
40090a: mov %eax,0x200b00(%rip) # 601410 <g>
400910: mov 0x200af6(%rip),%eax # 60140c <h>
400916: add $0x1,%eax
400919: mov %eax,0x200aed(%rip) # 60140c <h>
40091f: retq
0000000000400920 <_Z14test_atomicopsv>:
400920: mov 0x200ab2(%rip),%eax # 6013d8 <x>
400926: add $0x1,%eax
400929: mov %eax,0x200aa9(%rip) # 6013d8 <x>
40092f: mov 0x200af3(%rip),%eax # 601428 <a>
400935: add $0x1,%eax
400938: mov %eax,0x200aea(%rip) # 601428 <a>
40093e: mov 0x200ae0(%rip),%eax # 601424 <b>
400944: add $0x1,%eax
400947: mov %eax,0x200ad7(%rip) # 601424 <b>
40094d: mov 0x200acd(%rip),%eax # 601420 <c>
400953: add $0x1,%eax
400956: mov %eax,0x200ac4(%rip) # 601420 <c>
40095c: mov 0x200aba(%rip),%eax # 60141c <d>
400962: add $0x1,%eax
400965: mov %eax,0x200ab1(%rip) # 60141c <d>
40096b: mov 0x200aa7(%rip),%eax # 601418 <e>
400971: add $0x1,%eax
400974: mov %eax,0x200a9e(%rip) # 601418 <e>
40097a: mov 0x200a94(%rip),%eax # 601414 <f>
400980: add $0x1,%eax
400983: mov %eax,0x200a8b(%rip) # 601414 <f>
400989: mov 0x200a81(%rip),%eax # 601410 <g>
40098f: add $0x1,%eax
400992: mov %eax,0x200a78(%rip) # 601410 <g>
400998: mov 0x200a6e(%rip),%eax # 60140c <h>
40099e: add $0x1,%eax
4009a1: mov %eax,0x200a65(%rip) # 60140c <h>
4009a7: retq
4009a8: nopl 0x0(%rax,%rax,1)
00000000004009b0 <_Z8test_stdv>:
4009b0: push %r15
4009b2: mov 0x200a57(%rip),%r8d # 601410 <g>
4009b9: push %r14
4009bb: mov 0x200a4a(%rip),%r9d # 60140c <h>
4009c2: push %r13
4009c4: mov 0x200a25(%rip),%r10d # 6013f0 <g1>
4009cb: push %r12
4009cd: mov 0x200a18(%rip),%r11d # 6013ec <h1>
4009d4: push %rbp
4009d5: mov 0x200a39(%rip),%edi # 601414 <f>
4009db: push %rbx
4009dc: mov 0x200a36(%rip),%ebx # 601418 <e>
4009e2: mov 0x200a1f(%rip),%r15d # 601408 <a1>
4009e9: mov 0x200a14(%rip),%r14d # 601404 <b1>
4009f0: mov %r8d,-0x10(%rsp)
4009f5: mov %r9d,-0xc(%rsp)
4009fa: mov %ebx,-0x18(%rsp)
4009fe: mov %r10d,-0x8(%rsp)
400a03: mov %r11d,-0x4(%rsp)
400a08: mov %edi,-0x14(%rsp)
400a0c: mov 0x2009ed(%rip),%r13d # 601400 <c1>
400a13: mov 0x2009e2(%rip),%r12d # 6013fc <d1>
400a1a: mov 0x2009d8(%rip),%ebp # 6013f8 <e1>
400a20: mov 0x2009ce(%rip),%ebx # 6013f4 <f1>
400a26: mov 0x2009fc(%rip),%esi # 601428 <a>
400a2c: mov 0x2009f2(%rip),%ecx # 601424 <b>
400a32: mov 0x2009e8(%rip),%edx # 601420 <c>
400a38: mov 0x2009de(%rip),%eax # 60141c <d>
400a3e: mov 0x2009a3(%rip),%r11d # 6013e8 <a2>
400a45: mov 0x200998(%rip),%r10d # 6013e4 <b2>
400a4c: mov 0x20098d(%rip),%r9d # 6013e0 <c2>
400a53: mov 0x200982(%rip),%r8d # 6013dc <d2>
400a5a: add $0x1,%esi
400a5d: mov 0x200971(%rip),%edi # 6013d4 <std_x>
400a63: add $0x1,%eax
400a66: add $0x1,%ecx
400a69: add $0x1,%edx
400a6c: add $0x1,%edi
400a6f: mov %edi,0x20095f(%rip) # 6013d4 <std_x>
400a75: mov %eax,0x2009a1(%rip) # 60141c <d>
400a7b: mov -0x18(%rsp),%eax
400a7f: mov -0x14(%rsp),%edi
400a83: mov %r15d,0x20097e(%rip) # 601408 <a1>
400a8a: mov %r14d,0x200973(%rip) # 601404 <b1>
400a91: mov %r13d,0x200968(%rip) # 601400 <c1>
400a98: add $0x1,%eax
400a9b: mov %r12d,0x20095a(%rip) # 6013fc <d1>
400aa2: mov %ebp,0x200950(%rip) # 6013f8 <e1>
400aa8: mov %eax,0x20096a(%rip) # 601418 <e>
400aae: lea 0x1(%rdi),%eax
400ab1: mov %ebx,0x20093d(%rip) # 6013f4 <f1>
400ab7: mov %esi,0x20096b(%rip) # 601428 <a>
400abd: mov %ecx,0x200961(%rip) # 601424 <b>
400ac3: mov %eax,0x20094b(%rip) # 601414 <f>
400ac9: mov -0x10(%rsp),%eax
400acd: mov %edx,0x20094d(%rip) # 601420 <c>
400ad3: mov %r11d,0x20090e(%rip) # 6013e8 <a2>
400ada: mov %r10d,0x200903(%rip) # 6013e4 <b2>
400ae1: mov %r9d,0x2008f8(%rip) # 6013e0 <c2>
400ae8: add $0x1,%eax
400aeb: mov %r8d,0x2008ea(%rip) # 6013dc <d2>
400af2: mov %eax,0x200918(%rip) # 601410 <g>
400af8: mov -0xc(%rsp),%eax
400afc: add $0x1,%eax
400aff: mov %eax,0x200907(%rip) # 60140c <h>
400b05: mov -0x8(%rsp),%eax
400b09: mov %eax,0x2008e1(%rip) # 6013f0 <g1>
400b0f: mov -0x4(%rsp),%eax
400b13: pop %rbx
400b14: pop %rbp
400b15: mov %eax,0x2008d1(%rip) # 6013ec <h1>
400b1b: pop %r12
400b1d: pop %r13
400b1f: pop %r14
400b21: pop %r15
400b23: retq
400b24: data32 data32 nopw %cs:0x0(%rax,%rax,1)
0000000000400b30 <_Z14test_intrinsicv>:
400b30: push %r15
400b32: mov 0x2008d7(%rip),%r8d # 601410 <g>
400b39: push %r14
400b3b: mov 0x2008ca(%rip),%r9d # 60140c <h>
400b42: push %r13
400b44: mov 0x2008a5(%rip),%r10d # 6013f0 <g1>
400b4b: push %r12
400b4d: mov 0x200898(%rip),%r11d # 6013ec <h1>
400b54: push %rbp
400b55: mov 0x2008b9(%rip),%edi # 601414 <f>
400b5b: push %rbx
400b5c: mov 0x2008b6(%rip),%ebx # 601418 <e>
400b62: mov 0x20089f(%rip),%r15d # 601408 <a1>
400b69: mov 0x200894(%rip),%r14d # 601404 <b1>
400b70: mov %r8d,-0x10(%rsp)
400b75: mov %r9d,-0xc(%rsp)
400b7a: mov %ebx,-0x18(%rsp)
400b7e: mov %r10d,-0x8(%rsp)
400b83: mov %r11d,-0x4(%rsp)
400b88: mov %edi,-0x14(%rsp)
400b8c: mov 0x20086d(%rip),%r13d # 601400 <c1>
400b93: mov 0x200862(%rip),%r12d # 6013fc <d1>
400b9a: mov 0x200858(%rip),%ebp # 6013f8 <e1>
400ba0: mov 0x20084e(%rip),%ebx # 6013f4 <f1>
400ba6: mov 0x20087c(%rip),%esi # 601428 <a>
400bac: mov 0x200872(%rip),%ecx # 601424 <b>
400bb2: mov 0x200868(%rip),%edx # 601420 <c>
400bb8: mov 0x20085e(%rip),%eax # 60141c <d>
400bbe: mov 0x200823(%rip),%r11d # 6013e8 <a2>
400bc5: mov 0x200818(%rip),%r10d # 6013e4 <b2>
400bcc: mov 0x20080d(%rip),%r9d # 6013e0 <c2>
400bd3: mov 0x200802(%rip),%r8d # 6013dc <d2>
400bda: add $0x1,%esi
400bdd: mov 0x2007f5(%rip),%edi # 6013d8 <x>
400be3: add $0x1,%eax
400be6: add $0x1,%ecx
400be9: add $0x1,%edx
400bec: add $0x1,%edi
400bef: mov %edi,0x2007e3(%rip) # 6013d8 <x>
400bf5: mov %eax,0x200821(%rip) # 60141c <d>
400bfb: mov -0x18(%rsp),%eax
400bff: mov -0x14(%rsp),%edi
400c03: mov %r15d,0x2007fe(%rip) # 601408 <a1>
400c0a: mov %r14d,0x2007f3(%rip) # 601404 <b1>
400c11: mov %r13d,0x2007e8(%rip) # 601400 <c1>
400c18: add $0x1,%eax
400c1b: mov %r12d,0x2007da(%rip) # 6013fc <d1>
400c22: mov %ebp,0x2007d0(%rip) # 6013f8 <e1>
400c28: mov %eax,0x2007ea(%rip) # 601418 <e>
400c2e: lea 0x1(%rdi),%eax
400c31: mov %ebx,0x2007bd(%rip) # 6013f4 <f1>
400c37: mov %esi,0x2007eb(%rip) # 601428 <a>
400c3d: mov %ecx,0x2007e1(%rip) # 601424 <b>
400c43: mov %eax,0x2007cb(%rip) # 601414 <f>
400c49: mov -0x10(%rsp),%eax
400c4d: mov %edx,0x2007cd(%rip) # 601420 <c>
400c53: mov %r11d,0x20078e(%rip) # 6013e8 <a2>
400c5a: mov %r10d,0x200783(%rip) # 6013e4 <b2>
400c61: mov %r9d,0x200778(%rip) # 6013e0 <c2>
400c68: add $0x1,%eax
400c6b: mov %r8d,0x20076a(%rip) # 6013dc <d2>
400c72: mov %eax,0x200798(%rip) # 601410 <g>
400c78: mov -0xc(%rsp),%eax
400c7c: add $0x1,%eax
400c7f: mov %eax,0x200787(%rip) # 60140c <h>
400c85: mov -0x8(%rsp),%eax
400c89: mov %eax,0x200761(%rip) # 6013f0 <g1>
400c8f: mov -0x4(%rsp),%eax
400c93: pop %rbx
400c94: pop %rbp
400c95: mov %eax,0x200751(%rip) # 6013ec <h1>
400c9b: pop %r12
400c9d: pop %r13
400c9f: pop %r14
400ca1: pop %r15
400ca3: retq
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug middle-end/63273] atomic operations lead to inefficient code
2014-09-16 1:23 [Bug middle-end/63273] New: atomic operations lead to inefficient code dvyukov at google dot com
@ 2014-09-16 8:59 ` rguenth at gcc dot gnu.org
2014-09-16 17:09 ` dvyukov at google dot com
` (4 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: rguenth at gcc dot gnu.org @ 2014-09-16 8:59 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63273
Richard Biener <rguenth at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Status|UNCONFIRMED |NEW
Last reconfirmed| |2014-09-16
Ever confirmed|0 |1
--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
That's simply because the middle-end (alias analysis) doesn't know anything
about
the atomic builtins.
Which is because to me it's not exactly clear as for what other operations an
atomic load/store is a barrier for. [and for the middle-end "barrier" means
possibly clobbering/using]
Note that we expand the loads/stores to MEMs using ALIAS_SET_MEMORY_BARRIER
which means it conflicts with any other load/store on RTL (which matches
the behavior seen on GIMPLE).
In addition to that on GIMPLE the ATOMIC builtins make the input pointers
escape (that's surely not required), but they are usually globals anyway.
;; _52 = __atomic_load_4 (&MEM[(const struct __atomic_base *)&std_x]._M_i, 0);
(insn 25 24 0 (set (reg:SI 112 [ D.37281 ])
(mem/v:SI (symbol_ref:DI ("std_x") [flags 0x2] <var_decl
0x7ffff4c5ad10 std_x>) [-1 S4 A32]))
/usr/include/c++/4.9/bits/atomic_base.h:500 -1
(nil))
Confirmed. But eventually that's by design to make locking primitives work
(where we may not move any loads/stores into/out of the protected region).
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug middle-end/63273] atomic operations lead to inefficient code
2014-09-16 1:23 [Bug middle-end/63273] New: atomic operations lead to inefficient code dvyukov at google dot com
2014-09-16 8:59 ` [Bug middle-end/63273] " rguenth at gcc dot gnu.org
@ 2014-09-16 17:09 ` dvyukov at google dot com
2014-09-16 17:59 ` amacleod at redhat dot com
` (3 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: dvyukov at google dot com @ 2014-09-16 17:09 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63273
--- Comment #2 from Dmitry Vyukov <dvyukov at google dot com> ---
> Which is because to me it's not exactly clear as for what other operations an
atomic load/store is a barrier for.
That's trivial to answer -- memory_order_relaxed is barrier for nothing.
> But eventually that's by design to make locking primitives work
(where we may not move any loads/stores into/out of the protected region).
That's a very bad design for a production compiler. Locking primitives will
necessary include memory_order_acquire/release atomic, and they will prevent
unwanted code movement. Relaxed atomic must not prevent any code movement and
also don't clobber anything. Similarly, memory_order_release should generally
not prevent hoisting of loads above it.
For the record, this is just fixed in clang compiler (did not test yet,
though):
http://llvm.org/bugs/show_bug.cgi?id=17281
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug middle-end/63273] atomic operations lead to inefficient code
2014-09-16 1:23 [Bug middle-end/63273] New: atomic operations lead to inefficient code dvyukov at google dot com
2014-09-16 8:59 ` [Bug middle-end/63273] " rguenth at gcc dot gnu.org
2014-09-16 17:09 ` dvyukov at google dot com
@ 2014-09-16 17:59 ` amacleod at redhat dot com
2014-09-16 18:12 ` dvyukov at google dot com
` (2 subsequent siblings)
5 siblings, 0 replies; 7+ messages in thread
From: amacleod at redhat dot com @ 2014-09-16 17:59 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63273
Andrew Macleod <amacleod at redhat dot com> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |amacleod at redhat dot com
--- Comment #3 from Andrew Macleod <amacleod at redhat dot com> ---
GCC atomic implementation is intended to be correct first, and expose efficient
implementations for each of the memory models for the builtins. No one has
had time to venture into the optimization of these atomic operations, although
I drew up plans once upon a time.
(https://gcc.gnu.org/wiki/Atomic/GCCMM/Optimizations)
Its simply not clear given the nature of how atomics are normally used that its
worth the time to heavily optimize in and around them for real world test
cases. Its on the back-burner until such time that it is deemed to be
worthwhile.
Anyone who wished to work on this is welcome to have at it...
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug middle-end/63273] atomic operations lead to inefficient code
2014-09-16 1:23 [Bug middle-end/63273] New: atomic operations lead to inefficient code dvyukov at google dot com
` (2 preceding siblings ...)
2014-09-16 17:59 ` amacleod at redhat dot com
@ 2014-09-16 18:12 ` dvyukov at google dot com
2014-09-16 19:22 ` amacleod at redhat dot com
2014-09-16 19:57 ` dvyukov at google dot com
5 siblings, 0 replies; 7+ messages in thread
From: dvyukov at google dot com @ 2014-09-16 18:12 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63273
--- Comment #4 from Dmitry Vyukov <dvyukov at google dot com> ---
For the record, this bug is the result of my attempt to use std atomic
operations in ThreadSanitizer runtime. We do a bunch of relaxed loads and
stores during processing of every memory access in the target program. Usage of
std atomics instead of hand-rolled atomics has decreased end-to-end performance
by 30%, as the result we had to reject usage of std atomics and keep own
implementation.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug middle-end/63273] atomic operations lead to inefficient code
2014-09-16 1:23 [Bug middle-end/63273] New: atomic operations lead to inefficient code dvyukov at google dot com
` (3 preceding siblings ...)
2014-09-16 18:12 ` dvyukov at google dot com
@ 2014-09-16 19:22 ` amacleod at redhat dot com
2014-09-16 19:57 ` dvyukov at google dot com
5 siblings, 0 replies; 7+ messages in thread
From: amacleod at redhat dot com @ 2014-09-16 19:22 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63273
--- Comment #5 from Andrew Macleod <amacleod at redhat dot com> ---
Do you have a test case which shows what specifically is the issue? I suspect
its different from the included test case and it would be interesting to see
the real world situation. It may identify something easier to fix than
optimization in general.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [Bug middle-end/63273] atomic operations lead to inefficient code
2014-09-16 1:23 [Bug middle-end/63273] New: atomic operations lead to inefficient code dvyukov at google dot com
` (4 preceding siblings ...)
2014-09-16 19:22 ` amacleod at redhat dot com
@ 2014-09-16 19:57 ` dvyukov at google dot com
5 siblings, 0 replies; 7+ messages in thread
From: dvyukov at google dot com @ 2014-09-16 19:57 UTC (permalink / raw)
To: gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63273
--- Comment #6 from Dmitry Vyukov <dvyukov at google dot com> ---
The real world situation is:
replace atomic_load/store implementation with relaxed atomic builtins here:
https://gcc.gnu.org/viewcvs/gcc/trunk/libsanitizer/sanitizer_common/sanitizer_atomic_clang_x86.h?view=markup
then build tsan runtime and compare generated code for __tsan_read8 before and
after the change. They should be roughly identical.
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2014-09-16 19:57 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-16 1:23 [Bug middle-end/63273] New: atomic operations lead to inefficient code dvyukov at google dot com
2014-09-16 8:59 ` [Bug middle-end/63273] " rguenth at gcc dot gnu.org
2014-09-16 17:09 ` dvyukov at google dot com
2014-09-16 17:59 ` amacleod at redhat dot com
2014-09-16 18:12 ` dvyukov at google dot com
2014-09-16 19:22 ` amacleod at redhat dot com
2014-09-16 19:57 ` dvyukov at google dot com
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).