public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed
* [4.3] Invalid code or invalid optimisation?
@ 2009-06-04  1:39 Dave Korn
  2009-06-04  8:27 ` Andrew Haley
  0 siblings, 1 reply; 5+ messages in thread
From: Dave Korn @ 2009-06-04  1:39 UTC (permalink / raw)
  To: gcc

[-- Attachment #1: Type: text/plain, Size: 4048 bytes --]


    Good morning everyone,

  I have an interesting situation.  In this bit of code below, extracted from
a simple testcase, I have a singly-linked list:

------------------------------<snip!>------------------------------
template <class list_node> class List
{
 public:
  List() : head(__null)
  {
  }
  void insert (list_node *node)
  {
    List_insert (head, node);
  }
  list_node *head;
};

class pthread_mutex: public verifyable_object
{
public:
        [ ... data members ... ]
  pthread_mutex (pthread_mutexattr * = __null);
  ~pthread_mutex ();

  class pthread_mutex * next;

private:
  static List<pthread_mutex> mutexes;
};

List<pthread_mutex> pthread_mutex::mutexes;

pthread_mutex::pthread_mutex (pthread_mutexattr *attr) :
        [ ... member initialisers ... ]
{
        [ ... code ... ]
  mutexes.insert (this);
}
------------------------------<snip!>------------------------------

  I am getting unexpected results in the inlined List.insert operation in the
pthread_mutex constructor.  The critical part of the code uses an inlined
interlocked compare-and-exchange asm, that looks like this:

------------------------------<snip!>------------------------------
extern __inline__ long
ilockcmpexch (volatile long *t, long v, long c)
{
  return ({
		__typeof (*t) ret;
		__asm __volatile ("lock cmpxchgl %2, %1"
			: "=a" (ret), "=m" (*t)
			: "r" (v), "m" (*t), "0" (c));
		ret;
	});
}

template <class list_node> inline void
List_insert (list_node *&head, list_node *node)
{
  if (!node)
    return;
  do
    node->next = head;
  while ((PVOID)ilockcmpexch((LONG volatile
*)(&head),(LONG)(node),(LONG)(node->next)) != node->next);
}
------------------------------<snip!>------------------------------

  To my surprise, GCCs 4.3.2 and 4.3.3 at -O2 both sink the store to
node->next after the call to ilockcmpexch:

------------------------------<snip!>------------------------------
__ZN13pthread_mutexC1EP17pthread_mutexattr:
        [ ... code ... ]
L15:
	movl	__ZN13pthread_mutex7mutexesE, %edx	 # mutexes.head, D.1991
	movl	%edx, %eax	 # D.1991, tmp69
/APP
 # 35 "mxfull.cpp" 1
	lock cmpxchgl %esi, __ZN13pthread_mutex7mutexesE	 # this,
 # 0 "" 2
/NO_APP
	movl	%eax, -12(%ebp)	 # tmp69, ret
	movl	-12(%ebp), %eax	 # ret, D.1988
	cmpl	%eax, %edx	 # D.1988, D.1991
	jne	L15	 #,
	movl	%edx, 36(%esi)	 # D.1991, <variable>.next
------------------------------<snip!>------------------------------

  This is obviously bad news for the consistency of the list; the value of
'head' is cached in %edx and not written to node->next until after the
ilockcmpexch inline, meaning an incompletely-constructed node gets linked on
the front of the chain for a window of several instructions.  By contrast, a
recent build from head does what I want: it writes to node->next in front of
the ilockcmpexch, and only tests its value afterward:

------------------------------<snip!>------------------------------
__ZN13pthread_mutexC2EP17pthread_mutexattr:
        [ ... code ... ]
L9:
	movl	__ZN13pthread_mutex7mutexesE, %eax	 # mutexes.head, D.2119
	movl	%eax, 36(%ebx)	 # D.2119, <variable>.next
/APP
 # 35 "mxfull.cpp" 1
	lock cmpxchgl %ebx, __ZN13pthread_mutex7mutexesE	 # this,
 # 0 "" 2
/NO_APP
	movl	%eax, -12(%ebp)	 # tmp79, ret
	movl	-12(%ebp), %eax	 # ret, D.2120
	cmpl	%eax, 36(%ebx)	 # D.2120, <variable>.next
	jne	L9	 #,
------------------------------<snip!>------------------------------

  Adding a "memory" clobber to the inline asm works around the problem,
causing 4.3 series to generate the same assembly as head, but I think it's a
sledgehammer approach.  Am I asking too much of GCC to not sink the store, or
is 4.3 doing something wrong?  I /think/ that the fact that there's a volatile
store in ilockcmpexch means the earlier store shouldn't be moved past it, and
that GCC is perhaps missing that the asm's output operand effectively
represents a volatile write through *t, but I could be misunderstanding the
rules about volatile.  Anyone got their language lawyer's hat on at the moment?

    cheers,
      DaveK


[-- Attachment #2: mxfull.cpp --]
[-- Type: text/plain, Size: 2369 bytes --]


// g++ -c mxfull.cpp -o mxfull.o --save-temps -O2 -fverbose-asm

typedef long LONG;
typedef void *HANDLE;
typedef void *PVOID;
typedef char *LPCSTR;

typedef class pthread_mutex *pthread_mutex_t;
typedef class pthread_mutexattr *pthread_mutexattr_t;
typedef class pthread *pthread_t;

struct SECURITY_ATTRIBUTES;
typedef struct SECURITY_ATTRIBUTES *LPSECURITY_ATTRIBUTES;
extern struct SECURITY_ATTRIBUTES sec_none_nih;

HANDLE __attribute__((__stdcall__)) CreateSemaphoreA(LPSECURITY_ATTRIBUTES,LONG,LONG,LPCSTR);

class verifyable_object
{
public:
  long magic;

  verifyable_object (long);
  virtual ~verifyable_object ();
};

extern __inline__ long
ilockcmpexch (volatile long *t, long v, long c)
{
  return ({
		__typeof (*t) ret;
		__asm __volatile ("lock cmpxchgl %2, %1"
			: "=a" (ret), "=m" (*t)
			: "r" (v), "m" (*t), "0" (c));
		ret;
	});
}

class pthread_mutexattr: public verifyable_object
{
public:
  int pshared;
  int mutextype;
  pthread_mutexattr ();
  ~pthread_mutexattr ();
};

template <class list_node> inline void
List_insert (list_node *&head, list_node *node)
{
  if (!node)
    return;
  do
    node->next = head;
  while ((PVOID)ilockcmpexch((LONG volatile *)(&head),(LONG)(node),(LONG)(node->next)) != node->next);
}

template <class list_node> class List
{
 public:
  List() : head(__null)
  {
  }

  ~List()
  {
  }

  void insert (list_node *node)
  {
    List_insert (head, node);
  }

  list_node *head;

};

class pthread_mutex: public verifyable_object
{
public:

  unsigned long lock_counter;
  HANDLE win32_obj_id;
  unsigned int recursion_counter;
  LONG condwaits;
  pthread_t owner;



  int type;
  int pshared;


  pthread_mutex (pthread_mutexattr * = __null);
  ~pthread_mutex ();

  class pthread_mutex * next;

private:
  static List<pthread_mutex> mutexes;
};

List<pthread_mutex> pthread_mutex::mutexes;

pthread_mutex::pthread_mutex (pthread_mutexattr *attr) :
  verifyable_object (0xdf0df045 +1),
  lock_counter (0),
  win32_obj_id (__null), recursion_counter (0),
  condwaits (0), owner (__null),



  type (1),
  pshared (0)
{
  win32_obj_id = ::CreateSemaphoreA (&sec_none_nih, 0, 2147483647L, __null);
  if (!win32_obj_id)
    {
      magic = 0;
      return;
    }

  if (attr)
    {
      if (attr->pshared == 1)
 {

   magic = 0;
   return;
 }

      type = attr->mutextype;
    }

  mutexes.insert (this);
}


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [4.3] Invalid code or invalid optimisation?
  2009-06-04  1:39 [4.3] Invalid code or invalid optimisation? Dave Korn
@ 2009-06-04  8:27 ` Andrew Haley
  2009-06-04  9:22   ` Dave Korn
  0 siblings, 1 reply; 5+ messages in thread
From: Andrew Haley @ 2009-06-04  8:27 UTC (permalink / raw)
  To: Dave Korn; +Cc: gcc

Dave Korn wrote:

>   Adding a "memory" clobber to the inline asm works around the problem,
> causing 4.3 series to generate the same assembly as head, but I think it's a
> sledgehammer approach.  Am I asking too much of GCC to not sink the store, or
> is 4.3 doing something wrong?  I /think/ that the fact that there's a volatile
> store in ilockcmpexch means the earlier store shouldn't be moved past it, and
> that GCC is perhaps missing that the asm's output operand effectively
> represents a volatile write through *t, but I could be misunderstanding the
> rules about volatile.  Anyone got their language lawyer's hat on at the moment?

You could just look at the standard, y'know.  Volatile stores only block other
volatile stores: they don't block *all* stores.  If you really want a complete
memory barrier, which in a mutex you surely do, then you're going to have to
clobber memory.

Andrew.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [4.3] Invalid code or invalid optimisation?
  2009-06-04  8:27 ` Andrew Haley
@ 2009-06-04  9:22   ` Dave Korn
  2009-06-04  9:48     ` Dave Korn
  2009-06-04  9:57     ` Andrew Haley
  0 siblings, 2 replies; 5+ messages in thread
From: Dave Korn @ 2009-06-04  9:22 UTC (permalink / raw)
  To: Andrew Haley; +Cc: Dave Korn, gcc

Andrew Haley wrote:
> Dave Korn wrote:
> 
>>   Adding a "memory" clobber to the inline asm works around the problem,
>> causing 4.3 series to generate the same assembly as head, but I think it's a
>> sledgehammer approach.  Am I asking too much of GCC to not sink the store, or
>> is 4.3 doing something wrong?  I /think/ that the fact that there's a volatile
>> store in ilockcmpexch means the earlier store shouldn't be moved past it, and
>> that GCC is perhaps missing that the asm's output operand effectively
>> represents a volatile write through *t, but I could be misunderstanding the
>> rules about volatile.  Anyone got their language lawyer's hat on at the moment?
> 
> You could just look at the standard, y'know.

  I did but I get as far as all that stuff about "The party of the third part
shall be known herein as the party of the third part" and find it very
difficult to know whether what it looks like it says actually is what it is
trying to say.

>  Volatile stores only block other
> volatile stores: they don't block *all* stores.  If you really want a complete
> memory barrier, which in a mutex you surely do, then you're going to have to
> clobber memory.

  Ah.  That suggests that HEAD is in fact _missing_ an optimisation that 4.3
gets right.  Maybe I should file a PR after all.

    cheers,
      DaveK

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [4.3] Invalid code or invalid optimisation?
  2009-06-04  9:22   ` Dave Korn
@ 2009-06-04  9:48     ` Dave Korn
  2009-06-04  9:57     ` Andrew Haley
  1 sibling, 0 replies; 5+ messages in thread
From: Dave Korn @ 2009-06-04  9:48 UTC (permalink / raw)
  To: Dave Korn; +Cc: Andrew Haley, gcc

Dave Korn wrote:
> Andrew Haley wrote:

>>  Volatile stores only block other
>> volatile stores: they don't block *all* stores.  If you really want a complete
>> memory barrier, which in a mutex you surely do, then you're going to have to
>> clobber memory.
> 
>   Ah.  That suggests that HEAD is in fact _missing_ an optimisation that 4.3
> gets right.  Maybe I should file a PR after all.

  It also suggests that the register motion is fairly gratuitous, I think.

	movl	%eax, -12(%ebp)	 # tmp79, ret
	movl	-12(%ebp), %eax	 # ret, D.2120

  Given that ret is a local variable that goes immediately out of scope, I
can't see any reason to update the stack slot.  I can prevent this happening
by declaring the temporary as 'register __typeof (*t) ret __asm ("%eax");' but
doesn't this mean we're missing a trick here between some combination of
regalloc, copyprop and dead-store elimination?

    cheers,
      DaveK

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [4.3] Invalid code or invalid optimisation?
  2009-06-04  9:22   ` Dave Korn
  2009-06-04  9:48     ` Dave Korn
@ 2009-06-04  9:57     ` Andrew Haley
  1 sibling, 0 replies; 5+ messages in thread
From: Andrew Haley @ 2009-06-04  9:57 UTC (permalink / raw)
  To: Dave Korn; +Cc: gcc

Dave Korn wrote:
> Andrew Haley wrote:
>> Dave Korn wrote:
>>
>>>   Adding a "memory" clobber to the inline asm works around the problem,
>>> causing 4.3 series to generate the same assembly as head, but I think it's a
>>> sledgehammer approach.  Am I asking too much of GCC to not sink the store, or
>>> is 4.3 doing something wrong?  I /think/ that the fact that there's a volatile
>>> store in ilockcmpexch means the earlier store shouldn't be moved past it, and
>>> that GCC is perhaps missing that the asm's output operand effectively
>>> represents a volatile write through *t, but I could be misunderstanding the
>>> rules about volatile.  Anyone got their language lawyer's hat on at the moment?
>> You could just look at the standard, y'know.
>
>   I did but I get as far as all that stuff about "The party of the third part
> shall be known herein as the party of the third part" and find it very
> difficult to know whether what it looks like it says actually is what it is
> trying to say.

It's just a matter of learning the language in which it's written, and
that's like learning a programming language.  If you look at 6.7.3
Para 6 it says

"... An object that has volatile-qualified type may be modified in ways
unknown to the implementation or have other unknown side effects.
Therefore any expression referring to such an object shall be
evaluated strictly according to the rules of the abstract machine, as
described in 5.1.2.3.  Furthermore, at every sequence point the value
last stored in the object shall agree with that prescribed by the
abstract machine, except as modified by the unknown factors mentioned
previously..."

Now, this is admittedly gobbledygook, but it's important to note that
it only says that volatile stores may not be moved across sequence
points.  It does *not* say that other stores may not be moved across
volatile stores, which is what you were hoping for.

We already discussed this subject before: see
http://gcc.gnu.org/ml/gcc/2007-10/msg00477.html
and the very long thread that followed.

>> Volatile stores only block other volatile stores: they don't block
>> *all* stores.  If you really want a complete memory barrier, which
>> in a mutex you surely do, then you're going to have to clobber
>> memory.
>
>   Ah.  That suggests that HEAD is in fact _missing_ an optimisation
> that 4.3 gets right.  Maybe I should file a PR after all.

Maybe.

Andrew.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2009-06-04  9:57 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-06-04  1:39 [4.3] Invalid code or invalid optimisation? Dave Korn
2009-06-04  8:27 ` Andrew Haley
2009-06-04  9:22   ` Dave Korn
2009-06-04  9:48     ` Dave Korn
2009-06-04  9:57     ` Andrew Haley

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).