public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed
* Re: C++ suffers on another inner loop
       [not found] <m0ySVsy-00008lC@mill>
@ 1998-04-23 21:12 ` Iain McClatchie
  0 siblings, 0 replies; 3+ messages in thread
From: Iain McClatchie @ 1998-04-23 21:12 UTC (permalink / raw)
  To: Nathan Myers; +Cc: egcs

Nathan> Try coding it like this:
Nathan> str_hte_t*
Nathan> probe_str_hte2( const char* str, str_hash* hashp )
Nathan> {
Nathan>   str_key_t key;
Nathan>   key.str = str;
Nathan>   str_hash_int* hash = (str_hash_int*) hashp;
Nathan>   return hash->probe_hte( key );
Nathan> }

That's the C wrapper... and the main problem I see is in the inner loop.
Still, I gave it a shot, and got exactly the same results as my posted
code (actually, two instructions were interchanged, with no effect on
speed).

The problem is that seperating the string comparison test into a
seperate inlined function causes egcs to explicitly represent the 
boolean return value in a register, rather than just fold it into
control
flow.  Beyond that, I'm not really clear on what's going on.

-Iain

^ permalink raw reply	[flat|nested] 3+ messages in thread

* C++ suffers on another inner loop
@ 1998-04-24 16:23 Iain McClatchie
  0 siblings, 0 replies; 3+ messages in thread
From: Iain McClatchie @ 1998-04-24 16:23 UTC (permalink / raw)
  To: egcs; +Cc: Nathan Myers

Ah.  I'm a blockhead.  Excuse me, Nathan, for being too square to
realize you were joking.

I did a little more poking around and reduced the test case
considerably.  As it turns out, the problem has nothing to do with
C++, templates, or anything like that.  It's just when the compiler
inlines a function, it turns off some low-level optimizations that
it would have done had the function been inlined by hand.

The actual problem that one sees depends on the boolean expression
inlined.  Here's a much simpler example:

typedef struct hte_t {
  struct hte_t  *next;
  const char    *str;
} hte_t;

hte_t *                                   lookup1__FP5hte_tPCc:       
lookup1( hte_t *list, const char *str )	          pushl %ebp          
{					          movl %esp,%ebp      
  for( ; list; list = list->next ) {	          movl 8(%ebp),%eax   
    if( list->str == str )		          movl 12(%ebp),%edx  
      return list;			          testl %eax,%eax     
  }					          je .L8              
  return list;				          .align 4            
}					  .L5:                        
					          cmpl %edx,4(%eax)   
					          je .L8              
					          movl (%eax),%eax    
					          testl %eax,%eax     
					          jne .L5             
					  .L8:                        
					          movl %ebp,%esp      
					          popl %ebp           
					          ret                 
					                            
inline int
match( const char *str1, const char *str2 )
{
  return ( str1 == str2 );
}

hte_t *                                   lookup2__FP5hte_tPCc:     
lookup2( hte_t *list, const char *str )	          pushl %ebp        
{					          movl %esp,%ebp    
  for( ; list; list = list->next ) {	          movl 8(%ebp),%eax 
    if( match( str, list->str ))	          movl 12(%ebp),%edx
      return list;			          .align 4          
  }					  .L11:                     
  return list;				          testl %eax,%eax   
}					          je .L17           
					          cmpl %edx,4(%eax) 
					          je .L17           
In lookup2, egcs should combine the               movl (%eax),%eax  
end-of-loop test with the branch backward.        jmp .L11          
It succeeds in lookup1 but fails here,            .align 4          
presumably because the match function	  .L17:                     
is inlined.  I think Jason Merrill checked        movl %ebp,%esp    
in a fix for this sort of thing in	          popl %ebp         
February.			          	  ret               

It seems that any simple inlined conditional will show the same
symptom.  If the inlined conditional is a compound conditional, things
get worse: the code builds up a boolean value only to branch on it:

hte_t *                                      lookup1__FP5hte_tPCc:    
lookup1( hte_t *list, const char *str )	             pushl %ebp       
{					             movl %esp,%ebp   
  for( ; list; list = list->next ) {	             movl 8(%ebp),%edx
    if(( list->str == str ) ||		             movl 12(%ebp),%ec
       ( list->next == (hte_t *) str ))	             testl %edx,%edx  
      return list;			             je .L3           
  }					             .align 4         
  return list;				     .L5:                     
}					             cmpl %ecx,4(%edx)
					             je .L3           
					             movl (%edx),%eax 
					             cmpl %ecx,%eax   
					             je .L3           
					             movl %eax,%edx   
					             testl %edx,%edx  
					             jne .L5          
					     .L3:                     
					             movl %edx,%eax   
					             movl %ebp,%esp   
					             popl %ebp        
					             ret              
					                              


inline int                                   lookup2__FP5hte_tPCc:    
match( hte_t *list, const char *str )	             pushl %ebp       
{					             movl %esp,%ebp   
  return(( list->str == str ) ||	             movl 8(%ebp),%eax
	 ( list->next == (hte_t *) str ));           movl 12(%ebp),%ec
}					             .align 4         
					     .L14:                    
hte_t *					             testl %eax,%eax  
lookup2( hte_t *list, const char *str )	             je .L22          
{					             xorl %edx,%edx   
  for( ; list; list = list->next ) {	             cmpl %ecx,4(%eax)
    if( match( list, str ))		             je .L19          
      return list;			             cmpl %ecx,(%eax) 
  }					             jne .L20         
  return list;				     .L19:                    
}					             movl $1,%edx     
					     .L20:                    
					             testl %edx,%edx  
					             jne .L22         
					             movl (%eax),%eax 
					             jmp .L14         
					             .align 4         
					     .L22:                    
					             movl %ebp,%esp   
					             popl %ebp        
					             ret              

-Iain McClatchie

^ permalink raw reply	[flat|nested] 3+ messages in thread

* C++ suffers on another inner loop
@ 1998-04-22  0:53 Iain McClatchie
  0 siblings, 0 replies; 3+ messages in thread
From: Iain McClatchie @ 1998-04-22  0:53 UTC (permalink / raw)
  To: egcs

First off, I'd like to thank you all once again for a great compiler.
I'm working on an EDA program which does all sorts of manipulations on
Binary Decision DAGs.  That means my life is filled with endless
variations on basically the same operation cache code, over and over
and over.

Last night, after realizing I was going to have to implement yet
another cache, I decided to try templatizing these things.  Normally,
I'd be quite reticent to try such a thing, since I already have
working C code which is fast, but I thought I'd compare a C++
implementation against my existing C implementation to see which comes
out ahead.

Short summary: EGCS correctly inlines and optimizes away nearly all
the abstraction garbage that I put in, and gives me code which runs
almost exactly as fast as my original implementation.  Now I've got
far fewer lines of source to deal with, and one place to change things
when I find a bug.  Thanks guys.

What follows is a short but detailed look at part of the C and C++
implementations, along with the assembly from a representative
function.  I've found one place where egcs compiling the C++ version
misses a helpful simplification in an inner loop, and I wonder if one
of you folks could tell me why.

C implementation:

inline unsigned long
str_hash_fn( const char *str )
{
  int              len = 0;
  unsigned long    acc = 0;

  for( ; *str != 0; str++, len++ ) {
    acc = ((acc << 3) | (acc >> 29)) + *str + len;
  }
  return acc * 0x9DD68AB5;
}

static inline str_hte_t *
lookup_str( const str_hash *hash, unsigned long hkey, const char *str )
{
  str_hte_t         *look = hash->table[ hkey >> hash->shift ];

//-----------------------------------
// Here's the interesting inner loop
  for( ; look != NULL; look = look->next ) {
    if(( look->str == str ) || ( strcmp( look->str, str ) == 0 )) {
      return look;
    }
  }
// We get reasonable code in C.
//-----------------------------------
  return NULL;
}

str_hte_t *
probe_str_hte( const char *str, const str_hash *hash )
{
  unsigned long  hkey = str_hash_fn( str );

  return lookup_str( hash, hkey, str );
}

probe_str_hte__FPCcP8str_hash:
	pushl %ebp
	movl %esp,%ebp
	pushl %edi
	pushl %esi
	pushl %ebx
	movl 8(%ebp),%esi
	movl 12(%ebp),%edi
	movl %esi,%ecx
	xorl %ebx,%ebx
	xorl %eax,%eax
	cmpb $0,(%esi)
	je .L205
	.align 4
.L207:
	roll $3,%eax
	movsbl (%ecx),%edx
	addl %edx,%eax
	addl %ebx,%eax
	incl %ecx
	incl %ebx
	cmpb $0,(%ecx)
	jne .L207
.L205:
	imull $-1646884171,%eax,%eax

	movl 16(%edi),%ecx
	shrl %cl,%eax
	movl 4(%edi),%edx
	movl (%edx,%eax,4),%ebx
	testl %ebx,%ebx
	je .L212
	.align 4
.L214:
	movl 4(%ebx),%eax        # Here's the code for that loop
	cmpl %esi,%eax
	je .L216
	pushl %esi
	pushl %eax
	call strcmp
	addl $8,%esp
	testl %eax,%eax
	jne .L213                # The sense of this jump should be
.L216:                           # switched so that we get one taken
	movl %ebx,%eax           # branch per loop iteration.  But
	jmp .L210                # this is pretty good code.
	.align 4
.L213:
	movl (%ebx),%ebx
	testl %ebx,%ebx
	jne .L214
.L212:
	xorl %eax,%eax
.L210:
	leal -12(%ebp),%esp
	popl %ebx
	popl %esi
	popl %edi
	movl %ebp,%esp
	popl %ebp
	ret

class str_key_t {
public:
  const char            *str;
  inline unsigned long   hash_fn( void ) const;
};

typedef hash<str_key_t, str_pair_t>      str_hash_t;

inline
unsigned long
str_key_t::hash_fn( void ) const
{
  const char      *s = str;
  int              len = 0;
  unsigned long    acc = 0;

  for( ; *s != 0; s++, len++ ) {
    acc = ((acc << 3) | (acc >> 29)) + *s + len;
  }
  return acc * 0x9DD68AB5;
}

static inline
int
match( const str_pair_t *pair, const str_key_t &key )
{
  return ( pair->str == key.str ) ||
    ( strcmp( pair->str, key.str ) == 0 );
}

template <class Key, class Pair>
inline hash<Key, Pair>::hte_t *
hash<Key, Pair>::lookup( unsigned long hkey, const Key &key )
{
  hash<Key,Pair>::hte_t   *look = table[ hkey >> shift ];

//-----------------------------------
// Here's the interesting inner loop
  for( ; look != NULL; look = look->next ) {
    if( match( look, key )) return look;
  }
// The code that we get from C++ doesn't do as well
//-----------------------------------
  return NULL;
}

template <class Key, class Pair>
inline Pair *
hash<Key,Pair>::probe_hte( const Key &key )
{
  unsigned long     hkey = key.hash_fn();

  return lookup( hkey, key );
}

str_hte_t *
probe_str_hte2( const char *str, str_hash *hashp )
{
  str_hash_int   *hash = (str_hash_int *) hashp;
  str_key_t       key;

  key.str = str;
  return hash->probe_hte( key );
}

Egcs-1.0.2 generates:

probe_str_hte:
	pushl %ebp
	movl %esp,%ebp
	subl $4,%esp             # extra inst versus C
	pushl %esi
	xorl %edx,%edx
	pushl %ebx
	movl 8(%ebp),%eax
	movl 12(%ebp),%esi
	movl %eax,-4(%ebp)       # egcs can't eliminate this dead write
	movl %eax,%ecx
	movl %edx,%ebx
	cmpb $0,(%ecx)
	je .L198
	.align 4
.L196:
	roll $3,%edx
	movsbl (%ecx),%eax
	addl %edx,%eax
	leal (%ebx,%eax),%edx
	incl %ecx
	incl %ebx
	cmpb $0,(%ecx)
	jne .L196
.L198:
	imull $-1646884171,%edx,%edx

	movl 16(%esi),%ecx
	shrl %cl,%edx
	movl 20(%esi),%eax
	movl (%eax,%edx,4),%ebx
	testl %ebx,%ebx
	je .L211
	.align 4
.L203:                         # here is the inner loop.
	xorl %esi,%esi         # %esi is going to be used to hold the
	movl -4(%ebp),%eax     # boolean value returned from the
	cmpl %eax,(%ebx)       # inlined match() function call.
	je .L204
	pushl %eax
	movl (%ebx),%eax
	pushl %eax
	call strcmp
	addl $8,%esp
	testl %eax,%eax        # here egcs uses a test-and-jump
	jne .L205              # sequence...
.L204:
	movl $1,%esi           # ..to generate a boolean value...
.L205:
	testl %esi,%esi        # ..which it then test and jumps!
	je .L208               # boolean is dead outside this test
	movl %ebx,%eax
	jmp .L209
	.align 4
.L208:
	movl 8(%ebx),%ebx
	testl %ebx,%ebx
	jne .L203
.L211:
	xorl %eax,%eax
.L209:
	leal -12(%ebp),%esp
	popl %ebx
	popl %esi
	movl %ebp,%esp
	popl %ebp
	ret

-Iain McClatchie

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~1998-04-24 16:23 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <m0ySVsy-00008lC@mill>
1998-04-23 21:12 ` C++ suffers on another inner loop Iain McClatchie
1998-04-24 16:23 Iain McClatchie
  -- strict thread matches above, loose matches on Subject: below --
1998-04-22  0:53 Iain McClatchie

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).