From mboxrd@z Thu Jan  1 00:00:00 1970
From: Iain McClatchie <iainmcc@ix.netcom.com>
To: egcs@cygnus.com
Subject: C++ suffers on another inner loop
Date: Wed, 22 Apr 1998 00:53:00 -0000
Message-id: <353D2DA1.389C6B67@ix.netcom.com>
X-SW-Source: 1998-04/msg00894.html

First off, I'd like to thank you all once again for a great compiler.
I'm working on an EDA program which does all sorts of manipulations on
Binary Decision DAGs.  That means my life is filled with endless
variations on basically the same operation cache code, over and over
and over.

Last night, after realizing I was going to have to implement yet
another cache, I decided to try templatizing these things.  Normally,
I'd be quite reticent to try such a thing, since I already have
working C code which is fast, but I thought I'd compare a C++
implementation against my existing C implementation to see which comes
out ahead.

Short summary: EGCS correctly inlines and optimizes away nearly all
the abstraction garbage that I put in, and gives me code which runs
almost exactly as fast as my original implementation.  Now I've got
far fewer lines of source to deal with, and one place to change things
when I find a bug.  Thanks guys.

What follows is a short but detailed look at part of the C and C++
implementations, along with the assembly from a representative
function.  I've found one place where egcs compiling the C++ version
misses a helpful simplification in an inner loop, and I wonder if one
of you folks could tell me why.

C implementation:

inline unsigned long
str_hash_fn( const char *str )
{
  int              len = 0;
  unsigned long    acc = 0;

  for( ; *str != 0; str++, len++ ) {
    acc = ((acc << 3) | (acc >> 29)) + *str + len;
  }
  return acc * 0x9DD68AB5;
}

static inline str_hte_t *
lookup_str( const str_hash *hash, unsigned long hkey, const char *str )
{
  str_hte_t         *look = hash->table[ hkey >> hash->shift ];

//-----------------------------------
// Here's the interesting inner loop
  for( ; look != NULL; look = look->next ) {
    if(( look->str == str ) || ( strcmp( look->str, str ) == 0 )) {
      return look;
    }
  }
// We get reasonable code in C.
//-----------------------------------
  return NULL;
}

str_hte_t *
probe_str_hte( const char *str, const str_hash *hash )
{
  unsigned long  hkey = str_hash_fn( str );

  return lookup_str( hash, hkey, str );
}

probe_str_hte__FPCcP8str_hash:
	pushl %ebp
	movl %esp,%ebp
	pushl %edi
	pushl %esi
	pushl %ebx
	movl 8(%ebp),%esi
	movl 12(%ebp),%edi
	movl %esi,%ecx
	xorl %ebx,%ebx
	xorl %eax,%eax
	cmpb $0,(%esi)
	je .L205
	.align 4
.L207:
	roll $3,%eax
	movsbl (%ecx),%edx
	addl %edx,%eax
	addl %ebx,%eax
	incl %ecx
	incl %ebx
	cmpb $0,(%ecx)
	jne .L207
.L205:
	imull $-1646884171,%eax,%eax

	movl 16(%edi),%ecx
	shrl %cl,%eax
	movl 4(%edi),%edx
	movl (%edx,%eax,4),%ebx
	testl %ebx,%ebx
	je .L212
	.align 4
.L214:
	movl 4(%ebx),%eax        # Here's the code for that loop
	cmpl %esi,%eax
	je .L216
	pushl %esi
	pushl %eax
	call strcmp
	addl $8,%esp
	testl %eax,%eax
	jne .L213                # The sense of this jump should be
.L216:                           # switched so that we get one taken
	movl %ebx,%eax           # branch per loop iteration.  But
	jmp .L210                # this is pretty good code.
	.align 4
.L213:
	movl (%ebx),%ebx
	testl %ebx,%ebx
	jne .L214
.L212:
	xorl %eax,%eax
.L210:
	leal -12(%ebp),%esp
	popl %ebx
	popl %esi
	popl %edi
	movl %ebp,%esp
	popl %ebp
	ret

class str_key_t {
public:
  const char            *str;
  inline unsigned long   hash_fn( void ) const;
};

typedef hash<str_key_t, str_pair_t>      str_hash_t;

inline
unsigned long
str_key_t::hash_fn( void ) const
{
  const char      *s = str;
  int              len = 0;
  unsigned long    acc = 0;

  for( ; *s != 0; s++, len++ ) {
    acc = ((acc << 3) | (acc >> 29)) + *s + len;
  }
  return acc * 0x9DD68AB5;
}

static inline
int
match( const str_pair_t *pair, const str_key_t &key )
{
  return ( pair->str == key.str ) ||
    ( strcmp( pair->str, key.str ) == 0 );
}

template <class Key, class Pair>
inline hash<Key, Pair>::hte_t *
hash<Key, Pair>::lookup( unsigned long hkey, const Key &key )
{
  hash<Key,Pair>::hte_t   *look = table[ hkey >> shift ];

//-----------------------------------
// Here's the interesting inner loop
  for( ; look != NULL; look = look->next ) {
    if( match( look, key )) return look;
  }
// The code that we get from C++ doesn't do as well
//-----------------------------------
  return NULL;
}

template <class Key, class Pair>
inline Pair *
hash<Key,Pair>::probe_hte( const Key &key )
{
  unsigned long     hkey = key.hash_fn();

  return lookup( hkey, key );
}

str_hte_t *
probe_str_hte2( const char *str, str_hash *hashp )
{
  str_hash_int   *hash = (str_hash_int *) hashp;
  str_key_t       key;

  key.str = str;
  return hash->probe_hte( key );
}

Egcs-1.0.2 generates:

probe_str_hte:
	pushl %ebp
	movl %esp,%ebp
	subl $4,%esp             # extra inst versus C
	pushl %esi
	xorl %edx,%edx
	pushl %ebx
	movl 8(%ebp),%eax
	movl 12(%ebp),%esi
	movl %eax,-4(%ebp)       # egcs can't eliminate this dead write
	movl %eax,%ecx
	movl %edx,%ebx
	cmpb $0,(%ecx)
	je .L198
	.align 4
.L196:
	roll $3,%edx
	movsbl (%ecx),%eax
	addl %edx,%eax
	leal (%ebx,%eax),%edx
	incl %ecx
	incl %ebx
	cmpb $0,(%ecx)
	jne .L196
.L198:
	imull $-1646884171,%edx,%edx

	movl 16(%esi),%ecx
	shrl %cl,%edx
	movl 20(%esi),%eax
	movl (%eax,%edx,4),%ebx
	testl %ebx,%ebx
	je .L211
	.align 4
.L203:                         # here is the inner loop.
	xorl %esi,%esi         # %esi is going to be used to hold the
	movl -4(%ebp),%eax     # boolean value returned from the
	cmpl %eax,(%ebx)       # inlined match() function call.
	je .L204
	pushl %eax
	movl (%ebx),%eax
	pushl %eax
	call strcmp
	addl $8,%esp
	testl %eax,%eax        # here egcs uses a test-and-jump
	jne .L205              # sequence...
.L204:
	movl $1,%esi           # ..to generate a boolean value...
.L205:
	testl %esi,%esi        # ..which it then test and jumps!
	je .L208               # boolean is dead outside this test
	movl %ebx,%eax
	jmp .L209
	.align 4
.L208:
	movl 8(%ebx),%ebx
	testl %ebx,%ebx
	jne .L203
.L211:
	xorl %eax,%eax
.L209:
	leal -12(%ebp),%esp
	popl %ebx
	popl %esi
	movl %ebp,%esp
	popl %ebp
	ret

-Iain McClatchie