From mboxrd@z Thu Jan 1 00:00:00 1970 From: Iain McClatchie To: egcs@cygnus.com Subject: C++ suffers on another inner loop Date: Wed, 22 Apr 1998 00:53:00 -0000 Message-id: <353D2DA1.389C6B67@ix.netcom.com> X-SW-Source: 1998-04/msg00894.html First off, I'd like to thank you all once again for a great compiler. I'm working on an EDA program which does all sorts of manipulations on Binary Decision DAGs. That means my life is filled with endless variations on basically the same operation cache code, over and over and over. Last night, after realizing I was going to have to implement yet another cache, I decided to try templatizing these things. Normally, I'd be quite reticent to try such a thing, since I already have working C code which is fast, but I thought I'd compare a C++ implementation against my existing C implementation to see which comes out ahead. Short summary: EGCS correctly inlines and optimizes away nearly all the abstraction garbage that I put in, and gives me code which runs almost exactly as fast as my original implementation. Now I've got far fewer lines of source to deal with, and one place to change things when I find a bug. Thanks guys. What follows is a short but detailed look at part of the C and C++ implementations, along with the assembly from a representative function. I've found one place where egcs compiling the C++ version misses a helpful simplification in an inner loop, and I wonder if one of you folks could tell me why. C implementation: inline unsigned long str_hash_fn( const char *str ) { int len = 0; unsigned long acc = 0; for( ; *str != 0; str++, len++ ) { acc = ((acc << 3) | (acc >> 29)) + *str + len; } return acc * 0x9DD68AB5; } static inline str_hte_t * lookup_str( const str_hash *hash, unsigned long hkey, const char *str ) { str_hte_t *look = hash->table[ hkey >> hash->shift ]; //----------------------------------- // Here's the interesting inner loop for( ; look != NULL; look = look->next ) { if(( look->str == str ) || ( strcmp( look->str, str ) == 0 )) { return look; } } // We get reasonable code in C. //----------------------------------- return NULL; } str_hte_t * probe_str_hte( const char *str, const str_hash *hash ) { unsigned long hkey = str_hash_fn( str ); return lookup_str( hash, hkey, str ); } probe_str_hte__FPCcP8str_hash: pushl %ebp movl %esp,%ebp pushl %edi pushl %esi pushl %ebx movl 8(%ebp),%esi movl 12(%ebp),%edi movl %esi,%ecx xorl %ebx,%ebx xorl %eax,%eax cmpb $0,(%esi) je .L205 .align 4 .L207: roll $3,%eax movsbl (%ecx),%edx addl %edx,%eax addl %ebx,%eax incl %ecx incl %ebx cmpb $0,(%ecx) jne .L207 .L205: imull $-1646884171,%eax,%eax movl 16(%edi),%ecx shrl %cl,%eax movl 4(%edi),%edx movl (%edx,%eax,4),%ebx testl %ebx,%ebx je .L212 .align 4 .L214: movl 4(%ebx),%eax # Here's the code for that loop cmpl %esi,%eax je .L216 pushl %esi pushl %eax call strcmp addl $8,%esp testl %eax,%eax jne .L213 # The sense of this jump should be .L216: # switched so that we get one taken movl %ebx,%eax # branch per loop iteration. But jmp .L210 # this is pretty good code. .align 4 .L213: movl (%ebx),%ebx testl %ebx,%ebx jne .L214 .L212: xorl %eax,%eax .L210: leal -12(%ebp),%esp popl %ebx popl %esi popl %edi movl %ebp,%esp popl %ebp ret class str_key_t { public: const char *str; inline unsigned long hash_fn( void ) const; }; typedef hash str_hash_t; inline unsigned long str_key_t::hash_fn( void ) const { const char *s = str; int len = 0; unsigned long acc = 0; for( ; *s != 0; s++, len++ ) { acc = ((acc << 3) | (acc >> 29)) + *s + len; } return acc * 0x9DD68AB5; } static inline int match( const str_pair_t *pair, const str_key_t &key ) { return ( pair->str == key.str ) || ( strcmp( pair->str, key.str ) == 0 ); } template inline hash::hte_t * hash::lookup( unsigned long hkey, const Key &key ) { hash::hte_t *look = table[ hkey >> shift ]; //----------------------------------- // Here's the interesting inner loop for( ; look != NULL; look = look->next ) { if( match( look, key )) return look; } // The code that we get from C++ doesn't do as well //----------------------------------- return NULL; } template inline Pair * hash::probe_hte( const Key &key ) { unsigned long hkey = key.hash_fn(); return lookup( hkey, key ); } str_hte_t * probe_str_hte2( const char *str, str_hash *hashp ) { str_hash_int *hash = (str_hash_int *) hashp; str_key_t key; key.str = str; return hash->probe_hte( key ); } Egcs-1.0.2 generates: probe_str_hte: pushl %ebp movl %esp,%ebp subl $4,%esp # extra inst versus C pushl %esi xorl %edx,%edx pushl %ebx movl 8(%ebp),%eax movl 12(%ebp),%esi movl %eax,-4(%ebp) # egcs can't eliminate this dead write movl %eax,%ecx movl %edx,%ebx cmpb $0,(%ecx) je .L198 .align 4 .L196: roll $3,%edx movsbl (%ecx),%eax addl %edx,%eax leal (%ebx,%eax),%edx incl %ecx incl %ebx cmpb $0,(%ecx) jne .L196 .L198: imull $-1646884171,%edx,%edx movl 16(%esi),%ecx shrl %cl,%edx movl 20(%esi),%eax movl (%eax,%edx,4),%ebx testl %ebx,%ebx je .L211 .align 4 .L203: # here is the inner loop. xorl %esi,%esi # %esi is going to be used to hold the movl -4(%ebp),%eax # boolean value returned from the cmpl %eax,(%ebx) # inlined match() function call. je .L204 pushl %eax movl (%ebx),%eax pushl %eax call strcmp addl $8,%esp testl %eax,%eax # here egcs uses a test-and-jump jne .L205 # sequence... .L204: movl $1,%esi # ..to generate a boolean value... .L205: testl %esi,%esi # ..which it then test and jumps! je .L208 # boolean is dead outside this test movl %ebx,%eax jmp .L209 .align 4 .L208: movl 8(%ebx),%ebx testl %ebx,%ebx jne .L203 .L211: xorl %eax,%eax .L209: leal -12(%ebp),%esp popl %ebx popl %esi movl %ebp,%esp popl %ebp ret -Iain McClatchie