I was doing some tests with with memcmp and it seams that gcc 3.2.2 always
uses cmps, even for aligned strings.

I did some tests and discovered that using cmps was rather slow, compared 
to a simple loop and then a bswap and subtract at the end.

My method was is the following:

  int cmpa2(unsigned int * x, unsigned int * y, size_t size)
  {
    int i = 0;
    size /= 4;
    while (i < size && x[i] == y[i]) ++i;
    if (i == size) return 0;
    asm("bswap %0" : "+r"(x[i]));
    asm("bswap %0" : "+r"(y[i]));
    return x[i] - y[i];
  }

as oppose to:

  int cmpa(unsigned int * x, unsigned int * y, size_t size)
  {
    return __builtin_memcmp(x,y,size);
  }

Even though It takes integer pointers it really is comparing them as if 
they were byte strings.  I used integer points to avoid having to cast and 
to let the compiler know that the strings are 4 byte aligned. 
This method is 3-5 times as fast.  The longer the string the faster my 
method is.

I also wrote on specialized compare for comparing two ints AS IF they 
were a 4 byte string:

  int cmpi2(unsigned int x, unsigned int y)
  {
    asm("bswap %0" : "+r"(x));
    asm("bswap %0" : "+r"(y));
    return x - y;
  }

as oppose to:

  int cmpi(unsigned int x, unsigned int y) 
  {
    return __builtin_memcmp(&x,&y,4);
  }

Which was several order of magnitude faster.  It is 4 - 6 times faster.  
However I believe the function call is significant here as 
-fomit-frame-pointer increased the ratio (from 4 to 6).  And when I 
tried to account for the cost of the function call the results varies form 
any where to 10 to 30 times faster.

If a better memcmp is not already implemented in yet-to-be-released gcc I 
hope someone will seriously consider rewriting the memcpy implementation.  
Even though I only tested in on a Pentium 3 I am sure the results will 
also hold for a Pentium Classic based on the instruction timings.  I believe 
th bswap instruction was implemented in the 486. So only generic 386 code 
will have to use something other than bswap.

The test program is attached.  Here are the results when compiled with:
gcc -fomit-frame-pointer -O3 -march=pentium3 cmps.c (attribute noinline 
was used on test functions)

Memory compare int:
  2170000
  350000
  Speed up: 6.200000
Memory compare 16 bytes:
  4100000
  1240000
  Speed up: 3.306452
Memory compare 64 bytes:
  10930000
  2990000
  Speed up: 3.655518
Memory compare 256 bytes:
  37510000
  7700000
  Speed up: 4.871429

Here is the relevant assembly code:

cmpi: // integer as byte string compare using memcmp
        cld
        subl    $8, %esp
        movl    $4, %ecx
        movl    %esi, (%esp)
        leal    12(%esp), %esi
        movl    %edi, 4(%esp)
        leal    16(%esp), %edi
        repz
        cmpsb
        movl    4(%esp), %edi
        movl    (%esp), %esi
        seta    %dl
        setb    %cl
        addl    $8, %esp
        subb    %cl, %dl
        movsbl  %dl,%eax
        ret

cmpi: // interger as byte string compare my method
        movl    4(%esp), %eax
        movl    8(%esp), %ecx
        bswap %eax
        bswap %ecx
        subl    %ecx, %eax
        ret

cmpa: // aligned byte compare using memcmp
        cld
        subl    $8, %esp
        movl    20(%esp), %ecx
        movl    %esi, (%esp)
        movl    12(%esp), %esi
        cmpl    %ecx, %ecx
        movl    %edi, 4(%esp)
        movl    16(%esp), %edi
        repz
        cmpsb
        movl    4(%esp), %edi
        movl    (%esp), %esi
        seta    %dl
        setb    %cl
        addl    $8, %esp
        subb    %cl, %dl
        movsbl  %dl,%eax
        ret

cmpa2: // aligned byte compare: my method
        subl    $12, %esp
        movl    24(%esp), %ecx
        xorl    %edx, %edx
        movl    %esi, 4(%esp)
        movl    16(%esp), %esi
        shrl    $2, %ecx
        movl    %edi, 8(%esp)
        cmpl    %ecx, %edx
        movl    20(%esp), %edi
        movl    %ebx, (%esp)
        jae     .L6
        movl    (%edi), %eax
        cmpl    %eax, (%esi)
        je      .L9
.L6:
        xorl    %ebx, %ebx
        cmpl    %ecx, %edx
        je      .L4
        movl    (%esi,%edx,4), %eax
        bswap %eax
        movl    %eax, (%esi,%edx,4)
        movl    (%edi,%edx,4), %eax
        bswap %eax
        movl    %eax, (%edi,%edx,4)
        movl    (%esi,%edx,4), %ebx
        subl    %eax, %ebx
.L4:
        movl    %ebx, %eax
        movl    4(%esp), %esi
        movl    (%esp), %ebx
        movl    8(%esp), %edi
        addl    $12, %esp
        ret
        .p2align 4,,7
.L9:
        incl    %edx
        cmpl    %ecx, %edx
        jae     .L6
        movl    (%edi,%edx,4), %eax
        cmpl    %eax, (%esi,%edx,4)
        je      .L9
        jmp     .L6


--- 
http://kevin.atkinson.dhs.org