* Re: [RFC] i386 memcmp implementation and gcc builtin
[not found] ` <20021028232849.R3451@sunsite.ms.mff.cuni.cz>
@ 2002-10-29 19:45 ` Bonzini
0 siblings, 0 replies; only message in thread
From: Bonzini @ 2002-10-29 19:45 UTC (permalink / raw)
To: libc-alpha; +Cc: gcc
[-- Attachment #1: Type: text/plain, Size: 1411 bytes --]
Here is a revised version; thank everybody for the quick feedback!
I am now using EAX to hold the decreasing loop counter, and checking if <16
before checking =0 as Jakub suggested. It could be interesting to check if
working on -EAX and using indexed addressing modes is worth (I'm not sure,
because you'd lose the ability to decrease the loop counter).
I performed the tests with 100 repetitions, since on machines only slightly
newer than mine a single repetition is way too small to observe anything;
the number is customizable. A single program does testing and benchmarking
(to avoid clashes my implementation is now called my_memcmp); compile it
with -fno-builtin of course! As Roger Sayle suggested, I'm taking user time
instead of wall time (though my machine is so lightly loaded that I cannot
see the difference).
Note I'm using -mcpu=i686 -march=i686 (Andreas Jaeger suggested that, but
since I compiled my GCC on my own it is already the default).
Results:
utente@engineer:~/esperimenti$ gcc -O3 -fno-builtin test.c memcmp.S
utente@engineer:~/esperimenti$ ./a.out
100 repetitions
Testing custom memcmp ..... done, 8.130 seconds
Testing libc memcmp ....... done, 10.250 seconds
Testing builtin memcmp .... done, 9.620 seconds
Testing loop overhead ..... done, 5.580 seconds
(CCed to gcc people because of the dismaying __builtin_memcmp performance on
the PII).
Paolo
[-- Attachment #2: memcmp.S --]
[-- Type: application/octet-stream, Size: 3953 bytes --]
.globl my_memcmp
my_memcmp:
pushl %ebp # Save some registers
movl %esp,%ebp
pushl %ebx
movl 0x8(%ebp),%ebx # Load S1
movl 0xc(%ebp),%edx # Load S2
movl 0x10(%ebp),%eax # Load length
cmpl $0x10,%eax # For small strings
jb 7f # we cannot afford startup overheads
testb $3,%bl # Check if EBX is already aligned
jz 2f
# Align EBX
.align 16
1:
movb (%ebx),%cl # If not, compare byte-by-byte
cmpb (%edx),%cl
jne 9f # Until we have a mismatch
incl %ebx
incl %edx
decl %eax
testb $3,%bl # Or EBX is aligned
jnz 1b
# EBX is aligned, check if EDX is
.align 4
2:
push %edi # Save more callee-save regs
push %esi
testb $3,%dl # If EDX is aligned too
jnz 4f # use simpler and faster code
subl $4,%eax # Save a `cmp $4,%eax' below
# Loop for aligned EBX and EDX
.align 16
3:
movl (%edx),%esi # Compare a DWORD
cmpl %esi,(%ebx)
jne 8f
addl $4,%ebx # Go on with the next one
addl $4,%edx # if they match
subl $4,%eax
ja 3b
addl $4,%eax # Restore the loop counter
popl %esi # Beginning of epilog
popl %edi
jmp 6f # And compare byte-by-byte
# Set up loop for aligned EBX and unaligned EDX
.align 4
4:
movl %edx,%ecx # Load the low bits of S2 into ECX
andl $3,%ecx
andl $~3,%edx # And align EDX to lower dword
subl $4,%eax # Save a `cmp $4,%eax' below
movl %ecx,%ebp # Save lower bits
shll $3,%ecx # Byte offset --> bit offset
movl (%edx),%esi # Load two DWORDs
movl 0x4(%edx),%edi
# Loop for aligned EBX and unaligned EDX
#
# Example: EDX was unaligned by 1, hence CL = 8
#
# ESI EDI
# ,-----.-----.-----.-----. ,-----.-----.-----.-----.
# | 2 | 1 | 0 | -1 | | 6 | 5 | 4 | 3 |
# `-----'-----'-----'-----' `-----'-----'-----'-----'
# ,-.
# | |
# _| |_ after
# \ / shrdl $8, %edi, %esi
# `v'
# ESI EDI
# ,-----.-----.-----.-----. ,-----.-----.-----.-----.
# | 3 | 2 | 1 | 0 | | 6 | 5 | 4 | 3 |
# `-----'-----'-----'-----' `-----'-----'-----'-----'
# compared against (%ebx) used for the next iteration
#
# ESI EDI
# ,-----.-----.-----.-----. ,-----.-----.-----.-----.
# | 6 | 5 | 4 | 3 | | 10 | 9 | 8 | 7 |
# `-----'-----'-----'-----' `-----'-----'-----'-----'
#
# etc.
.align 16
5:
shrdl %cl,%edi,%esi # Compute an unaligned DWORD
cmpl %esi,(%ebx) # Compare it with the aligned EBX
jne 8f
movl %edi,%esi
movl 0x8(%edx),%edi # Load another DWORD if they matched
addl $4,%edx
addl $4,%ebx
subl $4,%eax
ja 5b
orl %ebp,%edx # When few bytes remain, work again
# on the unaligned EDX
addl $4,%eax # Restore the loop counter
popl %esi # Beginning of epilog
popl %edi
# Byte-by-byte loop
.align 16
6:
movb (%ebx),%cl # Compare byte-by-byte
cmpb (%edx),%cl
jne 9f
incl %ebx # Go on if they matched
incl %edx
decl %eax
jne 6b
popl %ebx # Epilog
popl %ebp
retl # Return EAX = 0
# Handle LEN < 16
.align 16
7:
orl %eax,%eax # If no characters to be compared
jnz 6b # Exit immediately
popl %ebx # Epilog
popl %ebp
retl # Return EAX = 0
# Compute return value for mismatching (%EBX) and %ESI
.align 16
8:
movl (%ebx),%edi # Load the two mismatching DWORDs
bswapl %esi # Make them big-endian
bswapl %edi
cmpl %esi,%edi # So that we can compare them
popl %esi # Beginning of epilog
popl %edi
# Compute return value from flags
.align 4
9:
sbbl %eax,%eax # -1 if <, 0 if >
orl $1,%eax # -1 if <, 1 if >
popl %ebx # Epilog
popl %ebp
retl
[-- Attachment #3: test.c --]
[-- Type: text/plain, Size: 3030 bytes --]
#define __NO_STRING_INLINES
#include <sys/resource.h>
#include <sys/time.h>
#include <stdio.h>
#include <assert.h>
#include <stdlib.h>
char s1[] = "abcdefghijklmnopqrstuvwxyz";
void
test_my_memcmp (char *s1, int i, int test, int len)
{
char s2[2 * len + 1];
int j, k;
for (k = i; k <= len + 1; k++)
for (j = 0; j < len; j++)
{
memcpy (s2 + j, s1, len + 1);
s2[i+j] += test;
assert (my_memcmp (s2 + j, s1, k) == (k != i ? test : 0));
}
}
void
speed_libc (char *s1, int i, int test, int len)
{
char s2[2 * len + 1];
int j, k;
for (k = i; k <= len + 1; k++)
for (j = 0; j < len; j++)
{
memcpy (s2 + j, s1, len + 1);
s2[i+j] += test;
s2[0] = memcmp (s2 + j, s1, k);
}
}
void
speed_builtin (char *s1, int i, int test, int len)
{
char s2[2 * len + 1];
int j, k;
for (k = i; k <= len + 1; k++)
for (j = 0; j < len; j++)
{
memcpy (s2 + j, s1, len + 1);
s2[i+j] += test;
s2[0] = __builtin_memcmp (s2 + j, s1, k);
}
}
void
speed_memcmp (char *s1, int i, int test, int len)
{
char s2[2 * len + 1];
int j, k;
for (k = i; k <= len + 1; k++)
for (j = 0; j < len; j++)
{
memcpy (s2 + j, s1, len + 1);
s2[i+j] += test;
s2[0] = my_memcmp (s2 + j, s1, k);
}
}
void
speed_loop (char *s1, int i, int test, int len)
{
char s2[2 * len + 1];
int j, k;
for (k = i; k <= len + 1; k++)
for (j = 0; j < len; j++)
{
memcpy (s2 + j, s1, len + 1);
s2[i+j] += test;
s2[0] = 0;
}
}
double
do_test (int repetitions, void (*test_function) (char *, int, int, int))
{
int len = strlen(s1);
int i, h;
struct rusage r1, r2;
getrusage(RUSAGE_SELF, &r1);
while (repetitions--)
for (h = 0; h < len; h += 5)
for (i = h; i < len; i++)
{
test_function (s1 + h, i - h, -1, len);
test_function (s1 + h, i - h, 0, len);
test_function (s1 + h, i - h, 1, len);
}
getrusage(RUSAGE_SELF, &r2);
return (r2.ru_utime.tv_sec - r1.ru_utime.tv_sec) +
(r2.ru_utime.tv_usec - r1.ru_utime.tv_usec) / 1000000.0;
}
main(int argc, char **argv)
{
int repetitions = (argc == 1) ? 100 : atoi(argv[1]);
do_test (1, test_my_memcmp);
printf ("%d repetitions\n\n", repetitions);
printf ("Testing custom memcmp .....");
fflush (stdout);
printf (" done, %10.3f seconds\n", do_test (repetitions, speed_memcmp));
printf ("Testing libc memcmp .......");
fflush (stdout);
printf (" done, %10.3f seconds\n", do_test (repetitions, speed_libc));
printf ("Testing builtin memcmp ....");
fflush (stdout);
printf (" done, %10.3f seconds\n", do_test (repetitions, speed_builtin));
printf ("Testing loop overhead .....");
fflush (stdout);
printf (" done, %10.3f seconds\n", do_test (repetitions, speed_loop));
exit (0);
}
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2002-10-29 20:58 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <002801c27ea7$36cda440$0f181897@bonz>
[not found] ` <20021028232849.R3451@sunsite.ms.mff.cuni.cz>
2002-10-29 19:45 ` [RFC] i386 memcmp implementation and gcc builtin Bonzini
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).