Re: [RFC] i386 memcmp implementation and gcc builtin

public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed

* Re: [RFC] i386 memcmp implementation and gcc builtin
       [not found] ` <20021028232849.R3451@sunsite.ms.mff.cuni.cz>
@ 2002-10-29 19:45   ` Bonzini
  0 siblings, 0 replies; only message in thread
From: Bonzini @ 2002-10-29 19:45 UTC (permalink / raw)
  To: libc-alpha; +Cc: gcc

[-- Attachment #1: Type: text/plain, Size: 1411 bytes --]

Here is a revised version; thank everybody for the quick feedback!

I am now using EAX to hold the decreasing loop counter, and checking if <16
before checking =0 as Jakub suggested.  It could be interesting to check if
working on -EAX and using indexed addressing modes is worth (I'm not sure,
because you'd lose the ability to decrease the loop counter).

I performed the tests with 100 repetitions, since on machines only slightly
newer than mine a single repetition is way too small to observe anything;
the number is customizable.  A single program does testing and benchmarking
(to avoid clashes my implementation is now called my_memcmp); compile it
with -fno-builtin of course!  As Roger Sayle suggested, I'm taking user time
instead of wall time (though my machine is so lightly loaded that I cannot
see the difference).

Note I'm using -mcpu=i686 -march=i686 (Andreas Jaeger suggested that, but
since I compiled my GCC on my own it is already the default).

Results:

utente@engineer:~/esperimenti$ gcc -O3 -fno-builtin test.c memcmp.S
utente@engineer:~/esperimenti$  ./a.out
100 repetitions

Testing custom memcmp ..... done,      8.130 seconds
Testing libc memcmp ....... done,     10.250 seconds
Testing builtin memcmp .... done,      9.620 seconds
Testing loop overhead ..... done,      5.580 seconds

(CCed to gcc people because of the dismaying __builtin_memcmp performance on
the PII).

Paolo


[-- Attachment #2: memcmp.S --]
[-- Type: application/octet-stream, Size: 3953 bytes --]

	.globl my_memcmp

my_memcmp:
	pushl  %ebp		# Save some registers
	movl   %esp,%ebp
	pushl  %ebx
	movl   0x8(%ebp),%ebx	# Load S1
	movl   0xc(%ebp),%edx	# Load S2
	movl   0x10(%ebp),%eax	# Load length

	cmpl   $0x10,%eax	# For small strings
	jb     7f		# we cannot afford startup overheads

	testb  $3,%bl		# Check if EBX is already aligned
	jz     2f
	
	# Align EBX

	.align 16
1:
	movb   (%ebx),%cl	# If not, compare byte-by-byte
	cmpb   (%edx),%cl
	jne    9f		# Until we have a mismatch
	incl   %ebx
	incl   %edx
	decl   %eax
	testb  $3,%bl		# Or EBX is aligned
	jnz    1b

	# EBX is aligned, check if EDX is

	.align 4
2:
	push   %edi		# Save more callee-save regs
	push   %esi
	testb  $3,%dl		# If EDX is aligned too
	jnz    4f		# use simpler and faster code
	subl   $4,%eax		# Save a `cmp $4,%eax' below

	# Loop for aligned EBX and EDX

	.align 16
3:
	movl   (%edx),%esi	# Compare a DWORD
	cmpl   %esi,(%ebx)
	jne    8f
	addl   $4,%ebx		# Go on with the next one
	addl   $4,%edx		# if they match
	subl   $4,%eax
	ja     3b

	addl   $4,%eax		# Restore the loop counter
	popl   %esi		# Beginning of epilog
	popl   %edi
	jmp    6f		# And compare byte-by-byte
	
	# Set up loop for aligned EBX and unaligned EDX

	.align 4
4:
	movl   %edx,%ecx	# Load the low bits of S2 into ECX
	andl   $3,%ecx
	andl   $~3,%edx		# And align EDX to lower dword
	subl   $4,%eax		# Save a `cmp $4,%eax' below
	movl   %ecx,%ebp	# Save lower bits
	shll   $3,%ecx		# Byte offset --> bit offset
	movl   (%edx),%esi	# Load two DWORDs
	movl   0x4(%edx),%edi
	
	# Loop for aligned EBX and unaligned EDX
	#
	# Example: EDX was unaligned by 1, hence CL = 8
	#
	#          ESI                        EDI
	# ,-----.-----.-----.-----. ,-----.-----.-----.-----.
	# |  2  |  1  |  0  |  -1 | |  6  |  5  |  4  |  3  |
	# `-----'-----'-----'-----' `-----'-----'-----'-----'
	#                         ,-.
	#                         | |
	#                        _| |_		after
	#			 \   /		shrdl $8, %edi, %esi
	#			  `v'
	#          ESI                        EDI
	# ,-----.-----.-----.-----. ,-----.-----.-----.-----.
	# |  3  |  2  |  1  |  0  | |  6  |  5  |  4  |  3  |
	# `-----'-----'-----'-----' `-----'-----'-----'-----'
	# compared against (%ebx)   used for the next iteration
	#
	#          ESI                        EDI
	# ,-----.-----.-----.-----. ,-----.-----.-----.-----.
	# |  6  |  5  |  4  |  3  | | 10  |  9  |  8  |  7  |
	# `-----'-----'-----'-----' `-----'-----'-----'-----'
	#
	# etc.

	.align 16
5:
	shrdl  %cl,%edi,%esi	# Compute an unaligned DWORD
	cmpl   %esi,(%ebx)	# Compare it with the aligned EBX
	jne    8f
	movl   %edi,%esi
	movl   0x8(%edx),%edi	# Load another DWORD if they matched
	addl   $4,%edx
	addl   $4,%ebx
	subl   $4,%eax
	ja     5b

	orl    %ebp,%edx	# When few bytes remain, work again
				# on the unaligned EDX
	addl   $4,%eax		# Restore the loop counter
	popl   %esi		# Beginning of epilog
	popl   %edi
	
	# Byte-by-byte loop

	.align 16
6:
	movb   (%ebx),%cl	# Compare byte-by-byte
	cmpb   (%edx),%cl
	jne    9f
	incl   %ebx		# Go on if they matched
	incl   %edx
	decl   %eax
	jne    6b
	popl   %ebx		# Epilog
	popl   %ebp
	retl   			# Return EAX = 0
	
	# Handle LEN < 16

	.align 16
7:
	orl    %eax,%eax	# If no characters to be compared
	jnz    6b		# Exit immediately

	popl   %ebx		# Epilog
	popl   %ebp
	retl   			# Return EAX = 0
	
	# Compute return value for mismatching (%EBX) and %ESI

	.align 16
8:
	movl   (%ebx),%edi	# Load the two mismatching DWORDs
	bswapl %esi		# Make them big-endian
	bswapl %edi
	cmpl   %esi,%edi	# So that we can compare them
	popl   %esi		# Beginning of epilog
	popl   %edi
	
	# Compute return value from flags

	.align 4
9:
	sbbl   %eax,%eax	# -1 if <, 0 if >
	orl    $1,%eax		# -1 if <, 1 if >
	popl   %ebx		# Epilog
	popl   %ebp
	retl   


[-- Attachment #3: test.c --]
[-- Type: text/plain, Size: 3030 bytes --]

#define __NO_STRING_INLINES

#include <sys/resource.h>
#include <sys/time.h>
#include <stdio.h>
#include <assert.h>
#include <stdlib.h>

char s1[] = "abcdefghijklmnopqrstuvwxyz";

void
test_my_memcmp (char *s1, int i, int test, int len)
{
  char s2[2 * len + 1];
  int j, k;

  for (k = i; k <= len + 1; k++)
    for (j = 0; j < len; j++)
      {
        memcpy (s2 + j, s1, len + 1);
        s2[i+j] += test;
        assert (my_memcmp (s2 + j, s1, k) == (k != i ? test : 0));
      }
}

void
speed_libc (char *s1, int i, int test, int len)
{
  char s2[2 * len + 1];
  int j, k;

  for (k = i; k <= len + 1; k++)
    for (j = 0; j < len; j++)
      {
        memcpy (s2 + j, s1, len + 1);
        s2[i+j] += test;
        s2[0] = memcmp (s2 + j, s1, k);
      }
}

void
speed_builtin (char *s1, int i, int test, int len)
{
  char s2[2 * len + 1];
  int j, k;

  for (k = i; k <= len + 1; k++)
    for (j = 0; j < len; j++)
      {
        memcpy (s2 + j, s1, len + 1);
        s2[i+j] += test;
        s2[0] = __builtin_memcmp (s2 + j, s1, k);
      }
}

void
speed_memcmp (char *s1, int i, int test, int len)
{
  char s2[2 * len + 1];
  int j, k;

  for (k = i; k <= len + 1; k++)
    for (j = 0; j < len; j++)
      {
        memcpy (s2 + j, s1, len + 1);
        s2[i+j] += test;
        s2[0] = my_memcmp (s2 + j, s1, k);
      }
}

void
speed_loop (char *s1, int i, int test, int len)
{
  char s2[2 * len + 1];
  int j, k;

  for (k = i; k <= len + 1; k++)
    for (j = 0; j < len; j++)
      {
        memcpy (s2 + j, s1, len + 1);
        s2[i+j] += test;
        s2[0] = 0;
      }
}

double
do_test (int repetitions, void (*test_function) (char *, int, int, int))
{
  int len = strlen(s1);
  int i, h;
  struct rusage r1, r2;

  getrusage(RUSAGE_SELF, &r1);

  while (repetitions--)
    for (h = 0; h < len; h += 5)
      for (i = h; i < len; i++)
        {
          test_function (s1 + h, i - h, -1, len);
          test_function (s1 + h, i - h, 0, len);
          test_function (s1 + h, i - h, 1, len);
        }

  getrusage(RUSAGE_SELF, &r2);
  return (r2.ru_utime.tv_sec - r1.ru_utime.tv_sec) +
	 (r2.ru_utime.tv_usec - r1.ru_utime.tv_usec) / 1000000.0;
}

main(int argc, char **argv)
{
  int repetitions = (argc == 1) ? 100 : atoi(argv[1]);

  do_test (1, test_my_memcmp);

  printf ("%d repetitions\n\n", repetitions);
  printf ("Testing custom memcmp .....");
  fflush (stdout);
  printf (" done, %10.3f seconds\n", do_test (repetitions, speed_memcmp));

  printf ("Testing libc memcmp .......");
  fflush (stdout);
  printf (" done, %10.3f seconds\n", do_test (repetitions, speed_libc));

  printf ("Testing builtin memcmp ....");
  fflush (stdout);
  printf (" done, %10.3f seconds\n", do_test (repetitions, speed_builtin));

  printf ("Testing loop overhead .....");
  fflush (stdout);
  printf (" done, %10.3f seconds\n", do_test (repetitions, speed_loop));

  exit (0);
}


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2002-10-29 20:58 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <002801c27ea7$36cda440$0f181897@bonz>
     [not found] ` <20021028232849.R3451@sunsite.ms.mff.cuni.cz>
2002-10-29 19:45   ` [RFC] i386 memcmp implementation and gcc builtin Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).