Re: Slow memcmp for aligned strings on Pentium 3

public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed

From: Kevin Atkinson <kevina@gnu.org>
To: Roger Sayle <roger@www.eyesopen.com>
Cc: gcc@gcc.gnu.org
Subject: Re: Slow memcmp for aligned strings on Pentium 3
Date: Fri, 04 Apr 2003 17:08:00 -0000	[thread overview]
Message-ID: <Pine.LNX.4.44.0304040923350.912-300000@kevin-pc.atkinson.dhs.org> (raw)
In-Reply-To: <Pine.LNX.4.44.0304032110020.7684-100000@www.eyesopen.com>

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1831 bytes --]

On Thu, 3 Apr 2003, Roger Sayle wrote:

> I suspect it should be possible to fix your code to handle these
> termination conditions correctly, and a comparison of your
> routine's performance with these fixes vs. __builtin_memcmp
> would be of interest.

Here you go.  Still assume the memory is alligned:

This is what I did:

int cmpa2(const unsigned int * x, const unsigned int * y, size_t size)
{
  int i = 0;
  size_t s = size / 4;
  while (i < s && x[i] == y[i]) ++i;
  size -= i * 4;
  if (size == 0) return 0;
  // hopefully if this is inline expanded when size is known
  // the compiler can eliminate many of these conditionals
  else if (size >= 4) { // if original size % 4 == 0 this should
                        // always be the case
    unsigned int xx = x[i], yy = y[i];
    asm("bswap %0" : "+r"(xx));
    asm("bswap %0" : "+r"(yy));
    return xx - yy;
  } else {
    const unsigned char * xb = (const unsigned char *)(x + i);
    const unsigned char * yb = (const unsigned char *)(y + i);
    // if size is known at compile time then the compiler should be
    // able to select the correct choice at compile time
    switch (size) {
    case 1:
      return *xb - *yb;
    case 2:
      return ((xb[0] - yb[0]) <<  8) + (xb[1] - yb[1]);
    case 3:
      return ((xb[0] - yb[0]) << 16) + ((xb[1] - yb[1]) << 8)
        + xb[2] - yb[2];}
  }
}

compiled with gcc -fomit-frame-pointer -O3 -march=pentium3 cmps.c

Memory compare int:
  2160000
  360000
  Speed up: 6.000000
Memory compare 15 bytes:
  4120000
  1550000
  Speed up: 2.658065
Memory compare 16 bytes:
  4150000
  1310000
  Speed up: 3.167939
Memory compare 64 bytes:
  11240000
  3470000
  Speed up: 3.239193
Memory compare 256 bytes:
  37930000
  10050000
  Speed up: 3.774129

Code and assembly output attached.

-- 
http://kevin.atkinson.dhs.org

[-- Attachment #2: Type: TEXT/PLAIN, Size: 3315 bytes --]

#include <stdio.h>
#include <time.h>
#include <stdlib.h>

__attribute__((noinline))
int cmpi(unsigned int x, unsigned int y) 
{
  return __builtin_memcmp(&x,&y,4);
}

__attribute__((noinline))
int cmpi2(unsigned int x, unsigned int y)
{
  asm("bswap %0" : "+r"(x));
  asm("bswap %0" : "+r"(y));
  return x - y;
}

__attribute__((noinline))
int cmpa(const unsigned int * x, const unsigned int * y, size_t size)
{
  return __builtin_memcmp(x,y,size);
}

__attribute__((noinline))
int cmpa2(const unsigned int * x, const unsigned int * y, size_t size)
{
  int i = 0;
  size_t s = size / 4;
  while (i < s && x[i] == y[i]) ++i;
  size -= i * 4;
  if (size == 0) return 0;
  // hopefully if this is inline expanded when size is known
  // the compiler can eliminate many of these conditionals
  else if (size >= 4) { // if original size % 4 == 0 this should
                        // always be the case
    unsigned int xx = x[i], yy = y[i];
    asm("bswap %0" : "+r"(xx));
    asm("bswap %0" : "+r"(yy));
    return xx - yy;
  } else {
    const unsigned char * xb = (const unsigned char *)(x + i);
    const unsigned char * yb = (const unsigned char *)(y + i);
    // if size is known at compile time then the compiler should be
    // able to select the correct choice at compile time
    switch (size) {
    case 1:
      return *xb - *yb;
    case 2:
      return ((xb[0] - yb[0]) <<  8) + (xb[1] - yb[1]);
    case 3:
      return ((xb[0] - yb[0]) << 16) + ((xb[1] - yb[1]) << 8)
        + xb[2] - yb[2];}
  }
}

int main()
{
  unsigned int x,y,i;
  int xa[256] = {}, ya[256] = {};
  int s,f;
  clock_t t;
  x = *(unsigned int *)("a  b");
  y = *(unsigned int *)("b  a");

  printf("Memory compare int:\n");
  t = clock();
  for (i = 0; i != 0x1000000; ++i)
    cmpi(x,y);
  s = clock() - t;
  printf("  %d\n", s);
  t = clock();
  for (i = 0; i != 0x1000000; ++i)
    cmpi2(x,y);
  f = clock() - t;
  printf("  %d\n",f);
  printf("  Speed up: %f\n", (double)s/f);


  printf("Memory compare 15 bytes:\n");
  xa[3] = 6;
  t = clock();
  for (i = 0; i != 0x1000000; ++i)
    cmpa(xa, ya, 15);
  s = clock() - t;
  printf("  %d\n", s);
  t = clock();
  for (i = 0; i != 0x1000000; ++i)
    cmpa2(xa, ya, 15);
  f = clock() - t;
  printf("  %d\n",f);
  printf("  Speed up: %f\n", (double)s/f);
  xa[3] = 0;

  printf("Memory compare 16 bytes:\n");
  xa[3] = 6;
  t = clock();
  for (i = 0; i != 0x1000000; ++i)
    cmpa(xa, ya, 16);
  s = clock() - t;
  printf("  %d\n", s);
  t = clock();
  for (i = 0; i != 0x1000000; ++i)
    cmpa2(xa, ya, 16);
  f = clock() - t;
  printf("  %d\n",f);
  printf("  Speed up: %f\n", (double)s/f);
  xa[3] = 0;

  printf("Memory compare 64 bytes:\n");
  xa[15] = 6;
  t = clock();
  for (i = 0; i != 0x1000000; ++i)
    cmpa(xa, ya, 64);
  s = clock() - t;
  printf("  %d\n", s);
  t = clock();
  for (i = 0; i != 0x1000000; ++i)
    cmpa2(xa, ya, 64);
  f = clock() - t;
  printf("  %d\n",f);
  printf("  Speed up: %f\n", (double)s/f);
  xa[15] = 0;

  printf("Memory compare 256 bytes:\n");
  xa[63] = 6;
  t = clock();
  for (i = 0; i != 0x1000000; ++i)
    cmpa(xa, ya, 256);
  s = clock() - t;
  printf("  %d\n", s);
  t = clock();
  for (i = 0; i != 0x1000000; ++i)
    cmpa2(xa, ya, 256);
  f = clock() - t;
  printf("  %d\n",f);
  printf("  Speed up: %f\n", (double)s/f);
}

[-- Attachment #3: Type: TEXT/PLAIN, Size: 7710 bytes --]

	.file	"cmps.c"
	.text
	.align 2
	.p2align 4,,15
.globl cmpi
	.type	cmpi,@function
cmpi:
	cld
	subl	$8, %esp
	movl	$4, %ecx
	movl	%esi, (%esp)
	leal	12(%esp), %esi
	movl	%edi, 4(%esp)
	leal	16(%esp), %edi
	repz
	cmpsb
	movl	4(%esp), %edi
	movl	(%esp), %esi
	seta	%dl
	setb	%cl
	addl	$8, %esp
	subb	%cl, %dl
	movsbl	%dl,%eax
	ret
.Lfe1:
	.size	cmpi,.Lfe1-cmpi
	.align 2
	.p2align 4,,15
.globl cmpi2
	.type	cmpi2,@function
cmpi2:
	movl	4(%esp), %eax
	movl	8(%esp), %ecx
#APP
	bswap %eax
	bswap %ecx
#NO_APP
	subl	%ecx, %eax
	ret
.Lfe2:
	.size	cmpi2,.Lfe2-cmpi2
	.align 2
	.p2align 4,,15
.globl cmpa
	.type	cmpa,@function
cmpa:
	cld
	subl	$8, %esp
	movl	20(%esp), %ecx
	movl	%esi, (%esp)
	movl	12(%esp), %esi
	cmpl	%ecx, %ecx
	movl	%edi, 4(%esp)
	movl	16(%esp), %edi
	repz
	cmpsb
	movl	4(%esp), %edi
	movl	(%esp), %esi
	seta	%dl
	setb	%cl
	addl	$8, %esp
	subb	%cl, %dl
	movsbl	%dl,%eax
	ret
.Lfe3:
	.size	cmpa,.Lfe3-cmpa
	.align 2
	.p2align 4,,15
.globl cmpa2
	.type	cmpa2,@function
cmpa2:
	pushl	%edi
	xorl	%edx, %edx
	pushl	%esi
	pushl	%ebx
	movl	24(%esp), %ecx
	movl	16(%esp), %esi
	movl	20(%esp), %edi
	movl	%ecx, %ebx
	shrl	$2, %ebx
	cmpl	%ebx, %edx
	jae	.L6
	movl	(%edi), %eax
	cmpl	%eax, (%esi)
	je	.L9
.L6:
	leal	0(,%edx,4), %eax
	subl	%eax, %ecx
	jne	.L10
	xorl	%ecx, %ecx
.L4:
	popl	%ebx
	movl	%ecx, %eax
	popl	%esi
	popl	%edi
	ret
	.p2align 4,,7
.L10:
	cmpl	$3, %ecx
	jbe	.L12
	movl	(%esi,%edx,4), %ecx
	movl	(%edi,%edx,4), %eax
#APP
	bswap %ecx
	bswap %eax
#NO_APP
.L22:
	subl	%eax, %ecx
	jmp	.L4
	.p2align 4,,7
.L12:
	cmpl	$2, %ecx
	leal	(%eax,%esi), %ebx
	leal	(%eax,%edi), %esi
	je	.L16
	cmpl	$2, %ecx
	ja	.L20
	cmpl	$1, %ecx
	jne	.L4
	movzbl	(%ebx), %ecx
	movzbl	(%esi), %eax
	jmp	.L22
.L20:
	cmpl	$3, %ecx
	jne	.L4
	movzbl	1(%esi), %eax
	movzbl	(%ebx), %ecx
	movzbl	(%esi), %edx
	movzbl	1(%ebx), %edi
	subl	%edx, %ecx
	subl	%eax, %edi
	movzbl	2(%esi), %eax
	sall	$8, %edi
	sall	$16, %ecx
	addl	%edi, %ecx
	movzbl	2(%ebx), %edi
	addl	%edi, %ecx
	jmp	.L22
.L16:
	movzbl	(%ebx), %ecx
	movzbl	(%esi), %edx
	movzbl	1(%ebx), %edi
	movzbl	1(%esi), %eax
	subl	%edx, %ecx
	sall	$8, %ecx
	subl	%eax, %edi
	addl	%edi, %ecx
	jmp	.L4
	.p2align 4,,7
.L9:
	incl	%edx
	cmpl	%ebx, %edx
	jae	.L6
	movl	(%edi,%edx,4), %eax
	cmpl	%eax, (%esi,%edx,4)
	je	.L9
	jmp	.L6
.Lfe4:
	.size	cmpa2,.Lfe4-cmpa2
	.section	.rodata.str1.1,"aMS",@progbits,1
.LC0:
	.string	"a  b"
.LC2:
	.string	"Memory compare int:"
.LC1:
	.string	"b  a"
.LC3:
	.string	"  %d\n"
.LC4:
	.string	"  Speed up: %f\n"
.LC5:
	.string	"Memory compare 15 bytes:"
.LC6:
	.string	"Memory compare 16 bytes:"
.LC7:
	.string	"Memory compare 64 bytes:"
.LC8:
	.string	"Memory compare 256 bytes:"
	.text
	.align 2
	.p2align 4,,15
.globl main
	.type	main,@function
main:
	pushl	%ebp
	xorl	%eax, %eax
	movl	%esp, %ebp
	pushl	%edi
	movl	$256, %ecx
	leal	-1048(%ebp), %edi
	pushl	%esi
	pushl	%ebx
	subl	$2092, %esp
	andl	$-16, %esp
	cld
	rep
	stosl
	movl	$256, %ecx
	leal	-2072(%ebp), %edi
	rep
	stosl
	xorl	%edi, %edi
	movl	.LC0, %esi
	movl	$.LC2, (%esp)
	movl	.LC1, %ebx
	call	puts
	call	clock
	movl	%eax, -2084(%ebp)
	.p2align 4,,15
.L28:
	movl	%esi, (%esp)
	incl	%edi
	movl	%ebx, 4(%esp)
	call	cmpi
	cmpl	$16777216, %edi
	jne	.L28
	call	clock
	movl	%eax, -2080(%ebp)
	xorl	%edi, %edi
	movl	-2084(%ebp), %eax
	subl	%eax, -2080(%ebp)
	movl	$.LC3, (%esp)
	movl	-2080(%ebp), %eax
	movl	%eax, 4(%esp)
	call	printf
	call	clock
	movl	%eax, -2084(%ebp)
.L33:
	movl	%esi, (%esp)
	incl	%edi
	movl	%ebx, 4(%esp)
	call	cmpi2
	cmpl	$16777216, %edi
	jne	.L33
	call	clock
	movl	$.LC3, (%esp)
	movl	%eax, %esi
	movl	-2084(%ebp), %eax
	xorl	%edi, %edi
	subl	%eax, %esi
	movl	%esi, 4(%esp)
	call	printf
	fildl	-2080(%ebp)
	movl	$.LC4, (%esp)
	pushl	%esi
	fildl	(%esp)
	addl	$4, %esp
	fdivrp	%st, %st(1)
	fstpl	4(%esp)
	call	printf
	movl	$.LC5, (%esp)
	call	puts
	movl	$6, %eax
	movl	%eax, -1036(%ebp)
	call	clock
	movl	%eax, -2084(%ebp)
.L38:
	movl	$15, 8(%esp)
	leal	-1048(%ebp), %ecx
	incl	%edi
	movl	%ecx, (%esp)
	leal	-2072(%ebp), %edx
	movl	%edx, 4(%esp)
	call	cmpa
	cmpl	$16777216, %edi
	jne	.L38
	call	clock
	movl	%eax, -2080(%ebp)
	xorl	%edi, %edi
	movl	-2084(%ebp), %eax
	subl	%eax, -2080(%ebp)
	movl	$.LC3, (%esp)
	movl	-2080(%ebp), %ebx
	movl	%ebx, 4(%esp)
	call	printf
	call	clock
	movl	%eax, -2084(%ebp)
.L43:
	movl	$15, 8(%esp)
	leal	-1048(%ebp), %edx
	incl	%edi
	movl	%edx, (%esp)
	leal	-2072(%ebp), %esi
	movl	%esi, 4(%esp)
	call	cmpa2
	cmpl	$16777216, %edi
	jne	.L43
	call	clock
	movl	$.LC3, (%esp)
	movl	%eax, %esi
	movl	-2084(%ebp), %eax
	xorl	%edi, %edi
	subl	%eax, %esi
	movl	%esi, 4(%esp)
	call	printf
	fildl	-2080(%ebp)
	movl	$.LC4, (%esp)
	pushl	%esi
	fildl	(%esp)
	addl	$4, %esp
	fdivrp	%st, %st(1)
	fstpl	4(%esp)
	call	printf
	movl	%edi, -1036(%ebp)
	xorl	%edi, %edi
	movl	$.LC6, (%esp)
	call	puts
	movl	$6, %eax
	movl	%eax, -1036(%ebp)
	call	clock
	movl	%eax, -2084(%ebp)
.L48:
	movl	$16, 8(%esp)
	leal	-1048(%ebp), %ecx
	incl	%edi
	movl	%ecx, (%esp)
	leal	-2072(%ebp), %eax
	movl	%eax, 4(%esp)
	call	cmpa
	cmpl	$16777216, %edi
	jne	.L48
	call	clock
	movl	%eax, -2080(%ebp)
	xorl	%edi, %edi
	movl	-2084(%ebp), %eax
	subl	%eax, -2080(%ebp)
	movl	$.LC3, (%esp)
	movl	-2080(%ebp), %ebx
	movl	%ebx, 4(%esp)
	call	printf
	call	clock
	movl	%eax, -2084(%ebp)
.L53:
	movl	$16, 8(%esp)
	leal	-2072(%ebp), %edx
	incl	%edi
	movl	%edx, 4(%esp)
	leal	-1048(%ebp), %esi
	movl	%esi, (%esp)
	call	cmpa2
	cmpl	$16777216, %edi
	jne	.L53
	call	clock
	movl	$.LC3, (%esp)
	movl	%eax, %esi
	movl	-2084(%ebp), %eax
	xorl	%edi, %edi
	subl	%eax, %esi
	movl	%esi, 4(%esp)
	call	printf
	fildl	-2080(%ebp)
	movl	$.LC4, (%esp)
	pushl	%esi
	fildl	(%esp)
	addl	$4, %esp
	fdivrp	%st, %st(1)
	fstpl	4(%esp)
	call	printf
	movl	%edi, -1036(%ebp)
	xorl	%edi, %edi
	movl	$.LC7, (%esp)
	call	puts
	movl	$6, %eax
	movl	%eax, -988(%ebp)
	call	clock
	movl	%eax, -2084(%ebp)
.L58:
	movl	$64, 8(%esp)
	leal	-1048(%ebp), %ecx
	incl	%edi
	movl	%ecx, (%esp)
	leal	-2072(%ebp), %ebx
	movl	%ebx, 4(%esp)
	call	cmpa
	cmpl	$16777216, %edi
	jne	.L58
	call	clock
	movl	%eax, -2080(%ebp)
	xorl	%edi, %edi
	movl	-2084(%ebp), %eax
	subl	%eax, -2080(%ebp)
	movl	$.LC3, (%esp)
	movl	-2080(%ebp), %edx
	movl	%edx, 4(%esp)
	call	printf
	call	clock
	movl	%eax, -2084(%ebp)
.L63:
	movl	$64, 8(%esp)
	leal	-2072(%ebp), %eax
	incl	%edi
	movl	%eax, 4(%esp)
	leal	-1048(%ebp), %esi
	movl	%esi, (%esp)
	call	cmpa2
	cmpl	$16777216, %edi
	jne	.L63
	call	clock
	movl	$.LC3, (%esp)
	movl	-2084(%ebp), %ecx
	movl	%eax, %edi
	xorl	%ebx, %ebx
	subl	%ecx, %edi
	movl	%edi, 4(%esp)
	call	printf
	fildl	-2080(%ebp)
	movl	$.LC4, (%esp)
	pushl	%edi
	movl	$6, %edi
	fildl	(%esp)
	addl	$4, %esp
	fdivrp	%st, %st(1)
	fstpl	4(%esp)
	call	printf
	movl	%ebx, -988(%ebp)
	movl	$.LC8, (%esp)
	call	puts
	movl	%edi, -796(%ebp)
	xorl	%edi, %edi
	call	clock
	movl	%eax, -2084(%ebp)
.L68:
	movl	$256, 8(%esp)
	leal	-2072(%ebp), %edx
	incl	%edi
	movl	%edx, 4(%esp)
	leal	-1048(%ebp), %esi
	movl	%esi, (%esp)
	call	cmpa
	cmpl	$16777216, %edi
	jne	.L68
	call	clock
	movl	%eax, -2080(%ebp)
	xorl	%edi, %edi
	movl	-2084(%ebp), %eax
	subl	%eax, -2080(%ebp)
	movl	$.LC3, (%esp)
	movl	-2080(%ebp), %eax
	movl	%eax, 4(%esp)
	call	printf
	call	clock
	movl	%eax, -2084(%ebp)
.L73:
	movl	$256, 8(%esp)
	leal	-1048(%ebp), %ecx
	incl	%edi
	movl	%ecx, (%esp)
	leal	-2072(%ebp), %ebx
	movl	%ebx, 4(%esp)
	call	cmpa2
	cmpl	$16777216, %edi
	jne	.L73
	call	clock
	movl	$.LC3, (%esp)
	movl	%eax, %edi
	movl	-2084(%ebp), %eax
	subl	%eax, %edi
	movl	%edi, 4(%esp)
	call	printf
	fildl	-2080(%ebp)
	movl	$.LC4, (%esp)
	pushl	%edi
	fildl	(%esp)
	addl	$4, %esp
	fdivrp	%st, %st(1)
	fstpl	4(%esp)
	call	printf
	leal	-12(%ebp), %esp
	popl	%ebx
	popl	%esi
	popl	%edi
	popl	%ebp
	ret
.Lfe5:
	.size	main,.Lfe5-main
	.ident	"GCC: (GNU) 3.2 20020903 (Red Hat Linux 8.0 3.2-7)"

next prev parent reply	other threads:[~2003-04-04 14:33 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-04-04 13:18 Roger Sayle
2003-04-04 14:51 ` Kevin Atkinson
2003-04-04 17:08 ` Kevin Atkinson [this message]
2003-04-04 18:58   ` Marcel Cox
2003-04-04 19:27     ` Kevin Atkinson
  -- strict thread matches above, loose matches on Subject: below --
2003-04-04 17:13 Jerry Quinn
2003-04-04 17:29 ` Kevin Atkinson
2003-04-06  5:54   ` Jerry Quinn
2003-04-06 22:17     ` Kevin Atkinson
2003-04-07 17:30       ` Jerry Quinn
2003-04-04 14:33 Bonzini
2003-04-04  6:22 Kevin Atkinson
2003-04-04  8:52 ` Zack Weinberg
2003-04-04 15:16   ` Kevin Atkinson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=Pine.LNX.4.44.0304040923350.912-300000@kevin-pc.atkinson.dhs.org \
    --to=kevina@gnu.org \
    --cc=gcc@gcc.gnu.org \
    --cc=roger@www.eyesopen.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).