From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 1107 invoked by alias); 4 Apr 2003 02:13:35 -0000 Mailing-List: contact gcc-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Archive: List-Post: List-Help: Sender: gcc-owner@gcc.gnu.org Received: (qmail 1096 invoked from network); 4 Apr 2003 02:13:31 -0000 Received: from unknown (HELO smtp-out.comcast.net) (24.153.64.109) by sources.redhat.com with SMTP; 4 Apr 2003 02:13:31 -0000 Received: from master.atkinson.dhs.org (pcp219109pcs.elkrdg01.md.comcast.net [68.55.220.142]) by mtaout11.icomcast.net (iPlanet Messaging Server 5.2 HotFix 1.14 (built Mar 18 2003)) with ESMTP id <0HCS0023ZREHJY@mtaout11.icomcast.net> for gcc@gcc.gnu.org; Thu, 03 Apr 2003 21:11:06 -0500 (EST) Received: from kevin-pc.atkinson.dhs.org (kevin-pc.atkinson.dhs.org [192.168.1.3]) by master.atkinson.dhs.org (Postfix) with ESMTP id 69992B84C for ; Thu, 03 Apr 2003 21:11:05 -0500 (EST) Date: Fri, 04 Apr 2003 06:22:00 -0000 From: Kevin Atkinson Subject: Slow memcmp for aligned strings on Pentium 3 X-X-Sender: kevina@kevin-pc.atkinson.dhs.org To: gcc@gcc.gnu.org Message-id: MIME-version: 1.0 Content-type: multipart/mixed; boundary="Boundary_(ID_Xg3uWjAzDwNb8LkvCa6Pww)" X-SW-Source: 2003-04/txt/msg00170.txt.bz2 This message is in MIME format. The first part should be readable text, while the remaining parts are likely unreadable without MIME-aware tools. Send mail to mime@docserver.cac.washington.edu for more info. --Boundary_(ID_Xg3uWjAzDwNb8LkvCa6Pww) Content-type: TEXT/PLAIN; charset=US-ASCII Content-transfer-encoding: 7BIT Content-length: 4774 I was doing some tests with with memcmp and it seams that gcc 3.2.2 always uses cmps, even for aligned strings. I did some tests and discovered that using cmps was rather slow, compared to a simple loop and then a bswap and subtract at the end. My method was is the following: int cmpa2(unsigned int * x, unsigned int * y, size_t size) { int i = 0; size /= 4; while (i < size && x[i] == y[i]) ++i; if (i == size) return 0; asm("bswap %0" : "+r"(x[i])); asm("bswap %0" : "+r"(y[i])); return x[i] - y[i]; } as oppose to: int cmpa(unsigned int * x, unsigned int * y, size_t size) { return __builtin_memcmp(x,y,size); } Even though It takes integer pointers it really is comparing them as if they were byte strings. I used integer points to avoid having to cast and to let the compiler know that the strings are 4 byte aligned. This method is 3-5 times as fast. The longer the string the faster my method is. I also wrote on specialized compare for comparing two ints AS IF they were a 4 byte string: int cmpi2(unsigned int x, unsigned int y) { asm("bswap %0" : "+r"(x)); asm("bswap %0" : "+r"(y)); return x - y; } as oppose to: int cmpi(unsigned int x, unsigned int y) { return __builtin_memcmp(&x,&y,4); } Which was several order of magnitude faster. It is 4 - 6 times faster. However I believe the function call is significant here as -fomit-frame-pointer increased the ratio (from 4 to 6). And when I tried to account for the cost of the function call the results varies form any where to 10 to 30 times faster. If a better memcmp is not already implemented in yet-to-be-released gcc I hope someone will seriously consider rewriting the memcpy implementation. Even though I only tested in on a Pentium 3 I am sure the results will also hold for a Pentium Classic based on the instruction timings. I believe th bswap instruction was implemented in the 486. So only generic 386 code will have to use something other than bswap. The test program is attached. Here are the results when compiled with: gcc -fomit-frame-pointer -O3 -march=pentium3 cmps.c (attribute noinline was used on test functions) Memory compare int: 2170000 350000 Speed up: 6.200000 Memory compare 16 bytes: 4100000 1240000 Speed up: 3.306452 Memory compare 64 bytes: 10930000 2990000 Speed up: 3.655518 Memory compare 256 bytes: 37510000 7700000 Speed up: 4.871429 Here is the relevant assembly code: cmpi: // integer as byte string compare using memcmp cld subl $8, %esp movl $4, %ecx movl %esi, (%esp) leal 12(%esp), %esi movl %edi, 4(%esp) leal 16(%esp), %edi repz cmpsb movl 4(%esp), %edi movl (%esp), %esi seta %dl setb %cl addl $8, %esp subb %cl, %dl movsbl %dl,%eax ret cmpi: // interger as byte string compare my method movl 4(%esp), %eax movl 8(%esp), %ecx bswap %eax bswap %ecx subl %ecx, %eax ret cmpa: // aligned byte compare using memcmp cld subl $8, %esp movl 20(%esp), %ecx movl %esi, (%esp) movl 12(%esp), %esi cmpl %ecx, %ecx movl %edi, 4(%esp) movl 16(%esp), %edi repz cmpsb movl 4(%esp), %edi movl (%esp), %esi seta %dl setb %cl addl $8, %esp subb %cl, %dl movsbl %dl,%eax ret cmpa2: // aligned byte compare: my method subl $12, %esp movl 24(%esp), %ecx xorl %edx, %edx movl %esi, 4(%esp) movl 16(%esp), %esi shrl $2, %ecx movl %edi, 8(%esp) cmpl %ecx, %edx movl 20(%esp), %edi movl %ebx, (%esp) jae .L6 movl (%edi), %eax cmpl %eax, (%esi) je .L9 .L6: xorl %ebx, %ebx cmpl %ecx, %edx je .L4 movl (%esi,%edx,4), %eax bswap %eax movl %eax, (%esi,%edx,4) movl (%edi,%edx,4), %eax bswap %eax movl %eax, (%edi,%edx,4) movl (%esi,%edx,4), %ebx subl %eax, %ebx .L4: movl %ebx, %eax movl 4(%esp), %esi movl (%esp), %ebx movl 8(%esp), %edi addl $12, %esp ret .p2align 4,,7 .L9: incl %edx cmpl %ecx, %edx jae .L6 movl (%edi,%edx,4), %eax cmpl %eax, (%esi,%edx,4) je .L9 jmp .L6 --- http://kevin.atkinson.dhs.org --Boundary_(ID_Xg3uWjAzDwNb8LkvCa6Pww) Content-id: Content-type: TEXT/PLAIN; charset=US-ASCII; name=cmps.c Content-transfer-encoding: 7BIT Content-disposition: attachment; filename=cmps.c Content-description: Content-length: 2172 #include #include #include __attribute__((noinline)) int cmpi(unsigned int x, unsigned int y) { return __builtin_memcmp(&x,&y,4); } __attribute__((noinline)) int cmpi2(unsigned int x, unsigned int y) { asm("bswap %0" : "+r"(x)); asm("bswap %0" : "+r"(y)); return x - y; } __attribute__((noinline)) int cmpa(unsigned int * x, unsigned int * y, size_t size) { return __builtin_memcmp(x,y,size); } __attribute__((noinline)) int cmpa2(unsigned int * x, unsigned int * y, size_t size) { int i = 0; size /= 4; while (i < size && x[i] == y[i]) ++i; if (i == size) return 0; asm("bswap %0" : "+r"(x[i])); asm("bswap %0" : "+r"(y[i])); return x[i] - y[i]; } int main() { unsigned int x,y,i; int xa[256] = {}, ya[256] = {}; int s,f; clock_t t; x = *(unsigned int *)("a b"); y = *(unsigned int *)("b a"); printf("Memory compare int:\n"); t = clock(); for (i = 0; i != 0x1000000; ++i) cmpi(x,y); s = clock() - t; printf(" %d\n", s); t = clock(); for (i = 0; i != 0x1000000; ++i) cmpi2(x,y); f = clock() - t; printf(" %d\n",f); printf(" Speed up: %f\n", (double)s/f); printf("Memory compare 16 bytes:\n"); xa[3] = 6; t = clock(); for (i = 0; i != 0x1000000; ++i) cmpa(xa, ya, 16); s = clock() - t; printf(" %d\n", s); t = clock(); for (i = 0; i != 0x1000000; ++i) cmpa2(xa, ya, 16); f = clock() - t; printf(" %d\n",f); printf(" Speed up: %f\n", (double)s/f); xa[3] = 0; xa[15] = 6; printf("Memory compare 64 bytes:\n"); t = clock(); for (i = 0; i != 0x1000000; ++i) cmpa(xa, ya, 64); s = clock() - t; printf(" %d\n", s); t = clock(); for (i = 0; i != 0x1000000; ++i) cmpa2(xa, ya, 64); f = clock() - t; printf(" %d\n",f); printf(" Speed up: %f\n", (double)s/f); xa[15] = 0; xa[255] = 6; printf("Memory compare 256 bytes:\n"); t = clock(); for (i = 0; i != 0x1000000; ++i) cmpa(xa, ya, 256); s = clock() - t; printf(" %d\n", s); t = clock(); for (i = 0; i != 0x1000000; ++i) cmpa2(xa, ya, 256); f = clock() - t; printf(" %d\n",f); printf(" Speed up: %f\n", (double)s/f); } --Boundary_(ID_Xg3uWjAzDwNb8LkvCa6Pww)--