From: Kevin Atkinson <kevina@gnu.org>
To: Roger Sayle <roger@www.eyesopen.com>
Cc: gcc@gcc.gnu.org
Subject: Re: Slow memcmp for aligned strings on Pentium 3
Date: Fri, 04 Apr 2003 17:08:00 -0000 [thread overview]
Message-ID: <Pine.LNX.4.44.0304040923350.912-300000@kevin-pc.atkinson.dhs.org> (raw)
In-Reply-To: <Pine.LNX.4.44.0304032110020.7684-100000@www.eyesopen.com>
[-- Attachment #1: Type: TEXT/PLAIN, Size: 1831 bytes --]
On Thu, 3 Apr 2003, Roger Sayle wrote:
> I suspect it should be possible to fix your code to handle these
> termination conditions correctly, and a comparison of your
> routine's performance with these fixes vs. __builtin_memcmp
> would be of interest.
Here you go. Still assume the memory is alligned:
This is what I did:
int cmpa2(const unsigned int * x, const unsigned int * y, size_t size)
{
int i = 0;
size_t s = size / 4;
while (i < s && x[i] == y[i]) ++i;
size -= i * 4;
if (size == 0) return 0;
// hopefully if this is inline expanded when size is known
// the compiler can eliminate many of these conditionals
else if (size >= 4) { // if original size % 4 == 0 this should
// always be the case
unsigned int xx = x[i], yy = y[i];
asm("bswap %0" : "+r"(xx));
asm("bswap %0" : "+r"(yy));
return xx - yy;
} else {
const unsigned char * xb = (const unsigned char *)(x + i);
const unsigned char * yb = (const unsigned char *)(y + i);
// if size is known at compile time then the compiler should be
// able to select the correct choice at compile time
switch (size) {
case 1:
return *xb - *yb;
case 2:
return ((xb[0] - yb[0]) << 8) + (xb[1] - yb[1]);
case 3:
return ((xb[0] - yb[0]) << 16) + ((xb[1] - yb[1]) << 8)
+ xb[2] - yb[2];}
}
}
compiled with gcc -fomit-frame-pointer -O3 -march=pentium3 cmps.c
Memory compare int:
2160000
360000
Speed up: 6.000000
Memory compare 15 bytes:
4120000
1550000
Speed up: 2.658065
Memory compare 16 bytes:
4150000
1310000
Speed up: 3.167939
Memory compare 64 bytes:
11240000
3470000
Speed up: 3.239193
Memory compare 256 bytes:
37930000
10050000
Speed up: 3.774129
Code and assembly output attached.
--
http://kevin.atkinson.dhs.org
[-- Attachment #2: Type: TEXT/PLAIN, Size: 3315 bytes --]
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
__attribute__((noinline))
int cmpi(unsigned int x, unsigned int y)
{
return __builtin_memcmp(&x,&y,4);
}
__attribute__((noinline))
int cmpi2(unsigned int x, unsigned int y)
{
asm("bswap %0" : "+r"(x));
asm("bswap %0" : "+r"(y));
return x - y;
}
__attribute__((noinline))
int cmpa(const unsigned int * x, const unsigned int * y, size_t size)
{
return __builtin_memcmp(x,y,size);
}
__attribute__((noinline))
int cmpa2(const unsigned int * x, const unsigned int * y, size_t size)
{
int i = 0;
size_t s = size / 4;
while (i < s && x[i] == y[i]) ++i;
size -= i * 4;
if (size == 0) return 0;
// hopefully if this is inline expanded when size is known
// the compiler can eliminate many of these conditionals
else if (size >= 4) { // if original size % 4 == 0 this should
// always be the case
unsigned int xx = x[i], yy = y[i];
asm("bswap %0" : "+r"(xx));
asm("bswap %0" : "+r"(yy));
return xx - yy;
} else {
const unsigned char * xb = (const unsigned char *)(x + i);
const unsigned char * yb = (const unsigned char *)(y + i);
// if size is known at compile time then the compiler should be
// able to select the correct choice at compile time
switch (size) {
case 1:
return *xb - *yb;
case 2:
return ((xb[0] - yb[0]) << 8) + (xb[1] - yb[1]);
case 3:
return ((xb[0] - yb[0]) << 16) + ((xb[1] - yb[1]) << 8)
+ xb[2] - yb[2];}
}
}
int main()
{
unsigned int x,y,i;
int xa[256] = {}, ya[256] = {};
int s,f;
clock_t t;
x = *(unsigned int *)("a b");
y = *(unsigned int *)("b a");
printf("Memory compare int:\n");
t = clock();
for (i = 0; i != 0x1000000; ++i)
cmpi(x,y);
s = clock() - t;
printf(" %d\n", s);
t = clock();
for (i = 0; i != 0x1000000; ++i)
cmpi2(x,y);
f = clock() - t;
printf(" %d\n",f);
printf(" Speed up: %f\n", (double)s/f);
printf("Memory compare 15 bytes:\n");
xa[3] = 6;
t = clock();
for (i = 0; i != 0x1000000; ++i)
cmpa(xa, ya, 15);
s = clock() - t;
printf(" %d\n", s);
t = clock();
for (i = 0; i != 0x1000000; ++i)
cmpa2(xa, ya, 15);
f = clock() - t;
printf(" %d\n",f);
printf(" Speed up: %f\n", (double)s/f);
xa[3] = 0;
printf("Memory compare 16 bytes:\n");
xa[3] = 6;
t = clock();
for (i = 0; i != 0x1000000; ++i)
cmpa(xa, ya, 16);
s = clock() - t;
printf(" %d\n", s);
t = clock();
for (i = 0; i != 0x1000000; ++i)
cmpa2(xa, ya, 16);
f = clock() - t;
printf(" %d\n",f);
printf(" Speed up: %f\n", (double)s/f);
xa[3] = 0;
printf("Memory compare 64 bytes:\n");
xa[15] = 6;
t = clock();
for (i = 0; i != 0x1000000; ++i)
cmpa(xa, ya, 64);
s = clock() - t;
printf(" %d\n", s);
t = clock();
for (i = 0; i != 0x1000000; ++i)
cmpa2(xa, ya, 64);
f = clock() - t;
printf(" %d\n",f);
printf(" Speed up: %f\n", (double)s/f);
xa[15] = 0;
printf("Memory compare 256 bytes:\n");
xa[63] = 6;
t = clock();
for (i = 0; i != 0x1000000; ++i)
cmpa(xa, ya, 256);
s = clock() - t;
printf(" %d\n", s);
t = clock();
for (i = 0; i != 0x1000000; ++i)
cmpa2(xa, ya, 256);
f = clock() - t;
printf(" %d\n",f);
printf(" Speed up: %f\n", (double)s/f);
}
[-- Attachment #3: Type: TEXT/PLAIN, Size: 7710 bytes --]
.file "cmps.c"
.text
.align 2
.p2align 4,,15
.globl cmpi
.type cmpi,@function
cmpi:
cld
subl $8, %esp
movl $4, %ecx
movl %esi, (%esp)
leal 12(%esp), %esi
movl %edi, 4(%esp)
leal 16(%esp), %edi
repz
cmpsb
movl 4(%esp), %edi
movl (%esp), %esi
seta %dl
setb %cl
addl $8, %esp
subb %cl, %dl
movsbl %dl,%eax
ret
.Lfe1:
.size cmpi,.Lfe1-cmpi
.align 2
.p2align 4,,15
.globl cmpi2
.type cmpi2,@function
cmpi2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
#APP
bswap %eax
bswap %ecx
#NO_APP
subl %ecx, %eax
ret
.Lfe2:
.size cmpi2,.Lfe2-cmpi2
.align 2
.p2align 4,,15
.globl cmpa
.type cmpa,@function
cmpa:
cld
subl $8, %esp
movl 20(%esp), %ecx
movl %esi, (%esp)
movl 12(%esp), %esi
cmpl %ecx, %ecx
movl %edi, 4(%esp)
movl 16(%esp), %edi
repz
cmpsb
movl 4(%esp), %edi
movl (%esp), %esi
seta %dl
setb %cl
addl $8, %esp
subb %cl, %dl
movsbl %dl,%eax
ret
.Lfe3:
.size cmpa,.Lfe3-cmpa
.align 2
.p2align 4,,15
.globl cmpa2
.type cmpa2,@function
cmpa2:
pushl %edi
xorl %edx, %edx
pushl %esi
pushl %ebx
movl 24(%esp), %ecx
movl 16(%esp), %esi
movl 20(%esp), %edi
movl %ecx, %ebx
shrl $2, %ebx
cmpl %ebx, %edx
jae .L6
movl (%edi), %eax
cmpl %eax, (%esi)
je .L9
.L6:
leal 0(,%edx,4), %eax
subl %eax, %ecx
jne .L10
xorl %ecx, %ecx
.L4:
popl %ebx
movl %ecx, %eax
popl %esi
popl %edi
ret
.p2align 4,,7
.L10:
cmpl $3, %ecx
jbe .L12
movl (%esi,%edx,4), %ecx
movl (%edi,%edx,4), %eax
#APP
bswap %ecx
bswap %eax
#NO_APP
.L22:
subl %eax, %ecx
jmp .L4
.p2align 4,,7
.L12:
cmpl $2, %ecx
leal (%eax,%esi), %ebx
leal (%eax,%edi), %esi
je .L16
cmpl $2, %ecx
ja .L20
cmpl $1, %ecx
jne .L4
movzbl (%ebx), %ecx
movzbl (%esi), %eax
jmp .L22
.L20:
cmpl $3, %ecx
jne .L4
movzbl 1(%esi), %eax
movzbl (%ebx), %ecx
movzbl (%esi), %edx
movzbl 1(%ebx), %edi
subl %edx, %ecx
subl %eax, %edi
movzbl 2(%esi), %eax
sall $8, %edi
sall $16, %ecx
addl %edi, %ecx
movzbl 2(%ebx), %edi
addl %edi, %ecx
jmp .L22
.L16:
movzbl (%ebx), %ecx
movzbl (%esi), %edx
movzbl 1(%ebx), %edi
movzbl 1(%esi), %eax
subl %edx, %ecx
sall $8, %ecx
subl %eax, %edi
addl %edi, %ecx
jmp .L4
.p2align 4,,7
.L9:
incl %edx
cmpl %ebx, %edx
jae .L6
movl (%edi,%edx,4), %eax
cmpl %eax, (%esi,%edx,4)
je .L9
jmp .L6
.Lfe4:
.size cmpa2,.Lfe4-cmpa2
.section .rodata.str1.1,"aMS",@progbits,1
.LC0:
.string "a b"
.LC2:
.string "Memory compare int:"
.LC1:
.string "b a"
.LC3:
.string " %d\n"
.LC4:
.string " Speed up: %f\n"
.LC5:
.string "Memory compare 15 bytes:"
.LC6:
.string "Memory compare 16 bytes:"
.LC7:
.string "Memory compare 64 bytes:"
.LC8:
.string "Memory compare 256 bytes:"
.text
.align 2
.p2align 4,,15
.globl main
.type main,@function
main:
pushl %ebp
xorl %eax, %eax
movl %esp, %ebp
pushl %edi
movl $256, %ecx
leal -1048(%ebp), %edi
pushl %esi
pushl %ebx
subl $2092, %esp
andl $-16, %esp
cld
rep
stosl
movl $256, %ecx
leal -2072(%ebp), %edi
rep
stosl
xorl %edi, %edi
movl .LC0, %esi
movl $.LC2, (%esp)
movl .LC1, %ebx
call puts
call clock
movl %eax, -2084(%ebp)
.p2align 4,,15
.L28:
movl %esi, (%esp)
incl %edi
movl %ebx, 4(%esp)
call cmpi
cmpl $16777216, %edi
jne .L28
call clock
movl %eax, -2080(%ebp)
xorl %edi, %edi
movl -2084(%ebp), %eax
subl %eax, -2080(%ebp)
movl $.LC3, (%esp)
movl -2080(%ebp), %eax
movl %eax, 4(%esp)
call printf
call clock
movl %eax, -2084(%ebp)
.L33:
movl %esi, (%esp)
incl %edi
movl %ebx, 4(%esp)
call cmpi2
cmpl $16777216, %edi
jne .L33
call clock
movl $.LC3, (%esp)
movl %eax, %esi
movl -2084(%ebp), %eax
xorl %edi, %edi
subl %eax, %esi
movl %esi, 4(%esp)
call printf
fildl -2080(%ebp)
movl $.LC4, (%esp)
pushl %esi
fildl (%esp)
addl $4, %esp
fdivrp %st, %st(1)
fstpl 4(%esp)
call printf
movl $.LC5, (%esp)
call puts
movl $6, %eax
movl %eax, -1036(%ebp)
call clock
movl %eax, -2084(%ebp)
.L38:
movl $15, 8(%esp)
leal -1048(%ebp), %ecx
incl %edi
movl %ecx, (%esp)
leal -2072(%ebp), %edx
movl %edx, 4(%esp)
call cmpa
cmpl $16777216, %edi
jne .L38
call clock
movl %eax, -2080(%ebp)
xorl %edi, %edi
movl -2084(%ebp), %eax
subl %eax, -2080(%ebp)
movl $.LC3, (%esp)
movl -2080(%ebp), %ebx
movl %ebx, 4(%esp)
call printf
call clock
movl %eax, -2084(%ebp)
.L43:
movl $15, 8(%esp)
leal -1048(%ebp), %edx
incl %edi
movl %edx, (%esp)
leal -2072(%ebp), %esi
movl %esi, 4(%esp)
call cmpa2
cmpl $16777216, %edi
jne .L43
call clock
movl $.LC3, (%esp)
movl %eax, %esi
movl -2084(%ebp), %eax
xorl %edi, %edi
subl %eax, %esi
movl %esi, 4(%esp)
call printf
fildl -2080(%ebp)
movl $.LC4, (%esp)
pushl %esi
fildl (%esp)
addl $4, %esp
fdivrp %st, %st(1)
fstpl 4(%esp)
call printf
movl %edi, -1036(%ebp)
xorl %edi, %edi
movl $.LC6, (%esp)
call puts
movl $6, %eax
movl %eax, -1036(%ebp)
call clock
movl %eax, -2084(%ebp)
.L48:
movl $16, 8(%esp)
leal -1048(%ebp), %ecx
incl %edi
movl %ecx, (%esp)
leal -2072(%ebp), %eax
movl %eax, 4(%esp)
call cmpa
cmpl $16777216, %edi
jne .L48
call clock
movl %eax, -2080(%ebp)
xorl %edi, %edi
movl -2084(%ebp), %eax
subl %eax, -2080(%ebp)
movl $.LC3, (%esp)
movl -2080(%ebp), %ebx
movl %ebx, 4(%esp)
call printf
call clock
movl %eax, -2084(%ebp)
.L53:
movl $16, 8(%esp)
leal -2072(%ebp), %edx
incl %edi
movl %edx, 4(%esp)
leal -1048(%ebp), %esi
movl %esi, (%esp)
call cmpa2
cmpl $16777216, %edi
jne .L53
call clock
movl $.LC3, (%esp)
movl %eax, %esi
movl -2084(%ebp), %eax
xorl %edi, %edi
subl %eax, %esi
movl %esi, 4(%esp)
call printf
fildl -2080(%ebp)
movl $.LC4, (%esp)
pushl %esi
fildl (%esp)
addl $4, %esp
fdivrp %st, %st(1)
fstpl 4(%esp)
call printf
movl %edi, -1036(%ebp)
xorl %edi, %edi
movl $.LC7, (%esp)
call puts
movl $6, %eax
movl %eax, -988(%ebp)
call clock
movl %eax, -2084(%ebp)
.L58:
movl $64, 8(%esp)
leal -1048(%ebp), %ecx
incl %edi
movl %ecx, (%esp)
leal -2072(%ebp), %ebx
movl %ebx, 4(%esp)
call cmpa
cmpl $16777216, %edi
jne .L58
call clock
movl %eax, -2080(%ebp)
xorl %edi, %edi
movl -2084(%ebp), %eax
subl %eax, -2080(%ebp)
movl $.LC3, (%esp)
movl -2080(%ebp), %edx
movl %edx, 4(%esp)
call printf
call clock
movl %eax, -2084(%ebp)
.L63:
movl $64, 8(%esp)
leal -2072(%ebp), %eax
incl %edi
movl %eax, 4(%esp)
leal -1048(%ebp), %esi
movl %esi, (%esp)
call cmpa2
cmpl $16777216, %edi
jne .L63
call clock
movl $.LC3, (%esp)
movl -2084(%ebp), %ecx
movl %eax, %edi
xorl %ebx, %ebx
subl %ecx, %edi
movl %edi, 4(%esp)
call printf
fildl -2080(%ebp)
movl $.LC4, (%esp)
pushl %edi
movl $6, %edi
fildl (%esp)
addl $4, %esp
fdivrp %st, %st(1)
fstpl 4(%esp)
call printf
movl %ebx, -988(%ebp)
movl $.LC8, (%esp)
call puts
movl %edi, -796(%ebp)
xorl %edi, %edi
call clock
movl %eax, -2084(%ebp)
.L68:
movl $256, 8(%esp)
leal -2072(%ebp), %edx
incl %edi
movl %edx, 4(%esp)
leal -1048(%ebp), %esi
movl %esi, (%esp)
call cmpa
cmpl $16777216, %edi
jne .L68
call clock
movl %eax, -2080(%ebp)
xorl %edi, %edi
movl -2084(%ebp), %eax
subl %eax, -2080(%ebp)
movl $.LC3, (%esp)
movl -2080(%ebp), %eax
movl %eax, 4(%esp)
call printf
call clock
movl %eax, -2084(%ebp)
.L73:
movl $256, 8(%esp)
leal -1048(%ebp), %ecx
incl %edi
movl %ecx, (%esp)
leal -2072(%ebp), %ebx
movl %ebx, 4(%esp)
call cmpa2
cmpl $16777216, %edi
jne .L73
call clock
movl $.LC3, (%esp)
movl %eax, %edi
movl -2084(%ebp), %eax
subl %eax, %edi
movl %edi, 4(%esp)
call printf
fildl -2080(%ebp)
movl $.LC4, (%esp)
pushl %edi
fildl (%esp)
addl $4, %esp
fdivrp %st, %st(1)
fstpl 4(%esp)
call printf
leal -12(%ebp), %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
.Lfe5:
.size main,.Lfe5-main
.ident "GCC: (GNU) 3.2 20020903 (Red Hat Linux 8.0 3.2-7)"
next prev parent reply other threads:[~2003-04-04 14:33 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2003-04-04 13:18 Roger Sayle
2003-04-04 14:51 ` Kevin Atkinson
2003-04-04 17:08 ` Kevin Atkinson [this message]
2003-04-04 18:58 ` Marcel Cox
2003-04-04 19:27 ` Kevin Atkinson
-- strict thread matches above, loose matches on Subject: below --
2003-04-04 17:13 Jerry Quinn
2003-04-04 17:29 ` Kevin Atkinson
2003-04-06 5:54 ` Jerry Quinn
2003-04-06 22:17 ` Kevin Atkinson
2003-04-07 17:30 ` Jerry Quinn
2003-04-04 14:33 Bonzini
2003-04-04 6:22 Kevin Atkinson
2003-04-04 8:52 ` Zack Weinberg
2003-04-04 15:16 ` Kevin Atkinson
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=Pine.LNX.4.44.0304040923350.912-300000@kevin-pc.atkinson.dhs.org \
--to=kevina@gnu.org \
--cc=gcc@gcc.gnu.org \
--cc=roger@www.eyesopen.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).