public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] x86_64: Remove 9 REX bytes from memchr.S
@ 2017-05-20 14:50 H.J. Lu
  2017-05-20 14:59 ` Zack Weinberg
  0 siblings, 1 reply; 5+ messages in thread
From: H.J. Lu @ 2017-05-20 14:50 UTC (permalink / raw)
  To: GNU C Library

There is no need to use 64-bit registers when only the lower 32 bits
are non-zero.

Tested on x86-64.  OK for master?

H.J.
---
	* sysdeps/x86_64/memchr.S (MEMCHR): Use 32-bit registers for
	the lower 32 bits.
---
 sysdeps/x86_64/memchr.S | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index 8242f2d..f1dad9e 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -44,10 +44,10 @@ ENTRY(MEMCHR)
 	punpcklbw %xmm1, %xmm1
 #endif
 
-	and	$63, %rcx
+	and	$63, %ecx
 	pshufd	$0, %xmm1, %xmm1
 
-	cmp	$48, %rcx
+	cmp	$48, %ecx
 	ja	L(crosscache)
 
 	movdqu	(%rdi), %xmm0
@@ -59,7 +59,7 @@ ENTRY(MEMCHR)
 	sub	$16, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
-	and	$15, %rcx
+	and	$15, %ecx
 	and	$-16, %rdi
 	add	%rcx, %rdx
 	sub	$64, %rdx
@@ -68,7 +68,7 @@ ENTRY(MEMCHR)
 
 	.p2align 4
 L(crosscache):
-	and	$15, %rcx
+	and	$15, %ecx
 	and	$-16, %rdi
 	movdqa	(%rdi), %xmm0
 
@@ -162,7 +162,7 @@ L(loop_prolog):
 
 	mov	%rdi, %rcx
 	and	$-64, %rdi
-	and	$63, %rcx
+	and	$63, %ecx
 	add	%rcx, %rdx
 
 	.p2align 4
@@ -214,7 +214,7 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$32, %rdx
+	add	$32, %edx
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
@@ -234,7 +234,7 @@ L(exit_loop):
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
-	sub	$16, %rdx
+	sub	$16, %edx
 	jle	L(return_null)
 
 	PCMPEQ	48(%rdi), %xmm1
@@ -246,13 +246,13 @@ L(exit_loop):
 
 	.p2align 4
 L(exit_loop_32):
-	add	$32, %rdx
+	add	$32, %edx
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
-	sub	$16, %rdx
+	sub	$16, %edx
 	jbe	L(return_null)
 
 	PCMPEQ	16(%rdi), %xmm1
-- 
2.9.4

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] x86_64: Remove 9 REX bytes from memchr.S
  2017-05-20 14:50 [PATCH] x86_64: Remove 9 REX bytes from memchr.S H.J. Lu
@ 2017-05-20 14:59 ` Zack Weinberg
  2017-05-20 19:58   ` H.J. Lu
  0 siblings, 1 reply; 5+ messages in thread
From: Zack Weinberg @ 2017-05-20 14:59 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library

On Sat, May 20, 2017 at 10:50 AM, H.J. Lu <hongjiu.lu@intel.com> wrote:
> There is no need to use 64-bit registers when only the lower 32 bits
> are non-zero.

This code is used generically for x86-64, not for a specific
microarchitecture. Is there a reason why this will never cause partial
register stalls, now or in the future?

zw

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] x86_64: Remove 9 REX bytes from memchr.S
  2017-05-20 14:59 ` Zack Weinberg
@ 2017-05-20 19:58   ` H.J. Lu
  2017-05-24 14:55     ` H.J. Lu
  0 siblings, 1 reply; 5+ messages in thread
From: H.J. Lu @ 2017-05-20 19:58 UTC (permalink / raw)
  To: Zack Weinberg; +Cc: GNU C Library

On Sat, May 20, 2017 at 7:59 AM, Zack Weinberg <zackw@panix.com> wrote:
> On Sat, May 20, 2017 at 10:50 AM, H.J. Lu <hongjiu.lu@intel.com> wrote:
>> There is no need to use 64-bit registers when only the lower 32 bits
>> are non-zero.
>
> This code is used generically for x86-64, not for a specific
> microarchitecture. Is there a reason why this will never cause partial
> register stalls, now or in the future?

By x86-64 specification, 32-bit destination registers in these instructions
are zero-extended to 64 bits and there is register stall at all.


-- 
H.J.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] x86_64: Remove 9 REX bytes from memchr.S
  2017-05-20 19:58   ` H.J. Lu
@ 2017-05-24 14:55     ` H.J. Lu
  2017-05-30 19:28       ` H.J. Lu
  0 siblings, 1 reply; 5+ messages in thread
From: H.J. Lu @ 2017-05-24 14:55 UTC (permalink / raw)
  To: Zack Weinberg; +Cc: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 734 bytes --]

On Sat, May 20, 2017 at 12:58 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, May 20, 2017 at 7:59 AM, Zack Weinberg <zackw@panix.com> wrote:
>> On Sat, May 20, 2017 at 10:50 AM, H.J. Lu <hongjiu.lu@intel.com> wrote:
>>> There is no need to use 64-bit registers when only the lower 32 bits
>>> are non-zero.
>>
>> This code is used generically for x86-64, not for a specific
>> microarchitecture. Is there a reason why this will never cause partial
>> register stalls, now or in the future?
>
> By x86-64 specification, 32-bit destination registers in these instructions
> are zero-extended to 64 bits and there is register stall at all.
>

Here is the updated patch with one more REX byte removed.

Any other comments?


-- 
H.J.

[-- Attachment #2: 0001-x86_64-Remove-redundant-REX-bytes-from-memchr.S.patch --]
[-- Type: text/x-patch, Size: 2126 bytes --]

From 2db36b54338f65a803080ce8085977299a4f52a0 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 18 May 2017 12:22:31 -0700
Subject: [PATCH] x86_64: Remove redundant REX bytes from memchr.S

By x86-64 specification, 32-bit destination registers are zero-extended
to 64 bits.  There is no need to use 64-bit registers when only the lower
32 bits are non-zero.

	* sysdeps/x86_64/memchr.S (MEMCHR): Use 32-bit registers for
	the lower 32 bits.
---
 sysdeps/x86_64/memchr.S | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index 8242f2d..77a71ae 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -31,7 +31,7 @@
 	.text
 ENTRY(MEMCHR)
 	movd	%esi, %xmm1
-	mov	%rdi, %rcx
+	mov	%edi, %ecx
 
 #ifdef USE_AS_WMEMCHR
 	test	%rdx, %rdx
@@ -44,10 +44,10 @@ ENTRY(MEMCHR)
 	punpcklbw %xmm1, %xmm1
 #endif
 
-	and	$63, %rcx
+	and	$63, %ecx
 	pshufd	$0, %xmm1, %xmm1
 
-	cmp	$48, %rcx
+	cmp	$48, %ecx
 	ja	L(crosscache)
 
 	movdqu	(%rdi), %xmm0
@@ -59,7 +59,7 @@ ENTRY(MEMCHR)
 	sub	$16, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
-	and	$15, %rcx
+	and	$15, %ecx
 	and	$-16, %rdi
 	add	%rcx, %rdx
 	sub	$64, %rdx
@@ -68,7 +68,7 @@ ENTRY(MEMCHR)
 
 	.p2align 4
 L(crosscache):
-	and	$15, %rcx
+	and	$15, %ecx
 	and	$-16, %rdi
 	movdqa	(%rdi), %xmm0
 
@@ -162,7 +162,7 @@ L(loop_prolog):
 
 	mov	%rdi, %rcx
 	and	$-64, %rdi
-	and	$63, %rcx
+	and	$63, %ecx
 	add	%rcx, %rdx
 
 	.p2align 4
@@ -214,7 +214,7 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$32, %rdx
+	add	$32, %edx
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
@@ -234,7 +234,7 @@ L(exit_loop):
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
-	sub	$16, %rdx
+	sub	$16, %edx
 	jle	L(return_null)
 
 	PCMPEQ	48(%rdi), %xmm1
@@ -246,13 +246,13 @@ L(exit_loop):
 
 	.p2align 4
 L(exit_loop_32):
-	add	$32, %rdx
+	add	$32, %edx
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
-	sub	$16, %rdx
+	sub	$16, %edx
 	jbe	L(return_null)
 
 	PCMPEQ	16(%rdi), %xmm1
-- 
2.9.4


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] x86_64: Remove 9 REX bytes from memchr.S
  2017-05-24 14:55     ` H.J. Lu
@ 2017-05-30 19:28       ` H.J. Lu
  0 siblings, 0 replies; 5+ messages in thread
From: H.J. Lu @ 2017-05-30 19:28 UTC (permalink / raw)
  To: Zack Weinberg; +Cc: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 862 bytes --]

On Wed, May 24, 2017 at 7:55 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, May 20, 2017 at 12:58 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Sat, May 20, 2017 at 7:59 AM, Zack Weinberg <zackw@panix.com> wrote:
>>> On Sat, May 20, 2017 at 10:50 AM, H.J. Lu <hongjiu.lu@intel.com> wrote:
>>>> There is no need to use 64-bit registers when only the lower 32 bits
>>>> are non-zero.
>>>
>>> This code is used generically for x86-64, not for a specific
>>> microarchitecture. Is there a reason why this will never cause partial
>>> register stalls, now or in the future?
>>
>> By x86-64 specification, 32-bit destination registers in these instructions
>> are zero-extended to 64 bits and there is register stall at all.
>>
>
> Here is the updated patch with one more REX byte removed.
>
> Any other comments?
>

This is the patch I am checking in.

-- 
H.J.

[-- Attachment #2: 0001-x86_64-Remove-redundant-REX-bytes-from-memchr.S.patch --]
[-- Type: text/x-patch, Size: 2440 bytes --]

From 0b4aae9e15eeb63419dc1df2578b3df50aae7edf Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 18 May 2017 12:22:31 -0700
Subject: [PATCH] x86_64: Remove redundant REX bytes from memchr.S

By x86-64 specification, 32-bit destination registers are zero-extended
to 64 bits.  There is no need to use 64-bit registers when only the lower
32 bits are non-zero.

	* sysdeps/x86_64/memchr.S (MEMCHR): Use 32-bit registers for
	the lower 32 bits.
---
 sysdeps/x86_64/memchr.S | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index 8242f2d..3167cd8 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -31,7 +31,7 @@
 	.text
 ENTRY(MEMCHR)
 	movd	%esi, %xmm1
-	mov	%rdi, %rcx
+	mov	%edi, %ecx
 
 #ifdef USE_AS_WMEMCHR
 	test	%rdx, %rdx
@@ -44,10 +44,10 @@ ENTRY(MEMCHR)
 	punpcklbw %xmm1, %xmm1
 #endif
 
-	and	$63, %rcx
+	and	$63, %ecx
 	pshufd	$0, %xmm1, %xmm1
 
-	cmp	$48, %rcx
+	cmp	$48, %ecx
 	ja	L(crosscache)
 
 	movdqu	(%rdi), %xmm0
@@ -59,7 +59,7 @@ ENTRY(MEMCHR)
 	sub	$16, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
-	and	$15, %rcx
+	and	$15, %ecx
 	and	$-16, %rdi
 	add	%rcx, %rdx
 	sub	$64, %rdx
@@ -68,7 +68,7 @@ ENTRY(MEMCHR)
 
 	.p2align 4
 L(crosscache):
-	and	$15, %rcx
+	and	$15, %ecx
 	and	$-16, %rdi
 	movdqa	(%rdi), %xmm0
 
@@ -162,7 +162,7 @@ L(loop_prolog):
 
 	mov	%rdi, %rcx
 	and	$-64, %rdi
-	and	$63, %rcx
+	and	$63, %ecx
 	add	%rcx, %rdx
 
 	.p2align 4
@@ -214,7 +214,7 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$32, %rdx
+	add	$32, %edx
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
@@ -234,32 +234,32 @@ L(exit_loop):
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
-	sub	$16, %rdx
+	sub	$16, %edx
 	jle	L(return_null)
 
 	PCMPEQ	48(%rdi), %xmm1
 	pmovmskb %xmm1, %eax
 	test	%eax, %eax
 	jnz	L(matches48_1)
-	xor	%rax, %rax
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
 L(exit_loop_32):
-	add	$32, %rdx
+	add	$32, %edx
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
-	sub	$16, %rdx
+	sub	$16, %edx
 	jbe	L(return_null)
 
 	PCMPEQ	16(%rdi), %xmm1
 	pmovmskb %xmm1, %eax
 	test	%eax, %eax
 	jnz	L(matches16_1)
-	xor	%rax, %rax
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -320,7 +320,7 @@ L(matches48_1):
 
 	.p2align 4
 L(return_null):
-	xor	%rax, %rax
+	xor	%eax, %eax
 	ret
 END(MEMCHR)
 
-- 
2.9.4


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-05-30 19:28 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-20 14:50 [PATCH] x86_64: Remove 9 REX bytes from memchr.S H.J. Lu
2017-05-20 14:59 ` Zack Weinberg
2017-05-20 19:58   ` H.J. Lu
2017-05-24 14:55     ` H.J. Lu
2017-05-30 19:28       ` H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).