public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH, MPX] MPX-specific changes in dl_runtime routines
@ 2015-07-02 14:04 Zamyatin, Igor
  2015-07-02 14:33 ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Zamyatin, Igor @ 2015-07-02 14:04 UTC (permalink / raw)
  To: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 526 bytes --]

Hi!

This patch adds necessary changes for proper work of dl_runtime routines both for 32 and 64 bits in MPX mode.

Is it ok for trunk?


Thanks,
Igor


2015-07-02  Igor Zamyatin  <igor.zamyatin@intel.com>
 
	* sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
	and restore Intel MPX return bound registers
	* sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX to
	call, jump and ret instructions to not loose bounds.
	* sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
	lrv_bnd1.

[-- Attachment #2: mpx_glibc_dl_runtime_all.patch --]
[-- Type: application/octet-stream, Size: 5359 bytes --]

diff --git a/ChangeLog b/ChangeLog
index 8a6c668..ea010a6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2015-07-01  Igor Zamyatin  <igor.zamyatin@intel.com>
+
+	* sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
+	and restore Intel MPX return bound registers
+	* sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX to
+	call, jump and ret instructions to not loose bounds.
+	* sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
+	lrv_bnd1.
+
 2015-06-30  Torvald Riegel  <triegel@redhat.com>
 
 	* nptl/DESIGN-systemtap-probes.txt: Remove lll_lock_wait,
diff --git a/sysdeps/i386/dl-trampoline.S b/sysdeps/i386/dl-trampoline.S
index f11972c..1c8e3d7 100644
--- a/sysdeps/i386/dl-trampoline.S
+++ b/sysdeps/i386/dl-trampoline.S
@@ -18,6 +18,12 @@
 
 #include <sysdep.h>
 
+#ifdef HAVE_MPX_SUPPORT
+# define PRESERVE_BND_REGS_PREFIX bnd
+#else
+# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
+#endif
+
 	.text
 	.globl _dl_runtime_resolve
 	.type _dl_runtime_resolve, @function
@@ -161,24 +167,39 @@ _dl_runtime_profile:
 	    +4      free
 	   %esp     free
 	*/
-	subl $20, %esp
-	cfi_adjust_cfa_offset (20)
-	movl %eax, (%esp)
-	movl %edx, 4(%esp)
-	fstpt 8(%esp)
-	fstpt 20(%esp)
+	subl $36, %esp
+	cfi_adjust_cfa_offset (36)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, (%esp)
+	bndmov %bnd1, 8(%esp)
+#else
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,0x08
+#endif
+	movl %eax, 16(%esp)
+	movl %edx, 20(%esp)
+	fstpt 24(%esp)
+	fstpt 36(%esp)
 	pushl %esp
 	cfi_adjust_cfa_offset (4)
-	leal 36(%esp), %ecx
-	movl 56(%esp), %eax
-	movl 60(%esp), %edx
+	leal 52(%esp), %ecx
+	movl 72(%esp), %eax
+	movl 76(%esp), %edx
 	call _dl_call_pltexit
-	movl (%esp), %eax
-	movl 4(%esp), %edx
-	fldt 20(%esp)
-	fldt 8(%esp)
-	addl $60, %esp
-	cfi_adjust_cfa_offset (-60)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov (%esp), %bnd0
+	bndmov 8(%esp), %bnd1
+#else
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,0x08
+#endif
+	movl 16(%esp), %eax
+	movl 20(%esp), %edx
+	fldt 36(%esp)
+	fldt 24(%esp)
+	addl $76, %esp
+	cfi_adjust_cfa_offset (-76)
+	PRESERVE_BND_REGS_PREFIX
 	ret
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
diff --git a/sysdeps/x86/bits/link.h b/sysdeps/x86/bits/link.h
index 3f559c9..0bf9b9a 100644
--- a/sysdeps/x86/bits/link.h
+++ b/sysdeps/x86/bits/link.h
@@ -38,6 +38,8 @@ typedef struct La_i86_retval
   uint32_t lrv_edx;
   long double lrv_st0;
   long double lrv_st1;
+  uint64_t lrv_bnd0;
+  uint64_t lrv_bnd1;
 } La_i86_retval;
 
 
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index 0e5a6fb..d542428 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -63,20 +63,6 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
-	bndmov 		    (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
-	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
-	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
-	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
-	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-# endif
-#endif
-
 #ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
@@ -154,8 +140,24 @@
 
 1:
 #endif
+
+#ifndef __ILP32__
+# ifdef HAVE_MPX_SUPPORT
+	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
+	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
+	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
+	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
+# else
+	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+# endif
+#endif
+
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
+	PRESERVE_BND_REGS_PREFIX
 	jns 3f
 
 	/* There's nothing in the frame size, so there
@@ -174,6 +176,7 @@
 	addq $48, %rsp		# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
 	jmp *%r11		# Jump to function address.
 
 3:
@@ -200,6 +203,7 @@
 	movq 32(%rdi), %rsi
 	movq 40(%rdi), %rdi
 
+	PRESERVE_BND_REGS_PREFIX
 	call *%r11
 
 	mov 24(%rbx), %rsp	# Drop the copied stack content
@@ -280,11 +284,11 @@
 
 #ifndef __ILP32__
 # ifdef HAVE_MPX_SUPPORT
-	bndmov LRV_BND0_OFFSET(%rcx), %bnd0  # Restore bound registers.
-	bndmov LRV_BND1_OFFSET(%rcx), %bnd1
+	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
+	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
 # else
-	.byte  0x66,0x0f,0x1a,0x81;.long (LRV_BND0_OFFSET)
-	.byte  0x66,0x0f,0x1a,0x89;.long (LRV_BND1_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
 # endif
 #endif
 
@@ -299,6 +303,7 @@
 	addq $48, %rsp		# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
 	retq
 
 #ifdef MORE_CODE

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-02 14:04 [PATCH, MPX] MPX-specific changes in dl_runtime routines Zamyatin, Igor
@ 2015-07-02 14:33 ` H.J. Lu
  2015-07-02 23:41   ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2015-07-02 14:33 UTC (permalink / raw)
  To: Zamyatin, Igor; +Cc: libc-alpha

On Thu, Jul 2, 2015 at 7:03 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> Hi!
>
> This patch adds necessary changes for proper work of dl_runtime routines both for 32 and 64 bits in MPX mode.
>
> Is it ok for trunk?
>
>
> Thanks,
> Igor
>
>
> 2015-07-02  Igor Zamyatin  <igor.zamyatin@intel.com>
>
>         * sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
>         and restore Intel MPX return bound registers
>         * sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX to
>         call, jump and ret instructions to not loose bounds.
>         * sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
>         lrv_bnd1.

Should we add link-defines.sym, similar to x86-64, to avoid
those magic numbers when accessing the fields in La_i86_regs
and  La_i86_retval?

Please add "[BZ #18134]" to ChangLog and add 18134 to NEWS
when this is checked in.

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-02 14:33 ` H.J. Lu
@ 2015-07-02 23:41   ` H.J. Lu
  2015-07-03  2:38     ` H.J. Lu
                       ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: H.J. Lu @ 2015-07-02 23:41 UTC (permalink / raw)
  To: Zamyatin, Igor; +Cc: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 1251 bytes --]

On Thu, Jul 2, 2015 at 7:33 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Thu, Jul 2, 2015 at 7:03 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> Hi!
>>
>> This patch adds necessary changes for proper work of dl_runtime routines both for 32 and 64 bits in MPX mode.
>>
>> Is it ok for trunk?
>>
>>
>> Thanks,
>> Igor
>>
>>
>> 2015-07-02  Igor Zamyatin  <igor.zamyatin@intel.com>
>>
>>         * sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
>>         and restore Intel MPX return bound registers
>>         * sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX to
>>         call, jump and ret instructions to not loose bounds.
>>         * sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
>>         lrv_bnd1.
>
> Should we add link-defines.sym, similar to x86-64, to avoid
> those magic numbers when accessing the fields in La_i86_regs
> and  La_i86_retval?
>

Here is a patch to add sysdeps/i386/link-defines.sym.

I think the i386 MPX change will store values into La_i86_retval
with wrong order. You need to store bnd0/bnd1 after lrv_st1, not
before lrv_eax.

Please add an i386 audit testcase to verify that pltexit gets correct
La_i86_regs and La_i86_retval pointers and fix the i386 MPX patch.


-- 
H.J.

[-- Attachment #2: 0001-Add-and-use-sysdeps-i386-link-defines.sym.patch --]
[-- Type: text/x-patch, Size: 3085 bytes --]

From 4d2115cf349fbc235f4ab8c9a117bd1b20b44f2d Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 2 Jul 2015 16:33:03 -0700
Subject: [PATCH] Add and use sysdeps/i386/link-defines.sym

---
 sysdeps/i386/Makefile         |  1 +
 sysdeps/i386/dl-trampoline.S  | 39 ++++++++++++++++++++++++---------------
 sysdeps/i386/link-defines.sym | 11 +++++++++++
 3 files changed, 36 insertions(+), 15 deletions(-)
 create mode 100644 sysdeps/i386/link-defines.sym

diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile
index c8af591..87b5c6f 100644
--- a/sysdeps/i386/Makefile
+++ b/sysdeps/i386/Makefile
@@ -33,6 +33,7 @@ sysdep-CFLAGS += -mpreferred-stack-boundary=4
 else
 ifeq ($(subdir),csu)
 sysdep-CFLAGS += -mpreferred-stack-boundary=4
+gen-as-const-headers += link-defines.sym
 else
 # Likewise, any function which calls user callbacks
 uses-callbacks += -mpreferred-stack-boundary=4
diff --git a/sysdeps/i386/dl-trampoline.S b/sysdeps/i386/dl-trampoline.S
index f11972c..b4372c9 100644
--- a/sysdeps/i386/dl-trampoline.S
+++ b/sysdeps/i386/dl-trampoline.S
@@ -17,6 +17,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#include <link-defines.h>
 
 	.text
 	.globl _dl_runtime_resolve
@@ -161,24 +162,32 @@ _dl_runtime_profile:
 	    +4      free
 	   %esp     free
 	*/
-	subl $20, %esp
-	cfi_adjust_cfa_offset (20)
-	movl %eax, (%esp)
-	movl %edx, 4(%esp)
-	fstpt 8(%esp)
-	fstpt 20(%esp)
+#if LONG_DOUBLE_SIZE != 12
+# error "long double size must be 12 bytes"
+#endif
+	# Allocate space for La_i86_retval and subtract 12 free bytes.
+	subl $(LRV_SIZE - 12), %esp
+	cfi_adjust_cfa_offset (LRV_SIZE - 12)
+	movl %eax, LRV_EAX_OFFSET(%esp)
+	movl %edx, LRV_EDX_OFFSET(%esp)
+	fstpt LRV_ST0_OFFSET(%esp)
+	fstpt LRV_ST1_OFFSET(%esp)
 	pushl %esp
 	cfi_adjust_cfa_offset (4)
-	leal 36(%esp), %ecx
-	movl 56(%esp), %eax
-	movl 60(%esp), %edx
+	# Address of La_i86_regs area.
+	leal (LRV_SIZE - 12 + 4 + 12)(%esp), %ecx
+	# PLT2
+	movl (LRV_SIZE - 12 + 4 + 32)(%esp), %eax
+	# PLT1
+	movl (LRV_SIZE - 12 + 4 + 36)(%esp), %edx
 	call _dl_call_pltexit
-	movl (%esp), %eax
-	movl 4(%esp), %edx
-	fldt 20(%esp)
-	fldt 8(%esp)
-	addl $60, %esp
-	cfi_adjust_cfa_offset (-60)
+	movl LRV_EAX_OFFSET(%esp), %eax
+	movl LRV_EDX_OFFSET(%esp), %edx
+	fldt LRV_ST1_OFFSET(%esp)
+	fldt LRV_ST0_OFFSET(%esp)
+	# Restore stack before return.
+	addl $(LRV_SIZE - 12 + 4 + 36), %esp
+	cfi_adjust_cfa_offset (-(LRV_SIZE - 12 + 4 + 36))
 	ret
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
diff --git a/sysdeps/i386/link-defines.sym b/sysdeps/i386/link-defines.sym
new file mode 100644
index 0000000..532a916
--- /dev/null
+++ b/sysdeps/i386/link-defines.sym
@@ -0,0 +1,11 @@
+#include "link.h"
+#include <stddef.h>
+
+--
+LONG_DOUBLE_SIZE	sizeof (long double)
+
+LRV_SIZE		sizeof (struct La_i86_retval)
+LRV_EAX_OFFSET		offsetof (struct La_i86_retval, lrv_eax)
+LRV_EDX_OFFSET		offsetof (struct La_i86_retval, lrv_edx)
+LRV_ST0_OFFSET		offsetof (struct La_i86_retval, lrv_st0)
+LRV_ST1_OFFSET		offsetof (struct La_i86_retval, lrv_st1)
-- 
2.4.3


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-02 23:41   ` H.J. Lu
@ 2015-07-03  2:38     ` H.J. Lu
  2015-07-07 12:39     ` H.J. Lu
  2015-07-08 15:25     ` Zamyatin, Igor
  2 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2015-07-03  2:38 UTC (permalink / raw)
  To: Zamyatin, Igor; +Cc: libc-alpha

On Thu, Jul 2, 2015 at 4:41 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Thu, Jul 2, 2015 at 7:33 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Thu, Jul 2, 2015 at 7:03 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>>> Hi!
>>>
>>> This patch adds necessary changes for proper work of dl_runtime routines both for 32 and 64 bits in MPX mode.
>>>
>>> Is it ok for trunk?
>>>
>>>
>>> Thanks,
>>> Igor
>>>
>>>
>>> 2015-07-02  Igor Zamyatin  <igor.zamyatin@intel.com>
>>>
>>>         * sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
>>>         and restore Intel MPX return bound registers
>>>         * sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX to
>>>         call, jump and ret instructions to not loose bounds.
>>>         * sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
>>>         lrv_bnd1.
>>
>> Should we add link-defines.sym, similar to x86-64, to avoid
>> those magic numbers when accessing the fields in La_i86_regs
>> and  La_i86_retval?
>>
>
> Here is a patch to add sysdeps/i386/link-defines.sym.
>
> I think the i386 MPX change will store values into La_i86_retval
> with wrong order. You need to store bnd0/bnd1 after lrv_st1, not
> before lrv_eax.
>
> Please add an i386 audit testcase to verify that pltexit gets correct
> La_i86_regs and La_i86_retval pointers and fix the i386 MPX patch.
>
>

I updated my patch and pushed it to hjl/audit branch.

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-02 23:41   ` H.J. Lu
  2015-07-03  2:38     ` H.J. Lu
@ 2015-07-07 12:39     ` H.J. Lu
  2015-07-08 15:25     ` Zamyatin, Igor
  2 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2015-07-07 12:39 UTC (permalink / raw)
  To: Zamyatin, Igor; +Cc: libc-alpha

On Thu, Jul 2, 2015 at 4:41 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Thu, Jul 2, 2015 at 7:33 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Thu, Jul 2, 2015 at 7:03 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>>> Hi!
>>>
>>> This patch adds necessary changes for proper work of dl_runtime routines both for 32 and 64 bits in MPX mode.
>>>
>>> Is it ok for trunk?
>>>
>>>
>>> Thanks,
>>> Igor
>>>
>>>
>>> 2015-07-02  Igor Zamyatin  <igor.zamyatin@intel.com>
>>>
>>>         * sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
>>>         and restore Intel MPX return bound registers
>>>         * sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX to
>>>         call, jump and ret instructions to not loose bounds.
>>>         * sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
>>>         lrv_bnd1.
>>
>> Should we add link-defines.sym, similar to x86-64, to avoid
>> those magic numbers when accessing the fields in La_i86_regs
>> and  La_i86_retval?
>>
>
> Here is a patch to add sysdeps/i386/link-defines.sym.
>
> I think the i386 MPX change will store values into La_i86_retval
> with wrong order. You need to store bnd0/bnd1 after lrv_st1, not
> before lrv_eax.
>
> Please add an i386 audit testcase to verify that pltexit gets correct
> La_i86_regs and La_i86_retval pointers and fix the i386 MPX patch.
>
>

Hi Igor,

I checked in my i386 audit update and test.  Please update and
resubmit i386 MPX change.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-02 23:41   ` H.J. Lu
  2015-07-03  2:38     ` H.J. Lu
  2015-07-07 12:39     ` H.J. Lu
@ 2015-07-08 15:25     ` Zamyatin, Igor
  2015-07-08 15:34       ` H.J. Lu
  2 siblings, 1 reply; 27+ messages in thread
From: Zamyatin, Igor @ 2015-07-08 15:25 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 2168 bytes --]

Hi! 

Please see updated patch (attached)

ChangeLog:

2015-07-08  Igor Zamyatin  <igor.zamyatin@intel.com>

	[BZ #18134]
	* sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
	and restore Intel MPX return bound registers.
	* sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX to
	call, jump and ret instructions to not loose bounds.
	* sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
	lrv_bnd1.
	* sysdeps/i386/link-defines.sym: Add definitions for LRV_BND0_OFFSET
	and LRV_BND1_OFFSET.

Thanks,
Igor

> -----Original Message-----
> From: H.J. Lu [mailto:hjl.tools@gmail.com]
> Sent: Friday, July 3, 2015 2:42 AM
> To: Zamyatin, Igor
> Cc: libc-alpha@sourceware.org
> Subject: Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
> 
> On Thu, Jul 2, 2015 at 7:33 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> > On Thu, Jul 2, 2015 at 7:03 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> wrote:
> >> Hi!
> >>
> >> This patch adds necessary changes for proper work of dl_runtime routines
> both for 32 and 64 bits in MPX mode.
> >>
> >> Is it ok for trunk?
> >>
> >>
> >> Thanks,
> >> Igor
> >>
> >>
> >> 2015-07-02  Igor Zamyatin  <igor.zamyatin@intel.com>
> >>
> >>         * sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
> >>         and restore Intel MPX return bound registers
> >>         * sysdeps/x86_64/dl-trampoline.h: Add
> PRESERVE_BND_REGS_PREFIX to
> >>         call, jump and ret instructions to not loose bounds.
> >>         * sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
> >>         lrv_bnd1.
> >
> > Should we add link-defines.sym, similar to x86-64, to avoid those
> > magic numbers when accessing the fields in La_i86_regs and
> > La_i86_retval?
> >
> 
> Here is a patch to add sysdeps/i386/link-defines.sym.
> 
> I think the i386 MPX change will store values into La_i86_retval with wrong
> order. You need to store bnd0/bnd1 after lrv_st1, not before lrv_eax.
> 
> Please add an i386 audit testcase to verify that pltexit gets correct
> La_i86_regs and La_i86_retval pointers and fix the i386 MPX patch.
> 
> 
> --
> H.J.

[-- Attachment #2: mpx_glibc_dl_runtime_profile_all.patch --]
[-- Type: application/octet-stream, Size: 5806 bytes --]

diff --git a/ChangeLog b/ChangeLog
index 07dc773..1323e18 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2015-07-08  Igor Zamyatin  <igor.zamyatin@intel.com>
+
+	[BZ #18134]
+	* sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
+	and restore Intel MPX return bound registers.
+	* sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX to
+	call, jump and ret instructions to not loose bounds.
+	* sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
+	lrv_bnd1.
+	* sysdeps/i386/link-defines.sym: Add definitions for LRV_BND0_OFFSET
+	and LRV_BND1_OFFSET.
+
 2015-07-08  Feng Gao  <gfree.wind@gmail.com>
 
 	* libio/fileops.c: Use "|" instead of "+" when combine _IO_LINE_BUF
diff --git a/sysdeps/i386/dl-trampoline.S b/sysdeps/i386/dl-trampoline.S
index 7c72b03..622af36 100644
--- a/sysdeps/i386/dl-trampoline.S
+++ b/sysdeps/i386/dl-trampoline.S
@@ -19,6 +19,12 @@
 #include <sysdep.h>
 #include <link-defines.h>
 
+#ifdef HAVE_MPX_SUPPORT
+# define PRESERVE_BND_REGS_PREFIX bnd
+#else
+# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
+#endif
+
 	.text
 	.globl _dl_runtime_resolve
 	.type _dl_runtime_resolve, @function
@@ -172,6 +178,13 @@ _dl_runtime_profile:
 	movl %edx, LRV_EDX_OFFSET(%esp)
 	fstpt LRV_ST0_OFFSET(%esp)
 	fstpt LRV_ST1_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, LRV_BND0_OFFSET(%esp)
+	bndmov %bnd1, LRV_BND1_OFFSET(%esp)
+#else
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,0x08
+#endif
 	pushl %esp
 	cfi_adjust_cfa_offset (4)
 	# Address of La_i86_regs area.
@@ -185,9 +198,17 @@ _dl_runtime_profile:
 	movl LRV_EDX_OFFSET(%esp), %edx
 	fldt LRV_ST1_OFFSET(%esp)
 	fldt LRV_ST0_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov LRV_BND0_OFFSET(%esp), %bnd0
+	bndmov LRV_BND1_OFFSET(%esp), %bnd1
+#else
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,0x08
+#endif
 	# Restore stack before return.
 	addl $(LRV_SIZE + 4 + LR_SIZE + 4), %esp
 	cfi_adjust_cfa_offset (-(LRV_SIZE + 4 + LR_SIZE + 4))
+	PRESERVE_BND_REGS_PREFIX
 	ret
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
diff --git a/sysdeps/i386/link-defines.sym b/sysdeps/i386/link-defines.sym
index a63dcb9..0995adb 100644
--- a/sysdeps/i386/link-defines.sym
+++ b/sysdeps/i386/link-defines.sym
@@ -16,3 +16,5 @@ LRV_EAX_OFFSET		offsetof (struct La_i86_retval, lrv_eax)
 LRV_EDX_OFFSET		offsetof (struct La_i86_retval, lrv_edx)
 LRV_ST0_OFFSET		offsetof (struct La_i86_retval, lrv_st0)
 LRV_ST1_OFFSET		offsetof (struct La_i86_retval, lrv_st1)
+LRV_BND0_OFFSET		offsetof (struct La_i86_retval, lrv_bnd0)
+LRV_BND1_OFFSET		offsetof (struct La_i86_retval, lrv_bnd1)
diff --git a/sysdeps/x86/bits/link.h b/sysdeps/x86/bits/link.h
index 3f559c9..0bf9b9a 100644
--- a/sysdeps/x86/bits/link.h
+++ b/sysdeps/x86/bits/link.h
@@ -38,6 +38,8 @@ typedef struct La_i86_retval
   uint32_t lrv_edx;
   long double lrv_st0;
   long double lrv_st1;
+  uint64_t lrv_bnd0;
+  uint64_t lrv_bnd1;
 } La_i86_retval;
 
 
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index 0e5a6fb..d542428 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -63,20 +63,6 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
-	bndmov 		    (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
-	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
-	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
-	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
-	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-# endif
-#endif
-
 #ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
@@ -154,8 +140,24 @@
 
 1:
 #endif
+
+#ifndef __ILP32__
+# ifdef HAVE_MPX_SUPPORT
+	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
+	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
+	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
+	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
+# else
+	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+# endif
+#endif
+
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
+	PRESERVE_BND_REGS_PREFIX
 	jns 3f
 
 	/* There's nothing in the frame size, so there
@@ -174,6 +176,7 @@
 	addq $48, %rsp		# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
 	jmp *%r11		# Jump to function address.
 
 3:
@@ -200,6 +203,7 @@
 	movq 32(%rdi), %rsi
 	movq 40(%rdi), %rdi
 
+	PRESERVE_BND_REGS_PREFIX
 	call *%r11
 
 	mov 24(%rbx), %rsp	# Drop the copied stack content
@@ -280,11 +284,11 @@
 
 #ifndef __ILP32__
 # ifdef HAVE_MPX_SUPPORT
-	bndmov LRV_BND0_OFFSET(%rcx), %bnd0  # Restore bound registers.
-	bndmov LRV_BND1_OFFSET(%rcx), %bnd1
+	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
+	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
 # else
-	.byte  0x66,0x0f,0x1a,0x81;.long (LRV_BND0_OFFSET)
-	.byte  0x66,0x0f,0x1a,0x89;.long (LRV_BND1_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
 # endif
 #endif
 
@@ -299,6 +303,7 @@
 	addq $48, %rsp		# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
 	retq
 
 #ifdef MORE_CODE

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-08 15:25     ` Zamyatin, Igor
@ 2015-07-08 15:34       ` H.J. Lu
  2015-07-08 15:56         ` Zamyatin, Igor
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2015-07-08 15:34 UTC (permalink / raw)
  To: Zamyatin, Igor; +Cc: libc-alpha

On Wed, Jul 8, 2015 at 8:25 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> Hi!
>
> Please see updated patch (attached)
>
> ChangeLog:
>
> 2015-07-08  Igor Zamyatin  <igor.zamyatin@intel.com>
>
>         [BZ #18134]
>         * sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
>         and restore Intel MPX return bound registers.
>         * sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX to
>         call, jump and ret instructions to not loose bounds.
>         * sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
>         lrv_bnd1.
>         * sysdeps/i386/link-defines.sym: Add definitions for LRV_BND0_OFFSET
>         and LRV_BND1_OFFSET.
>

+#else
+ .byte 0x66,0x0f,0x1b,0x04,0x24
+ .byte 0x66,0x0f,0x1b,0x4c,0x24,0x08
+#endif
...
+#else
+ .byte 0x66,0x0f,0x1a,0x04,0x24
+ .byte 0x66,0x0f,0x1a,0x4c,0x24,0x08
+#endif

Please use LRV_BND0_OFFSET and LRV_BND1_OFFSET.


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-08 15:34       ` H.J. Lu
@ 2015-07-08 15:56         ` Zamyatin, Igor
  2015-07-08 18:49           ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Zamyatin, Igor @ 2015-07-08 15:56 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 1337 bytes --]

Fixed in the attached patch


Thanks,
Igor

> -----Original Message-----
> From: H.J. Lu [mailto:hjl.tools@gmail.com]
> Sent: Wednesday, July 8, 2015 6:35 PM
> To: Zamyatin, Igor
> Cc: libc-alpha@sourceware.org
> Subject: Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
> 
> On Wed, Jul 8, 2015 at 8:25 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> wrote:
> > Hi!
> >
> > Please see updated patch (attached)
> >
> > ChangeLog:
> >
> > 2015-07-08  Igor Zamyatin  <igor.zamyatin@intel.com>
> >
> >         [BZ #18134]
> >         * sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
> >         and restore Intel MPX return bound registers.
> >         * sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX
> to
> >         call, jump and ret instructions to not loose bounds.
> >         * sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
> >         lrv_bnd1.
> >         * sysdeps/i386/link-defines.sym: Add definitions for
> LRV_BND0_OFFSET
> >         and LRV_BND1_OFFSET.
> >
> 
> +#else
> + .byte 0x66,0x0f,0x1b,0x04,0x24
> + .byte 0x66,0x0f,0x1b,0x4c,0x24,0x08
> +#endif
> ...
> +#else
> + .byte 0x66,0x0f,0x1a,0x04,0x24
> + .byte 0x66,0x0f,0x1a,0x4c,0x24,0x08
> +#endif
> 
> Please use LRV_BND0_OFFSET and LRV_BND1_OFFSET.
> 
> 
> --
> H.J.

[-- Attachment #2: mpx_glibc_dl_runtime_profile_all.patch --]
[-- Type: application/octet-stream, Size: 5860 bytes --]

diff --git a/ChangeLog b/ChangeLog
index 07dc773..1323e18 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2015-07-08  Igor Zamyatin  <igor.zamyatin@intel.com>
+
+	[BZ #18134]
+	* sysdeps/i386/dl-trampoline.S (_dl_runtime_profile): Save
+	and restore Intel MPX return bound registers.
+	* sysdeps/x86_64/dl-trampoline.h: Add PRESERVE_BND_REGS_PREFIX to
+	call, jump and ret instructions to not loose bounds.
+	* sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
+	lrv_bnd1.
+	* sysdeps/i386/link-defines.sym: Add definitions for LRV_BND0_OFFSET
+	and LRV_BND1_OFFSET.
+
 2015-07-08  Feng Gao  <gfree.wind@gmail.com>
 
 	* libio/fileops.c: Use "|" instead of "+" when combine _IO_LINE_BUF
diff --git a/sysdeps/i386/dl-trampoline.S b/sysdeps/i386/dl-trampoline.S
index 7c72b03..d03bc8c 100644
--- a/sysdeps/i386/dl-trampoline.S
+++ b/sysdeps/i386/dl-trampoline.S
@@ -19,6 +19,12 @@
 #include <sysdep.h>
 #include <link-defines.h>
 
+#ifdef HAVE_MPX_SUPPORT
+# define PRESERVE_BND_REGS_PREFIX bnd
+#else
+# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
+#endif
+
 	.text
 	.globl _dl_runtime_resolve
 	.type _dl_runtime_resolve, @function
@@ -172,6 +178,13 @@ _dl_runtime_profile:
 	movl %edx, LRV_EDX_OFFSET(%esp)
 	fstpt LRV_ST0_OFFSET(%esp)
 	fstpt LRV_ST1_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, LRV_BND0_OFFSET(%esp)
+	bndmov %bnd1, LRV_BND1_OFFSET(%esp)
+#else
+	.byte 0x66,0x0f,0x1b,0x04,0x24,LRV_BND0_OFFSET
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,LRV_BND1_OFFSET
+#endif
 	pushl %esp
 	cfi_adjust_cfa_offset (4)
 	# Address of La_i86_regs area.
@@ -185,9 +198,17 @@ _dl_runtime_profile:
 	movl LRV_EDX_OFFSET(%esp), %edx
 	fldt LRV_ST1_OFFSET(%esp)
 	fldt LRV_ST0_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov LRV_BND0_OFFSET(%esp), %bnd0
+	bndmov LRV_BND1_OFFSET(%esp), %bnd1
+#else
+	.byte 0x66,0x0f,0x1a,0x04,0x24,LRV_BND0_OFFSET
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,LRV_BND1_OFFSET
+#endif
 	# Restore stack before return.
 	addl $(LRV_SIZE + 4 + LR_SIZE + 4), %esp
 	cfi_adjust_cfa_offset (-(LRV_SIZE + 4 + LR_SIZE + 4))
+	PRESERVE_BND_REGS_PREFIX
 	ret
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
diff --git a/sysdeps/i386/link-defines.sym b/sysdeps/i386/link-defines.sym
index a63dcb9..0995adb 100644
--- a/sysdeps/i386/link-defines.sym
+++ b/sysdeps/i386/link-defines.sym
@@ -16,3 +16,5 @@ LRV_EAX_OFFSET		offsetof (struct La_i86_retval, lrv_eax)
 LRV_EDX_OFFSET		offsetof (struct La_i86_retval, lrv_edx)
 LRV_ST0_OFFSET		offsetof (struct La_i86_retval, lrv_st0)
 LRV_ST1_OFFSET		offsetof (struct La_i86_retval, lrv_st1)
+LRV_BND0_OFFSET		offsetof (struct La_i86_retval, lrv_bnd0)
+LRV_BND1_OFFSET		offsetof (struct La_i86_retval, lrv_bnd1)
diff --git a/sysdeps/x86/bits/link.h b/sysdeps/x86/bits/link.h
index 3f559c9..0bf9b9a 100644
--- a/sysdeps/x86/bits/link.h
+++ b/sysdeps/x86/bits/link.h
@@ -38,6 +38,8 @@ typedef struct La_i86_retval
   uint32_t lrv_edx;
   long double lrv_st0;
   long double lrv_st1;
+  uint64_t lrv_bnd0;
+  uint64_t lrv_bnd1;
 } La_i86_retval;
 
 
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index 0e5a6fb..d542428 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -63,20 +63,6 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
-	bndmov 		    (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
-	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
-	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
-	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
-	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-# endif
-#endif
-
 #ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
@@ -154,8 +140,24 @@
 
 1:
 #endif
+
+#ifndef __ILP32__
+# ifdef HAVE_MPX_SUPPORT
+	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
+	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
+	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
+	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
+# else
+	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+# endif
+#endif
+
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
+	PRESERVE_BND_REGS_PREFIX
 	jns 3f
 
 	/* There's nothing in the frame size, so there
@@ -174,6 +176,7 @@
 	addq $48, %rsp		# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
 	jmp *%r11		# Jump to function address.
 
 3:
@@ -200,6 +203,7 @@
 	movq 32(%rdi), %rsi
 	movq 40(%rdi), %rdi
 
+	PRESERVE_BND_REGS_PREFIX
 	call *%r11
 
 	mov 24(%rbx), %rsp	# Drop the copied stack content
@@ -280,11 +284,11 @@
 
 #ifndef __ILP32__
 # ifdef HAVE_MPX_SUPPORT
-	bndmov LRV_BND0_OFFSET(%rcx), %bnd0  # Restore bound registers.
-	bndmov LRV_BND1_OFFSET(%rcx), %bnd1
+	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
+	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
 # else
-	.byte  0x66,0x0f,0x1a,0x81;.long (LRV_BND0_OFFSET)
-	.byte  0x66,0x0f,0x1a,0x89;.long (LRV_BND1_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
 # endif
 #endif
 
@@ -299,6 +303,7 @@
 	addq $48, %rsp		# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
 	retq
 
 #ifdef MORE_CODE

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-08 15:56         ` Zamyatin, Igor
@ 2015-07-08 18:49           ` H.J. Lu
  2015-07-09 13:37             ` Zamyatin, Igor
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2015-07-08 18:49 UTC (permalink / raw)
  To: Zamyatin, Igor; +Cc: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 273 bytes --]

On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> Fixed in the attached patch
>

I fixed some typos and updated sysdeps/i386/configure for
HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT
and without on i386 and x86-64.

-- 
H.J.

[-- Attachment #2: 0001-Preserve-bound-registers-for-pointer-pass-return.patch --]
[-- Type: text/x-patch, Size: 9512 bytes --]

From bed2c05d4d05462c155baada8402516f103a79c4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 8 Jul 2015 11:08:44 -0700
Subject: [PATCH] Preserve bound registers for pointer pass/return

We need to save/restore bound registers and add a BND prefix before
branches in _dl_runtime_profile so that bound registers for pointer
pass and return are preserved when LD_AUDIT is used.

	[BZ #18134]
	* sysdeps/i386/configure.ac: Set HAVE_MPX_SUPPORT.
	* sysdeps/i386/configure: Regenerated.
	* sysdeps/i386/dl-trampoline.S (PRESERVE_BND_REGS_PREFIX): New.
	(_dl_runtime_profile): Save and restore Intel MPX return bound
	registers when calling _dl_call_pltexit.  Add
	PRESERVE_BND_REGS_PREFIX before return.
	* sysdeps/i386/link-defines.sym (LRV_BND0_OFFSET): New.
	(LRV_BND1_OFFSET): Likewise.
	* sysdeps/x86/bits/link.h (La_i86_retval): Add lrv_bnd0 and
	lrv_bnd1.
	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Fix
	typo in bndmov encoding.
	* sysdeps/x86_64/dl-trampoline.h: Properly save and restore
	Intel MPX bound registers.  Add PRESERVE_BND_REGS_PREFIX before
	branch instructions to preserve bounds.
---
 sysdeps/i386/configure         | 27 +++++++++++++++++++++++++++
 sysdeps/i386/configure.ac      | 15 +++++++++++++++
 sysdeps/i386/dl-trampoline.S   | 21 +++++++++++++++++++++
 sysdeps/i386/link-defines.sym  |  2 ++
 sysdeps/x86/bits/link.h        |  2 ++
 sysdeps/x86_64/dl-trampoline.S |  4 ++--
 sysdeps/x86_64/dl-trampoline.h | 41 +++++++++++++++++++++++------------------
 7 files changed, 92 insertions(+), 20 deletions(-)

diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure
index 6e89b59..ab66c08 100644
--- a/sysdeps/i386/configure
+++ b/sysdeps/i386/configure
@@ -240,6 +240,33 @@ $as_echo "$libc_cv_cc_novzeroupper" >&6; }
 config_vars="$config_vars
 config-cflags-novzeroupper = $libc_cv_cc_novzeroupper"
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
+$as_echo_n "checking for Intel MPX support... " >&6; }
+if ${libc_cv_asm_mpx+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.s <<\EOF
+        bndmov %bnd0,(%esp)
+EOF
+if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5
+$as_echo "$libc_cv_asm_mpx" >&6; }
+if test $libc_cv_asm_mpx == yes; then
+  $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h
+
+fi
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5
 $as_echo_n "checking for AVX2 support... " >&6; }
 if ${libc_cv_cc_avx2+:} false; then :
diff --git a/sysdeps/i386/configure.ac b/sysdeps/i386/configure.ac
index 35c4522..a3f3067 100644
--- a/sysdeps/i386/configure.ac
+++ b/sysdeps/i386/configure.ac
@@ -88,6 +88,21 @@ LIBC_TRY_CC_OPTION([-mno-vzeroupper],
 ])
 LIBC_CONFIG_VAR([config-cflags-novzeroupper], [$libc_cv_cc_novzeroupper])
 
+dnl Check whether asm supports Intel MPX
+AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
+cat > conftest.s <<\EOF
+        bndmov %bnd0,(%esp)
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*])
+if test $libc_cv_asm_mpx == yes; then
+  AC_DEFINE(HAVE_MPX_SUPPORT)
+fi
+
 dnl Check if -mavx2 works.
 AC_CACHE_CHECK(for AVX2 support, libc_cv_cc_avx2, [dnl
 LIBC_TRY_CC_OPTION([-mavx2], [libc_cv_cc_avx2=yes], [libc_cv_cc_avx2=no])
diff --git a/sysdeps/i386/dl-trampoline.S b/sysdeps/i386/dl-trampoline.S
index 7c72b03..8a2fd8d 100644
--- a/sysdeps/i386/dl-trampoline.S
+++ b/sysdeps/i386/dl-trampoline.S
@@ -19,6 +19,12 @@
 #include <sysdep.h>
 #include <link-defines.h>
 
+#ifdef HAVE_MPX_SUPPORT
+# define PRESERVE_BND_REGS_PREFIX bnd
+#else
+# define PRESERVE_BND_REGS_PREFIX .byte 0xf2
+#endif
+
 	.text
 	.globl _dl_runtime_resolve
 	.type _dl_runtime_resolve, @function
@@ -172,6 +178,13 @@ _dl_runtime_profile:
 	movl %edx, LRV_EDX_OFFSET(%esp)
 	fstpt LRV_ST0_OFFSET(%esp)
 	fstpt LRV_ST1_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, LRV_BND0_OFFSET(%esp)
+	bndmov %bnd1, LRV_BND1_OFFSET(%esp)
+#else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,LRV_BND0_OFFSET
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,LRV_BND1_OFFSET
+#endif
 	pushl %esp
 	cfi_adjust_cfa_offset (4)
 	# Address of La_i86_regs area.
@@ -185,9 +198,17 @@ _dl_runtime_profile:
 	movl LRV_EDX_OFFSET(%esp), %edx
 	fldt LRV_ST1_OFFSET(%esp)
 	fldt LRV_ST0_OFFSET(%esp)
+#ifdef HAVE_MPX_SUPPORT
+	bndmov LRV_BND0_OFFSET(%esp), %bnd0
+	bndmov LRV_BND1_OFFSET(%esp), %bnd1
+#else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,LRV_BND0_OFFSET
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,LRV_BND1_OFFSET
+#endif
 	# Restore stack before return.
 	addl $(LRV_SIZE + 4 + LR_SIZE + 4), %esp
 	cfi_adjust_cfa_offset (-(LRV_SIZE + 4 + LR_SIZE + 4))
+	PRESERVE_BND_REGS_PREFIX
 	ret
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
diff --git a/sysdeps/i386/link-defines.sym b/sysdeps/i386/link-defines.sym
index a63dcb9..0995adb 100644
--- a/sysdeps/i386/link-defines.sym
+++ b/sysdeps/i386/link-defines.sym
@@ -16,3 +16,5 @@ LRV_EAX_OFFSET		offsetof (struct La_i86_retval, lrv_eax)
 LRV_EDX_OFFSET		offsetof (struct La_i86_retval, lrv_edx)
 LRV_ST0_OFFSET		offsetof (struct La_i86_retval, lrv_st0)
 LRV_ST1_OFFSET		offsetof (struct La_i86_retval, lrv_st1)
+LRV_BND0_OFFSET		offsetof (struct La_i86_retval, lrv_bnd0)
+LRV_BND1_OFFSET		offsetof (struct La_i86_retval, lrv_bnd1)
diff --git a/sysdeps/x86/bits/link.h b/sysdeps/x86/bits/link.h
index 3f559c9..0bf9b9a 100644
--- a/sysdeps/x86/bits/link.h
+++ b/sysdeps/x86/bits/link.h
@@ -38,6 +38,8 @@ typedef struct La_i86_retval
   uint32_t lrv_edx;
   long double lrv_st0;
   long double lrv_st1;
+  uint64_t lrv_bnd0;
+  uint64_t lrv_bnd1;
 } La_i86_retval;
 
 
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 5f9b35d..b151d35 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -206,8 +206,8 @@ _dl_runtime_profile:
 #  else
 	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
 	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
 #  endif
 # endif
 
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index 0e5a6fb..d542428 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -63,20 +63,6 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
-	bndmov 		    (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
-	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
-	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
-	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
-	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-# endif
-#endif
-
 #ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
@@ -154,8 +140,24 @@
 
 1:
 #endif
+
+#ifndef __ILP32__
+# ifdef HAVE_MPX_SUPPORT
+	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
+	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
+	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
+	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
+# else
+	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+# endif
+#endif
+
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
+	PRESERVE_BND_REGS_PREFIX
 	jns 3f
 
 	/* There's nothing in the frame size, so there
@@ -174,6 +176,7 @@
 	addq $48, %rsp		# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
 	jmp *%r11		# Jump to function address.
 
 3:
@@ -200,6 +203,7 @@
 	movq 32(%rdi), %rsi
 	movq 40(%rdi), %rdi
 
+	PRESERVE_BND_REGS_PREFIX
 	call *%r11
 
 	mov 24(%rbx), %rsp	# Drop the copied stack content
@@ -280,11 +284,11 @@
 
 #ifndef __ILP32__
 # ifdef HAVE_MPX_SUPPORT
-	bndmov LRV_BND0_OFFSET(%rcx), %bnd0  # Restore bound registers.
-	bndmov LRV_BND1_OFFSET(%rcx), %bnd1
+	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
+	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
 # else
-	.byte  0x66,0x0f,0x1a,0x81;.long (LRV_BND0_OFFSET)
-	.byte  0x66,0x0f,0x1a,0x89;.long (LRV_BND1_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
+	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
 # endif
 #endif
 
@@ -299,6 +303,7 @@
 	addq $48, %rsp		# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
+	PRESERVE_BND_REGS_PREFIX
 	retq
 
 #ifdef MORE_CODE
-- 
2.4.3


^ permalink raw reply	[flat|nested] 27+ messages in thread

* RE: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-08 18:49           ` H.J. Lu
@ 2015-07-09 13:37             ` Zamyatin, Igor
  2015-07-09 14:12               ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Zamyatin, Igor @ 2015-07-09 13:37 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha

> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> wrote:
> > Fixed in the attached patch
> >
> 
> I fixed some typos and updated sysdeps/i386/configure for
> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
> without on i386 and x86-64.

Done, all works fine

Thanks,
Igor

> 
> --
> H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-09 13:37             ` Zamyatin, Igor
@ 2015-07-09 14:12               ` H.J. Lu
  2015-07-09 14:29                 ` Ondřej Bílka
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2015-07-09 14:12 UTC (permalink / raw)
  To: Zamyatin, Igor; +Cc: libc-alpha

On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
>> wrote:
>> > Fixed in the attached patch
>> >
>>
>> I fixed some typos and updated sysdeps/i386/configure for
>> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
>> without on i386 and x86-64.
>
> Done, all works fine
>

I checked it in for you.

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-09 14:12               ` H.J. Lu
@ 2015-07-09 14:29                 ` Ondřej Bílka
  2015-07-09 16:07                   ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Ondřej Bílka @ 2015-07-09 14:29 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Zamyatin, Igor, libc-alpha

On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> >> wrote:
> >> > Fixed in the attached patch
> >> >
> >>
> >> I fixed some typos and updated sysdeps/i386/configure for
> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
> >> without on i386 and x86-64.
> >
> > Done, all works fine
> >
> 
> I checked it in for you.
> 
These are nice but you could have same problem with lazy tls allocation.
I wrote patch to merge trampolines, which now conflicts. Could you write
similar patch to solve that? Original purpose was to always save xmm
registers so we could use sse2 routines which speeds up lookup time.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-09 14:29                 ` Ondřej Bílka
@ 2015-07-09 16:07                   ` H.J. Lu
  2015-07-11 10:47                     ` Ondřej Bílka
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2015-07-09 16:07 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: Zamyatin, Igor, libc-alpha

On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
>> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
>> >> wrote:
>> >> > Fixed in the attached patch
>> >> >
>> >>
>> >> I fixed some typos and updated sysdeps/i386/configure for
>> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
>> >> without on i386 and x86-64.
>> >
>> > Done, all works fine
>> >
>>
>> I checked it in for you.
>>
> These are nice but you could have same problem with lazy tls allocation.
> I wrote patch to merge trampolines, which now conflicts. Could you write
> similar patch to solve that? Original purpose was to always save xmm
> registers so we could use sse2 routines which speeds up lookup time.

So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
much gain it will give us?

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-09 16:07                   ` H.J. Lu
@ 2015-07-11 10:47                     ` Ondřej Bílka
  2015-07-11 20:27                       ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Ondřej Bílka @ 2015-07-11 10:47 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Zamyatin, Igor, libc-alpha

On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
> On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
> >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> >> >> wrote:
> >> >> > Fixed in the attached patch
> >> >> >
> >> >>
> >> >> I fixed some typos and updated sysdeps/i386/configure for
> >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
> >> >> without on i386 and x86-64.
> >> >
> >> > Done, all works fine
> >> >
> >>
> >> I checked it in for you.
> >>
> > These are nice but you could have same problem with lazy tls allocation.
> > I wrote patch to merge trampolines, which now conflicts. Could you write
> > similar patch to solve that? Original purpose was to always save xmm
> > registers so we could use sse2 routines which speeds up lookup time.
> 
> So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
> much gain it will give us?
>
I couldn't measure that without patch. Gain now would be big as we now
use byte-by-byte loop to check symbol name which is slow, especially
with c++ name mangling. Would be following benchmark good to measure
speedup or do I need to measure startup time which is bit harder?

#define _GNU_SOURCE
#include <string.h>
#include <dlfcn.h>
int main()
{
  int p=0;
  for (int i=0;i<10000000;i++)
    p+=(int) dlsym(RTLD_DEFAULT, "strlen");
  return p;
}

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-11 10:47                     ` Ondřej Bílka
@ 2015-07-11 20:27                       ` H.J. Lu
  2015-07-11 21:00                         ` H.J. Lu
  2015-07-11 23:50                         ` [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve H.J. Lu
  0 siblings, 2 replies; 27+ messages in thread
From: H.J. Lu @ 2015-07-11 20:27 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: libc-alpha

On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
> On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
> > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
> > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> > >> >> wrote:
> > >> >> > Fixed in the attached patch
> > >> >> >
> > >> >>
> > >> >> I fixed some typos and updated sysdeps/i386/configure for
> > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
> > >> >> without on i386 and x86-64.
> > >> >
> > >> > Done, all works fine
> > >> >
> > >>
> > >> I checked it in for you.
> > >>
> > > These are nice but you could have same problem with lazy tls allocation.
> > > I wrote patch to merge trampolines, which now conflicts. Could you write
> > > similar patch to solve that? Original purpose was to always save xmm
> > > registers so we could use sse2 routines which speeds up lookup time.
> > 
> > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
> > much gain it will give us?
> >
> I couldn't measure that without patch. Gain now would be big as we now
> use byte-by-byte loop to check symbol name which is slow, especially
> with c++ name mangling. Would be following benchmark good to measure
> speedup or do I need to measure startup time which is bit harder?
> 

Please try this.


H.J.
---
 sysdeps/x86_64/dl-trampoline.S | 42 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 678c57f..ea81d7d 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -27,26 +27,36 @@
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
 #ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
-# define REGISTER_SAVE_AREA	(8 * 7)
-# define REGISTER_SAVE_RAX	0
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as XMM0 to
+   XMM7.  */
+# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 8)
+/* Align XMM register save area to 16 bytes.  */
+# define REGISTER_SAVE_XMM0	0
 # define PRESERVE_BND_REGS_PREFIX
 #else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
-   BND1, BND2, BND3.  */
-# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4)
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and XMM0 to XMM7. */
+# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4 + 16 * 8)
 /* Align bound register save area to 16 bytes.  */
 # define REGISTER_SAVE_BND0	0
 # define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
+# define REGISTER_SAVE_XMM0	(REGISTER_SAVE_BND3 + 16)
 # ifdef HAVE_MPX_SUPPORT
 #  define PRESERVE_BND_REGS_PREFIX bnd
 # else
 #  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
 # endif
 #endif
+#define REGISTER_SAVE_XMM1	(REGISTER_SAVE_XMM0 + 16)
+#define REGISTER_SAVE_XMM2	(REGISTER_SAVE_XMM1 + 16)
+#define REGISTER_SAVE_XMM3	(REGISTER_SAVE_XMM2 + 16)
+#define REGISTER_SAVE_XMM4	(REGISTER_SAVE_XMM3 + 16)
+#define REGISTER_SAVE_XMM5	(REGISTER_SAVE_XMM4 + 16)
+#define REGISTER_SAVE_XMM6	(REGISTER_SAVE_XMM5 + 16)
+#define REGISTER_SAVE_XMM7	(REGISTER_SAVE_XMM6 + 16)
+#define REGISTER_SAVE_RAX	(REGISTER_SAVE_XMM7 + 16)
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
 #define REGISTER_SAVE_RSI	(REGISTER_SAVE_RDX + 8)
@@ -71,6 +81,15 @@ _dl_runtime_resolve:
 	movq %rdi, REGISTER_SAVE_RDI(%rsp)
 	movq %r8, REGISTER_SAVE_R8(%rsp)
 	movq %r9, REGISTER_SAVE_R9(%rsp)
+	# movaps is 1-byte shorter.
+	movaps %xmm0, REGISTER_SAVE_XMM0(%rsp)
+	movaps %xmm1, REGISTER_SAVE_XMM1(%rsp)
+	movaps %xmm2, REGISTER_SAVE_XMM2(%rsp)
+	movaps %xmm3, REGISTER_SAVE_XMM3(%rsp)
+	movaps %xmm4, REGISTER_SAVE_XMM4(%rsp)
+	movaps %xmm5, REGISTER_SAVE_XMM5(%rsp)
+	movaps %xmm6, REGISTER_SAVE_XMM6(%rsp)
+	movaps %xmm7, REGISTER_SAVE_XMM7(%rsp)
 #ifndef __ILP32__
 	# We also have to preserve bound registers.  These are nops if
 	# Intel MPX isn't available or disabled.
@@ -123,6 +142,15 @@ _dl_runtime_resolve:
 	movq REGISTER_SAVE_RDX(%rsp), %rdx
 	movq REGISTER_SAVE_RCX(%rsp), %rcx
 	movq REGISTER_SAVE_RAX(%rsp), %rax
+	# movaps is 1-byte shorter.
+	movaps REGISTER_SAVE_XMM0(%rsp), %xmm0
+	movaps REGISTER_SAVE_XMM1(%rsp), %xmm1
+	movaps REGISTER_SAVE_XMM2(%rsp), %xmm2
+	movaps REGISTER_SAVE_XMM3(%rsp), %xmm3
+	movaps REGISTER_SAVE_XMM4(%rsp), %xmm4
+	movaps REGISTER_SAVE_XMM5(%rsp), %xmm5
+	movaps REGISTER_SAVE_XMM6(%rsp), %xmm6
+	movaps REGISTER_SAVE_XMM7(%rsp), %xmm7
 	# Adjust stack(PLT did 2 pushes)
 	addq $(REGISTER_SAVE_AREA + 16), %rsp
 	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
-- 
2.4.3

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH, MPX] MPX-specific changes in dl_runtime routines
  2015-07-11 20:27                       ` H.J. Lu
@ 2015-07-11 21:00                         ` H.J. Lu
  2015-07-11 23:50                         ` [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve H.J. Lu
  1 sibling, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2015-07-11 21:00 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: libc-alpha

On Sat, Jul 11, 2015 at 1:27 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
>> On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
>> > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
>> > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
>> > >> >> wrote:
>> > >> >> > Fixed in the attached patch
>> > >> >> >
>> > >> >>
>> > >> >> I fixed some typos and updated sysdeps/i386/configure for
>> > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
>> > >> >> without on i386 and x86-64.
>> > >> >
>> > >> > Done, all works fine
>> > >> >
>> > >>
>> > >> I checked it in for you.
>> > >>
>> > > These are nice but you could have same problem with lazy tls allocation.
>> > > I wrote patch to merge trampolines, which now conflicts. Could you write
>> > > similar patch to solve that? Original purpose was to always save xmm
>> > > registers so we could use sse2 routines which speeds up lookup time.
>> >
>> > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
>> > much gain it will give us?
>> >
>> I couldn't measure that without patch. Gain now would be big as we now
>> use byte-by-byte loop to check symbol name which is slow, especially
>> with c++ name mangling. Would be following benchmark good to measure
>> speedup or do I need to measure startup time which is bit harder?
>>
>
> Please try this.
>
>
> H.J.
> ---
>  sysdeps/x86_64/dl-trampoline.S | 42 +++++++++++++++++++++++++++++++++++-------
>  1 file changed, 35 insertions(+), 7 deletions(-)
>
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index 678c57f..ea81d7d 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -27,26 +27,36 @@
>  /* Area on stack to save and restore registers used for parameter
>     passing when calling _dl_fixup.  */
>  #ifdef __ILP32__
> -/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
> -# define REGISTER_SAVE_AREA    (8 * 7)
> -# define REGISTER_SAVE_RAX     0
> +/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as XMM0 to
> +   XMM7.  */
> +# define REGISTER_SAVE_AREA    (8 * 7 + 16 * 8)
> +/* Align XMM register save area to 16 bytes.  */
> +# define REGISTER_SAVE_XMM0    0
>  # define PRESERVE_BND_REGS_PREFIX
>  #else
> -/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
> -   BND1, BND2, BND3.  */
> -# define REGISTER_SAVE_AREA    (8 * 7 + 16 * 4)
> +/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
> +   BND0, BND1, BND2, BND3 and XMM0 to XMM7. */
> +# define REGISTER_SAVE_AREA    (8 * 7 + 16 * 4 + 16 * 8)
>  /* Align bound register save area to 16 bytes.  */
>  # define REGISTER_SAVE_BND0    0
>  # define REGISTER_SAVE_BND1    (REGISTER_SAVE_BND0 + 16)
>  # define REGISTER_SAVE_BND2    (REGISTER_SAVE_BND1 + 16)
>  # define REGISTER_SAVE_BND3    (REGISTER_SAVE_BND2 + 16)
> -# define REGISTER_SAVE_RAX     (REGISTER_SAVE_BND3 + 16)
> +# define REGISTER_SAVE_XMM0    (REGISTER_SAVE_BND3 + 16)
>  # ifdef HAVE_MPX_SUPPORT
>  #  define PRESERVE_BND_REGS_PREFIX bnd
>  # else
>  #  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
>  # endif
>  #endif
> +#define REGISTER_SAVE_XMM1     (REGISTER_SAVE_XMM0 + 16)
> +#define REGISTER_SAVE_XMM2     (REGISTER_SAVE_XMM1 + 16)
> +#define REGISTER_SAVE_XMM3     (REGISTER_SAVE_XMM2 + 16)
> +#define REGISTER_SAVE_XMM4     (REGISTER_SAVE_XMM3 + 16)
> +#define REGISTER_SAVE_XMM5     (REGISTER_SAVE_XMM4 + 16)
> +#define REGISTER_SAVE_XMM6     (REGISTER_SAVE_XMM5 + 16)
> +#define REGISTER_SAVE_XMM7     (REGISTER_SAVE_XMM6 + 16)
> +#define REGISTER_SAVE_RAX      (REGISTER_SAVE_XMM7 + 16)
>  #define REGISTER_SAVE_RCX      (REGISTER_SAVE_RAX + 8)
>  #define REGISTER_SAVE_RDX      (REGISTER_SAVE_RCX + 8)
>  #define REGISTER_SAVE_RSI      (REGISTER_SAVE_RDX + 8)
> @@ -71,6 +81,15 @@ _dl_runtime_resolve:
>         movq %rdi, REGISTER_SAVE_RDI(%rsp)
>         movq %r8, REGISTER_SAVE_R8(%rsp)
>         movq %r9, REGISTER_SAVE_R9(%rsp)
> +       # movaps is 1-byte shorter.
> +       movaps %xmm0, REGISTER_SAVE_XMM0(%rsp)
> +       movaps %xmm1, REGISTER_SAVE_XMM1(%rsp)
> +       movaps %xmm2, REGISTER_SAVE_XMM2(%rsp)
> +       movaps %xmm3, REGISTER_SAVE_XMM3(%rsp)
> +       movaps %xmm4, REGISTER_SAVE_XMM4(%rsp)
> +       movaps %xmm5, REGISTER_SAVE_XMM5(%rsp)
> +       movaps %xmm6, REGISTER_SAVE_XMM6(%rsp)
> +       movaps %xmm7, REGISTER_SAVE_XMM7(%rsp)
>  #ifndef __ILP32__
>         # We also have to preserve bound registers.  These are nops if
>         # Intel MPX isn't available or disabled.
> @@ -123,6 +142,15 @@ _dl_runtime_resolve:
>         movq REGISTER_SAVE_RDX(%rsp), %rdx
>         movq REGISTER_SAVE_RCX(%rsp), %rcx
>         movq REGISTER_SAVE_RAX(%rsp), %rax
> +       # movaps is 1-byte shorter.
> +       movaps REGISTER_SAVE_XMM0(%rsp), %xmm0
> +       movaps REGISTER_SAVE_XMM1(%rsp), %xmm1
> +       movaps REGISTER_SAVE_XMM2(%rsp), %xmm2
> +       movaps REGISTER_SAVE_XMM3(%rsp), %xmm3
> +       movaps REGISTER_SAVE_XMM4(%rsp), %xmm4
> +       movaps REGISTER_SAVE_XMM5(%rsp), %xmm5
> +       movaps REGISTER_SAVE_XMM6(%rsp), %xmm6
> +       movaps REGISTER_SAVE_XMM7(%rsp), %xmm7
>         # Adjust stack(PLT did 2 pushes)
>         addq $(REGISTER_SAVE_AREA + 16), %rsp
>         cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
> --
> 2.4.3
>

movaps doesn't work since the stack is misaligned when
__tls_get_addr is called due to:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-11 20:27                       ` H.J. Lu
  2015-07-11 21:00                         ` H.J. Lu
@ 2015-07-11 23:50                         ` H.J. Lu
  2015-07-12 21:50                           ` H.J. Lu
  2015-07-26 13:16                           ` Ondřej Bílka
  1 sibling, 2 replies; 27+ messages in thread
From: H.J. Lu @ 2015-07-11 23:50 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Ondřej Bílka, libc-alpha

On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote:
> On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
> > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
> > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
> > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> > > >> >> wrote:
> > > >> >> > Fixed in the attached patch
> > > >> >> >
> > > >> >>
> > > >> >> I fixed some typos and updated sysdeps/i386/configure for
> > > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
> > > >> >> without on i386 and x86-64.
> > > >> >
> > > >> > Done, all works fine
> > > >> >
> > > >>
> > > >> I checked it in for you.
> > > >>
> > > > These are nice but you could have same problem with lazy tls allocation.
> > > > I wrote patch to merge trampolines, which now conflicts. Could you write
> > > > similar patch to solve that? Original purpose was to always save xmm
> > > > registers so we could use sse2 routines which speeds up lookup time.
> > > 
> > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
> > > much gain it will give us?
> > >
> > I couldn't measure that without patch. Gain now would be big as we now
> > use byte-by-byte loop to check symbol name which is slow, especially
> > with c++ name mangling. Would be following benchmark good to measure
> > speedup or do I need to measure startup time which is bit harder?
> > 
> 
> Please try this.
> 

We have to use movups instead of movaps due to

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066


H.J.
---
 sysdeps/x86_64/dl-trampoline.S | 51 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 7 deletions(-)

diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 678c57f..4a1928d 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -27,26 +27,36 @@
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
 #ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
-# define REGISTER_SAVE_AREA	(8 * 7)
-# define REGISTER_SAVE_RAX	0
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as XMM0 to
+   XMM7.  */
+# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 8)
+/* Align XMM register save area to 16 bytes.  */
+# define REGISTER_SAVE_XMM0	0
 # define PRESERVE_BND_REGS_PREFIX
 #else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
-   BND1, BND2, BND3.  */
-# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4)
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and XMM0 to XMM7. */
+# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4 + 16 * 8)
 /* Align bound register save area to 16 bytes.  */
 # define REGISTER_SAVE_BND0	0
 # define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
+# define REGISTER_SAVE_XMM0	(REGISTER_SAVE_BND3 + 16)
 # ifdef HAVE_MPX_SUPPORT
 #  define PRESERVE_BND_REGS_PREFIX bnd
 # else
 #  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
 # endif
 #endif
+#define REGISTER_SAVE_XMM1	(REGISTER_SAVE_XMM0 + 16)
+#define REGISTER_SAVE_XMM2	(REGISTER_SAVE_XMM1 + 16)
+#define REGISTER_SAVE_XMM3	(REGISTER_SAVE_XMM2 + 16)
+#define REGISTER_SAVE_XMM4	(REGISTER_SAVE_XMM3 + 16)
+#define REGISTER_SAVE_XMM5	(REGISTER_SAVE_XMM4 + 16)
+#define REGISTER_SAVE_XMM6	(REGISTER_SAVE_XMM5 + 16)
+#define REGISTER_SAVE_XMM7	(REGISTER_SAVE_XMM6 + 16)
+#define REGISTER_SAVE_RAX	(REGISTER_SAVE_XMM7 + 16)
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
 #define REGISTER_SAVE_RSI	(REGISTER_SAVE_RDX + 8)
@@ -54,6 +64,10 @@
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
 
+#if (REGISTER_SAVE_AREA % 16) != 8
+# error REGISTER_SAVE_AREA must be odd multples of 8
+#endif
+
 	.text
 	.globl _dl_runtime_resolve
 	.type _dl_runtime_resolve, @function
@@ -71,6 +85,20 @@ _dl_runtime_resolve:
 	movq %rdi, REGISTER_SAVE_RDI(%rsp)
 	movq %r8, REGISTER_SAVE_R8(%rsp)
 	movq %r9, REGISTER_SAVE_R9(%rsp)
+	# movaps is 1-byte shorter.
+	# We use movups instea of movaps since __tls_get_addr may be
+	# called with misaligned stack:
+	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+	# On Intel processors since 2011, movups is as fast as movaps
+	# on aligned address.
+	movups %xmm0, REGISTER_SAVE_XMM0(%rsp)
+	movups %xmm1, REGISTER_SAVE_XMM1(%rsp)
+	movups %xmm2, REGISTER_SAVE_XMM2(%rsp)
+	movups %xmm3, REGISTER_SAVE_XMM3(%rsp)
+	movups %xmm4, REGISTER_SAVE_XMM4(%rsp)
+	movups %xmm5, REGISTER_SAVE_XMM5(%rsp)
+	movups %xmm6, REGISTER_SAVE_XMM6(%rsp)
+	movups %xmm7, REGISTER_SAVE_XMM7(%rsp)
 #ifndef __ILP32__
 	# We also have to preserve bound registers.  These are nops if
 	# Intel MPX isn't available or disabled.
@@ -123,6 +151,15 @@ _dl_runtime_resolve:
 	movq REGISTER_SAVE_RDX(%rsp), %rdx
 	movq REGISTER_SAVE_RCX(%rsp), %rcx
 	movq REGISTER_SAVE_RAX(%rsp), %rax
+	# movaps is 1-byte shorter.
+	movups REGISTER_SAVE_XMM0(%rsp), %xmm0
+	movups REGISTER_SAVE_XMM1(%rsp), %xmm1
+	movups REGISTER_SAVE_XMM2(%rsp), %xmm2
+	movups REGISTER_SAVE_XMM3(%rsp), %xmm3
+	movups REGISTER_SAVE_XMM4(%rsp), %xmm4
+	movups REGISTER_SAVE_XMM5(%rsp), %xmm5
+	movups REGISTER_SAVE_XMM6(%rsp), %xmm6
+	movups REGISTER_SAVE_XMM7(%rsp), %xmm7
 	# Adjust stack(PLT did 2 pushes)
 	addq $(REGISTER_SAVE_AREA + 16), %rsp
 	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
-- 
2.4.3

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-11 23:50                         ` [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve H.J. Lu
@ 2015-07-12 21:50                           ` H.J. Lu
  2015-07-26 13:16                           ` Ondřej Bílka
  1 sibling, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2015-07-12 21:50 UTC (permalink / raw)
  Cc: Ondřej Bílka, libc-alpha

On Sat, Jul 11, 2015 at 4:50 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote:
>> On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
>> > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
>> > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
>> > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
>> > > >> >> wrote:
>> > > >> >> > Fixed in the attached patch
>> > > >> >> >
>> > > >> >>
>> > > >> >> I fixed some typos and updated sysdeps/i386/configure for
>> > > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
>> > > >> >> without on i386 and x86-64.
>> > > >> >
>> > > >> > Done, all works fine
>> > > >> >
>> > > >>
>> > > >> I checked it in for you.
>> > > >>
>> > > > These are nice but you could have same problem with lazy tls allocation.
>> > > > I wrote patch to merge trampolines, which now conflicts. Could you write
>> > > > similar patch to solve that? Original purpose was to always save xmm
>> > > > registers so we could use sse2 routines which speeds up lookup time.
>> > >
>> > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
>> > > much gain it will give us?
>> > >
>> > I couldn't measure that without patch. Gain now would be big as we now
>> > use byte-by-byte loop to check symbol name which is slow, especially
>> > with c++ name mangling. Would be following benchmark good to measure
>> > speedup or do I need to measure startup time which is bit harder?
>> >
>>
>> Please try this.
>>
>
> We have to use movups instead of movaps due to
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
>

It won't work since stack may be misaligned when SSE2 functions
are called.  Please try glibc hjl/pr18661 branch, where I fixed a few
stack misalignment bugs in x86-64 assembly code and save/restore
xmm0-xmm7,  with GCC hjl/pr58066/gcc-5-branch


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-11 23:50                         ` [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve H.J. Lu
  2015-07-12 21:50                           ` H.J. Lu
@ 2015-07-26 13:16                           ` Ondřej Bílka
  2015-07-26 19:38                             ` H.J. Lu
  1 sibling, 1 reply; 27+ messages in thread
From: Ondřej Bílka @ 2015-07-26 13:16 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 3110 bytes --]

On Sat, Jul 11, 2015 at 04:50:02PM -0700, H.J. Lu wrote:
> On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote:
> > On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
> > > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
> > > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
> > > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> > > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> > > > >> >> wrote:
> > > > >> >> > Fixed in the attached patch
> > > > >> >> >
> > > > >> >>
> > > > >> >> I fixed some typos and updated sysdeps/i386/configure for
> > > > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
> > > > >> >> without on i386 and x86-64.
> > > > >> >
> > > > >> > Done, all works fine
> > > > >> >
> > > > >>
> > > > >> I checked it in for you.
> > > > >>
> > > > > These are nice but you could have same problem with lazy tls allocation.
> > > > > I wrote patch to merge trampolines, which now conflicts. Could you write
> > > > > similar patch to solve that? Original purpose was to always save xmm
> > > > > registers so we could use sse2 routines which speeds up lookup time.
> > > > 
> > > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
> > > > much gain it will give us?
> > > >
> > > I couldn't measure that without patch. Gain now would be big as we now
> > > use byte-by-byte loop to check symbol name which is slow, especially
> > > with c++ name mangling. Would be following benchmark good to measure
> > > speedup or do I need to measure startup time which is bit harder?
> > > 
> > 
> > Please try this.
> > 
> 
> We have to use movups instead of movaps due to
> 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> 
>
Thanks, this looks promising.

I think how to do definite benchmark, Now I have evidence that its
likely improvement but not definite.

I found that benchmark that i intended causes too much noise and I
didn't get useful from that yet. It was creating 1000 functions in
library and calling them from main where performance between runs vary
by factor of 3 for same implementation.

I have indirect evidence. With attached patch to use sse2 routines I
decreased startup time of running binaries when you run "make bench" 
by ~6000 cycles and dlopen time by 4% on haswell and ivy bridge.

See results on haswell of 

LD_DEBUG=statistics make bench &> old_rtld

that are large so you could browse these here

http://kam.mff.cuni.cz/~ondra/old_rtld
http://kam.mff.cuni.cz/~ondra/new_rtld

For dlopen benchmark I measure ten times performance of
dlopen(RTLD_DEFAULT,"memcpy");
dlopen(RTLD_DEFAULT,"strlen");

Without patch I get
 624.49  559.58  556.6 556.04  558.42  557.86  559.46  555.17  556.93  555.32
and with patch
  604.71  536.74  536.08  535.78  534.11  533.67  534.8 534.8 533.46 536.08

I attached vip patches, I didn't change memcpy yet.

So if you have idea how directly measure fixup change it would be
welcome.



[-- Attachment #2: 0002-dlopen-benchmark.patch --]
[-- Type: text/plain, Size: 2905 bytes --]

From 8bad3c9751cba151b5f4ad2108bf2b860705c6eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20B=C3=ADlka?= <neleai@seznam.cz>
Date: Sun, 26 Jul 2015 10:15:56 +0200
Subject: [PATCH 2/4] dlopen benchmark

---
 benchtests/Makefile       |  3 ++-
 benchtests/bench-dlopen.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 benchtests/bench-dlopen.c

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8e615e5..9e82e43 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -30,7 +30,7 @@ bench-pthread := pthread_once
 bench := $(bench-math) $(bench-pthread)
 
 # String function benchmarks.
-string-bench := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
+string-bench := dlopen bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
 		mempcpy memset rawmemchr stpcpy stpncpy strcasecmp strcasestr \
 		strcat strchr strchrnul strcmp strcpy strcspn strlen \
 		strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
@@ -57,6 +57,7 @@ CFLAGS-bench-ffsll.c += -fno-builtin
 
 bench-malloc := malloc-thread
 
+$(objpfx)bench-dlopen: -ldl
 $(addprefix $(objpfx)bench-,$(bench-math)): $(libm)
 $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library)
 $(objpfx)bench-malloc-thread: $(shared-thread-library)
diff --git a/benchtests/bench-dlopen.c b/benchtests/bench-dlopen.c
new file mode 100644
index 0000000..b47d18b
--- /dev/null
+++ b/benchtests/bench-dlopen.c
@@ -0,0 +1,47 @@
+/* Measure strcpy functions.
+   Copyright (C) 2013-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+#include <dlfcn.h>
+#include <string.h>
+#include "bench-timing.h"
+
+int
+main (void)
+{
+  long ret = 0;
+
+  size_t i, j,iters = 100;
+  timing_t start, stop, cur;
+  for (j=0;j<10;j++)
+    {
+      TIMING_NOW (start);
+      for (i = 0; i < iters; ++i)
+        {
+          ret += (long) dlsym (RTLD_DEFAULT, "strlen");
+          ret += (long) dlsym (RTLD_DEFAULT, "memcpy");
+        }
+      TIMING_NOW (stop);
+
+      TIMING_DIFF (cur, start, stop);
+
+      TIMING_PRINT_MEAN ((double) cur, (double) iters);
+    }
+
+  return ret;
+}
+
+
-- 
2.1.4


[-- Attachment #3: 0004-rtld.patch --]
[-- Type: text/plain, Size: 20179 bytes --]

From e4ca50b9d4ba9cd378a740623f2da889afbe6309 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20B=C3=ADlka?= <neleai@seznam.cz>
Date: Sun, 26 Jul 2015 10:39:15 +0200
Subject: [PATCH 4/4] rtld

---
 sysdeps/x86_64/multiarch/rtld-memcmp.c |   1 -
 sysdeps/x86_64/multiarch/rtld-memset.S |   1 -
 sysdeps/x86_64/rtld-memcmp.c           |   1 -
 sysdeps/x86_64/rtld-memset.S           |  37 -----
 sysdeps/x86_64/rtld-strchr.S           | 288 ---------------------------------
 sysdeps/x86_64/rtld-strlen.S           | 136 ----------------
 sysdeps/x86_64/strcmp.S                |   1 +
 7 files changed, 1 insertion(+), 464 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/rtld-memcmp.c
 delete mode 100644 sysdeps/x86_64/multiarch/rtld-memset.S
 delete mode 100644 sysdeps/x86_64/rtld-memcmp.c
 delete mode 100644 sysdeps/x86_64/rtld-memset.S
 delete mode 100644 sysdeps/x86_64/rtld-strchr.S
 delete mode 100644 sysdeps/x86_64/rtld-strlen.S

diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c
deleted file mode 100644
index 0f27135..0000000
--- a/sysdeps/x86_64/multiarch/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memcmp.c"
diff --git a/sysdeps/x86_64/multiarch/rtld-memset.S b/sysdeps/x86_64/multiarch/rtld-memset.S
deleted file mode 100644
index 8092aa0..0000000
--- a/sysdeps/x86_64/multiarch/rtld-memset.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memset.S"
diff --git a/sysdeps/x86_64/rtld-memcmp.c b/sysdeps/x86_64/rtld-memcmp.c
deleted file mode 100644
index 2ee4032..0000000
--- a/sysdeps/x86_64/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <string/memcmp.c>
diff --git a/sysdeps/x86_64/rtld-memset.S b/sysdeps/x86_64/rtld-memset.S
deleted file mode 100644
index f8df333..0000000
--- a/sysdeps/x86_64/rtld-memset.S
+++ /dev/null
@@ -1,37 +0,0 @@
-/* memset implementation for the dynamic linker.  This is separate from the
-   libc implementation to avoid writing to SSE registers.
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-/* void *memset (void *dest, char c, size_t count)
-   dest	 => %rdi
-   c	 => %rsi
-   count => %rdx  */
-ENTRY (memset)
-	mov	%rdx, %rcx
-	movzbl	%sil, %eax
-	mov	%rdi, %rdx
-	rep	stosb
-	mov	%rdx, %rax
-	ret
-END (memset)
-libc_hidden_builtin_def (memset)
diff --git a/sysdeps/x86_64/rtld-strchr.S b/sysdeps/x86_64/rtld-strchr.S
deleted file mode 100644
index cc694d7..0000000
--- a/sysdeps/x86_64/rtld-strchr.S
+++ /dev/null
@@ -1,288 +0,0 @@
-/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
-   For AMD x86-64.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-ENTRY (strchr)
-
-	/* Before we start with the main loop we process single bytes
-	   until the source pointer is aligned.  This has two reasons:
-	   1. aligned 64-bit memory access is faster
-	   and (more important)
-	   2. we process in the main loop 64 bit in one step although
-	      we don't know the end of the string.  But accessing at
-	      8-byte alignment guarantees that we never access illegal
-	      memory if this would not also be done by the trivial
-	      implementation (this is because all processor inherent
-	      boundaries are multiples of 8).  */
-
-	movq	%rdi, %rdx
-	andl	$7, %edx	/* Mask alignment bits  */
-	movq	%rdi, %rax	/* duplicate destination.  */
-	jz	1f		/* aligned => start loop */
-	neg	%edx
-	addl	$8, %edx	/* Align to 8 bytes.  */
-
-	/* Search the first bytes directly.  */
-0:	movb	(%rax), %cl	/* load byte  */
-	cmpb	%cl,%sil	/* compare byte.  */
-	je	6f		/* target found */
-	testb	%cl,%cl		/* is byte NUL? */
-	je	7f		/* yes => return NULL */
-	incq	%rax		/* increment pointer */
-	decl	%edx
-	jnz	0b
-
-
-1:
-	/* At the moment %rsi contains C.  What we need for the
-	   algorithm is C in all bytes of the register.  Avoid
-	   operations on 16 bit words because these require an
-	   prefix byte (and one more cycle).  */
-	/* Populate 8 bit data to full 64-bit.  */
-	movabs	$0x0101010101010101,%r9
-	movzbl	%sil,%edx
-	imul	%rdx,%r9
-
-	movq $0xfefefefefefefeff, %r8 /* Save magic.  */
-
-      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
-	 change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of QUARDWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.	If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24 tec..  If one of bits 54-63 is set, there will be a carry
-	 into bit 64 (=carry flag), so all of the hole bits will
-	 be changed.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-	.p2align 4
-4:
-	/* Main Loop is unrolled 4 times.  */
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => restart loop */
-
-
-7:	/* Return NULL.  */
-	xorl %eax, %eax
-	retq
-
-
-	/* We now scan for the byte in which the character was matched.
-	   But we have to take care of the case that a NUL char is
-	   found before this in the dword.  Note that we XORed %rcx
-	   with the byte we're looking for, therefore the tests below look
-	   reversed.  */
-
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
-	subq	$8,%rax		/* correct pointer increment.  */
-	testb %cl, %cl		/* is first byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is first byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is second byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is third byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is third byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is fourth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is fourth byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is fifth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is fifth byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is sixth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is sixth byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is seventh byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is seventh byte NUL? */
-	je 7b			/* yes => return NULL */
-
-	/* It must be in the eigth byte and it cannot be NUL.  */
-	incq %rax
-
-6:
-	nop
-	retq
-END (strchr)
-
-weak_alias (strchr, index)
-libc_hidden_builtin_def (strchr)
diff --git a/sysdeps/x86_64/rtld-strlen.S b/sysdeps/x86_64/rtld-strlen.S
deleted file mode 100644
index 1328652..0000000
--- a/sysdeps/x86_64/rtld-strlen.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/* strlen(str) -- determine the length of the string STR.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   Based on i486 version contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-ENTRY (strlen)
-	movq %rdi, %rcx		/* Duplicate source pointer. */
-	andl $7, %ecx		/* mask alignment bits */
-	movq %rdi, %rax		/* duplicate destination.  */
-	jz 1f			/* aligned => start loop */
-
-	neg %ecx		/* We need to align to 8 bytes.  */
-	addl $8,%ecx
-	/* Search the first bytes directly.  */
-0:	cmpb $0x0,(%rax)	/* is byte NUL? */
-	je 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-	decl %ecx
-	jnz 0b
-
-1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
-
-	.p2align 4		/* Align loop.  */
-4:	/* Main Loop is unrolled 4 times.  */
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => continue loop */
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0x00ff0000, %ecx /* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	testl $0xff000000, %ecx /* is fourth byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	shrq $32, %rcx		/* look at other half.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0xff0000, %ecx	/* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-2:
-	subq %rdi, %rax		/* compute difference to string start */
-	ret
-END (strlen)
-libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 1329649..02bb9a6 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -128,6 +128,7 @@ libc_hidden_def (__strncasecmp)
 ENTRY (STRCMP)
 #if !IS_IN (libc)
 /* Simple version since we can't use SSE registers in ld.so.  */
+	jmp __strcmp_sse2_unaligned
 L(oop):	movb	(%rdi), %al
 	cmpb	(%rsi), %al
 	jne	L(neq)
-- 
2.1.4


^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-26 13:16                           ` Ondřej Bílka
@ 2015-07-26 19:38                             ` H.J. Lu
  2015-07-27 10:10                               ` Ondřej Bílka
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2015-07-26 19:38 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: libc-alpha

On Sun, Jul 26, 2015 at 6:16 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Sat, Jul 11, 2015 at 04:50:02PM -0700, H.J. Lu wrote:
>> On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote:
>> > On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
>> > > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
>> > > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> > > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
>> > > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> > > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
>> > > > >> >> wrote:
>> > > > >> >> > Fixed in the attached patch
>> > > > >> >> >
>> > > > >> >>
>> > > > >> >> I fixed some typos and updated sysdeps/i386/configure for
>> > > > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
>> > > > >> >> without on i386 and x86-64.
>> > > > >> >
>> > > > >> > Done, all works fine
>> > > > >> >
>> > > > >>
>> > > > >> I checked it in for you.
>> > > > >>
>> > > > > These are nice but you could have same problem with lazy tls allocation.
>> > > > > I wrote patch to merge trampolines, which now conflicts. Could you write
>> > > > > similar patch to solve that? Original purpose was to always save xmm
>> > > > > registers so we could use sse2 routines which speeds up lookup time.
>> > > >
>> > > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
>> > > > much gain it will give us?
>> > > >
>> > > I couldn't measure that without patch. Gain now would be big as we now
>> > > use byte-by-byte loop to check symbol name which is slow, especially
>> > > with c++ name mangling. Would be following benchmark good to measure
>> > > speedup or do I need to measure startup time which is bit harder?
>> > >
>> >
>> > Please try this.
>> >
>>
>> We have to use movups instead of movaps due to
>>
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
>>
>>
> Thanks, this looks promising.
>
> I think how to do definite benchmark, Now I have evidence that its
> likely improvement but not definite.
>
> I found that benchmark that i intended causes too much noise and I
> didn't get useful from that yet. It was creating 1000 functions in
> library and calling them from main where performance between runs vary
> by factor of 3 for same implementation.
>
> I have indirect evidence. With attached patch to use sse2 routines I
> decreased startup time of running binaries when you run "make bench"
> by ~6000 cycles and dlopen time by 4% on haswell and ivy bridge.
>
> See results on haswell of
>
> LD_DEBUG=statistics make bench &> old_rtld
>
> that are large so you could browse these here
>
> http://kam.mff.cuni.cz/~ondra/old_rtld
> http://kam.mff.cuni.cz/~ondra/new_rtld
>
> For dlopen benchmark I measure ten times performance of
> dlopen(RTLD_DEFAULT,"memcpy");
> dlopen(RTLD_DEFAULT,"strlen");
>
> Without patch I get
>  624.49  559.58  556.6 556.04  558.42  557.86  559.46  555.17  556.93  555.32
> and with patch
>   604.71  536.74  536.08  535.78  534.11  533.67  534.8 534.8 533.46 536.08
>
> I attached vip patches, I didn't change memcpy yet.
>
> So if you have idea how directly measure fixup change it would be
> welcome.
>

There is a potential performance issue.  This won't change parameters
passed in S256-bit/512-bit vector registers because SSE load will only
update the lower 128 bits of 256-bit/512-bit vector registers while
preserving the upper bits.  But these SSE load operations may not be
fast on all current and future processors.  To load the entire
256-bit/512-bit vector registers, we need to check CPU feature in
each symbol lookup.  On the other hand, we can compile x86-64 ld.so
with -msse2.  I don't know what the final performance impact is.


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-26 19:38                             ` H.J. Lu
@ 2015-07-27 10:10                               ` Ondřej Bílka
  2015-07-27 13:14                                 ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Ondřej Bílka @ 2015-07-27 10:10 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha

On Sun, Jul 26, 2015 at 12:38:20PM -0700, H.J. Lu wrote:
> On Sun, Jul 26, 2015 at 6:16 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > On Sat, Jul 11, 2015 at 04:50:02PM -0700, H.J. Lu wrote:
> >> On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote:
> >> > On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
> >> > > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
> >> > > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> >> > > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
> >> > > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> >> > > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> >> > > > >> >> wrote:
> >> > > > >> >> > Fixed in the attached patch
> >> > > > >> >> >
> >> > > > >> >>
> >> > > > >> >> I fixed some typos and updated sysdeps/i386/configure for
> >> > > > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
> >> > > > >> >> without on i386 and x86-64.
> >> > > > >> >
> >> > > > >> > Done, all works fine
> >> > > > >> >
> >> > > > >>
> >> > > > >> I checked it in for you.
> >> > > > >>
> >> > > > > These are nice but you could have same problem with lazy tls allocation.
> >> > > > > I wrote patch to merge trampolines, which now conflicts. Could you write
> >> > > > > similar patch to solve that? Original purpose was to always save xmm
> >> > > > > registers so we could use sse2 routines which speeds up lookup time.
> >> > > >
> >> > > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
> >> > > > much gain it will give us?
> >> > > >
> >> > > I couldn't measure that without patch. Gain now would be big as we now
> >> > > use byte-by-byte loop to check symbol name which is slow, especially
> >> > > with c++ name mangling. Would be following benchmark good to measure
> >> > > speedup or do I need to measure startup time which is bit harder?
> >> > >
> >> >
> >> > Please try this.
> >> >
> >>
> >> We have to use movups instead of movaps due to
> >>
> >> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> >>
> >>
> > Thanks, this looks promising.
> >
> > I think how to do definite benchmark, Now I have evidence that its
> > likely improvement but not definite.
> >
> > I found that benchmark that i intended causes too much noise and I
> > didn't get useful from that yet. It was creating 1000 functions in
> > library and calling them from main where performance between runs vary
> > by factor of 3 for same implementation.
> >
> > I have indirect evidence. With attached patch to use sse2 routines I
> > decreased startup time of running binaries when you run "make bench"
> > by ~6000 cycles and dlopen time by 4% on haswell and ivy bridge.
> >
> > See results on haswell of
> >
> > LD_DEBUG=statistics make bench &> old_rtld
> >
> > that are large so you could browse these here
> >
> > http://kam.mff.cuni.cz/~ondra/old_rtld
> > http://kam.mff.cuni.cz/~ondra/new_rtld
> >
> > For dlopen benchmark I measure ten times performance of
> > dlopen(RTLD_DEFAULT,"memcpy");
> > dlopen(RTLD_DEFAULT,"strlen");
> >
> > Without patch I get
> >  624.49  559.58  556.6 556.04  558.42  557.86  559.46  555.17  556.93  555.32
> > and with patch
> >   604.71  536.74  536.08  535.78  534.11  533.67  534.8 534.8 533.46 536.08
> >
> > I attached vip patches, I didn't change memcpy yet.
> >
> > So if you have idea how directly measure fixup change it would be
> > welcome.
> >
> 
> There is a potential performance issue.  This won't change parameters
> passed in S256-bit/512-bit vector registers because SSE load will only
> update the lower 128 bits of 256-bit/512-bit vector registers while
> preserving the upper bits.  But these SSE load operations may not be
> fast on all current and future processors.  To load the entire
> 256-bit/512-bit vector registers, we need to check CPU feature in
> each symbol lookup.  On the other hand, we can compile x86-64 ld.so
> with -msse2.  I don't know what the final performance impact is.
> 
Yes, these should be saved due problems with modes. There could be 
problem that saving these takes longer. You don't need
check cpu features on each call. 
Make _dl_runtime_resolve a function pointer and on
startup initialize it to correct variant.

This depends if more complicated code is worth gain. On other hand I
read original code to find it was also about fixing bug 15128 where
ifunc resolver clobbers xmm registers, similar problem is with LD_AUDIT.

These could be solved with this or by saving xmm registers only when
ifunc is called. I don't know whats best maintainable solution.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-27 10:10                               ` Ondřej Bílka
@ 2015-07-27 13:14                                 ` H.J. Lu
  2015-07-27 13:26                                   ` Ondřej Bílka
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2015-07-27 13:14 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: libc-alpha

On Mon, Jul 27, 2015 at 3:10 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Sun, Jul 26, 2015 at 12:38:20PM -0700, H.J. Lu wrote:
>> On Sun, Jul 26, 2015 at 6:16 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> > On Sat, Jul 11, 2015 at 04:50:02PM -0700, H.J. Lu wrote:
>> >> On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote:
>> >> > On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
>> >> > > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
>> >> > > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> >> > > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
>> >> > > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> >> > > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
>> >> > > > >> >> wrote:
>> >> > > > >> >> > Fixed in the attached patch
>> >> > > > >> >> >
>> >> > > > >> >>
>> >> > > > >> >> I fixed some typos and updated sysdeps/i386/configure for
>> >> > > > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
>> >> > > > >> >> without on i386 and x86-64.
>> >> > > > >> >
>> >> > > > >> > Done, all works fine
>> >> > > > >> >
>> >> > > > >>
>> >> > > > >> I checked it in for you.
>> >> > > > >>
>> >> > > > > These are nice but you could have same problem with lazy tls allocation.
>> >> > > > > I wrote patch to merge trampolines, which now conflicts. Could you write
>> >> > > > > similar patch to solve that? Original purpose was to always save xmm
>> >> > > > > registers so we could use sse2 routines which speeds up lookup time.
>> >> > > >
>> >> > > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
>> >> > > > much gain it will give us?
>> >> > > >
>> >> > > I couldn't measure that without patch. Gain now would be big as we now
>> >> > > use byte-by-byte loop to check symbol name which is slow, especially
>> >> > > with c++ name mangling. Would be following benchmark good to measure
>> >> > > speedup or do I need to measure startup time which is bit harder?
>> >> > >
>> >> >
>> >> > Please try this.
>> >> >
>> >>
>> >> We have to use movups instead of movaps due to
>> >>
>> >> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
>> >>
>> >>
>> > Thanks, this looks promising.
>> >
>> > I think how to do definite benchmark, Now I have evidence that its
>> > likely improvement but not definite.
>> >
>> > I found that benchmark that i intended causes too much noise and I
>> > didn't get useful from that yet. It was creating 1000 functions in
>> > library and calling them from main where performance between runs vary
>> > by factor of 3 for same implementation.
>> >
>> > I have indirect evidence. With attached patch to use sse2 routines I
>> > decreased startup time of running binaries when you run "make bench"
>> > by ~6000 cycles and dlopen time by 4% on haswell and ivy bridge.
>> >
>> > See results on haswell of
>> >
>> > LD_DEBUG=statistics make bench &> old_rtld
>> >
>> > that are large so you could browse these here
>> >
>> > http://kam.mff.cuni.cz/~ondra/old_rtld
>> > http://kam.mff.cuni.cz/~ondra/new_rtld
>> >
>> > For dlopen benchmark I measure ten times performance of
>> > dlopen(RTLD_DEFAULT,"memcpy");
>> > dlopen(RTLD_DEFAULT,"strlen");
>> >
>> > Without patch I get
>> >  624.49  559.58  556.6 556.04  558.42  557.86  559.46  555.17  556.93  555.32
>> > and with patch
>> >   604.71  536.74  536.08  535.78  534.11  533.67  534.8 534.8 533.46 536.08
>> >
>> > I attached vip patches, I didn't change memcpy yet.
>> >
>> > So if you have idea how directly measure fixup change it would be
>> > welcome.
>> >
>>
>> There is a potential performance issue.  This won't change parameters
>> passed in S256-bit/512-bit vector registers because SSE load will only
>> update the lower 128 bits of 256-bit/512-bit vector registers while
>> preserving the upper bits.  But these SSE load operations may not be
>> fast on all current and future processors.  To load the entire
>> 256-bit/512-bit vector registers, we need to check CPU feature in
>> each symbol lookup.  On the other hand, we can compile x86-64 ld.so
>> with -msse2.  I don't know what the final performance impact is.
>>
> Yes, these should be saved due problems with modes. There could be
> problem that saving these takes longer. You don't need
> check cpu features on each call.
> Make _dl_runtime_resolve a function pointer and on
> startup initialize it to correct variant.

One more indirect call.

> This depends if more complicated code is worth gain. On other hand I
> read original code to find it was also about fixing bug 15128 where
> ifunc resolver clobbers xmm registers, similar problem is with LD_AUDIT.
>
> These could be solved with this or by saving xmm registers only when
> ifunc is called. I don't know whats best maintainable solution.

That is true.  Can you try compile ld.so with -msse2?

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-27 13:14                                 ` H.J. Lu
@ 2015-07-27 13:26                                   ` Ondřej Bílka
  2015-07-27 13:37                                     ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: Ondřej Bílka @ 2015-07-27 13:26 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha

On Mon, Jul 27, 2015 at 06:14:07AM -0700, H.J. Lu wrote:
> On Mon, Jul 27, 2015 at 3:10 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> > On Sun, Jul 26, 2015 at 12:38:20PM -0700, H.J. Lu wrote:
> >> On Sun, Jul 26, 2015 at 6:16 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> >> > On Sat, Jul 11, 2015 at 04:50:02PM -0700, H.J. Lu wrote:
> >> >> On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote:
> >> >> > On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
> >> >> > > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
> >> >> > > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> >> >> > > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
> >> >> > > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
> >> >> > > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
> >> >> > > > >> >> wrote:
> >> >> > > > >> >> > Fixed in the attached patch
> >> >> > > > >> >> >
> >> >> > > > >> >>
> >> >> > > > >> >> I fixed some typos and updated sysdeps/i386/configure for
> >> >> > > > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
> >> >> > > > >> >> without on i386 and x86-64.
> >> >> > > > >> >
> >> >> > > > >> > Done, all works fine
> >> >> > > > >> >
> >> >> > > > >>
> >> >> > > > >> I checked it in for you.
> >> >> > > > >>
> >> >> > > > > These are nice but you could have same problem with lazy tls allocation.
> >> >> > > > > I wrote patch to merge trampolines, which now conflicts. Could you write
> >> >> > > > > similar patch to solve that? Original purpose was to always save xmm
> >> >> > > > > registers so we could use sse2 routines which speeds up lookup time.
> >> >> > > >
> >> >> > > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
> >> >> > > > much gain it will give us?
> >> >> > > >
> >> >> > > I couldn't measure that without patch. Gain now would be big as we now
> >> >> > > use byte-by-byte loop to check symbol name which is slow, especially
> >> >> > > with c++ name mangling. Would be following benchmark good to measure
> >> >> > > speedup or do I need to measure startup time which is bit harder?
> >> >> > >
> >> >> >
> >> >> > Please try this.
> >> >> >
> >> >>
> >> >> We have to use movups instead of movaps due to
> >> >>
> >> >> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> >> >>
> >> >>
> >> > Thanks, this looks promising.
> >> >
> >> > I think how to do definite benchmark, Now I have evidence that its
> >> > likely improvement but not definite.
> >> >
> >> > I found that benchmark that i intended causes too much noise and I
> >> > didn't get useful from that yet. It was creating 1000 functions in
> >> > library and calling them from main where performance between runs vary
> >> > by factor of 3 for same implementation.
> >> >
> >> > I have indirect evidence. With attached patch to use sse2 routines I
> >> > decreased startup time of running binaries when you run "make bench"
> >> > by ~6000 cycles and dlopen time by 4% on haswell and ivy bridge.
> >> >
> >> > See results on haswell of
> >> >
> >> > LD_DEBUG=statistics make bench &> old_rtld
> >> >
> >> > that are large so you could browse these here
> >> >
> >> > http://kam.mff.cuni.cz/~ondra/old_rtld
> >> > http://kam.mff.cuni.cz/~ondra/new_rtld
> >> >
> >> > For dlopen benchmark I measure ten times performance of
> >> > dlopen(RTLD_DEFAULT,"memcpy");
> >> > dlopen(RTLD_DEFAULT,"strlen");
> >> >
> >> > Without patch I get
> >> >  624.49  559.58  556.6 556.04  558.42  557.86  559.46  555.17  556.93  555.32
> >> > and with patch
> >> >   604.71  536.74  536.08  535.78  534.11  533.67  534.8 534.8 533.46 536.08
> >> >
> >> > I attached vip patches, I didn't change memcpy yet.
> >> >
> >> > So if you have idea how directly measure fixup change it would be
> >> > welcome.
> >> >
> >>
> >> There is a potential performance issue.  This won't change parameters
> >> passed in S256-bit/512-bit vector registers because SSE load will only
> >> update the lower 128 bits of 256-bit/512-bit vector registers while
> >> preserving the upper bits.  But these SSE load operations may not be
> >> fast on all current and future processors.  To load the entire
> >> 256-bit/512-bit vector registers, we need to check CPU feature in
> >> each symbol lookup.  On the other hand, we can compile x86-64 ld.so
> >> with -msse2.  I don't know what the final performance impact is.
> >>
> > Yes, these should be saved due problems with modes. There could be
> > problem that saving these takes longer. You don't need
> > check cpu features on each call.
> > Make _dl_runtime_resolve a function pointer and on
> > startup initialize it to correct variant.
> 
> One more indirect call.
>
no, my proposal is different, we could do this:

void *_dl_runtime_resolve;
int startup()
{
  if (has_avx())
    _dl_runtime_resolve = _dl_runtime_resolve_avx;
  else
    _dl_runtime_resolve = _dl_runtime_resolve_sse;
}

Then we will assign correct variant. 
 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-27 13:26                                   ` Ondřej Bílka
@ 2015-07-27 13:37                                     ` H.J. Lu
  2015-07-28 20:55                                       ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2015-07-27 13:37 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: libc-alpha

On Mon, Jul 27, 2015 at 6:26 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
> On Mon, Jul 27, 2015 at 06:14:07AM -0700, H.J. Lu wrote:
>> On Mon, Jul 27, 2015 at 3:10 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> > On Sun, Jul 26, 2015 at 12:38:20PM -0700, H.J. Lu wrote:
>> >> On Sun, Jul 26, 2015 at 6:16 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> >> > On Sat, Jul 11, 2015 at 04:50:02PM -0700, H.J. Lu wrote:
>> >> >> On Sat, Jul 11, 2015 at 01:27:42PM -0700, H.J. Lu wrote:
>> >> >> > On Sat, Jul 11, 2015 at 12:46:54PM +0200, Ondřej Bílka wrote:
>> >> >> > > On Thu, Jul 09, 2015 at 09:07:24AM -0700, H.J. Lu wrote:
>> >> >> > > > On Thu, Jul 9, 2015 at 7:28 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> >> >> > > > > On Thu, Jul 09, 2015 at 07:12:24AM -0700, H.J. Lu wrote:
>> >> >> > > > >> On Thu, Jul 9, 2015 at 6:37 AM, Zamyatin, Igor <igor.zamyatin@intel.com> wrote:
>> >> >> > > > >> >> On Wed, Jul 8, 2015 at 8:56 AM, Zamyatin, Igor <igor.zamyatin@intel.com>
>> >> >> > > > >> >> wrote:
>> >> >> > > > >> >> > Fixed in the attached patch
>> >> >> > > > >> >> >
>> >> >> > > > >> >>
>> >> >> > > > >> >> I fixed some typos and updated sysdeps/i386/configure for
>> >> >> > > > >> >> HAVE_MPX_SUPPORT.  Please verify both with HAVE_MPX_SUPPORT and
>> >> >> > > > >> >> without on i386 and x86-64.
>> >> >> > > > >> >
>> >> >> > > > >> > Done, all works fine
>> >> >> > > > >> >
>> >> >> > > > >>
>> >> >> > > > >> I checked it in for you.
>> >> >> > > > >>
>> >> >> > > > > These are nice but you could have same problem with lazy tls allocation.
>> >> >> > > > > I wrote patch to merge trampolines, which now conflicts. Could you write
>> >> >> > > > > similar patch to solve that? Original purpose was to always save xmm
>> >> >> > > > > registers so we could use sse2 routines which speeds up lookup time.
>> >> >> > > >
>> >> >> > > > So we will preserve only xmm0 to xmm7 in _dl_runtime_resolve? How
>> >> >> > > > much gain it will give us?
>> >> >> > > >
>> >> >> > > I couldn't measure that without patch. Gain now would be big as we now
>> >> >> > > use byte-by-byte loop to check symbol name which is slow, especially
>> >> >> > > with c++ name mangling. Would be following benchmark good to measure
>> >> >> > > speedup or do I need to measure startup time which is bit harder?
>> >> >> > >
>> >> >> >
>> >> >> > Please try this.
>> >> >> >
>> >> >>
>> >> >> We have to use movups instead of movaps due to
>> >> >>
>> >> >> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
>> >> >>
>> >> >>
>> >> > Thanks, this looks promising.
>> >> >
>> >> > I think how to do definite benchmark, Now I have evidence that its
>> >> > likely improvement but not definite.
>> >> >
>> >> > I found that benchmark that i intended causes too much noise and I
>> >> > didn't get useful from that yet. It was creating 1000 functions in
>> >> > library and calling them from main where performance between runs vary
>> >> > by factor of 3 for same implementation.
>> >> >
>> >> > I have indirect evidence. With attached patch to use sse2 routines I
>> >> > decreased startup time of running binaries when you run "make bench"
>> >> > by ~6000 cycles and dlopen time by 4% on haswell and ivy bridge.
>> >> >
>> >> > See results on haswell of
>> >> >
>> >> > LD_DEBUG=statistics make bench &> old_rtld
>> >> >
>> >> > that are large so you could browse these here
>> >> >
>> >> > http://kam.mff.cuni.cz/~ondra/old_rtld
>> >> > http://kam.mff.cuni.cz/~ondra/new_rtld
>> >> >
>> >> > For dlopen benchmark I measure ten times performance of
>> >> > dlopen(RTLD_DEFAULT,"memcpy");
>> >> > dlopen(RTLD_DEFAULT,"strlen");
>> >> >
>> >> > Without patch I get
>> >> >  624.49  559.58  556.6 556.04  558.42  557.86  559.46  555.17  556.93  555.32
>> >> > and with patch
>> >> >   604.71  536.74  536.08  535.78  534.11  533.67  534.8 534.8 533.46 536.08
>> >> >
>> >> > I attached vip patches, I didn't change memcpy yet.
>> >> >
>> >> > So if you have idea how directly measure fixup change it would be
>> >> > welcome.
>> >> >
>> >>
>> >> There is a potential performance issue.  This won't change parameters
>> >> passed in S256-bit/512-bit vector registers because SSE load will only
>> >> update the lower 128 bits of 256-bit/512-bit vector registers while
>> >> preserving the upper bits.  But these SSE load operations may not be
>> >> fast on all current and future processors.  To load the entire
>> >> 256-bit/512-bit vector registers, we need to check CPU feature in
>> >> each symbol lookup.  On the other hand, we can compile x86-64 ld.so
>> >> with -msse2.  I don't know what the final performance impact is.
>> >>
>> > Yes, these should be saved due problems with modes. There could be
>> > problem that saving these takes longer. You don't need
>> > check cpu features on each call.
>> > Make _dl_runtime_resolve a function pointer and on
>> > startup initialize it to correct variant.
>>
>> One more indirect call.
>>
> no, my proposal is different, we could do this:
>
> void *_dl_runtime_resolve;
> int startup()
> {
>   if (has_avx())
>     _dl_runtime_resolve = _dl_runtime_resolve_avx;
>   else
>     _dl_runtime_resolve = _dl_runtime_resolve_sse;
> }
>
> Then we will assign correct variant.

Yes, this may work for both _dl_runtime_profile and
 _dl_runtime_resolve.  I will see what I can do.


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-27 13:37                                     ` H.J. Lu
@ 2015-07-28 20:55                                       ` H.J. Lu
  2015-07-29  2:03                                         ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2015-07-28 20:55 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: libc-alpha

On Mon, Jul 27, 2015 at 6:37 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Jul 27, 2015 at 6:26 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>> On Mon, Jul 27, 2015 at 06:14:07AM -0700, H.J. Lu wrote:
>>> >>
>>> >> There is a potential performance issue.  This won't change parameters
>>> >> passed in S256-bit/512-bit vector registers because SSE load will only
>>> >> update the lower 128 bits of 256-bit/512-bit vector registers while
>>> >> preserving the upper bits.  But these SSE load operations may not be
>>> >> fast on all current and future processors.  To load the entire
>>> >> 256-bit/512-bit vector registers, we need to check CPU feature in
>>> >> each symbol lookup.  On the other hand, we can compile x86-64 ld.so
>>> >> with -msse2.  I don't know what the final performance impact is.
>>> >>
>>> > Yes, these should be saved due problems with modes. There could be
>>> > problem that saving these takes longer. You don't need
>>> > check cpu features on each call.
>>> > Make _dl_runtime_resolve a function pointer and on
>>> > startup initialize it to correct variant.
>>>
>>> One more indirect call.
>>>
>> no, my proposal is different, we could do this:
>>
>> void *_dl_runtime_resolve;
>> int startup()
>> {
>>   if (has_avx())
>>     _dl_runtime_resolve = _dl_runtime_resolve_avx;
>>   else
>>     _dl_runtime_resolve = _dl_runtime_resolve_sse;
>> }
>>
>> Then we will assign correct variant.
>
> Yes, this may work for both _dl_runtime_profile and
>  _dl_runtime_resolve.  I will see what I can do.
>

Please try hjl/pr18661 branch.  I implemented:

0000000000016fd0 t _dl_runtime_profile_avx
0000000000016b50 t _dl_runtime_profile_avx512
0000000000017450 t _dl_runtime_profile_sse
00000000000168d0 t _dl_runtime_resolve_avx
0000000000016780 t _dl_runtime_resolve_avx512
0000000000016a20 t _dl_runtime_resolve_sse

-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-28 20:55                                       ` H.J. Lu
@ 2015-07-29  2:03                                         ` H.J. Lu
  2015-07-29 12:11                                           ` H.J. Lu
  0 siblings, 1 reply; 27+ messages in thread
From: H.J. Lu @ 2015-07-29  2:03 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: libc-alpha

On Tue, Jul 28, 2015 at 1:55 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Jul 27, 2015 at 6:37 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Mon, Jul 27, 2015 at 6:26 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>>> On Mon, Jul 27, 2015 at 06:14:07AM -0700, H.J. Lu wrote:
>>>> >>
>>>> >> There is a potential performance issue.  This won't change parameters
>>>> >> passed in S256-bit/512-bit vector registers because SSE load will only
>>>> >> update the lower 128 bits of 256-bit/512-bit vector registers while
>>>> >> preserving the upper bits.  But these SSE load operations may not be
>>>> >> fast on all current and future processors.  To load the entire
>>>> >> 256-bit/512-bit vector registers, we need to check CPU feature in
>>>> >> each symbol lookup.  On the other hand, we can compile x86-64 ld.so
>>>> >> with -msse2.  I don't know what the final performance impact is.
>>>> >>
>>>> > Yes, these should be saved due problems with modes. There could be
>>>> > problem that saving these takes longer. You don't need
>>>> > check cpu features on each call.
>>>> > Make _dl_runtime_resolve a function pointer and on
>>>> > startup initialize it to correct variant.
>>>>
>>>> One more indirect call.
>>>>
>>> no, my proposal is different, we could do this:
>>>
>>> void *_dl_runtime_resolve;
>>> int startup()
>>> {
>>>   if (has_avx())
>>>     _dl_runtime_resolve = _dl_runtime_resolve_avx;
>>>   else
>>>     _dl_runtime_resolve = _dl_runtime_resolve_sse;
>>> }
>>>
>>> Then we will assign correct variant.
>>
>> Yes, this may work for both _dl_runtime_profile and
>>  _dl_runtime_resolve.  I will see what I can do.
>>
>
> Please try hjl/pr18661 branch.  I implemented:
>
> 0000000000016fd0 t _dl_runtime_profile_avx
> 0000000000016b50 t _dl_runtime_profile_avx512
> 0000000000017450 t _dl_runtime_profile_sse
> 00000000000168d0 t _dl_runtime_resolve_avx
> 0000000000016780 t _dl_runtime_resolve_avx512
> 0000000000016a20 t _dl_runtime_resolve_sse

I enabled SSE in ld.so and it works fine.


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve
  2015-07-29  2:03                                         ` H.J. Lu
@ 2015-07-29 12:11                                           ` H.J. Lu
  0 siblings, 0 replies; 27+ messages in thread
From: H.J. Lu @ 2015-07-29 12:11 UTC (permalink / raw)
  To: Ondřej Bílka; +Cc: libc-alpha

On Tue, Jul 28, 2015 at 7:03 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Jul 28, 2015 at 1:55 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Mon, Jul 27, 2015 at 6:37 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Mon, Jul 27, 2015 at 6:26 AM, Ondřej Bílka <neleai@seznam.cz> wrote:
>>>> On Mon, Jul 27, 2015 at 06:14:07AM -0700, H.J. Lu wrote:
>>>>> >>
>>>>> >> There is a potential performance issue.  This won't change parameters
>>>>> >> passed in S256-bit/512-bit vector registers because SSE load will only
>>>>> >> update the lower 128 bits of 256-bit/512-bit vector registers while
>>>>> >> preserving the upper bits.  But these SSE load operations may not be
>>>>> >> fast on all current and future processors.  To load the entire
>>>>> >> 256-bit/512-bit vector registers, we need to check CPU feature in
>>>>> >> each symbol lookup.  On the other hand, we can compile x86-64 ld.so
>>>>> >> with -msse2.  I don't know what the final performance impact is.
>>>>> >>
>>>>> > Yes, these should be saved due problems with modes. There could be
>>>>> > problem that saving these takes longer. You don't need
>>>>> > check cpu features on each call.
>>>>> > Make _dl_runtime_resolve a function pointer and on
>>>>> > startup initialize it to correct variant.
>>>>>
>>>>> One more indirect call.
>>>>>
>>>> no, my proposal is different, we could do this:
>>>>
>>>> void *_dl_runtime_resolve;
>>>> int startup()
>>>> {
>>>>   if (has_avx())
>>>>     _dl_runtime_resolve = _dl_runtime_resolve_avx;
>>>>   else
>>>>     _dl_runtime_resolve = _dl_runtime_resolve_sse;
>>>> }
>>>>
>>>> Then we will assign correct variant.
>>>
>>> Yes, this may work for both _dl_runtime_profile and
>>>  _dl_runtime_resolve.  I will see what I can do.
>>>
>>
>> Please try hjl/pr18661 branch.  I implemented:
>>
>> 0000000000016fd0 t _dl_runtime_profile_avx
>> 0000000000016b50 t _dl_runtime_profile_avx512
>> 0000000000017450 t _dl_runtime_profile_sse
>> 00000000000168d0 t _dl_runtime_resolve_avx
>> 0000000000016780 t _dl_runtime_resolve_avx512
>> 0000000000016a20 t _dl_runtime_resolve_sse
>
> I enabled SSE in ld.so and it works fine.
>

I fully enabled SSE in ld.so on hjl/pr18661 branch.  I didn't notice
any issue on both AVX and non-AVX machines.  There may be
some performance improvement.  But it is hard to tell.


-- 
H.J.

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2015-07-29 12:11 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-02 14:04 [PATCH, MPX] MPX-specific changes in dl_runtime routines Zamyatin, Igor
2015-07-02 14:33 ` H.J. Lu
2015-07-02 23:41   ` H.J. Lu
2015-07-03  2:38     ` H.J. Lu
2015-07-07 12:39     ` H.J. Lu
2015-07-08 15:25     ` Zamyatin, Igor
2015-07-08 15:34       ` H.J. Lu
2015-07-08 15:56         ` Zamyatin, Igor
2015-07-08 18:49           ` H.J. Lu
2015-07-09 13:37             ` Zamyatin, Igor
2015-07-09 14:12               ` H.J. Lu
2015-07-09 14:29                 ` Ondřej Bílka
2015-07-09 16:07                   ` H.J. Lu
2015-07-11 10:47                     ` Ondřej Bílka
2015-07-11 20:27                       ` H.J. Lu
2015-07-11 21:00                         ` H.J. Lu
2015-07-11 23:50                         ` [PATCH] Save and restore xmm0-xmm7 in _dl_runtime_resolve H.J. Lu
2015-07-12 21:50                           ` H.J. Lu
2015-07-26 13:16                           ` Ondřej Bílka
2015-07-26 19:38                             ` H.J. Lu
2015-07-27 10:10                               ` Ondřej Bílka
2015-07-27 13:14                                 ` H.J. Lu
2015-07-27 13:26                                   ` Ondřej Bílka
2015-07-27 13:37                                     ` H.J. Lu
2015-07-28 20:55                                       ` H.J. Lu
2015-07-29  2:03                                         ` H.J. Lu
2015-07-29 12:11                                           ` H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).