public inbox for binutils@sourceware.org
 help / color / mirror / Atom feed
* PR29189, dlltool delaylibs corrupt float/double arguments
@ 2023-05-17  0:52 Alan Modra
  2023-05-17 10:18 ` Jan Beulich
  0 siblings, 1 reply; 4+ messages in thread
From: Alan Modra @ 2023-05-17  0:52 UTC (permalink / raw)
  To: binutils; +Cc: Matthew Glazar, Jan Beulich, Dave Korn

PR29189 is an excellent bug report.  The reporter debugged the problem
to the point of finding out exactly where things were going wrong (in
a windows dll, so not a mingw problem) and even supplied a fix, giving
an ABI reference.  There hasn't been any action on the bug report due
to lack of an active mingw binutils maintainer, so I thought I'd take
a look as part of trying to whittle down the enormous number of
binutils bugzilla entries.

This is a rewrite of the patch given in the PR.  (It might even
resemble code emitted by Microsoft's LINK.EXE as reported in
https://www.mail-archive.com/llvm-bugs@lists.llvm.org/msg55205.html).
I'm not going to apply it without review from an x86 maintainer,
because it's been too long since I did any serious x86 specific work
and I don't want to break things for someone running an original amd64
machine.  The patch needs to be tested too.  I don't have a mingw
setup.

	PR 29189
	* dlltool.c (i386_x64_trampoline): Save and restore xmm0-3.  Make
	use of parameter save area for integer arg regs.  Comment.

diff --git a/binutils/dlltool.c b/binutils/dlltool.c
index 31c864d7d5c..eb2c4f34658 100644
--- a/binutils/dlltool.c
+++ b/binutils/dlltool.c
@@ -583,22 +583,37 @@ static const char i386_trampoline[] =
   "\tpopl %%ecx\n"
   "\tjmp *%%eax\n";
 
+/* Save integer arg regs in parameter space reserved by our caller
+   above the return address.  Allocate space for four fp arg regs plus
+   parameter space possibly used by __delayLoadHelper2 plus alignment.
+   We enter with the stack offset from 16-byte alignment by the return
+   address, so allocate 64 + 32 + 8 = 104 bytes.
+   FIXME: do we need to save avx ymm0-5 used to pass vectors args?  If
+   so, how to do it without blowing up on cpus without avx, cpuid?  */
 static const char i386_x64_trampoline[] =
-  "\tsubq $72, %%rsp\n"
-  "\t.seh_stackalloc 72\n"
+  "\tsubq $104, %%rsp\n"
+  "\t.seh_stackalloc 104\n"
   "\t.seh_endprologue\n"
-  "\tmovq %%rcx, 64(%%rsp)\n"
-  "\tmovq %%rdx, 56(%%rsp)\n"
-  "\tmovq %%r8, 48(%%rsp)\n"
-  "\tmovq %%r9, 40(%%rsp)\n"
-  "\tmovq  %%rax, %%rdx\n"
-  "\tleaq  __DELAY_IMPORT_DESCRIPTOR_%s(%%rip), %%rcx\n"
+  "\tmovq %%rcx, 104+8(%%rsp)\n"
+  "\tmovq %%rdx, 104+16(%%rsp)\n"
+  "\tmovq %%r8, 104+24(%%rsp)\n"
+  "\tmovq %%r9, 104+32(%%rsp)\n"
+  "\tmovaps %%xmm0, 32(%%rsp)\n"
+  "\tmovaps %%xmm1, 48(%%rsp)\n"
+  "\tmovaps %%xmm2, 64(%%rsp)\n"
+  "\tmovaps %%xmm3, 80(%%rsp)\n"
+  "\tmovq %%rax, %%rdx\n"
+  "\tleaq __DELAY_IMPORT_DESCRIPTOR_%s(%%rip), %%rcx\n"
   "\tcall __delayLoadHelper2\n"
-  "\tmovq 40(%%rsp), %%r9\n"
-  "\tmovq 48(%%rsp), %%r8\n"
-  "\tmovq 56(%%rsp), %%rdx\n"
-  "\tmovq 64(%%rsp), %%rcx\n"
-  "\taddq $72, %%rsp\n"
+  "\tmovq 104+8(%%rsp), %%rcx\n"
+  "\tmovq 104+16(%%rsp), %%rdx\n"
+  "\tmovq 104+24(%%rsp), %%r8\n"
+  "\tmovq 104+32(%%rsp), %%r9\n"
+  "\tmovaps 32(%%rsp), %%xmm0\n"
+  "\tmovaps 48(%%rsp), %%xmm1\n"
+  "\tmovaps 64(%%rsp), %%xmm2\n"
+  "\tmovaps 80(%%rsp), %%xmm3\n"
+  "\taddq $104, %%rsp\n"
   "\tjmp *%%rax\n";
 
 struct mac

-- 
Alan Modra
Australia Development Lab, IBM

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: PR29189, dlltool delaylibs corrupt float/double arguments
  2023-05-17  0:52 PR29189, dlltool delaylibs corrupt float/double arguments Alan Modra
@ 2023-05-17 10:18 ` Jan Beulich
  2023-05-17 23:52   ` Alan Modra
  0 siblings, 1 reply; 4+ messages in thread
From: Jan Beulich @ 2023-05-17 10:18 UTC (permalink / raw)
  To: Alan Modra; +Cc: Matthew Glazar, Dave Korn, binutils

On 17.05.2023 02:52, Alan Modra wrote:
> PR29189 is an excellent bug report.  The reporter debugged the problem
> to the point of finding out exactly where things were going wrong (in
> a windows dll, so not a mingw problem) and even supplied a fix, giving
> an ABI reference.  There hasn't been any action on the bug report due
> to lack of an active mingw binutils maintainer, so I thought I'd take
> a look as part of trying to whittle down the enormous number of
> binutils bugzilla entries.
> 
> This is a rewrite of the patch given in the PR.  (It might even
> resemble code emitted by Microsoft's LINK.EXE as reported in
> https://www.mail-archive.com/llvm-bugs@lists.llvm.org/msg55205.html).
> I'm not going to apply it without review from an x86 maintainer,
> because it's been too long since I did any serious x86 specific work
> and I don't want to break things for someone running an original amd64
> machine.  The patch needs to be tested too.  I don't have a mingw
> setup.
> 
> 	PR 29189
> 	* dlltool.c (i386_x64_trampoline): Save and restore xmm0-3.  Make
> 	use of parameter save area for integer arg regs.  Comment.

The change looks good to me, and I'm inclined to suggest that we postpone
dealing with wider vector registers (it's not just AVX, but also AVX512
which may need dealing with), in the hope that the system DLLs would only
ever touch the XMM ones, and only via legacy (pre-AVX) instructions (thus
leaving their upper halves / three quarters intact).

However, I'm inclined to suggest that we deal with %xmm4 and %xmm5 right
away: They're not callee preserved, so even a legacy-only system DLL
might clobber their low half/quarter.

Jan

> --- a/binutils/dlltool.c
> +++ b/binutils/dlltool.c
> @@ -583,22 +583,37 @@ static const char i386_trampoline[] =
>    "\tpopl %%ecx\n"
>    "\tjmp *%%eax\n";
>  
> +/* Save integer arg regs in parameter space reserved by our caller
> +   above the return address.  Allocate space for four fp arg regs plus
> +   parameter space possibly used by __delayLoadHelper2 plus alignment.
> +   We enter with the stack offset from 16-byte alignment by the return
> +   address, so allocate 64 + 32 + 8 = 104 bytes.
> +   FIXME: do we need to save avx ymm0-5 used to pass vectors args?  If
> +   so, how to do it without blowing up on cpus without avx, cpuid?  */
>  static const char i386_x64_trampoline[] =
> -  "\tsubq $72, %%rsp\n"
> -  "\t.seh_stackalloc 72\n"
> +  "\tsubq $104, %%rsp\n"
> +  "\t.seh_stackalloc 104\n"
>    "\t.seh_endprologue\n"
> -  "\tmovq %%rcx, 64(%%rsp)\n"
> -  "\tmovq %%rdx, 56(%%rsp)\n"
> -  "\tmovq %%r8, 48(%%rsp)\n"
> -  "\tmovq %%r9, 40(%%rsp)\n"
> -  "\tmovq  %%rax, %%rdx\n"
> -  "\tleaq  __DELAY_IMPORT_DESCRIPTOR_%s(%%rip), %%rcx\n"
> +  "\tmovq %%rcx, 104+8(%%rsp)\n"
> +  "\tmovq %%rdx, 104+16(%%rsp)\n"
> +  "\tmovq %%r8, 104+24(%%rsp)\n"
> +  "\tmovq %%r9, 104+32(%%rsp)\n"
> +  "\tmovaps %%xmm0, 32(%%rsp)\n"
> +  "\tmovaps %%xmm1, 48(%%rsp)\n"
> +  "\tmovaps %%xmm2, 64(%%rsp)\n"
> +  "\tmovaps %%xmm3, 80(%%rsp)\n"
> +  "\tmovq %%rax, %%rdx\n"
> +  "\tleaq __DELAY_IMPORT_DESCRIPTOR_%s(%%rip), %%rcx\n"
>    "\tcall __delayLoadHelper2\n"
> -  "\tmovq 40(%%rsp), %%r9\n"
> -  "\tmovq 48(%%rsp), %%r8\n"
> -  "\tmovq 56(%%rsp), %%rdx\n"
> -  "\tmovq 64(%%rsp), %%rcx\n"
> -  "\taddq $72, %%rsp\n"
> +  "\tmovq 104+8(%%rsp), %%rcx\n"
> +  "\tmovq 104+16(%%rsp), %%rdx\n"
> +  "\tmovq 104+24(%%rsp), %%r8\n"
> +  "\tmovq 104+32(%%rsp), %%r9\n"
> +  "\tmovaps 32(%%rsp), %%xmm0\n"
> +  "\tmovaps 48(%%rsp), %%xmm1\n"
> +  "\tmovaps 64(%%rsp), %%xmm2\n"
> +  "\tmovaps 80(%%rsp), %%xmm3\n"
> +  "\taddq $104, %%rsp\n"
>    "\tjmp *%%rax\n";
>  
>  struct mac
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: PR29189, dlltool delaylibs corrupt float/double arguments
  2023-05-17 10:18 ` Jan Beulich
@ 2023-05-17 23:52   ` Alan Modra
  2023-05-19  7:11     ` Jan Beulich
  0 siblings, 1 reply; 4+ messages in thread
From: Alan Modra @ 2023-05-17 23:52 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Matthew Glazar, Dave Korn, binutils

On Wed, May 17, 2023 at 12:18:15PM +0200, Jan Beulich wrote:
> On 17.05.2023 02:52, Alan Modra wrote:
> > PR29189 is an excellent bug report.  The reporter debugged the problem
> > to the point of finding out exactly where things were going wrong (in
> > a windows dll, so not a mingw problem) and even supplied a fix, giving
> > an ABI reference.  There hasn't been any action on the bug report due
> > to lack of an active mingw binutils maintainer, so I thought I'd take
> > a look as part of trying to whittle down the enormous number of
> > binutils bugzilla entries.
> > 
> > This is a rewrite of the patch given in the PR.  (It might even
> > resemble code emitted by Microsoft's LINK.EXE as reported in
> > https://www.mail-archive.com/llvm-bugs@lists.llvm.org/msg55205.html).
> > I'm not going to apply it without review from an x86 maintainer,
> > because it's been too long since I did any serious x86 specific work
> > and I don't want to break things for someone running an original amd64
> > machine.  The patch needs to be tested too.  I don't have a mingw
> > setup.
> > 
> > 	PR 29189
> > 	* dlltool.c (i386_x64_trampoline): Save and restore xmm0-3.  Make
> > 	use of parameter save area for integer arg regs.  Comment.
> 
> The change looks good to me, and I'm inclined to suggest that we postpone
> dealing with wider vector registers (it's not just AVX, but also AVX512
> which may need dealing with), in the hope that the system DLLs would only
> ever touch the XMM ones, and only via legacy (pre-AVX) instructions (thus
> leaving their upper halves / three quarters intact).

Yes, I think the same.  "only via pre-AVX insns" is more of a concern
than any real vector processing occurring.

> However, I'm inclined to suggest that we deal with %xmm4 and %xmm5 right
> away: They're not callee preserved, so even a legacy-only system DLL
> might clobber their low half/quarter.

That sounds very reasonable.  Here's an updated patch.

	PR 29189
	* dlltool.c (i386_x64_trampoline): Save and restore xmm0-5.  Make
	use of parameter save area for integer arg regs.  Comment.

diff --git a/binutils/dlltool.c b/binutils/dlltool.c
index 31c864d7d5c..3ae9de28f10 100644
--- a/binutils/dlltool.c
+++ b/binutils/dlltool.c
@@ -583,22 +583,48 @@ static const char i386_trampoline[] =
   "\tpopl %%ecx\n"
   "\tjmp *%%eax\n";
 
+/* Save integer arg regs in parameter space reserved by our caller
+   above the return address.  Allocate space for six fp arg regs plus
+   parameter space possibly used by __delayLoadHelper2 plus alignment.
+   We enter with the stack offset from 16-byte alignment by the return
+   address, so allocate 96 + 32 + 8 = 136 bytes.  Note that only the
+   first four xmm regs are used to pass fp args, but the first six
+   vector ymm (zmm too?) are used to pass vector args.  We are
+   assuming that volatile vector regs are not modified inside
+   __delayLoadHelper2.  However, it is known that at least xmm0 and
+   xmm1 are trashed in some versions of Microsoft dlls, and if xmm4 or
+   xmm5 are also used then that would trash the lower bits of ymm4 and
+   ymm5.  If it turns out that vector insns with a vex prefix are used
+   then we'll need to save ymm0-5 here but that can't be done without
+   first testing cpuid to see whether the instructions are available.  */
 static const char i386_x64_trampoline[] =
-  "\tsubq $72, %%rsp\n"
-  "\t.seh_stackalloc 72\n"
+  "\tsubq $136, %%rsp\n"
+  "\t.seh_stackalloc 136\n"
   "\t.seh_endprologue\n"
-  "\tmovq %%rcx, 64(%%rsp)\n"
-  "\tmovq %%rdx, 56(%%rsp)\n"
-  "\tmovq %%r8, 48(%%rsp)\n"
-  "\tmovq %%r9, 40(%%rsp)\n"
-  "\tmovq  %%rax, %%rdx\n"
-  "\tleaq  __DELAY_IMPORT_DESCRIPTOR_%s(%%rip), %%rcx\n"
+  "\tmovq %%rcx, 136+8(%%rsp)\n"
+  "\tmovq %%rdx, 136+16(%%rsp)\n"
+  "\tmovq %%r8, 136+24(%%rsp)\n"
+  "\tmovq %%r9, 136+32(%%rsp)\n"
+  "\tmovaps %%xmm0, 32(%%rsp)\n"
+  "\tmovaps %%xmm1, 48(%%rsp)\n"
+  "\tmovaps %%xmm2, 64(%%rsp)\n"
+  "\tmovaps %%xmm3, 80(%%rsp)\n"
+  "\tmovaps %%xmm4, 96(%%rsp)\n"
+  "\tmovaps %%xmm5, 112(%%rsp)\n"
+  "\tmovq %%rax, %%rdx\n"
+  "\tleaq __DELAY_IMPORT_DESCRIPTOR_%s(%%rip), %%rcx\n"
   "\tcall __delayLoadHelper2\n"
-  "\tmovq 40(%%rsp), %%r9\n"
-  "\tmovq 48(%%rsp), %%r8\n"
-  "\tmovq 56(%%rsp), %%rdx\n"
-  "\tmovq 64(%%rsp), %%rcx\n"
-  "\taddq $72, %%rsp\n"
+  "\tmovq 136+8(%%rsp), %%rcx\n"
+  "\tmovq 136+16(%%rsp), %%rdx\n"
+  "\tmovq 136+24(%%rsp), %%r8\n"
+  "\tmovq 136+32(%%rsp), %%r9\n"
+  "\tmovaps 32(%%rsp), %%xmm0\n"
+  "\tmovaps 48(%%rsp), %%xmm1\n"
+  "\tmovaps 64(%%rsp), %%xmm2\n"
+  "\tmovaps 80(%%rsp), %%xmm3\n"
+  "\tmovaps 96(%%rsp), %%xmm4\n"
+  "\tmovaps 112(%%rsp), %%xmm5\n"
+  "\taddq $136, %%rsp\n"
   "\tjmp *%%rax\n";
 
 struct mac

-- 
Alan Modra
Australia Development Lab, IBM

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: PR29189, dlltool delaylibs corrupt float/double arguments
  2023-05-17 23:52   ` Alan Modra
@ 2023-05-19  7:11     ` Jan Beulich
  0 siblings, 0 replies; 4+ messages in thread
From: Jan Beulich @ 2023-05-19  7:11 UTC (permalink / raw)
  To: Alan Modra; +Cc: Matthew Glazar, Dave Korn, binutils

On 18.05.2023 01:52, Alan Modra wrote:
> On Wed, May 17, 2023 at 12:18:15PM +0200, Jan Beulich wrote:
>> On 17.05.2023 02:52, Alan Modra wrote:
>>> PR29189 is an excellent bug report.  The reporter debugged the problem
>>> to the point of finding out exactly where things were going wrong (in
>>> a windows dll, so not a mingw problem) and even supplied a fix, giving
>>> an ABI reference.  There hasn't been any action on the bug report due
>>> to lack of an active mingw binutils maintainer, so I thought I'd take
>>> a look as part of trying to whittle down the enormous number of
>>> binutils bugzilla entries.
>>>
>>> This is a rewrite of the patch given in the PR.  (It might even
>>> resemble code emitted by Microsoft's LINK.EXE as reported in
>>> https://www.mail-archive.com/llvm-bugs@lists.llvm.org/msg55205.html).
>>> I'm not going to apply it without review from an x86 maintainer,
>>> because it's been too long since I did any serious x86 specific work
>>> and I don't want to break things for someone running an original amd64
>>> machine.  The patch needs to be tested too.  I don't have a mingw
>>> setup.
>>>
>>> 	PR 29189
>>> 	* dlltool.c (i386_x64_trampoline): Save and restore xmm0-3.  Make
>>> 	use of parameter save area for integer arg regs.  Comment.
>>
>> The change looks good to me, and I'm inclined to suggest that we postpone
>> dealing with wider vector registers (it's not just AVX, but also AVX512
>> which may need dealing with), in the hope that the system DLLs would only
>> ever touch the XMM ones, and only via legacy (pre-AVX) instructions (thus
>> leaving their upper halves / three quarters intact).
> 
> Yes, I think the same.  "only via pre-AVX insns" is more of a concern
> than any real vector processing occurring.
> 
>> However, I'm inclined to suggest that we deal with %xmm4 and %xmm5 right
>> away: They're not callee preserved, so even a legacy-only system DLL
>> might clobber their low half/quarter.
> 
> That sounds very reasonable.  Here's an updated patch.
> 
> 	PR 29189
> 	* dlltool.c (i386_x64_trampoline): Save and restore xmm0-5.  Make
> 	use of parameter save area for integer arg regs.  Comment.

Lgtm, fwiw. One more remark though (you may want to further extend the
comment):

> --- a/binutils/dlltool.c
> +++ b/binutils/dlltool.c
> @@ -583,22 +583,48 @@ static const char i386_trampoline[] =
>    "\tpopl %%ecx\n"
>    "\tjmp *%%eax\n";
>  
> +/* Save integer arg regs in parameter space reserved by our caller
> +   above the return address.  Allocate space for six fp arg regs plus
> +   parameter space possibly used by __delayLoadHelper2 plus alignment.
> +   We enter with the stack offset from 16-byte alignment by the return
> +   address, so allocate 96 + 32 + 8 = 136 bytes.  Note that only the
> +   first four xmm regs are used to pass fp args, but the first six
> +   vector ymm (zmm too?) are used to pass vector args.  We are
> +   assuming that volatile vector regs are not modified inside
> +   __delayLoadHelper2.  However, it is known that at least xmm0 and
> +   xmm1 are trashed in some versions of Microsoft dlls, and if xmm4 or
> +   xmm5 are also used then that would trash the lower bits of ymm4 and
> +   ymm5.  If it turns out that vector insns with a vex prefix are used
> +   then we'll need to save ymm0-5 here but that can't be done without
> +   first testing cpuid to see whether the instructions are available.  */

Checking CPUID alone isn't going to be sufficient. XCR0 would also need
consulting.

Jan

>  static const char i386_x64_trampoline[] =
> -  "\tsubq $72, %%rsp\n"
> -  "\t.seh_stackalloc 72\n"
> +  "\tsubq $136, %%rsp\n"
> +  "\t.seh_stackalloc 136\n"
>    "\t.seh_endprologue\n"
> -  "\tmovq %%rcx, 64(%%rsp)\n"
> -  "\tmovq %%rdx, 56(%%rsp)\n"
> -  "\tmovq %%r8, 48(%%rsp)\n"
> -  "\tmovq %%r9, 40(%%rsp)\n"
> -  "\tmovq  %%rax, %%rdx\n"
> -  "\tleaq  __DELAY_IMPORT_DESCRIPTOR_%s(%%rip), %%rcx\n"
> +  "\tmovq %%rcx, 136+8(%%rsp)\n"
> +  "\tmovq %%rdx, 136+16(%%rsp)\n"
> +  "\tmovq %%r8, 136+24(%%rsp)\n"
> +  "\tmovq %%r9, 136+32(%%rsp)\n"
> +  "\tmovaps %%xmm0, 32(%%rsp)\n"
> +  "\tmovaps %%xmm1, 48(%%rsp)\n"
> +  "\tmovaps %%xmm2, 64(%%rsp)\n"
> +  "\tmovaps %%xmm3, 80(%%rsp)\n"
> +  "\tmovaps %%xmm4, 96(%%rsp)\n"
> +  "\tmovaps %%xmm5, 112(%%rsp)\n"
> +  "\tmovq %%rax, %%rdx\n"
> +  "\tleaq __DELAY_IMPORT_DESCRIPTOR_%s(%%rip), %%rcx\n"
>    "\tcall __delayLoadHelper2\n"
> -  "\tmovq 40(%%rsp), %%r9\n"
> -  "\tmovq 48(%%rsp), %%r8\n"
> -  "\tmovq 56(%%rsp), %%rdx\n"
> -  "\tmovq 64(%%rsp), %%rcx\n"
> -  "\taddq $72, %%rsp\n"
> +  "\tmovq 136+8(%%rsp), %%rcx\n"
> +  "\tmovq 136+16(%%rsp), %%rdx\n"
> +  "\tmovq 136+24(%%rsp), %%r8\n"
> +  "\tmovq 136+32(%%rsp), %%r9\n"
> +  "\tmovaps 32(%%rsp), %%xmm0\n"
> +  "\tmovaps 48(%%rsp), %%xmm1\n"
> +  "\tmovaps 64(%%rsp), %%xmm2\n"
> +  "\tmovaps 80(%%rsp), %%xmm3\n"
> +  "\tmovaps 96(%%rsp), %%xmm4\n"
> +  "\tmovaps 112(%%rsp), %%xmm5\n"
> +  "\taddq $136, %%rsp\n"
>    "\tjmp *%%rax\n";
>  
>  struct mac
> 


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2023-05-19  7:11 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-17  0:52 PR29189, dlltool delaylibs corrupt float/double arguments Alan Modra
2023-05-17 10:18 ` Jan Beulich
2023-05-17 23:52   ` Alan Modra
2023-05-19  7:11     ` Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).