From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <libffi-discuss-return-1969-listarch-libffi-discuss=sources.redhat.com@sourceware.org>
Received: (qmail 8524 invoked by alias); 28 Oct 2014 18:32:20 -0000
Mailing-List: contact libffi-discuss-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libffi-discuss.sourceware.org>
List-Subscribe: <mailto:libffi-discuss-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libffi-discuss/>
List-Post: <mailto:libffi-discuss@sourceware.org>
List-Help: <mailto:libffi-discuss-help@sourceware.org>, <http://sourceware.org/ml/#faqs>
Sender: libffi-discuss-owner@sourceware.org
Received: (qmail 8446 invoked by uid 89); 28 Oct 2014 18:32:20 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-0.3 required=5.0 tests=AWL,BAYES_50,FREEMAIL_ENVFROM_END_DIGIT,FREEMAIL_FROM,KAM_STOCKGEN,RCVD_IN_DNSWL_LOW,SPF_PASS autolearn=no version=3.3.2
X-HELO: mail-qa0-f52.google.com
Received: from mail-qa0-f52.google.com (HELO mail-qa0-f52.google.com) (209.85.216.52) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-SHA encrypted) ESMTPS; Tue, 28 Oct 2014 18:32:16 +0000
Received: by mail-qa0-f52.google.com with SMTP id u7so893542qaz.25        for <libffi-discuss@sourceware.org>; Tue, 28 Oct 2014 11:32:13 -0700 (PDT)
X-Received: by 10.140.105.37 with SMTP id b34mr7252684qgf.91.1414521133483;        Tue, 28 Oct 2014 11:32:13 -0700 (PDT)
Received: from anchor.com (50-194-63-110-static.hfc.comcastbusiness.net. [50.194.63.110])        by mx.google.com with ESMTPSA id j1sm1948207qao.38.2014.10.28.11.32.12        for <multiple recipients>        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);        Tue, 28 Oct 2014 11:32:12 -0700 (PDT)
From: Richard Henderson <rth@twiddle.net>
To: libffi-discuss@sourceware.org
Cc: Kai Tietz <ktietz@redhat.com>
Subject: [PATCH 4/8] win64: Rewrite
Date: Tue, 28 Oct 2014 18:32:00 -0000
Message-Id: <1414521094-18403-5-git-send-email-rth@twiddle.net>
In-Reply-To: <1414521094-18403-1-git-send-email-rth@twiddle.net>
References: <1414521094-18403-1-git-send-email-rth@twiddle.net>
X-SW-Source: 2014/txt/msg00126.txt.bz2

It's way too different from the 32-bit ABIs with which it is
currently associated. As seen from all of the existing XFAILs.

Cc: Kai Tietz <ktietz@redhat.com>
---
 Makefile.am                                        |   4 +-
 src/x86/ffitarget.h                                |  29 +-
 src/x86/ffiw64.c                                   | 281 +++++++++
 src/x86/win64.S                                    | 693 ++++++---------------
 testsuite/libffi.call/call.exp                     |  13 +-
 testsuite/libffi.call/cls_align_longdouble_split.c |   2 -
 .../libffi.call/cls_align_longdouble_split2.c      |   2 -
 testsuite/libffi.call/cls_longdouble.c             |   2 -
 testsuite/libffi.call/float2.c                     |   3 -
 testsuite/libffi.call/huge_struct.c                |   2 -
 testsuite/libffi.call/return_ldl.c                 |   1 -
 11 files changed, 496 insertions(+), 536 deletions(-)
 create mode 100644 src/x86/ffiw64.c
diff --git a/Makefile.am b/Makefile.am
index 0e40451..3d1ecae 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -37,7 +37,7 @@ EXTRA_DIST = LICENSE ChangeLog.v1 ChangeLog.libgcj			\
 	 src/sh64/sysv.S src/sh64/ffitarget.h src/sparc/v8.S		\
 	 src/sparc/v9.S src/sparc/ffitarget.h src/sparc/ffi.c		\
 	 src/x86/darwin64.S src/x86/ffi.c src/x86/sysv.S		\
-	 src/x86/win32.S src/x86/darwin.S src/x86/win64.S		\
+	 src/x86/win32.S src/x86/darwin.S src/x86/ffiw64.c src/x86/win64.S \
 	 src/x86/freebsd.S src/x86/ffi64.c src/x86/unix64.S		\
 	 src/x86/ffitarget.h src/pa/ffitarget.h src/pa/ffi.c		\
 	 src/pa/linux.S src/pa/hpux32.S src/frv/ffi.c src/bfin/ffi.c	\
@@ -135,7 +135,7 @@ if X86_WIN32
 nodist_libffi_la_SOURCES += src/x86/ffi.c src/x86/win32.S
 endif
 if X86_WIN64
-nodist_libffi_la_SOURCES += src/x86/ffi.c src/x86/win64.S
+nodist_libffi_la_SOURCES += src/x86/ffiw64.c src/x86/win64.S
 endif
 if X86_DARWIN
 nodist_libffi_la_SOURCES += src/x86/ffi.c src/x86/darwin.S src/x86/ffi64.c src/x86/darwin64.S
diff --git a/src/x86/ffitarget.h b/src/x86/ffitarget.h
index 0d295e0..8c52573 100644
--- a/src/x86/ffitarget.h
+++ b/src/x86/ffitarget.h
@@ -127,25 +127,18 @@ typedef enum ffi_abi {
 #define FFI_TYPE_SMALL_STRUCT_4B (FFI_TYPE_LAST + 3)
 #define FFI_TYPE_MS_STRUCT       (FFI_TYPE_LAST + 4)
 
-#if defined (X86_64) || (defined (__x86_64__) && defined (X86_DARWIN))
-#define FFI_TRAMPOLINE_SIZE 24
-#define FFI_NATIVE_RAW_API 0
-#define FFI_GO_CLOSURES 1
+#if defined (X86_64) || defined(X86_WIN64) \
+    || (defined (__x86_64__) && defined (X86_DARWIN))
+# define FFI_TRAMPOLINE_SIZE 24
+# define FFI_NATIVE_RAW_API 0
+# define FFI_GO_CLOSURES 1
 #else
-#ifdef X86_WIN32
-#define FFI_TRAMPOLINE_SIZE 52
-#else
-#ifdef X86_WIN64
-#define FFI_TRAMPOLINE_SIZE 29
-#define FFI_NATIVE_RAW_API 0
-#define FFI_NO_RAW_API 1
-#else
-#define FFI_TRAMPOLINE_SIZE 10
-#endif
-#endif
-#ifndef X86_WIN64
-#define FFI_NATIVE_RAW_API 1  /* x86 has native raw api support */
-#endif
+# ifdef X86_WIN32
+#  define FFI_TRAMPOLINE_SIZE 52
+# else
+#  define FFI_TRAMPOLINE_SIZE 10
+# endif
+# define FFI_NATIVE_RAW_API 1  /* x86 has native raw api support */
 #endif
 
 #endif
diff --git a/src/x86/ffiw64.c b/src/x86/ffiw64.c
new file mode 100644
index 0000000..316f544
--- /dev/null
+++ b/src/x86/ffiw64.c
@@ -0,0 +1,281 @@
+/* -----------------------------------------------------------------------
+   ffiw64.c - Copyright (c) 2014 Red Hat, Inc.
+
+   x86 win64 Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#include <ffi.h>
+#include <ffi_common.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#ifdef X86_WIN64
+
+struct win64_call_frame
+{
+  UINT64 rbp;		/* 0 */
+  UINT64 retaddr;	/* 8 */
+  UINT64 fn;		/* 16 */
+  UINT64 flags;		/* 24 */
+  UINT64 rvalue;	/* 32 */
+};
+
+extern void ffi_call_win64 (void *stack, struct win64_call_frame *,
+			    void *closure) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_cif_machdep (ffi_cif *cif)
+{
+  int flags, n;
+
+  if (cif->abi != FFI_WIN64)
+    return FFI_BAD_ABI;
+
+  flags = cif->rtype->type;
+  switch (flags)
+    {
+    default:
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = FFI_TYPE_STRUCT;
+      break;
+    case FFI_TYPE_COMPLEX:
+      flags = FFI_TYPE_STRUCT;
+      /* FALLTHRU */
+    case FFI_TYPE_STRUCT:
+      switch (cif->rtype->size)
+	{
+	case 8:
+	  flags = FFI_TYPE_UINT64;
+	  break;
+	case 4:
+	  flags = FFI_TYPE_SMALL_STRUCT_4B;
+	  break;
+	case 2:
+	  flags = FFI_TYPE_SMALL_STRUCT_2B;
+	  break;
+	case 1:
+	  flags = FFI_TYPE_SMALL_STRUCT_1B;
+	  break;
+	}
+      break;
+    }
+  cif->flags = flags;
+
+  /* Each argument either fits in a register, an 8 byte slot, or is
+     passed by reference with the pointer in the 8 byte slot.  */
+  n = cif->nargs;
+  n += (flags == FFI_TYPE_STRUCT);
+  if (n < 4)
+    n = 4;
+  cif->bytes = n * 8;
+
+  return FFI_OK;
+}
+
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	      void **avalue, void *closure)
+{
+  int i, j, n, flags;
+  UINT64 *stack;
+  size_t rsize;
+  struct win64_call_frame *frame;
+
+  FFI_ASSERT(cif->abi == FFI_WIN64);
+
+  flags = cif->flags;
+  rsize = 0;
+
+  /* If we have no return value for a structure, we need to create one.
+     Otherwise we can ignore the return type entirely.  */
+  if (rvalue == NULL)
+    {
+      if (flags == FFI_TYPE_STRUCT)
+	rsize = cif->rtype->size;
+      else
+	flags = FFI_TYPE_VOID;
+    }
+
+  stack = alloca(cif->bytes + sizeof(struct win64_call_frame) + rsize);
+  frame = (struct win64_call_frame *)((char *)stack + cif->bytes);
+  if (rsize)
+    rvalue = frame + 1;
+
+  frame->fn = (uintptr_t)fn;
+  frame->flags = flags;
+  frame->rvalue = (uintptr_t)rvalue;
+
+  j = 0;
+  if (flags == FFI_TYPE_STRUCT)
+    {
+      stack[0] = (uintptr_t)rvalue;
+      j = 1;
+    }
+
+  for (i = 0, n = cif->nargs; i < n; ++i, ++j)
+    {
+      switch (cif->arg_types[i]->size)
+	{
+	case 8:
+	  stack[j] = *(UINT64 *)avalue[i];
+	  break;
+	case 4:
+	  stack[j] = *(UINT32 *)avalue[i];
+	  break;
+	case 2:
+	  stack[j] = *(UINT16 *)avalue[i];
+	  break;
+	case 1:
+	  stack[j] = *(UINT8 *)avalue[i];
+	  break;
+	default:
+	  stack[j] = (uintptr_t)avalue[i];
+	  break;
+	}
+    }
+
+  ffi_call_win64 (stack, frame, closure);
+}
+
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
+
+
+extern void ffi_closure_win64(void) FFI_HIDDEN;
+extern void ffi_go_closure_win64(void) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_closure_loc (ffi_closure* closure,
+		      ffi_cif* cif,
+		      void (*fun)(ffi_cif*, void*, void**, void*),
+		      void *user_data,
+		      void *codeloc)
+{
+  static const unsigned char trampoline[16] = {
+    /* leaq  -0x7(%rip),%r10   # 0x0  */
+    0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff,
+    /* jmpq  *0x3(%rip)        # 0x10 */
+    0xff, 0x25, 0x03, 0x00, 0x00, 0x00,
+    /* nopl  (%rax) */
+    0x0f, 0x1f, 0x00
+  };
+  void *tramp = closure->tramp;
+
+  if (cif->abi != FFI_WIN64)
+    return FFI_BAD_ABI;
+
+  memcpy (tramp, trampoline, sizeof(trampoline));
+  *(UINT64 *)(tramp + 16) = (uintptr_t)ffi_closure_win64;
+
+  closure->cif = cif;
+  closure->fun = fun;
+  closure->user_data = user_data;
+
+  return FFI_OK;
+}
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+		     void (*fun)(ffi_cif*, void*, void**, void*))
+{
+  if (cif->abi != FFI_WIN64)
+    return FFI_BAD_ABI;
+
+  closure->tramp = ffi_go_closure_win64;
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+
+struct win64_closure_frame
+{
+  UINT64 rvalue[2];
+  UINT64 fargs[4];
+  UINT64 retaddr;
+  UINT64 args[];
+};
+
+int FFI_HIDDEN
+ffi_closure_win64_inner(ffi_cif *cif,
+			void (*fun)(ffi_cif*, void*, void**, void*),
+			void *user_data,
+			struct win64_closure_frame *frame)
+{
+  void **avalue;
+  void *rvalue;
+  int i, n, nreg, flags;
+
+  avalue = alloca(cif->nargs * sizeof(void *));
+  rvalue = frame->rvalue;
+  nreg = 0;
+
+  /* When returning a structure, the address is in the first argument.
+     We must also be prepared to return the same address in eax, so
+     install that address in the frame and pretend we return a pointer.  */
+  flags = cif->flags;
+  if (flags == FFI_TYPE_STRUCT)
+    {
+      rvalue = (void *)(uintptr_t)frame->args[0];
+      frame->rvalue[0] = frame->args[0];
+      nreg = 1;
+    }
+
+  for (i = 0, n = cif->nargs; i < n; ++i, ++nreg)
+    {
+      size_t size = cif->arg_types[i]->size;
+      size_t type = cif->arg_types[i]->type;
+      void *a;
+
+      if (type == FFI_TYPE_DOUBLE || type == FFI_TYPE_FLOAT)
+	{
+	  if (nreg < 4)
+	    a = &frame->fargs[nreg];
+	  else
+	    a = &frame->args[nreg];
+	}
+      else if (size == 1 || size == 2 || size == 4 || size == 8)
+	a = &frame->args[nreg];
+      else
+	a = (void *)(uintptr_t)frame->args[nreg];
+
+      avalue[i] = a;
+    }
+
+  /* Invoke the closure.  */
+  fun (cif, rvalue, avalue, user_data);
+  return flags;
+}
+
+#endif /* X86_WIN64 */
diff --git a/src/x86/win64.S b/src/x86/win64.S
index 687f97c..a5a20b6 100644
--- a/src/x86/win64.S
+++ b/src/x86/win64.S
@@ -1,264 +1,16 @@
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
+#include <ffi_cfi.h>
 
-/* Constants for ffi_call_win64 */
-#define STACK 0
-#define PREP_ARGS_FN 32
-#define ECIF 40
-#define CIF_BYTES 48
-#define CIF_FLAGS 56
-#define RVALUE 64
-#define FN 72
-
-/* ffi_call_win64 (void (*prep_args_fn)(char *, extended_cif *),
-		   extended_cif *ecif, unsigned bytes, unsigned flags,
-		   unsigned *rvalue, void (*fn)());
- */
-
-#ifdef _MSC_VER
-PUBLIC	ffi_call_win64
-
-EXTRN	__chkstk:NEAR
-EXTRN	ffi_closure_win64_inner:NEAR
-
-_TEXT	SEGMENT
-
-;;; ffi_closure_win64 will be called with these registers set:
-;;;    rax points to 'closure'
-;;;    r11 contains a bit mask that specifies which of the
-;;;    first four parameters are float or double
-;;;
-;;; It must move the parameters passed in registers to their stack location,
-;;; call ffi_closure_win64_inner for the actual work, then return the result.
-;;;
-ffi_closure_win64 PROC FRAME
-	;; copy register arguments onto stack
-	test	r11, 1
-	jne	first_is_float
-	mov	QWORD PTR [rsp+8], rcx
-	jmp	second
-first_is_float:
-	movlpd	QWORD PTR [rsp+8], xmm0
-
-second:
-	test	r11, 2
-	jne	second_is_float
-	mov	QWORD PTR [rsp+16], rdx
-	jmp	third
-second_is_float:
-	movlpd	QWORD PTR [rsp+16], xmm1
-
-third:
-	test	r11, 4
-	jne	third_is_float
-	mov	QWORD PTR [rsp+24], r8
-	jmp	fourth
-third_is_float:
-	movlpd	QWORD PTR [rsp+24], xmm2
-
-fourth:
-	test	r11, 8
-	jne	fourth_is_float
-	mov	QWORD PTR [rsp+32], r9
-	jmp	done
-fourth_is_float:
-	movlpd	QWORD PTR [rsp+32], xmm3
-
-done:
-	.ALLOCSTACK 40
-	sub	rsp, 40
-	.ENDPROLOG
-	mov	rcx, rax	; context is first parameter
-	mov	rdx, rsp	; stack is second parameter
-	add	rdx, 48		; point to start of arguments
-	mov	rax, ffi_closure_win64_inner
-	call	rax		; call the real closure function
-	add	rsp, 40
-	movd	xmm0, rax	; If the closure returned a float,
-				; ffi_closure_win64_inner wrote it to rax
-	ret	0
-ffi_closure_win64 ENDP
-
-ffi_call_win64 PROC FRAME
-	;; copy registers onto stack
-	mov	QWORD PTR [rsp+32], r9
-	mov	QWORD PTR [rsp+24], r8
-	mov	QWORD PTR [rsp+16], rdx
-	mov	QWORD PTR [rsp+8], rcx
-	.PUSHREG rbp
-	push	rbp
-	.ALLOCSTACK 48
-	sub	rsp, 48					; 00000030H
-	.SETFRAME rbp, 32
-	lea	rbp, QWORD PTR [rsp+32]
-	.ENDPROLOG
-
-	mov	eax, DWORD PTR CIF_BYTES[rbp]
-	add	rax, 15
-	and	rax, -16
-	call	__chkstk
-	sub	rsp, rax
-	lea	rax, QWORD PTR [rsp+32]
-	mov	QWORD PTR STACK[rbp], rax
-
-	mov	rdx, QWORD PTR ECIF[rbp]
-	mov	rcx, QWORD PTR STACK[rbp]
-	call	QWORD PTR PREP_ARGS_FN[rbp]
-
-	mov	rsp, QWORD PTR STACK[rbp]
-
-	movlpd	xmm3, QWORD PTR [rsp+24]
-	movd	r9, xmm3
-
-	movlpd	xmm2, QWORD PTR [rsp+16]
-	movd	r8, xmm2
-
-	movlpd	xmm1, QWORD PTR [rsp+8]
-	movd	rdx, xmm1
-
-	movlpd	xmm0, QWORD PTR [rsp]
-	movd	rcx, xmm0
-
-	call	QWORD PTR FN[rbp]
-ret_struct4b$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_4B
- 	jne	ret_struct2b$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov	DWORD PTR [rcx], eax
-	jmp	ret_void$
-
-ret_struct2b$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_2B
- 	jne	ret_struct1b$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov	WORD PTR [rcx], ax
-	jmp	ret_void$
-
-ret_struct1b$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_1B
- 	jne	ret_uint8$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov	BYTE PTR [rcx], al
-	jmp	ret_void$
-
-ret_uint8$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT8
- 	jne	ret_sint8$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movzx   rax, al
-	mov	QWORD PTR [rcx], rax
-	jmp	ret_void$
-
-ret_sint8$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT8
- 	jne	ret_uint16$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movsx   rax, al
-	mov	QWORD PTR [rcx], rax
-	jmp	ret_void$
-
-ret_uint16$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT16
- 	jne	ret_sint16$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movzx   rax, ax
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_sint16$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT16
- 	jne	ret_uint32$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movsx   rax, ax
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_uint32$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT32
- 	jne	ret_sint32$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov     eax, eax
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_sint32$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT32
- 	jne	ret_float$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	cdqe
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_float$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_FLOAT
- 	jne	SHORT ret_double$
-
- 	mov	rax, QWORD PTR RVALUE[rbp]
- 	movss	DWORD PTR [rax], xmm0
- 	jmp	SHORT ret_void$
-
-ret_double$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_DOUBLE
- 	jne	SHORT ret_uint64$
-
- 	mov	rax, QWORD PTR RVALUE[rbp]
- 	movlpd	QWORD PTR [rax], xmm0
- 	jmp	SHORT ret_void$
-
-ret_uint64$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT64
-  	jne	SHORT ret_sint64$
-
- 	mov	rcx, QWORD PTR RVALUE[rbp]
- 	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_sint64$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT64
-  	jne	SHORT ret_pointer$
-
- 	mov	rcx, QWORD PTR RVALUE[rbp]
- 	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_pointer$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_POINTER
-  	jne	SHORT ret_int$
-
- 	mov	rcx, QWORD PTR RVALUE[rbp]
- 	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_int$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_INT
-  	jne	SHORT ret_void$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	cdqe
-	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_void$:
-	xor	rax, rax
-
-	lea	rsp, QWORD PTR [rbp+16]
-	pop	rbp
-	ret	0
-ffi_call_win64 ENDP
-_TEXT	ENDS
-END
+#if defined(HAVE_AS_CFI_PSEUDO_OP)
+        .cfi_sections   .debug_frame
+#endif
 
-#else
+#define arg0	%rcx
+#define arg1	%rdx
+#define arg2	%r8
+#define arg3	%r9
 
 #ifdef SYMBOL_UNDERSCORE
 #define SYMBOL_NAME(name) _##name
@@ -266,255 +18,202 @@ END
 #define SYMBOL_NAME(name) name
 #endif
 
-.text
-
-.extern SYMBOL_NAME(ffi_closure_win64_inner)
-
-# ffi_closure_win64 will be called with these registers set:
-#    rax points to 'closure'
-#    r11 contains a bit mask that specifies which of the
-#    first four parameters are float or double
-#
-# It must move the parameters passed in registers to their stack location,
-# call ffi_closure_win64_inner for the actual work, then return the result.
-#
-	.balign 16
-	.globl SYMBOL_NAME(ffi_closure_win64)
-	.seh_proc SYMBOL_NAME(ffi_closure_win64)
-SYMBOL_NAME(ffi_closure_win64):
-	# copy register arguments onto stack
-	test	$1,%r11
-	jne	.Lfirst_is_float
-	mov	%rcx, 8(%rsp)
-	jmp	.Lsecond
-.Lfirst_is_float:
-	movlpd	%xmm0, 8(%rsp)
-
-.Lsecond:
-	test	$2, %r11
-	jne	.Lsecond_is_float
-	mov	%rdx, 16(%rsp)
-	jmp	.Lthird
-.Lsecond_is_float:
-	movlpd	%xmm1, 16(%rsp)
-
-.Lthird:
-	test	$4, %r11
-	jne	.Lthird_is_float
-	mov	%r8,24(%rsp)
-	jmp	.Lfourth
-.Lthird_is_float:
-	movlpd	%xmm2, 24(%rsp)
-
-.Lfourth:
-	test	$8, %r11
-	jne	.Lfourth_is_float
-	mov	%r9, 32(%rsp)
-	jmp	.Ldone
-.Lfourth_is_float:
-	movlpd	%xmm3, 32(%rsp)
-
-.Ldone:
-	.seh_stackalloc 40
-	sub	$40, %rsp
+.macro E which
+	.align	8
+	.org	0b + \which * 8
+.endm
+
+	.text
+
+/* ffi_call_win64 (void *stack, struct win64_call_frame *frame, void *r10)
+
+   Bit o trickiness here -- FRAME is the base of the stack frame
+   for this function.  This has been allocated by ffi_call.  We also
+   deallocate some of the stack that has been alloca'd.  */
+
+	.align	8
+	.globl	ffi_call_win64
+
+	.seh_proc ffi_call_win64
+ffi_call_win64:
+	cfi_startproc
+	/* Set up the local stack frame and install it in rbp/rsp.  */
+	movq	(%rsp), %rax
+	movq	%rbp, (arg1)
+	movq	%rax, 8(arg1)
+	movq	arg1, %rbp
+	cfi_def_cfa(%rbp, 16)
+	cfi_rel_offset(%rbp, 0)
+	.seh_pushreg %rbp
+	.seh_setframe %rbp, 0
 	.seh_endprologue
-	mov	%rax, %rcx	# context is first parameter
-	mov	%rsp, %rdx	# stack is second parameter
-	add	$48, %rdx	# point to start of arguments
-	leaq	SYMBOL_NAME(ffi_closure_win64_inner)(%rip), %rax
-	callq	*%rax		# call the real closure function
-	add	$40, %rsp
-	movq	%rax, %xmm0	# If the closure returned a float,
-				# ffi_closure_win64_inner wrote it to rax
-	retq
+	movq	arg0, %rsp
+
+	movq	arg2, %r10
+
+	/* Load all slots into both general and xmm registers.  */
+	movq	(%rsp), %rcx
+	movsd	(%rsp), %xmm0
+	movq	8(%rsp), %rdx
+	movsd	8(%rsp), %xmm1
+	movq	16(%rsp), %r8
+	movsd	16(%rsp), %xmm2
+	movq	24(%rsp), %r9
+	movsd	24(%rsp), %xmm3
+
+	call	*16(%rbp)
+
+	movl	24(%rbp), %ecx
+	movq	32(%rbp), %r8
+	leaq	0f(%rip), %r10
+	cmpl	$FFI_TYPE_SMALL_STRUCT_4B, %ecx
+	leaq	(%r10, %rcx, 8), %r10
+	ja	99f
+	jmp	*%r10
+
+/* Below, we're space constrained most of the time.  Thus we eschew the
+   modern "mov, pop, ret" sequence (5 bytes) for "leave, ret" (2 bytes).  */
+.macro epilogue
+	leaveq
+	cfi_remember_state
+	cfi_def_cfa(%rsp, 8)
+	cfi_restore(%rbp)
+	ret
+	cfi_restore_state
+.endm
+
+	.align	8
+0:
+E FFI_TYPE_VOID
+	epilogue
+E FFI_TYPE_INT
+	movslq	%eax, %rax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_FLOAT
+	movss	%xmm0, (%r8)
+	epilogue
+E FFI_TYPE_DOUBLE
+	movsd	%xmm0, (%r8)
+	epilogue
+E FFI_TYPE_LONGDOUBLE
+	call	abort
+E FFI_TYPE_UINT8
+	movzbl	%al, %eax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT8
+	movsbq	%al, %rax
+	jmp	98f
+E FFI_TYPE_UINT16
+	movzwl	%ax, %eax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT16
+	movswq	%ax, %rax
+	jmp	98f
+E FFI_TYPE_UINT32
+	movl	%eax, %eax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT32
+	movslq	%eax, %rax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_UINT64
+98:	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT64
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_STRUCT
+	epilogue
+E FFI_TYPE_POINTER
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_COMPLEX
+	call	abort
+E FFI_TYPE_SMALL_STRUCT_1B
+	movb	%al, (%r8)
+	epilogue
+E FFI_TYPE_SMALL_STRUCT_2B
+	movw	%ax, (%r8)
+	epilogue
+E FFI_TYPE_SMALL_STRUCT_4B
+	movl	%eax, (%r8)
+	epilogue
+
+	.align	8
+99:	call	abort
+
+.purgem epilogue
+
+	cfi_endproc
 	.seh_endproc
 
-	.balign 16
-	.globl	SYMBOL_NAME(ffi_call_win64)
-	.seh_proc SYMBOL_NAME(ffi_call_win64)
-SYMBOL_NAME(ffi_call_win64):
-	# copy registers onto stack
-	mov	%r9,32(%rsp)
-	mov	%r8,24(%rsp)
-	mov	%rdx,16(%rsp)
-	mov	%rcx,8(%rsp)
-	.seh_pushreg rbp
-	push	%rbp
-	.seh_stackalloc 48
-	sub	$48,%rsp
-	.seh_setframe rbp, 32
-	lea	32(%rsp),%rbp
-	.seh_endprologue
-
-	mov	CIF_BYTES(%rbp),%eax
-	add	$15, %rax
-	and	$-16, %rax
-	cmpq	$0x1000, %rax
-	jb	Lch_done
-Lch_probe:
-	subq	$0x1000,%rsp
-	orl	$0x0, (%rsp)
-	subq	$0x1000,%rax
-	cmpq	$0x1000,%rax
-	ja	Lch_probe
-Lch_done:
-	subq	%rax, %rsp
-	orl	$0x0, (%rsp)
-	lea	32(%rsp), %rax
-	mov	%rax, STACK(%rbp)
-
-	mov	ECIF(%rbp), %rdx
-	mov	STACK(%rbp), %rcx
-	callq	*PREP_ARGS_FN(%rbp)
-
-	mov	STACK(%rbp), %rsp
-
-	movlpd	24(%rsp), %xmm3
-	movd	%xmm3, %r9
-
-	movlpd	16(%rsp), %xmm2
-	movd	%xmm2, %r8
-
-	movlpd	8(%rsp), %xmm1
-	movd	%xmm1, %rdx
-
-	movlpd	(%rsp), %xmm0
-	movd	%xmm0, %rcx
-
-	callq	*FN(%rbp)
-.Lret_struct4b:
- 	cmpl	$FFI_TYPE_SMALL_STRUCT_4B, CIF_FLAGS(%rbp)
- 	jne .Lret_struct2b
-
-	mov	RVALUE(%rbp), %rcx
-	mov	%eax, (%rcx)
-	jmp	.Lret_void
-
-.Lret_struct2b:
-	cmpl	$FFI_TYPE_SMALL_STRUCT_2B, CIF_FLAGS(%rbp)
-	jne .Lret_struct1b
-
-	mov	RVALUE(%rbp), %rcx
-	mov	%ax, (%rcx)
-	jmp .Lret_void
-
-.Lret_struct1b:
-	cmpl	$FFI_TYPE_SMALL_STRUCT_1B, CIF_FLAGS(%rbp)
-	jne .Lret_uint8
 
-	mov	RVALUE(%rbp), %rcx
-	mov	%al, (%rcx)
-	jmp .Lret_void
-
-.Lret_uint8:
-	cmpl	$FFI_TYPE_UINT8, CIF_FLAGS(%rbp)
-	jne .Lret_sint8
-
-	mov     RVALUE(%rbp), %rcx
-	movzbq  %al, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_sint8:
-	cmpl	$FFI_TYPE_SINT8, CIF_FLAGS(%rbp)
-	jne .Lret_uint16
-
-	mov     RVALUE(%rbp), %rcx
-	movsbq  %al, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_uint16:
-	cmpl	$FFI_TYPE_UINT16, CIF_FLAGS(%rbp)
-	jne .Lret_sint16
-
-	mov     RVALUE(%rbp), %rcx
-	movzwq  %ax, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_sint16:
-	cmpl	$FFI_TYPE_SINT16, CIF_FLAGS(%rbp)
-	jne .Lret_uint32
-
-	mov     RVALUE(%rbp), %rcx
-	movswq  %ax, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_uint32:
-	cmpl	$FFI_TYPE_UINT32, CIF_FLAGS(%rbp)
-	jne .Lret_sint32
-
-	mov     RVALUE(%rbp), %rcx
-	movl    %eax, %eax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_sint32:
- 	cmpl	$FFI_TYPE_SINT32, CIF_FLAGS(%rbp)
- 	jne	.Lret_float
-
-	mov	RVALUE(%rbp), %rcx
-	cltq
-	movq	%rax, (%rcx)
-	jmp	.Lret_void
-
-.Lret_float:
- 	cmpl	$FFI_TYPE_FLOAT, CIF_FLAGS(%rbp)
- 	jne	.Lret_double
-
- 	mov	RVALUE(%rbp), %rax
- 	movss	%xmm0, (%rax)
- 	jmp	.Lret_void
-
-.Lret_double:
- 	cmpl	$FFI_TYPE_DOUBLE, CIF_FLAGS(%rbp)
- 	jne	.Lret_uint64
-
- 	mov	RVALUE(%rbp), %rax
- 	movlpd	%xmm0, (%rax)
- 	jmp	.Lret_void
-
-.Lret_uint64:
-  	cmpl	$FFI_TYPE_UINT64, CIF_FLAGS(%rbp)
- 	jne	.Lret_sint64
-
- 	mov	RVALUE(%rbp), %rcx
- 	mov	%rax, (%rcx)
- 	jmp	.Lret_void
-
-.Lret_sint64:
-  	cmpl	$FFI_TYPE_SINT64, CIF_FLAGS(%rbp)
-  	jne	.Lret_pointer
-
- 	mov	RVALUE(%rbp), %rcx
- 	mov	%rax, (%rcx)
- 	jmp	.Lret_void
+/* 32 bytes of outgoing register stack space, 8 bytes of alignment,
+   16 bytes of result, 32 bytes of xmm registers.  */
+#define ffi_clo_FS	(32+8+16+32)
+#define ffi_clo_OFF_R	(32+8)
+#define ffi_clo_OFF_X	(32+8+16)
+
+	.align	8
+	.globl	ffi_go_closure_win64
+
+	.seh_proc ffi_go_closure_win64
+ffi_go_closure_win64:
+	cfi_startproc
+	/* Save all integer arguments into the incoming reg stack space.  */
+	movq	arg0, 8(%rsp)
+	movq	arg1, 16(%rsp)
+	movq	arg2, 24(%rsp)
+	movq	arg3, 32(%rsp)
+
+	movq	8(%r10), arg0			/* load cif */
+	movq	16(%r10), arg1			/* load fun */
+	movq	%r10, arg2			/* closure is user_data */
+	jmp	0f
+	cfi_endproc
+	.seh_endproc
 
-.Lret_pointer:
-  	cmpl	$FFI_TYPE_POINTER, CIF_FLAGS(%rbp)
-  	jne	.Lret_int
+	.align	8
+	.globl	ffi_closure_win64
+
+	.seh_proc ffi_closure_win64
+ffi_closure_win64:
+	cfi_startproc
+	/* Save all integer arguments into the incoming reg stack space.  */
+	movq	arg0, 8(%rsp)
+	movq	arg1, 16(%rsp)
+	movq	arg2, 24(%rsp)
+	movq	arg3, 32(%rsp)
+
+	movq	FFI_TRAMPOLINE_SIZE(%r10), arg0		/* load cif */
+	movq	FFI_TRAMPOLINE_SIZE+8(%r10), arg1	/* load fun */
+	movq	FFI_TRAMPOLINE_SIZE+16(%r10), arg2	/* load user_data */
+0:
+	subq	$ffi_clo_FS, %rsp
+	cfi_adjust_cfa_offset(ffi_clo_FS)
+	.seh_stackalloc ffi_clo_FS
+	.seh_endprologue
 
- 	mov	RVALUE(%rbp), %rcx
- 	mov	%rax, (%rcx)
- 	jmp	.Lret_void
+	/* Save all sse arguments into the stack frame.  */
+	movsd	%xmm0, ffi_clo_OFF_X(%rsp)
+	movsd	%xmm1, ffi_clo_OFF_X+8(%rsp)
+	movsd	%xmm2, ffi_clo_OFF_X+16(%rsp)
+	movsd	%xmm3, ffi_clo_OFF_X+24(%rsp)
 
-.Lret_int:
-  	cmpl	$FFI_TYPE_INT, CIF_FLAGS(%rbp)
-  	jne	.Lret_void
+	leaq	ffi_clo_OFF_R(%rsp), arg3
+	call	ffi_closure_win64_inner
 
-	mov	RVALUE(%rbp), %rcx
-	cltq
-	movq	%rax, (%rcx)
-	jmp	.Lret_void
+	/* Load the result into both possible result registers.  */
+	movq    ffi_clo_OFF_R(%rsp), %rax
+	movsd   ffi_clo_OFF_R(%rsp), %xmm0
 
-.Lret_void:
-	xor	%rax, %rax
+	addq	$ffi_clo_FS, %rsp
+	cfi_adjust_cfa_offset(-ffi_clo_FS)
+	ret
 
-	lea	16(%rbp), %rsp
-	pop	%rbp
-	retq
+	cfi_endproc
 	.seh_endproc
-#endif /* !_MSC_VER */
-
diff --git a/testsuite/libffi.call/call.exp b/testsuite/libffi.call/call.exp
index 5177f07..55de25c 100644
--- a/testsuite/libffi.call/call.exp
+++ b/testsuite/libffi.call/call.exp
@@ -24,16 +24,15 @@ set ctlist [lsearch -inline -all -glob [lsort [glob -nocomplain -- $srcdir/$subd
 
 run-many-tests $tlist ""
 
-if { ![istarget s390*] } {
-
+# ??? We really should preprocess ffi.h and grep
+# for FFI_TARGET_HAS_COMPLEX_TYPE.
+if { [istarget s390*]
+     || [istarget x86_64*] } {
+  run-many-tests $ctlist ""
+} else {
     foreach test $ctlist {
 	unsupported "$test"
     }
-
-} else {
-
-  run-many-tests $ctlist ""
-
 }
 
 dg-finish
diff --git a/testsuite/libffi.call/cls_align_longdouble_split.c b/testsuite/libffi.call/cls_align_longdouble_split.c
index 15f9365..cc1c43b 100644
--- a/testsuite/libffi.call/cls_align_longdouble_split.c
+++ b/testsuite/libffi.call/cls_align_longdouble_split.c
@@ -4,10 +4,8 @@
    PR:		none.
    Originator:	<hos@tamanegi.org> 20031203	 */
 
-/* { dg-excess-errors "no long double format" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 /* { dg-do run { xfail strongarm*-*-* xscale*-*-* } } */
 /* { dg-options -mlong-double-128 { target powerpc64*-*-linux* } } */
-/* { dg-output "" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 
 #include "ffitest.h"
 
diff --git a/testsuite/libffi.call/cls_align_longdouble_split2.c b/testsuite/libffi.call/cls_align_longdouble_split2.c
index ca1c356..5d3bec0 100644
--- a/testsuite/libffi.call/cls_align_longdouble_split2.c
+++ b/testsuite/libffi.call/cls_align_longdouble_split2.c
@@ -5,10 +5,8 @@
 	Originator:		Blake Chaffin	6/18/2007
 */
 
-/* { dg-excess-errors "no long double format" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 /* { dg-do run { xfail strongarm*-*-* } } */
 /* { dg-options -mlong-double-128 { target powerpc64*-*-linux* } } */
-/* { dg-output "" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 
 #include "ffitest.h"
 
diff --git a/testsuite/libffi.call/cls_longdouble.c b/testsuite/libffi.call/cls_longdouble.c
index 5dc9ac7..d24e72e 100644
--- a/testsuite/libffi.call/cls_longdouble.c
+++ b/testsuite/libffi.call/cls_longdouble.c
@@ -4,12 +4,10 @@
    PR:			none.
    Originator:	Blake Chaffin	*/
 
-/* { dg-excess-errors "no long double format" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 /* This test is known to PASS on armv7l-unknown-linux-gnueabihf, so I have
    remove the xfail for arm*-*-* below, until we know more.  */
 /* { dg-do run { xfail strongarm*-*-* xscale*-*-* } } */
 /* { dg-options -mlong-double-128 { target powerpc64*-*-linux* } } */
-/* { dg-output "" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 
 #include "ffitest.h"
 
diff --git a/testsuite/libffi.call/float2.c b/testsuite/libffi.call/float2.c
index a0b296c..aae1abf 100644
--- a/testsuite/libffi.call/float2.c
+++ b/testsuite/libffi.call/float2.c
@@ -4,9 +4,6 @@
    PR:		none.
    Originator:	From the original ffitest.c  */
 
-/* { dg-excess-errors "fails" { target x86_64-*-mingw* x86_64-*-cygwin* } } */
-/* { dg-do run { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
-
 #include "ffitest.h"
 #include "float.h"
 
diff --git a/testsuite/libffi.call/huge_struct.c b/testsuite/libffi.call/huge_struct.c
index 657fe54..187c42c 100644
--- a/testsuite/libffi.call/huge_struct.c
+++ b/testsuite/libffi.call/huge_struct.c
@@ -5,11 +5,9 @@
 	Originator:		Blake Chaffin	6/18/2007
 */
 
-/* { dg-excess-errors "" { target x86_64-*-mingw* x86_64-*-cygwin* } } */
 /* { dg-do run { xfail strongarm*-*-* xscale*-*-* } } */
 /* { dg-options -mlong-double-128 { target powerpc64*-*-linux* } } */
 /* { dg-options -Wformat=0 { target moxie*-*-elf } } */
-/* { dg-output "" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 
 #include "ffitest.h"
 
diff --git a/testsuite/libffi.call/return_ldl.c b/testsuite/libffi.call/return_ldl.c
index 5c2fe65..520e710 100644
--- a/testsuite/libffi.call/return_ldl.c
+++ b/testsuite/libffi.call/return_ldl.c
@@ -4,7 +4,6 @@
    PR:		none.
    Originator:	<andreast@gcc.gnu.org> 20071113  */
 
-/* { dg-do run { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 #include "ffitest.h"
 
 static long double return_ldl(long double ldl)
-- 
1.9.3