public inbox for libffi-discuss@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 0/8] Go closures for x86_64
@ 2014-10-28 18:32 Richard Henderson
  2014-10-28 18:32 ` [PATCH 1/8] Add entry points for interacting with Go Richard Henderson
                   ` (7 more replies)
  0 siblings, 8 replies; 9+ messages in thread
From: Richard Henderson @ 2014-10-28 18:32 UTC (permalink / raw)
  To: libffi-discuss

The first in a series of patch sets to implement the Go interfaces
I proposed a week or so ago, but mostly split into smaller pieces.
These patches will also include support for complex types as I go,
since Go needs them, and performing both updates at the same time
will make things easier.

This first set includes x86_64-linux and x86_64-cygwin.

It leaves x86_64-darwin in a broken state, but I'm unsure why it
has a completely different assembly file, rather than merely using
the preprocessor to handle the __USER_LABEL_PREFIX__.  I'm hoping
that someone who can test darwin can do this update.

The existing win64 support was poor, having a significant amount
of failures (and some xfails to cover that up).  IMO the biggest
problem there was trying to combine it with the 32-bit targets.
The ABI is significantly different, and the amount of ifdefs
needed to force it in was ... ugly.

It leaves the visual studio build broken, in that I totally
rewrote all of the win64.S assembly, but did not keep the microsoft
assembly.  Hopefully someone who cares about VS can handle that.

This patch set is available at

  git://github.com/rth7680/libffi.git go/x86


r~


Richard Henderson (8):
  Add entry points for interacting with Go
  Add ffi_cfi.h
  x86-64: Support go closures
  win64: Rewrite
  win64: Remove support from ffi.c
  x86_64: Fixups for x32
  x86_64: Decouple return types from FFI_TYPE constants
  x86_64: Add support for complex types

 Makefile.am                                        |   4 +-
 include/ffi.h.in                                   |  16 +
 include/ffi_cfi.h                                  |  53 ++
 src/x86/ffi.c                                      | 212 +------
 src/x86/ffi64.c                                    | 327 +++++++---
 src/x86/ffitarget.h                                |  29 +-
 src/x86/ffiw64.c                                   | 281 +++++++++
 src/x86/internal64.h                               |  22 +
 src/x86/unix64.S                                   | 552 ++++++++--------
 src/x86/win64.S                                    | 693 ++++++---------------
 testsuite/libffi.call/call.exp                     |  13 +-
 testsuite/libffi.call/cls_align_longdouble_split.c |   2 -
 .../libffi.call/cls_align_longdouble_split2.c      |   2 -
 testsuite/libffi.call/cls_longdouble.c             |   2 -
 testsuite/libffi.call/float2.c                     |   3 -
 testsuite/libffi.call/huge_struct.c                |   2 -
 testsuite/libffi.call/return_ldl.c                 |   1 -
 17 files changed, 1088 insertions(+), 1126 deletions(-)
 create mode 100644 include/ffi_cfi.h
 create mode 100644 src/x86/ffiw64.c
 create mode 100644 src/x86/internal64.h

-- 
1.9.3

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/8] Add entry points for interacting with Go
  2014-10-28 18:32 [PATCH 0/8] Go closures for x86_64 Richard Henderson
@ 2014-10-28 18:32 ` Richard Henderson
  2014-10-28 18:32 ` [PATCH 3/8] x86-64: Support go closures Richard Henderson
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2014-10-28 18:32 UTC (permalink / raw)
  To: libffi-discuss

A "ffi_go_closure" is intended to be compatible with the
function descriptors used by Go, and ffi_call_go sets up
the static chain parameter for calling a Go function.

The entry points are disabled when a backend has not been
updated, much like we do for "normal" closures.
---
 include/ffi.h.in | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/include/ffi.h.in b/include/ffi.h.in
index f403ae0..c43d52f 100644
--- a/include/ffi.h.in
+++ b/include/ffi.h.in
@@ -428,6 +428,22 @@ ffi_prep_java_raw_closure_loc (ffi_java_raw_closure*,
 
 #endif /* FFI_CLOSURES */
 
+#if FFI_GO_CLOSURES
+
+typedef struct {
+  void      *tramp;
+  ffi_cif   *cif;
+  void     (*fun)(ffi_cif*,void*,void**,void*);
+} ffi_go_closure;
+
+ffi_status ffi_prep_go_closure (ffi_go_closure*, ffi_cif *,
+				void (*fun)(ffi_cif*,void*,void**,void*));
+
+void ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+		  void **avalue, void *closure);
+
+#endif /* FFI_GO_CLOSURES */
+
 /* ---- Public interface definition -------------------------------------- */
 
 ffi_status ffi_prep_cif(ffi_cif *cif,
-- 
1.9.3

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 3/8] x86-64: Support go closures
  2014-10-28 18:32 [PATCH 0/8] Go closures for x86_64 Richard Henderson
  2014-10-28 18:32 ` [PATCH 1/8] Add entry points for interacting with Go Richard Henderson
@ 2014-10-28 18:32 ` Richard Henderson
  2014-10-28 18:32 ` [PATCH 7/8] x86_64: Decouple return types from FFI_TYPE constants Richard Henderson
                   ` (5 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2014-10-28 18:32 UTC (permalink / raw)
  To: libffi-discuss

Dumps all of the hand-coded unwind info for gas generated.  Move jump
table data into .rodata.  Adjust ffi_call_unix64 to load the static
chain.  Split out sse portions of ffi_closure_unix64 to
ffi_closure_unix64_sse rather than test cif->flags at runtime.
---
 src/x86/ffi64.c     | 103 ++++++++++++-----
 src/x86/ffitarget.h |   2 +
 src/x86/unix64.S    | 323 +++++++++++++++++++++++++---------------------------
 3 files changed, 230 insertions(+), 198 deletions(-)

diff --git a/src/x86/ffi64.c b/src/x86/ffi64.c
index 5a5e043..384a93a 100644
--- a/src/x86/ffi64.c
+++ b/src/x86/ffi64.c
@@ -32,6 +32,7 @@
 
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdint.h>
 
 #ifdef __x86_64__
 
@@ -62,10 +63,12 @@ struct register_args
   /* Registers for argument passing.  */
   UINT64 gpr[MAX_GPR_REGS];
   union big_int_union sse[MAX_SSE_REGS];
+  UINT64 rax;	/* ssecount */
+  UINT64 r10;	/* static chain */
 };
 
 extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-			     void *raddr, void (*fnaddr)(void), unsigned ssecount);
+			     void *raddr, void (*fnaddr)(void)) FFI_HIDDEN;
 
 /* All reference to register classes here is identical to the code in
    gcc/config/i386/i386.c. Do *not* change one without the other.  */
@@ -358,6 +361,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   enum x86_64_reg_class classes[MAX_CLASSES];
   size_t bytes, n;
 
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
+
   gprcount = ssecount = 0;
 
   flags = cif->rtype->type;
@@ -419,8 +425,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   return FFI_OK;
 }
 
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	      void **avalue, void *closure)
 {
   enum x86_64_reg_class classes[MAX_CLASSES];
   char *stack, *argp;
@@ -445,6 +452,8 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
   reg_args = (struct register_args *) stack;
   argp = stack + sizeof (struct register_args);
 
+  reg_args->r10 = (uintptr_t) closure;
+
   gprcount = ssecount = 0;
 
   /* If the return value is passed in memory, add the pointer as the
@@ -521,13 +530,27 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 	    }
 	}
     }
+  reg_args->rax = ssecount;
 
   ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
-		   cif->flags, rvalue, fn, ssecount);
+		   cif->flags, rvalue, fn);
 }
 
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
 
-extern void ffi_closure_unix64(void);
+extern void ffi_closure_unix64(void) FFI_HIDDEN;
+extern void ffi_closure_unix64_sse(void) FFI_HIDDEN;
 
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
@@ -536,29 +559,26 @@ ffi_prep_closure_loc (ffi_closure* closure,
 		      void *user_data,
 		      void *codeloc)
 {
-  volatile unsigned short *tramp;
-
-  /* Sanity check on the cif ABI.  */
-  {
-    int abi = cif->abi;
-    if (UNLIKELY (! (abi > FFI_FIRST_ABI && abi < FFI_LAST_ABI)))
-      return FFI_BAD_ABI;
-  }
-
-  tramp = (volatile unsigned short *) &closure->tramp[0];
+  static const unsigned char trampoline[16] = {
+    /* leaq  -0x7(%rip),%r10   # 0x0  */
+    0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff,
+    /* jmpq  *0x3(%rip)        # 0x10 */
+    0xff, 0x25, 0x03, 0x00, 0x00, 0x00,
+    /* nopl  (%rax) */
+    0x0f, 0x1f, 0x00
+  };
+  void (*dest)(void);
 
-  tramp[0] = 0xbb49;		/* mov <code>, %r11	*/
-  *((unsigned long long * volatile) &tramp[1])
-    = (unsigned long) ffi_closure_unix64;
-  tramp[5] = 0xba49;		/* mov <data>, %r10	*/
-  *((unsigned long long * volatile) &tramp[6])
-    = (unsigned long) codeloc;
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
 
-  /* Set the carry bit iff the function uses any sse registers.
-     This is clc or stc, together with the first byte of the jmp.  */
-  tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
+  if (cif->flags & (1 << 11))
+    dest = ffi_closure_unix64_sse;
+  else
+    dest = ffi_closure_unix64;
 
-  tramp[11] = 0xe3ff;			/* jmp *%r11    */
+  memcpy (closure->tramp, trampoline, sizeof(trampoline));
+  *(UINT64 *)(closure->tramp + 16) = (uintptr_t)dest;
 
   closure->cif = cif;
   closure->fun = fun;
@@ -567,18 +587,20 @@ ffi_prep_closure_loc (ffi_closure* closure,
   return FFI_OK;
 }
 
-int
-ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
-			 struct register_args *reg_args, char *argp)
+int FFI_HIDDEN
+ffi_closure_unix64_inner(ffi_cif *cif,
+			 void (*fun)(ffi_cif*, void*, void**, void*),
+			 void *user_data,
+			 void *rvalue,
+			 struct register_args *reg_args,
+			 char *argp)
 {
-  ffi_cif *cif;
   void **avalue;
   ffi_type **arg_types;
   long i, avn;
   int gprcount, ssecount, ngpr, nsse;
   int ret;
 
-  cif = closure->cif;
   avalue = alloca(cif->nargs * sizeof(void *));
   gprcount = ssecount = 0;
 
@@ -667,10 +689,29 @@ ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
     }
 
   /* Invoke the closure.  */
-  closure->fun (cif, rvalue, avalue, closure->user_data);
+  fun (cif, rvalue, avalue, user_data);
 
   /* Tell assembly how to perform return type promotions.  */
   return ret;
 }
 
+extern void ffi_go_closure_unix64(void) FFI_HIDDEN;
+extern void ffi_go_closure_unix64_sse(void) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+		     void (*fun)(ffi_cif*, void*, void**, void*))
+{
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
+
+  closure->tramp = (cif->flags & (1 << 11)
+		    ? ffi_go_closure_unix64_sse
+		    : ffi_go_closure_unix64);
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+
 #endif /* __x86_64__ */
diff --git a/src/x86/ffitarget.h b/src/x86/ffitarget.h
index a236677..0d295e0 100644
--- a/src/x86/ffitarget.h
+++ b/src/x86/ffitarget.h
@@ -121,6 +121,7 @@ typedef enum ffi_abi {
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
+
 #define FFI_TYPE_SMALL_STRUCT_1B (FFI_TYPE_LAST + 1)
 #define FFI_TYPE_SMALL_STRUCT_2B (FFI_TYPE_LAST + 2)
 #define FFI_TYPE_SMALL_STRUCT_4B (FFI_TYPE_LAST + 3)
@@ -129,6 +130,7 @@ typedef enum ffi_abi {
 #if defined (X86_64) || (defined (__x86_64__) && defined (X86_DARWIN))
 #define FFI_TRAMPOLINE_SIZE 24
 #define FFI_NATIVE_RAW_API 0
+#define FFI_GO_CLOSURES 1
 #else
 #ifdef X86_WIN32
 #define FFI_TRAMPOLINE_SIZE 52
diff --git a/src/x86/unix64.S b/src/x86/unix64.S
index dcd6bc7..134cb3d 100644
--- a/src/x86/unix64.S
+++ b/src/x86/unix64.S
@@ -30,6 +30,7 @@
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
+#include <ffi_cfi.h>
 
 .text
 
@@ -43,9 +44,10 @@
 	.align	2
 	.globl	ffi_call_unix64
 	.type	ffi_call_unix64,@function
+	FFI_HIDDEN(ffi_call_unix64)
 
 ffi_call_unix64:
-.LUW0:
+	cfi_startproc
 	movq	(%rsp), %r10		/* Load return address.  */
 	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
 	movq	%rdx, (%rax)		/* Save flags.  */
@@ -53,24 +55,36 @@ ffi_call_unix64:
 	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
 	movq	%r10, 24(%rax)		/* Relocate return address.  */
 	movq	%rax, %rbp		/* Finalize local stack frame.  */
-.LUW1:
+
+	/* New stack frame based off rbp.  This is a itty bit of unwind
+	   trickery in that the CFA *has* changed.  There is no easy way
+	   to describe it correctly on entry to the function.  Fortunately,
+	   it doesn't matter too much since at all points we can correctly
+	   unwind back to ffi_call.  Note that the location to which we
+	   moved the return address is (the new) CFA-8, so from the
+	   perspective of the unwind info, it hasn't moved.  */
+	cfi_def_cfa(%rbp, 32)
+	cfi_rel_offset(%rbp, 16)
+
 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
 	movl	%r9d, %eax		/* Set number of SSE registers.  */
 
 	/* Load up all argument registers.  */
 	movq	(%r10), %rdi
-	movq	8(%r10), %rsi
-	movq	16(%r10), %rdx
-	movq	24(%r10), %rcx
-	movq	32(%r10), %r8
-	movq	40(%r10), %r9
+	movq	0x08(%r10), %rsi
+	movq	0x10(%r10), %rdx
+	movq	0x18(%r10), %rcx
+	movq	0x20(%r10), %r8
+	movq	0x28(%r10), %r9
+	movl	0xb0(%r10), %eax
 	testl	%eax, %eax
 	jnz	.Lload_sse
 .Lret_from_load_sse:
 
-	/* Deallocate the reg arg area.  */
-	leaq	176(%r10), %rsp
+	/* Deallocate the reg arg area, except for r10, then load via pop.  */
+	leaq	0xb8(%r10), %rsp
+	popq	%r10
 
 	/* Call the user function.  */
 	call	*%r11
@@ -81,7 +95,9 @@ ffi_call_unix64:
 	movq	0(%rbp), %rcx		/* Reload flags.  */
 	movq	8(%rbp), %rdi		/* Reload raddr.  */
 	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
-.LUW2:
+	cfi_remember_state
+	cfi_def_cfa(%rsp, 8)
+	cfi_restore(%rbp)
 
 	/* The first byte of the flags contains the FFI_TYPE.  */
 	movzbl	%cl, %r10d
@@ -90,6 +106,8 @@ ffi_call_unix64:
 	addq	%r11, %r10
 	jmp	*%r10
 
+	.section .rodata
+	.align	2
 .Lstore_table:
 	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */
 	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_INT */
@@ -106,6 +124,7 @@ ffi_call_unix64:
 	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_SINT64 */
 	.long	.Lst_struct-.Lstore_table	/* FFI_TYPE_STRUCT */
 	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_POINTER */
+	.previous
 
 	.align 2
 .Lst_void:
@@ -188,49 +207,83 @@ ffi_call_unix64:
 	   It's not worth an indirect jump to load the exact set of
 	   SSE registers needed; zero or all is a good compromise.  */
 	.align 2
-.LUW3:
+	cfi_restore_state
 .Lload_sse:
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
+	movdqa	0x30(%r10), %xmm0
+	movdqa	0x40(%r10), %xmm1
+	movdqa	0x50(%r10), %xmm2
+	movdqa	0x60(%r10), %xmm3
+	movdqa	0x70(%r10), %xmm4
+	movdqa	0x80(%r10), %xmm5
+	movdqa	0x90(%r10), %xmm6
+	movdqa	0xa0(%r10), %xmm7
 	jmp	.Lret_from_load_sse
 
-.LUW4:
+	cfi_endproc
 	.size    ffi_call_unix64,.-ffi_call_unix64
 
+/* 6 general registers, 8 vector registers,
+   16 bytes of rvalue, 8 bytes of alignment.  */
+#define ffi_closure_OFS_G	0
+#define ffi_closure_OFS_V	(6*8)
+#define ffi_closure_OFS_RVALUE	(ffi_closure_OFS_V + 8*16)
+#define ffi_closure_FS		(ffi_closure_OFS_RVALUE + 16 + 8)
+
+/* The location of rvalue within the red zone after deallocating the frame.  */
+#define ffi_closure_RED_RVALUE	(ffi_closure_OFS_RVALUE - ffi_closure_FS)
+
+	.align	2
+	.globl	ffi_closure_unix64_sse
+	.type	ffi_closure_unix64_sse,@function
+	FFI_HIDDEN(ffi_closure_unix64_sse)
+
+ffi_closure_unix64_sse:
+	cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	cfi_adjust_cfa_offset(ffi_closure_FS)
+
+	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
+	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
+	movdqa	%xmm2, ffi_closure_OFS_V+0x20(%rsp)
+	movdqa	%xmm3, ffi_closure_OFS_V+0x30(%rsp)
+	movdqa	%xmm4, ffi_closure_OFS_V+0x40(%rsp)
+	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
+	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
+	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
+	jmp	0f
+
+	cfi_endproc
+	.size	ffi_closure_unix64_sse,.-ffi_closure_unix64_sse
+
 	.align	2
-	.globl ffi_closure_unix64
+	.globl	ffi_closure_unix64
 	.type	ffi_closure_unix64,@function
+	FFI_HIDDEN(ffi_closure_unix64)
 
 ffi_closure_unix64:
-.LUW5:
-	/* The carry flag is set by the trampoline iff SSE registers
-	   are used.  Don't clobber it before the branch instruction.  */
-	leaq    -200(%rsp), %rsp
-.LUW6:
-	movq	%rdi, (%rsp)
-	movq    %rsi, 8(%rsp)
-	movq    %rdx, 16(%rsp)
-	movq    %rcx, 24(%rsp)
-	movq    %r8, 32(%rsp)
-	movq    %r9, 40(%rsp)
-	jc      .Lsave_sse
-.Lret_from_save_sse:
-
-	movq	%r10, %rdi
-	leaq	176(%rsp), %rsi
-	movq	%rsp, %rdx
-	leaq	208(%rsp), %rcx
-	call	ffi_closure_unix64_inner@PLT
+	cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	cfi_adjust_cfa_offset(ffi_closure_FS)
+0:
+	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
+	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
+	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
+	movq    %rcx, ffi_closure_OFS_G+0x18(%rsp)
+	movq    %r8,  ffi_closure_OFS_G+0x20(%rsp)
+	movq    %r9,  ffi_closure_OFS_G+0x28(%rsp)
+
+	movq	24(%r10), %rdi				/* Load cif */
+	movq	32(%r10), %rsi				/* Load fun */
+	movq	40(%r10), %rdx				/* Load user_data */
+.Ldo_closure:
+	leaq	ffi_closure_OFS_RVALUE(%rsp), %rcx	/* Load rvalue */
+	movq	%rsp, %r8				/* Load reg_args */
+	leaq	ffi_closure_FS+8(%rsp), %r9		/* Load argp */
+	call	ffi_closure_unix64_inner
 
 	/* Deallocate stack frame early; return value is now in redzone.  */
-	addq	$200, %rsp
-.LUW7:
+	addq	$ffi_closure_FS, %rsp
+	cfi_adjust_cfa_offset(-ffi_closure_FS)
 
 	/* The first byte of the return value contains the FFI_TYPE.  */
 	movzbl	%al, %r10d
@@ -239,6 +292,8 @@ ffi_closure_unix64:
 	addq	%r11, %r10
 	jmp	*%r10
 
+	.section .rodata
+	.align	2
 .Lload_table:
 	.long	.Lld_void-.Lload_table		/* FFI_TYPE_VOID */
 	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_INT */
@@ -255,6 +310,7 @@ ffi_closure_unix64:
 	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_SINT64 */
 	.long	.Lld_struct-.Lload_table	/* FFI_TYPE_STRUCT */
 	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_POINTER */
+	.previous
 
 	.align 2
 .Lld_void:
@@ -262,32 +318,32 @@ ffi_closure_unix64:
 
 	.align 2
 .Lld_int8:
-	movzbl	-24(%rsp), %eax
+	movzbl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 	.align 2
 .Lld_int16:
-	movzwl	-24(%rsp), %eax
+	movzwl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 	.align 2
 .Lld_int32:
-	movl	-24(%rsp), %eax
+	movl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 	.align 2
 .Lld_int64:
-	movq	-24(%rsp), %rax
+	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	ret
 
 	.align 2
 .Lld_float:
-	movss	-24(%rsp), %xmm0
+	movss	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	ret
 	.align 2
 .Lld_double:
-	movsd	-24(%rsp), %xmm0
+	movsd	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	ret
 	.align 2
 .Lld_ldouble:
-	fldt	-24(%rsp)
+	fldt	ffi_closure_RED_RVALUE(%rsp)
 	ret
 
 	.align 2
@@ -297,136 +353,69 @@ ffi_closure_unix64:
 	   both rdx and xmm1 with the second word.  For the remaining,
 	   bit 8 set means xmm0 gets the second word, and bit 9 means
 	   that rax gets the second word.  */
-	movq	-24(%rsp), %rcx
-	movq	-16(%rsp), %rdx
-	movq	-16(%rsp), %xmm1
+	movq	ffi_closure_RED_RVALUE(%rsp), %rcx
+	movq	ffi_closure_RED_RVALUE+8(%rsp), %rdx
+	movq	ffi_closure_RED_RVALUE+8(%rsp), %xmm1
 	testl	$0x100, %eax
 	cmovnz	%rdx, %rcx
 	movd	%rcx, %xmm0
 	testl	$0x200, %eax
-	movq	-24(%rsp), %rax
+	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	cmovnz	%rdx, %rax
 	ret
 
-	/* See the comment above .Lload_sse; the same logic applies here.  */
-	.align 2
-.LUW8:
-.Lsave_sse:
-	movdqa	%xmm0, 48(%rsp)
-	movdqa	%xmm1, 64(%rsp)
-	movdqa	%xmm2, 80(%rsp)
-	movdqa	%xmm3, 96(%rsp)
-	movdqa	%xmm4, 112(%rsp)
-	movdqa	%xmm5, 128(%rsp)
-	movdqa	%xmm6, 144(%rsp)
-	movdqa	%xmm7, 160(%rsp)
-	jmp	.Lret_from_save_sse
-
-.LUW9:
+	cfi_endproc
 	.size	ffi_closure_unix64,.-ffi_closure_unix64
 
-#ifdef __GNUC__
-/* Only emit DWARF unwind info when building with the GNU toolchain.  */
-
-#ifdef HAVE_AS_X86_64_UNWIND_SECTION_TYPE
-	.section	.eh_frame,"a",@unwind
-#else
-	.section	.eh_frame,"a",@progbits
-#endif
-.Lframe1:
-	.long	.LECIE1-.LSCIE1		/* CIE Length */
-.LSCIE1:
-	.long	0			/* CIE Identifier Tag */
-	.byte	1			/* CIE Version */
-	.ascii "zR\0"			/* CIE Augmentation */
-	.uleb128 1			/* CIE Code Alignment Factor */
-	.sleb128 -8			/* CIE Data Alignment Factor */
-	.byte	0x10			/* CIE RA Column */
-	.uleb128 1			/* Augmentation size */
-	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.uleb128 7
-	.uleb128 8
-	.byte	0x80+16			/* DW_CFA_offset, %rip offset 1*-8 */
-	.uleb128 1
-	.align 8
-.LECIE1:
-.LSFDE1:
-	.long	.LEFDE1-.LASFDE1	/* FDE Length */
-.LASFDE1:
-	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
-#if HAVE_AS_X86_PCREL
-	.long	.LUW0-.			/* FDE initial location */
-#else
-	.long	.LUW0@rel
-#endif
-	.long	.LUW4-.LUW0		/* FDE address range */
-	.uleb128 0x0			/* Augmentation size */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW1-.LUW0
-
-	/* New stack frame based off rbp.  This is a itty bit of unwind
-	   trickery in that the CFA *has* changed.  There is no easy way
-	   to describe it correctly on entry to the function.  Fortunately,
-	   it doesn't matter too much since at all points we can correctly
-	   unwind back to ffi_call.  Note that the location to which we
-	   moved the return address is (the new) CFA-8, so from the
-	   perspective of the unwind info, it hasn't moved.  */
-	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
-	.uleb128 6
-	.uleb128 32
-	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
-	.uleb128 2
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW2-.LUW1
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.uleb128 7
-	.uleb128 8
-	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW3-.LUW2
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align 8
-.LEFDE1:
-.LSFDE3:
-	.long	.LEFDE3-.LASFDE3	/* FDE Length */
-.LASFDE3:
-	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
-#if HAVE_AS_X86_PCREL
-	.long	.LUW5-.			/* FDE initial location */
-#else
-	.long	.LUW5@rel
-#endif
-	.long	.LUW9-.LUW5		/* FDE address range */
-	.uleb128 0x0			/* Augmentation size */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW6-.LUW5
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.uleb128 208
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW7-.LUW6
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.uleb128 8
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW8-.LUW7
-	.byte	0xb			/* DW_CFA_restore_state */
+	.align	2
+	.globl	ffi_go_closure_unix64_sse
+	.type	ffi_go_closure_unix64_sse,@function
+	FFI_HIDDEN(ffi_go_closure_unix64_sse)
+
+ffi_go_closure_unix64_sse:
+	cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	cfi_adjust_cfa_offset(ffi_closure_FS)
+
+	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
+	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
+	movdqa	%xmm2, ffi_closure_OFS_V+0x20(%rsp)
+	movdqa	%xmm3, ffi_closure_OFS_V+0x30(%rsp)
+	movdqa	%xmm4, ffi_closure_OFS_V+0x40(%rsp)
+	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
+	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
+	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
+	jmp	0f
+
+	cfi_endproc
+	.size	ffi_go_closure_unix64_sse,.-ffi_go_closure_unix64_sse
 
-	.align 8
-.LEFDE3:
+	.align	2
+	.globl	ffi_go_closure_unix64
+	.type	ffi_go_closure_unix64,@function
+	FFI_HIDDEN(ffi_go_closure_unix64)
+
+ffi_go_closure_unix64:
+	cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	cfi_adjust_cfa_offset(ffi_closure_FS)
+0:
+	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
+	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
+	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
+	movq    %rcx, ffi_closure_OFS_G+0x18(%rsp)
+	movq    %r8,  ffi_closure_OFS_G+0x20(%rsp)
+	movq    %r9,  ffi_closure_OFS_G+0x28(%rsp)
+
+	movq	8(%r10), %rdi		/* Load cif */
+	movq	16(%r10), %rsi		/* Load fun */
+	movq	%r10, %rdx		/* Load closure (user_data) */
+	jmp	.Ldo_closure
+
+	cfi_endproc
+	.size	ffi_go_closure_unix64,.-ffi_go_closure_unix64
 
-#endif /* __GNUC__ */
-	
 #endif /* __x86_64__ */
-
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
 #endif
-- 
1.9.3

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/8] Add ffi_cfi.h
  2014-10-28 18:32 [PATCH 0/8] Go closures for x86_64 Richard Henderson
                   ` (2 preceding siblings ...)
  2014-10-28 18:32 ` [PATCH 7/8] x86_64: Decouple return types from FFI_TYPE constants Richard Henderson
@ 2014-10-28 18:32 ` Richard Henderson
  2014-10-28 18:32 ` [PATCH 4/8] win64: Rewrite Richard Henderson
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2014-10-28 18:32 UTC (permalink / raw)
  To: libffi-discuss

Have one copy of the HAVE_AS_CFI_PSEUDO_OP code
to share between all backends.
---
 include/ffi_cfi.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 include/ffi_cfi.h

diff --git a/include/ffi_cfi.h b/include/ffi_cfi.h
new file mode 100644
index 0000000..6cca20c
--- /dev/null
+++ b/include/ffi_cfi.h
@@ -0,0 +1,53 @@
+/* -----------------------------------------------------------------------
+   ffi_cfi.h - Copyright (c) 2014  Red Hat, Inc.
+
+   Conditionally assemble cfi directives. Only necessary for building libffi.
+   ----------------------------------------------------------------------- */
+
+#ifndef FFI_CFI_H
+#define FFI_CFI_H
+
+#ifdef HAVE_AS_CFI_PSEUDO_OP
+
+# define cfi_startproc			.cfi_startproc
+# define cfi_endproc			.cfi_endproc
+# define cfi_def_cfa(reg, off)		.cfi_def_cfa reg, off
+# define cfi_def_cfa_register(reg)	.cfi_def_cfa_register reg
+# define cfi_def_cfa_offset(off)	.cfi_def_cfa_offset off
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+# define cfi_offset(reg, off)		.cfi_offset reg, off
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+# define cfi_register(r1, r2)		.cfi_register r1, r2
+# define cfi_return_column(reg)		.cfi_return_column reg
+# define cfi_restore(reg)		.cfi_restore reg
+# define cfi_same_value(reg)		.cfi_same_value reg
+# define cfi_undefined(reg)		.cfi_undefined reg
+# define cfi_remember_state		.cfi_remember_state
+# define cfi_restore_state		.cfi_restore_state
+# define cfi_window_save		.cfi_window_save
+# define cfi_personality(enc, exp)	.cfi_personality enc, exp
+# define cfi_lsda(enc, exp)		.cfi_lsda enc, exp
+
+#else
+
+# define cfi_startproc
+# define cfi_endproc
+# define cfi_def_cfa(reg, off)
+# define cfi_def_cfa_register(reg)
+# define cfi_def_cfa_offset(off)
+# define cfi_adjust_cfa_offset(off)
+# define cfi_offset(reg, off)
+# define cfi_rel_offset(reg, off)
+# define cfi_register(r1, r2)
+# define cfi_return_column(reg)
+# define cfi_restore(reg)
+# define cfi_same_value(reg)
+# define cfi_undefined(reg)
+# define cfi_remember_state
+# define cfi_restore_state
+# define cfi_window_save
+# define cfi_personality(enc, exp)
+# define cfi_lsda(enc, exp)
+
+#endif /* HAVE_AS_CFI_PSEUDO_OP */
+#endif /* FFI_CFI_H */
-- 
1.9.3

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 4/8] win64: Rewrite
  2014-10-28 18:32 [PATCH 0/8] Go closures for x86_64 Richard Henderson
                   ` (3 preceding siblings ...)
  2014-10-28 18:32 ` [PATCH 2/8] Add ffi_cfi.h Richard Henderson
@ 2014-10-28 18:32 ` Richard Henderson
  2014-10-28 18:32 ` [PATCH 5/8] win64: Remove support from ffi.c Richard Henderson
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2014-10-28 18:32 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Kai Tietz

It's way too different from the 32-bit ABIs with which it is
currently associated. As seen from all of the existing XFAILs.

Cc: Kai Tietz <ktietz@redhat.com>
---
 Makefile.am                                        |   4 +-
 src/x86/ffitarget.h                                |  29 +-
 src/x86/ffiw64.c                                   | 281 +++++++++
 src/x86/win64.S                                    | 693 ++++++---------------
 testsuite/libffi.call/call.exp                     |  13 +-
 testsuite/libffi.call/cls_align_longdouble_split.c |   2 -
 .../libffi.call/cls_align_longdouble_split2.c      |   2 -
 testsuite/libffi.call/cls_longdouble.c             |   2 -
 testsuite/libffi.call/float2.c                     |   3 -
 testsuite/libffi.call/huge_struct.c                |   2 -
 testsuite/libffi.call/return_ldl.c                 |   1 -
 11 files changed, 496 insertions(+), 536 deletions(-)
 create mode 100644 src/x86/ffiw64.c

diff --git a/Makefile.am b/Makefile.am
index 0e40451..3d1ecae 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -37,7 +37,7 @@ EXTRA_DIST = LICENSE ChangeLog.v1 ChangeLog.libgcj			\
 	 src/sh64/sysv.S src/sh64/ffitarget.h src/sparc/v8.S		\
 	 src/sparc/v9.S src/sparc/ffitarget.h src/sparc/ffi.c		\
 	 src/x86/darwin64.S src/x86/ffi.c src/x86/sysv.S		\
-	 src/x86/win32.S src/x86/darwin.S src/x86/win64.S		\
+	 src/x86/win32.S src/x86/darwin.S src/x86/ffiw64.c src/x86/win64.S \
 	 src/x86/freebsd.S src/x86/ffi64.c src/x86/unix64.S		\
 	 src/x86/ffitarget.h src/pa/ffitarget.h src/pa/ffi.c		\
 	 src/pa/linux.S src/pa/hpux32.S src/frv/ffi.c src/bfin/ffi.c	\
@@ -135,7 +135,7 @@ if X86_WIN32
 nodist_libffi_la_SOURCES += src/x86/ffi.c src/x86/win32.S
 endif
 if X86_WIN64
-nodist_libffi_la_SOURCES += src/x86/ffi.c src/x86/win64.S
+nodist_libffi_la_SOURCES += src/x86/ffiw64.c src/x86/win64.S
 endif
 if X86_DARWIN
 nodist_libffi_la_SOURCES += src/x86/ffi.c src/x86/darwin.S src/x86/ffi64.c src/x86/darwin64.S
diff --git a/src/x86/ffitarget.h b/src/x86/ffitarget.h
index 0d295e0..8c52573 100644
--- a/src/x86/ffitarget.h
+++ b/src/x86/ffitarget.h
@@ -127,25 +127,18 @@ typedef enum ffi_abi {
 #define FFI_TYPE_SMALL_STRUCT_4B (FFI_TYPE_LAST + 3)
 #define FFI_TYPE_MS_STRUCT       (FFI_TYPE_LAST + 4)
 
-#if defined (X86_64) || (defined (__x86_64__) && defined (X86_DARWIN))
-#define FFI_TRAMPOLINE_SIZE 24
-#define FFI_NATIVE_RAW_API 0
-#define FFI_GO_CLOSURES 1
+#if defined (X86_64) || defined(X86_WIN64) \
+    || (defined (__x86_64__) && defined (X86_DARWIN))
+# define FFI_TRAMPOLINE_SIZE 24
+# define FFI_NATIVE_RAW_API 0
+# define FFI_GO_CLOSURES 1
 #else
-#ifdef X86_WIN32
-#define FFI_TRAMPOLINE_SIZE 52
-#else
-#ifdef X86_WIN64
-#define FFI_TRAMPOLINE_SIZE 29
-#define FFI_NATIVE_RAW_API 0
-#define FFI_NO_RAW_API 1
-#else
-#define FFI_TRAMPOLINE_SIZE 10
-#endif
-#endif
-#ifndef X86_WIN64
-#define FFI_NATIVE_RAW_API 1  /* x86 has native raw api support */
-#endif
+# ifdef X86_WIN32
+#  define FFI_TRAMPOLINE_SIZE 52
+# else
+#  define FFI_TRAMPOLINE_SIZE 10
+# endif
+# define FFI_NATIVE_RAW_API 1  /* x86 has native raw api support */
 #endif
 
 #endif
diff --git a/src/x86/ffiw64.c b/src/x86/ffiw64.c
new file mode 100644
index 0000000..316f544
--- /dev/null
+++ b/src/x86/ffiw64.c
@@ -0,0 +1,281 @@
+/* -----------------------------------------------------------------------
+   ffiw64.c - Copyright (c) 2014 Red Hat, Inc.
+
+   x86 win64 Foreign Function Interface
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+   ----------------------------------------------------------------------- */
+
+#include <ffi.h>
+#include <ffi_common.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#ifdef X86_WIN64
+
+struct win64_call_frame
+{
+  UINT64 rbp;		/* 0 */
+  UINT64 retaddr;	/* 8 */
+  UINT64 fn;		/* 16 */
+  UINT64 flags;		/* 24 */
+  UINT64 rvalue;	/* 32 */
+};
+
+extern void ffi_call_win64 (void *stack, struct win64_call_frame *,
+			    void *closure) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_cif_machdep (ffi_cif *cif)
+{
+  int flags, n;
+
+  if (cif->abi != FFI_WIN64)
+    return FFI_BAD_ABI;
+
+  flags = cif->rtype->type;
+  switch (flags)
+    {
+    default:
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = FFI_TYPE_STRUCT;
+      break;
+    case FFI_TYPE_COMPLEX:
+      flags = FFI_TYPE_STRUCT;
+      /* FALLTHRU */
+    case FFI_TYPE_STRUCT:
+      switch (cif->rtype->size)
+	{
+	case 8:
+	  flags = FFI_TYPE_UINT64;
+	  break;
+	case 4:
+	  flags = FFI_TYPE_SMALL_STRUCT_4B;
+	  break;
+	case 2:
+	  flags = FFI_TYPE_SMALL_STRUCT_2B;
+	  break;
+	case 1:
+	  flags = FFI_TYPE_SMALL_STRUCT_1B;
+	  break;
+	}
+      break;
+    }
+  cif->flags = flags;
+
+  /* Each argument either fits in a register, an 8 byte slot, or is
+     passed by reference with the pointer in the 8 byte slot.  */
+  n = cif->nargs;
+  n += (flags == FFI_TYPE_STRUCT);
+  if (n < 4)
+    n = 4;
+  cif->bytes = n * 8;
+
+  return FFI_OK;
+}
+
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	      void **avalue, void *closure)
+{
+  int i, j, n, flags;
+  UINT64 *stack;
+  size_t rsize;
+  struct win64_call_frame *frame;
+
+  FFI_ASSERT(cif->abi == FFI_WIN64);
+
+  flags = cif->flags;
+  rsize = 0;
+
+  /* If we have no return value for a structure, we need to create one.
+     Otherwise we can ignore the return type entirely.  */
+  if (rvalue == NULL)
+    {
+      if (flags == FFI_TYPE_STRUCT)
+	rsize = cif->rtype->size;
+      else
+	flags = FFI_TYPE_VOID;
+    }
+
+  stack = alloca(cif->bytes + sizeof(struct win64_call_frame) + rsize);
+  frame = (struct win64_call_frame *)((char *)stack + cif->bytes);
+  if (rsize)
+    rvalue = frame + 1;
+
+  frame->fn = (uintptr_t)fn;
+  frame->flags = flags;
+  frame->rvalue = (uintptr_t)rvalue;
+
+  j = 0;
+  if (flags == FFI_TYPE_STRUCT)
+    {
+      stack[0] = (uintptr_t)rvalue;
+      j = 1;
+    }
+
+  for (i = 0, n = cif->nargs; i < n; ++i, ++j)
+    {
+      switch (cif->arg_types[i]->size)
+	{
+	case 8:
+	  stack[j] = *(UINT64 *)avalue[i];
+	  break;
+	case 4:
+	  stack[j] = *(UINT32 *)avalue[i];
+	  break;
+	case 2:
+	  stack[j] = *(UINT16 *)avalue[i];
+	  break;
+	case 1:
+	  stack[j] = *(UINT8 *)avalue[i];
+	  break;
+	default:
+	  stack[j] = (uintptr_t)avalue[i];
+	  break;
+	}
+    }
+
+  ffi_call_win64 (stack, frame, closure);
+}
+
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
+
+
+extern void ffi_closure_win64(void) FFI_HIDDEN;
+extern void ffi_go_closure_win64(void) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_closure_loc (ffi_closure* closure,
+		      ffi_cif* cif,
+		      void (*fun)(ffi_cif*, void*, void**, void*),
+		      void *user_data,
+		      void *codeloc)
+{
+  static const unsigned char trampoline[16] = {
+    /* leaq  -0x7(%rip),%r10   # 0x0  */
+    0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff,
+    /* jmpq  *0x3(%rip)        # 0x10 */
+    0xff, 0x25, 0x03, 0x00, 0x00, 0x00,
+    /* nopl  (%rax) */
+    0x0f, 0x1f, 0x00
+  };
+  void *tramp = closure->tramp;
+
+  if (cif->abi != FFI_WIN64)
+    return FFI_BAD_ABI;
+
+  memcpy (tramp, trampoline, sizeof(trampoline));
+  *(UINT64 *)(tramp + 16) = (uintptr_t)ffi_closure_win64;
+
+  closure->cif = cif;
+  closure->fun = fun;
+  closure->user_data = user_data;
+
+  return FFI_OK;
+}
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+		     void (*fun)(ffi_cif*, void*, void**, void*))
+{
+  if (cif->abi != FFI_WIN64)
+    return FFI_BAD_ABI;
+
+  closure->tramp = ffi_go_closure_win64;
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+
+struct win64_closure_frame
+{
+  UINT64 rvalue[2];
+  UINT64 fargs[4];
+  UINT64 retaddr;
+  UINT64 args[];
+};
+
+int FFI_HIDDEN
+ffi_closure_win64_inner(ffi_cif *cif,
+			void (*fun)(ffi_cif*, void*, void**, void*),
+			void *user_data,
+			struct win64_closure_frame *frame)
+{
+  void **avalue;
+  void *rvalue;
+  int i, n, nreg, flags;
+
+  avalue = alloca(cif->nargs * sizeof(void *));
+  rvalue = frame->rvalue;
+  nreg = 0;
+
+  /* When returning a structure, the address is in the first argument.
+     We must also be prepared to return the same address in eax, so
+     install that address in the frame and pretend we return a pointer.  */
+  flags = cif->flags;
+  if (flags == FFI_TYPE_STRUCT)
+    {
+      rvalue = (void *)(uintptr_t)frame->args[0];
+      frame->rvalue[0] = frame->args[0];
+      nreg = 1;
+    }
+
+  for (i = 0, n = cif->nargs; i < n; ++i, ++nreg)
+    {
+      size_t size = cif->arg_types[i]->size;
+      size_t type = cif->arg_types[i]->type;
+      void *a;
+
+      if (type == FFI_TYPE_DOUBLE || type == FFI_TYPE_FLOAT)
+	{
+	  if (nreg < 4)
+	    a = &frame->fargs[nreg];
+	  else
+	    a = &frame->args[nreg];
+	}
+      else if (size == 1 || size == 2 || size == 4 || size == 8)
+	a = &frame->args[nreg];
+      else
+	a = (void *)(uintptr_t)frame->args[nreg];
+
+      avalue[i] = a;
+    }
+
+  /* Invoke the closure.  */
+  fun (cif, rvalue, avalue, user_data);
+  return flags;
+}
+
+#endif /* X86_WIN64 */
diff --git a/src/x86/win64.S b/src/x86/win64.S
index 687f97c..a5a20b6 100644
--- a/src/x86/win64.S
+++ b/src/x86/win64.S
@@ -1,264 +1,16 @@
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
+#include <ffi_cfi.h>
 
-/* Constants for ffi_call_win64 */
-#define STACK 0
-#define PREP_ARGS_FN 32
-#define ECIF 40
-#define CIF_BYTES 48
-#define CIF_FLAGS 56
-#define RVALUE 64
-#define FN 72
-
-/* ffi_call_win64 (void (*prep_args_fn)(char *, extended_cif *),
-		   extended_cif *ecif, unsigned bytes, unsigned flags,
-		   unsigned *rvalue, void (*fn)());
- */
-
-#ifdef _MSC_VER
-PUBLIC	ffi_call_win64
-
-EXTRN	__chkstk:NEAR
-EXTRN	ffi_closure_win64_inner:NEAR
-
-_TEXT	SEGMENT
-
-;;; ffi_closure_win64 will be called with these registers set:
-;;;    rax points to 'closure'
-;;;    r11 contains a bit mask that specifies which of the
-;;;    first four parameters are float or double
-;;;
-;;; It must move the parameters passed in registers to their stack location,
-;;; call ffi_closure_win64_inner for the actual work, then return the result.
-;;;
-ffi_closure_win64 PROC FRAME
-	;; copy register arguments onto stack
-	test	r11, 1
-	jne	first_is_float
-	mov	QWORD PTR [rsp+8], rcx
-	jmp	second
-first_is_float:
-	movlpd	QWORD PTR [rsp+8], xmm0
-
-second:
-	test	r11, 2
-	jne	second_is_float
-	mov	QWORD PTR [rsp+16], rdx
-	jmp	third
-second_is_float:
-	movlpd	QWORD PTR [rsp+16], xmm1
-
-third:
-	test	r11, 4
-	jne	third_is_float
-	mov	QWORD PTR [rsp+24], r8
-	jmp	fourth
-third_is_float:
-	movlpd	QWORD PTR [rsp+24], xmm2
-
-fourth:
-	test	r11, 8
-	jne	fourth_is_float
-	mov	QWORD PTR [rsp+32], r9
-	jmp	done
-fourth_is_float:
-	movlpd	QWORD PTR [rsp+32], xmm3
-
-done:
-	.ALLOCSTACK 40
-	sub	rsp, 40
-	.ENDPROLOG
-	mov	rcx, rax	; context is first parameter
-	mov	rdx, rsp	; stack is second parameter
-	add	rdx, 48		; point to start of arguments
-	mov	rax, ffi_closure_win64_inner
-	call	rax		; call the real closure function
-	add	rsp, 40
-	movd	xmm0, rax	; If the closure returned a float,
-				; ffi_closure_win64_inner wrote it to rax
-	ret	0
-ffi_closure_win64 ENDP
-
-ffi_call_win64 PROC FRAME
-	;; copy registers onto stack
-	mov	QWORD PTR [rsp+32], r9
-	mov	QWORD PTR [rsp+24], r8
-	mov	QWORD PTR [rsp+16], rdx
-	mov	QWORD PTR [rsp+8], rcx
-	.PUSHREG rbp
-	push	rbp
-	.ALLOCSTACK 48
-	sub	rsp, 48					; 00000030H
-	.SETFRAME rbp, 32
-	lea	rbp, QWORD PTR [rsp+32]
-	.ENDPROLOG
-
-	mov	eax, DWORD PTR CIF_BYTES[rbp]
-	add	rax, 15
-	and	rax, -16
-	call	__chkstk
-	sub	rsp, rax
-	lea	rax, QWORD PTR [rsp+32]
-	mov	QWORD PTR STACK[rbp], rax
-
-	mov	rdx, QWORD PTR ECIF[rbp]
-	mov	rcx, QWORD PTR STACK[rbp]
-	call	QWORD PTR PREP_ARGS_FN[rbp]
-
-	mov	rsp, QWORD PTR STACK[rbp]
-
-	movlpd	xmm3, QWORD PTR [rsp+24]
-	movd	r9, xmm3
-
-	movlpd	xmm2, QWORD PTR [rsp+16]
-	movd	r8, xmm2
-
-	movlpd	xmm1, QWORD PTR [rsp+8]
-	movd	rdx, xmm1
-
-	movlpd	xmm0, QWORD PTR [rsp]
-	movd	rcx, xmm0
-
-	call	QWORD PTR FN[rbp]
-ret_struct4b$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_4B
- 	jne	ret_struct2b$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov	DWORD PTR [rcx], eax
-	jmp	ret_void$
-
-ret_struct2b$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_2B
- 	jne	ret_struct1b$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov	WORD PTR [rcx], ax
-	jmp	ret_void$
-
-ret_struct1b$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_1B
- 	jne	ret_uint8$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov	BYTE PTR [rcx], al
-	jmp	ret_void$
-
-ret_uint8$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT8
- 	jne	ret_sint8$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movzx   rax, al
-	mov	QWORD PTR [rcx], rax
-	jmp	ret_void$
-
-ret_sint8$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT8
- 	jne	ret_uint16$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movsx   rax, al
-	mov	QWORD PTR [rcx], rax
-	jmp	ret_void$
-
-ret_uint16$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT16
- 	jne	ret_sint16$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movzx   rax, ax
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_sint16$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT16
- 	jne	ret_uint32$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	movsx   rax, ax
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_uint32$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT32
- 	jne	ret_sint32$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	mov     eax, eax
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_sint32$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT32
- 	jne	ret_float$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	cdqe
-	mov	QWORD PTR [rcx], rax
-	jmp	SHORT ret_void$
-
-ret_float$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_FLOAT
- 	jne	SHORT ret_double$
-
- 	mov	rax, QWORD PTR RVALUE[rbp]
- 	movss	DWORD PTR [rax], xmm0
- 	jmp	SHORT ret_void$
-
-ret_double$:
- 	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_DOUBLE
- 	jne	SHORT ret_uint64$
-
- 	mov	rax, QWORD PTR RVALUE[rbp]
- 	movlpd	QWORD PTR [rax], xmm0
- 	jmp	SHORT ret_void$
-
-ret_uint64$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT64
-  	jne	SHORT ret_sint64$
-
- 	mov	rcx, QWORD PTR RVALUE[rbp]
- 	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_sint64$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT64
-  	jne	SHORT ret_pointer$
-
- 	mov	rcx, QWORD PTR RVALUE[rbp]
- 	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_pointer$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_POINTER
-  	jne	SHORT ret_int$
-
- 	mov	rcx, QWORD PTR RVALUE[rbp]
- 	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_int$:
-  	cmp	DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_INT
-  	jne	SHORT ret_void$
-
-	mov	rcx, QWORD PTR RVALUE[rbp]
-	cdqe
-	mov	QWORD PTR [rcx], rax
- 	jmp	SHORT ret_void$
-
-ret_void$:
-	xor	rax, rax
-
-	lea	rsp, QWORD PTR [rbp+16]
-	pop	rbp
-	ret	0
-ffi_call_win64 ENDP
-_TEXT	ENDS
-END
+#if defined(HAVE_AS_CFI_PSEUDO_OP)
+        .cfi_sections   .debug_frame
+#endif
 
-#else
+#define arg0	%rcx
+#define arg1	%rdx
+#define arg2	%r8
+#define arg3	%r9
 
 #ifdef SYMBOL_UNDERSCORE
 #define SYMBOL_NAME(name) _##name
@@ -266,255 +18,202 @@ END
 #define SYMBOL_NAME(name) name
 #endif
 
-.text
-
-.extern SYMBOL_NAME(ffi_closure_win64_inner)
-
-# ffi_closure_win64 will be called with these registers set:
-#    rax points to 'closure'
-#    r11 contains a bit mask that specifies which of the
-#    first four parameters are float or double
-#
-# It must move the parameters passed in registers to their stack location,
-# call ffi_closure_win64_inner for the actual work, then return the result.
-#
-	.balign 16
-	.globl SYMBOL_NAME(ffi_closure_win64)
-	.seh_proc SYMBOL_NAME(ffi_closure_win64)
-SYMBOL_NAME(ffi_closure_win64):
-	# copy register arguments onto stack
-	test	$1,%r11
-	jne	.Lfirst_is_float
-	mov	%rcx, 8(%rsp)
-	jmp	.Lsecond
-.Lfirst_is_float:
-	movlpd	%xmm0, 8(%rsp)
-
-.Lsecond:
-	test	$2, %r11
-	jne	.Lsecond_is_float
-	mov	%rdx, 16(%rsp)
-	jmp	.Lthird
-.Lsecond_is_float:
-	movlpd	%xmm1, 16(%rsp)
-
-.Lthird:
-	test	$4, %r11
-	jne	.Lthird_is_float
-	mov	%r8,24(%rsp)
-	jmp	.Lfourth
-.Lthird_is_float:
-	movlpd	%xmm2, 24(%rsp)
-
-.Lfourth:
-	test	$8, %r11
-	jne	.Lfourth_is_float
-	mov	%r9, 32(%rsp)
-	jmp	.Ldone
-.Lfourth_is_float:
-	movlpd	%xmm3, 32(%rsp)
-
-.Ldone:
-	.seh_stackalloc 40
-	sub	$40, %rsp
+.macro E which
+	.align	8
+	.org	0b + \which * 8
+.endm
+
+	.text
+
+/* ffi_call_win64 (void *stack, struct win64_call_frame *frame, void *r10)
+
+   Bit o trickiness here -- FRAME is the base of the stack frame
+   for this function.  This has been allocated by ffi_call.  We also
+   deallocate some of the stack that has been alloca'd.  */
+
+	.align	8
+	.globl	ffi_call_win64
+
+	.seh_proc ffi_call_win64
+ffi_call_win64:
+	cfi_startproc
+	/* Set up the local stack frame and install it in rbp/rsp.  */
+	movq	(%rsp), %rax
+	movq	%rbp, (arg1)
+	movq	%rax, 8(arg1)
+	movq	arg1, %rbp
+	cfi_def_cfa(%rbp, 16)
+	cfi_rel_offset(%rbp, 0)
+	.seh_pushreg %rbp
+	.seh_setframe %rbp, 0
 	.seh_endprologue
-	mov	%rax, %rcx	# context is first parameter
-	mov	%rsp, %rdx	# stack is second parameter
-	add	$48, %rdx	# point to start of arguments
-	leaq	SYMBOL_NAME(ffi_closure_win64_inner)(%rip), %rax
-	callq	*%rax		# call the real closure function
-	add	$40, %rsp
-	movq	%rax, %xmm0	# If the closure returned a float,
-				# ffi_closure_win64_inner wrote it to rax
-	retq
+	movq	arg0, %rsp
+
+	movq	arg2, %r10
+
+	/* Load all slots into both general and xmm registers.  */
+	movq	(%rsp), %rcx
+	movsd	(%rsp), %xmm0
+	movq	8(%rsp), %rdx
+	movsd	8(%rsp), %xmm1
+	movq	16(%rsp), %r8
+	movsd	16(%rsp), %xmm2
+	movq	24(%rsp), %r9
+	movsd	24(%rsp), %xmm3
+
+	call	*16(%rbp)
+
+	movl	24(%rbp), %ecx
+	movq	32(%rbp), %r8
+	leaq	0f(%rip), %r10
+	cmpl	$FFI_TYPE_SMALL_STRUCT_4B, %ecx
+	leaq	(%r10, %rcx, 8), %r10
+	ja	99f
+	jmp	*%r10
+
+/* Below, we're space constrained most of the time.  Thus we eschew the
+   modern "mov, pop, ret" sequence (5 bytes) for "leave, ret" (2 bytes).  */
+.macro epilogue
+	leaveq
+	cfi_remember_state
+	cfi_def_cfa(%rsp, 8)
+	cfi_restore(%rbp)
+	ret
+	cfi_restore_state
+.endm
+
+	.align	8
+0:
+E FFI_TYPE_VOID
+	epilogue
+E FFI_TYPE_INT
+	movslq	%eax, %rax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_FLOAT
+	movss	%xmm0, (%r8)
+	epilogue
+E FFI_TYPE_DOUBLE
+	movsd	%xmm0, (%r8)
+	epilogue
+E FFI_TYPE_LONGDOUBLE
+	call	abort
+E FFI_TYPE_UINT8
+	movzbl	%al, %eax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT8
+	movsbq	%al, %rax
+	jmp	98f
+E FFI_TYPE_UINT16
+	movzwl	%ax, %eax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT16
+	movswq	%ax, %rax
+	jmp	98f
+E FFI_TYPE_UINT32
+	movl	%eax, %eax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT32
+	movslq	%eax, %rax
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_UINT64
+98:	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_SINT64
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_STRUCT
+	epilogue
+E FFI_TYPE_POINTER
+	movq	%rax, (%r8)
+	epilogue
+E FFI_TYPE_COMPLEX
+	call	abort
+E FFI_TYPE_SMALL_STRUCT_1B
+	movb	%al, (%r8)
+	epilogue
+E FFI_TYPE_SMALL_STRUCT_2B
+	movw	%ax, (%r8)
+	epilogue
+E FFI_TYPE_SMALL_STRUCT_4B
+	movl	%eax, (%r8)
+	epilogue
+
+	.align	8
+99:	call	abort
+
+.purgem epilogue
+
+	cfi_endproc
 	.seh_endproc
 
-	.balign 16
-	.globl	SYMBOL_NAME(ffi_call_win64)
-	.seh_proc SYMBOL_NAME(ffi_call_win64)
-SYMBOL_NAME(ffi_call_win64):
-	# copy registers onto stack
-	mov	%r9,32(%rsp)
-	mov	%r8,24(%rsp)
-	mov	%rdx,16(%rsp)
-	mov	%rcx,8(%rsp)
-	.seh_pushreg rbp
-	push	%rbp
-	.seh_stackalloc 48
-	sub	$48,%rsp
-	.seh_setframe rbp, 32
-	lea	32(%rsp),%rbp
-	.seh_endprologue
-
-	mov	CIF_BYTES(%rbp),%eax
-	add	$15, %rax
-	and	$-16, %rax
-	cmpq	$0x1000, %rax
-	jb	Lch_done
-Lch_probe:
-	subq	$0x1000,%rsp
-	orl	$0x0, (%rsp)
-	subq	$0x1000,%rax
-	cmpq	$0x1000,%rax
-	ja	Lch_probe
-Lch_done:
-	subq	%rax, %rsp
-	orl	$0x0, (%rsp)
-	lea	32(%rsp), %rax
-	mov	%rax, STACK(%rbp)
-
-	mov	ECIF(%rbp), %rdx
-	mov	STACK(%rbp), %rcx
-	callq	*PREP_ARGS_FN(%rbp)
-
-	mov	STACK(%rbp), %rsp
-
-	movlpd	24(%rsp), %xmm3
-	movd	%xmm3, %r9
-
-	movlpd	16(%rsp), %xmm2
-	movd	%xmm2, %r8
-
-	movlpd	8(%rsp), %xmm1
-	movd	%xmm1, %rdx
-
-	movlpd	(%rsp), %xmm0
-	movd	%xmm0, %rcx
-
-	callq	*FN(%rbp)
-.Lret_struct4b:
- 	cmpl	$FFI_TYPE_SMALL_STRUCT_4B, CIF_FLAGS(%rbp)
- 	jne .Lret_struct2b
-
-	mov	RVALUE(%rbp), %rcx
-	mov	%eax, (%rcx)
-	jmp	.Lret_void
-
-.Lret_struct2b:
-	cmpl	$FFI_TYPE_SMALL_STRUCT_2B, CIF_FLAGS(%rbp)
-	jne .Lret_struct1b
-
-	mov	RVALUE(%rbp), %rcx
-	mov	%ax, (%rcx)
-	jmp .Lret_void
-
-.Lret_struct1b:
-	cmpl	$FFI_TYPE_SMALL_STRUCT_1B, CIF_FLAGS(%rbp)
-	jne .Lret_uint8
 
-	mov	RVALUE(%rbp), %rcx
-	mov	%al, (%rcx)
-	jmp .Lret_void
-
-.Lret_uint8:
-	cmpl	$FFI_TYPE_UINT8, CIF_FLAGS(%rbp)
-	jne .Lret_sint8
-
-	mov     RVALUE(%rbp), %rcx
-	movzbq  %al, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_sint8:
-	cmpl	$FFI_TYPE_SINT8, CIF_FLAGS(%rbp)
-	jne .Lret_uint16
-
-	mov     RVALUE(%rbp), %rcx
-	movsbq  %al, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_uint16:
-	cmpl	$FFI_TYPE_UINT16, CIF_FLAGS(%rbp)
-	jne .Lret_sint16
-
-	mov     RVALUE(%rbp), %rcx
-	movzwq  %ax, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_sint16:
-	cmpl	$FFI_TYPE_SINT16, CIF_FLAGS(%rbp)
-	jne .Lret_uint32
-
-	mov     RVALUE(%rbp), %rcx
-	movswq  %ax, %rax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_uint32:
-	cmpl	$FFI_TYPE_UINT32, CIF_FLAGS(%rbp)
-	jne .Lret_sint32
-
-	mov     RVALUE(%rbp), %rcx
-	movl    %eax, %eax
-	movq    %rax, (%rcx)
-	jmp .Lret_void
-
-.Lret_sint32:
- 	cmpl	$FFI_TYPE_SINT32, CIF_FLAGS(%rbp)
- 	jne	.Lret_float
-
-	mov	RVALUE(%rbp), %rcx
-	cltq
-	movq	%rax, (%rcx)
-	jmp	.Lret_void
-
-.Lret_float:
- 	cmpl	$FFI_TYPE_FLOAT, CIF_FLAGS(%rbp)
- 	jne	.Lret_double
-
- 	mov	RVALUE(%rbp), %rax
- 	movss	%xmm0, (%rax)
- 	jmp	.Lret_void
-
-.Lret_double:
- 	cmpl	$FFI_TYPE_DOUBLE, CIF_FLAGS(%rbp)
- 	jne	.Lret_uint64
-
- 	mov	RVALUE(%rbp), %rax
- 	movlpd	%xmm0, (%rax)
- 	jmp	.Lret_void
-
-.Lret_uint64:
-  	cmpl	$FFI_TYPE_UINT64, CIF_FLAGS(%rbp)
- 	jne	.Lret_sint64
-
- 	mov	RVALUE(%rbp), %rcx
- 	mov	%rax, (%rcx)
- 	jmp	.Lret_void
-
-.Lret_sint64:
-  	cmpl	$FFI_TYPE_SINT64, CIF_FLAGS(%rbp)
-  	jne	.Lret_pointer
-
- 	mov	RVALUE(%rbp), %rcx
- 	mov	%rax, (%rcx)
- 	jmp	.Lret_void
+/* 32 bytes of outgoing register stack space, 8 bytes of alignment,
+   16 bytes of result, 32 bytes of xmm registers.  */
+#define ffi_clo_FS	(32+8+16+32)
+#define ffi_clo_OFF_R	(32+8)
+#define ffi_clo_OFF_X	(32+8+16)
+
+	.align	8
+	.globl	ffi_go_closure_win64
+
+	.seh_proc ffi_go_closure_win64
+ffi_go_closure_win64:
+	cfi_startproc
+	/* Save all integer arguments into the incoming reg stack space.  */
+	movq	arg0, 8(%rsp)
+	movq	arg1, 16(%rsp)
+	movq	arg2, 24(%rsp)
+	movq	arg3, 32(%rsp)
+
+	movq	8(%r10), arg0			/* load cif */
+	movq	16(%r10), arg1			/* load fun */
+	movq	%r10, arg2			/* closure is user_data */
+	jmp	0f
+	cfi_endproc
+	.seh_endproc
 
-.Lret_pointer:
-  	cmpl	$FFI_TYPE_POINTER, CIF_FLAGS(%rbp)
-  	jne	.Lret_int
+	.align	8
+	.globl	ffi_closure_win64
+
+	.seh_proc ffi_closure_win64
+ffi_closure_win64:
+	cfi_startproc
+	/* Save all integer arguments into the incoming reg stack space.  */
+	movq	arg0, 8(%rsp)
+	movq	arg1, 16(%rsp)
+	movq	arg2, 24(%rsp)
+	movq	arg3, 32(%rsp)
+
+	movq	FFI_TRAMPOLINE_SIZE(%r10), arg0		/* load cif */
+	movq	FFI_TRAMPOLINE_SIZE+8(%r10), arg1	/* load fun */
+	movq	FFI_TRAMPOLINE_SIZE+16(%r10), arg2	/* load user_data */
+0:
+	subq	$ffi_clo_FS, %rsp
+	cfi_adjust_cfa_offset(ffi_clo_FS)
+	.seh_stackalloc ffi_clo_FS
+	.seh_endprologue
 
- 	mov	RVALUE(%rbp), %rcx
- 	mov	%rax, (%rcx)
- 	jmp	.Lret_void
+	/* Save all sse arguments into the stack frame.  */
+	movsd	%xmm0, ffi_clo_OFF_X(%rsp)
+	movsd	%xmm1, ffi_clo_OFF_X+8(%rsp)
+	movsd	%xmm2, ffi_clo_OFF_X+16(%rsp)
+	movsd	%xmm3, ffi_clo_OFF_X+24(%rsp)
 
-.Lret_int:
-  	cmpl	$FFI_TYPE_INT, CIF_FLAGS(%rbp)
-  	jne	.Lret_void
+	leaq	ffi_clo_OFF_R(%rsp), arg3
+	call	ffi_closure_win64_inner
 
-	mov	RVALUE(%rbp), %rcx
-	cltq
-	movq	%rax, (%rcx)
-	jmp	.Lret_void
+	/* Load the result into both possible result registers.  */
+	movq    ffi_clo_OFF_R(%rsp), %rax
+	movsd   ffi_clo_OFF_R(%rsp), %xmm0
 
-.Lret_void:
-	xor	%rax, %rax
+	addq	$ffi_clo_FS, %rsp
+	cfi_adjust_cfa_offset(-ffi_clo_FS)
+	ret
 
-	lea	16(%rbp), %rsp
-	pop	%rbp
-	retq
+	cfi_endproc
 	.seh_endproc
-#endif /* !_MSC_VER */
-
diff --git a/testsuite/libffi.call/call.exp b/testsuite/libffi.call/call.exp
index 5177f07..55de25c 100644
--- a/testsuite/libffi.call/call.exp
+++ b/testsuite/libffi.call/call.exp
@@ -24,16 +24,15 @@ set ctlist [lsearch -inline -all -glob [lsort [glob -nocomplain -- $srcdir/$subd
 
 run-many-tests $tlist ""
 
-if { ![istarget s390*] } {
-
+# ??? We really should preprocess ffi.h and grep
+# for FFI_TARGET_HAS_COMPLEX_TYPE.
+if { [istarget s390*]
+     || [istarget x86_64*] } {
+  run-many-tests $ctlist ""
+} else {
     foreach test $ctlist {
 	unsupported "$test"
     }
-
-} else {
-
-  run-many-tests $ctlist ""
-
 }
 
 dg-finish
diff --git a/testsuite/libffi.call/cls_align_longdouble_split.c b/testsuite/libffi.call/cls_align_longdouble_split.c
index 15f9365..cc1c43b 100644
--- a/testsuite/libffi.call/cls_align_longdouble_split.c
+++ b/testsuite/libffi.call/cls_align_longdouble_split.c
@@ -4,10 +4,8 @@
    PR:		none.
    Originator:	<hos@tamanegi.org> 20031203	 */
 
-/* { dg-excess-errors "no long double format" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 /* { dg-do run { xfail strongarm*-*-* xscale*-*-* } } */
 /* { dg-options -mlong-double-128 { target powerpc64*-*-linux* } } */
-/* { dg-output "" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 
 #include "ffitest.h"
 
diff --git a/testsuite/libffi.call/cls_align_longdouble_split2.c b/testsuite/libffi.call/cls_align_longdouble_split2.c
index ca1c356..5d3bec0 100644
--- a/testsuite/libffi.call/cls_align_longdouble_split2.c
+++ b/testsuite/libffi.call/cls_align_longdouble_split2.c
@@ -5,10 +5,8 @@
 	Originator:		Blake Chaffin	6/18/2007
 */
 
-/* { dg-excess-errors "no long double format" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 /* { dg-do run { xfail strongarm*-*-* } } */
 /* { dg-options -mlong-double-128 { target powerpc64*-*-linux* } } */
-/* { dg-output "" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 
 #include "ffitest.h"
 
diff --git a/testsuite/libffi.call/cls_longdouble.c b/testsuite/libffi.call/cls_longdouble.c
index 5dc9ac7..d24e72e 100644
--- a/testsuite/libffi.call/cls_longdouble.c
+++ b/testsuite/libffi.call/cls_longdouble.c
@@ -4,12 +4,10 @@
    PR:			none.
    Originator:	Blake Chaffin	*/
 
-/* { dg-excess-errors "no long double format" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 /* This test is known to PASS on armv7l-unknown-linux-gnueabihf, so I have
    remove the xfail for arm*-*-* below, until we know more.  */
 /* { dg-do run { xfail strongarm*-*-* xscale*-*-* } } */
 /* { dg-options -mlong-double-128 { target powerpc64*-*-linux* } } */
-/* { dg-output "" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 
 #include "ffitest.h"
 
diff --git a/testsuite/libffi.call/float2.c b/testsuite/libffi.call/float2.c
index a0b296c..aae1abf 100644
--- a/testsuite/libffi.call/float2.c
+++ b/testsuite/libffi.call/float2.c
@@ -4,9 +4,6 @@
    PR:		none.
    Originator:	From the original ffitest.c  */
 
-/* { dg-excess-errors "fails" { target x86_64-*-mingw* x86_64-*-cygwin* } } */
-/* { dg-do run { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
-
 #include "ffitest.h"
 #include "float.h"
 
diff --git a/testsuite/libffi.call/huge_struct.c b/testsuite/libffi.call/huge_struct.c
index 657fe54..187c42c 100644
--- a/testsuite/libffi.call/huge_struct.c
+++ b/testsuite/libffi.call/huge_struct.c
@@ -5,11 +5,9 @@
 	Originator:		Blake Chaffin	6/18/2007
 */
 
-/* { dg-excess-errors "" { target x86_64-*-mingw* x86_64-*-cygwin* } } */
 /* { dg-do run { xfail strongarm*-*-* xscale*-*-* } } */
 /* { dg-options -mlong-double-128 { target powerpc64*-*-linux* } } */
 /* { dg-options -Wformat=0 { target moxie*-*-elf } } */
-/* { dg-output "" { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 
 #include "ffitest.h"
 
diff --git a/testsuite/libffi.call/return_ldl.c b/testsuite/libffi.call/return_ldl.c
index 5c2fe65..520e710 100644
--- a/testsuite/libffi.call/return_ldl.c
+++ b/testsuite/libffi.call/return_ldl.c
@@ -4,7 +4,6 @@
    PR:		none.
    Originator:	<andreast@gcc.gnu.org> 20071113  */
 
-/* { dg-do run { xfail x86_64-*-mingw* x86_64-*-cygwin* } } */
 #include "ffitest.h"
 
 static long double return_ldl(long double ldl)
-- 
1.9.3

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 8/8] x86_64: Add support for complex types
  2014-10-28 18:32 [PATCH 0/8] Go closures for x86_64 Richard Henderson
                   ` (5 preceding siblings ...)
  2014-10-28 18:32 ` [PATCH 5/8] win64: Remove support from ffi.c Richard Henderson
@ 2014-10-28 18:32 ` Richard Henderson
  2014-10-28 18:32 ` [PATCH 6/8] x86_64: Fixups for x32 Richard Henderson
  7 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2014-10-28 18:32 UTC (permalink / raw)
  To: libffi-discuss

---
 src/x86/ffi64.c      | 97 +++++++++++++++++++++++++++++++++++++++++++++-------
 src/x86/internal64.h |  6 ++--
 src/x86/unix64.S     | 63 ++++++++++++++++++----------------
 3 files changed, 122 insertions(+), 44 deletions(-)

diff --git a/src/x86/ffi64.c b/src/x86/ffi64.c
index a03061b..650f7bb 100644
--- a/src/x86/ffi64.c
+++ b/src/x86/ffi64.c
@@ -171,6 +171,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
     case FFI_TYPE_UINT64:
     case FFI_TYPE_SINT64:
     case FFI_TYPE_POINTER:
+    do_integer:
       {
 	size_t size = byte_offset + type->size;
 
@@ -301,11 +302,42 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	  }
 	return words;
       }
-
-    default:
-      FFI_ASSERT(0);
+    case FFI_TYPE_COMPLEX:
+      {
+	ffi_type *inner = type->elements[0];
+	switch (inner->type)
+	  {
+	  case FFI_TYPE_INT:
+	  case FFI_TYPE_UINT8:
+	  case FFI_TYPE_SINT8:
+	  case FFI_TYPE_UINT16:
+	  case FFI_TYPE_SINT16:
+	  case FFI_TYPE_UINT32:
+	  case FFI_TYPE_SINT32:
+	  case FFI_TYPE_UINT64:
+	  case FFI_TYPE_SINT64:
+	    goto do_integer;
+
+	  case FFI_TYPE_FLOAT:
+	    classes[0] = X86_64_SSE_CLASS;
+	    if (byte_offset % 8)
+	      {
+		classes[1] = X86_64_SSESF_CLASS;
+		return 2;
+	      }
+	    return 1;
+	  case FFI_TYPE_DOUBLE:
+	    classes[0] = classes[1] = X86_64_SSEDF_CLASS;
+	    return 2;
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+	  case FFI_TYPE_LONGDOUBLE:
+	    classes[0] = X86_64_COMPLEX_X87_CLASS;
+	    return 1;
+#endif
+	  }
+      }
     }
-  return 0; /* Never reached.  */
+  abort();
 }
 
 /* Examine the argument and return set number of register required in each
@@ -360,7 +392,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 {
   int gprcount, ssecount, i, avn, ngpr, nsse, flags;
   enum x86_64_reg_class classes[MAX_CLASSES];
-  size_t bytes, n;
+  size_t bytes, n, rtype_size;
   ffi_type *rtype;
 
   if (cif->abi != FFI_UNIX64)
@@ -369,6 +401,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   gprcount = ssecount = 0;
 
   rtype = cif->rtype;
+  rtype_size = rtype->size;
   switch (rtype->type)
     {
     case FFI_TYPE_VOID:
@@ -421,16 +454,54 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	}
       else
 	{
-	  /* Mark which registers the result appears in.  */
 	  _Bool sse0 = SSE_CLASS_P (classes[0]);
-	  _Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
-	  if (sse0)
-	    flags = (sse1 ? UNIX64_RET_ST_XMM0_XMM1 : UNIX64_RET_ST_XMM0_RAX);
-	  else
-	    flags = (sse1 ? UNIX64_RET_ST_RAX_XMM0 : UNIX64_RET_ST_RAX_RDX);
 
-	  /* Mark the true size of the structure.  */
-	  flags |= rtype->size << UNIX64_SIZE_SHIFT;
+	  if (rtype_size == 4 && sse0)
+	    flags = UNIX64_RET_XMM32;
+	  else if (rtype_size == 8)
+	    flags = sse0 ? UNIX64_RET_XMM64 : UNIX64_RET_INT64;
+	  else
+	    {
+	      _Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
+	      if (sse0 && sse1)
+		flags = UNIX64_RET_ST_XMM0_XMM1;
+	      else if (sse0)
+		flags = UNIX64_RET_ST_XMM0_RAX;
+	      else if (sse1)
+		flags = UNIX64_RET_ST_RAX_XMM0;
+	      else
+		flags = UNIX64_RET_ST_RAX_RDX;
+	      flags |= rtype_size << UNIX64_SIZE_SHIFT;
+	    }
+	}
+      break;
+    case FFI_TYPE_COMPLEX:
+      switch (rtype->elements[0]->type)
+	{
+	case FFI_TYPE_UINT8:
+	case FFI_TYPE_SINT8:
+	case FFI_TYPE_UINT16:
+	case FFI_TYPE_SINT16:
+	case FFI_TYPE_INT:
+	case FFI_TYPE_UINT32:
+	case FFI_TYPE_SINT32:
+	case FFI_TYPE_UINT64:
+	case FFI_TYPE_SINT64:
+	  flags = UNIX64_RET_ST_RAX_RDX | (rtype_size << UNIX64_SIZE_SHIFT);
+	  break;
+	case FFI_TYPE_FLOAT:
+	  flags = UNIX64_RET_XMM64;
+	  break;
+	case FFI_TYPE_DOUBLE:
+	  flags = UNIX64_RET_ST_XMM0_XMM1 | (16 << UNIX64_SIZE_SHIFT);
+	  break;
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+	case FFI_TYPE_LONGDOUBLE:
+	  flags = UNIX64_RET_X87_2;
+	  break;
+#endif
+	default:
+	  return FFI_BAD_TYPEDEF;
 	}
       break;
     default:
diff --git a/src/x86/internal64.h b/src/x86/internal64.h
index 07b1b10..512e955 100644
--- a/src/x86/internal64.h
+++ b/src/x86/internal64.h
@@ -9,11 +9,13 @@
 #define UNIX64_RET_XMM32	8
 #define UNIX64_RET_XMM64	9
 #define UNIX64_RET_X87		10
-#define UNIX64_RET_ST_RAX_RDX	11
+#define UNIX64_RET_X87_2	11
 #define UNIX64_RET_ST_XMM0_RAX	12
 #define UNIX64_RET_ST_RAX_XMM0	13
 #define UNIX64_RET_ST_XMM0_XMM1	14
-#define UNIX64_RET_LAST		14
+#define UNIX64_RET_ST_RAX_RDX	15
+
+#define UNIX64_RET_LAST		15
 
 #define UNIX64_FLAG_RET_IN_MEM	(1 << 10)
 #define UNIX64_FLAG_XMM_ARGS	(1 << 11)
diff --git a/src/x86/unix64.S b/src/x86/unix64.S
index 0151229..6066bbf 100644
--- a/src/x86/unix64.S
+++ b/src/x86/unix64.S
@@ -156,9 +156,10 @@ E UNIX64_RET_XMM64
 E UNIX64_RET_X87
 	fstpt	(%rdi)
 	ret
-E UNIX64_RET_ST_RAX_RDX
-	movq	%rdx, 8(%rsi)
-	jmp	2f
+E UNIX64_RET_X87_2
+	fstpt	(%rdi)
+	fstpt	16(%rdi)
+	ret
 E UNIX64_RET_ST_XMM0_RAX
 	movq	%rax, 8(%rsi)
 	jmp	3f
@@ -167,14 +168,15 @@ E UNIX64_RET_ST_RAX_XMM0
 	jmp	2f
 E UNIX64_RET_ST_XMM0_XMM1
 	movq	%xmm1, 8(%rsi)
-
-	.align 8
-3:	movq	%xmm0, (%rsi)
+	jmp	3f
+E UNIX64_RET_ST_RAX_RDX
+	movq	%rdx, 8(%rsi)
+2:	movq	%rax, (%rsi)
 	shrl	$UNIX64_SIZE_SHIFT, %ecx
 	rep movsb
 	ret
 	.align 8
-2:	movq	%rax, (%rsi)
+3:	movq	%xmm0, (%rsi)
 	shrl	$UNIX64_SIZE_SHIFT, %ecx
 	rep movsb
 	ret
@@ -201,11 +203,11 @@ E UNIX64_RET_ST_XMM0_XMM1
 	.size    ffi_call_unix64,.-ffi_call_unix64
 
 /* 6 general registers, 8 vector registers,
-   16 bytes of rvalue, 8 bytes of alignment.  */
+   32 bytes of rvalue, 8 bytes of alignment.  */
 #define ffi_closure_OFS_G	0
 #define ffi_closure_OFS_V	(6*8)
 #define ffi_closure_OFS_RVALUE	(ffi_closure_OFS_V + 8*16)
-#define ffi_closure_FS		(ffi_closure_OFS_RVALUE + 16 + 8)
+#define ffi_closure_FS		(ffi_closure_OFS_RVALUE + 32 + 8)
 
 /* The location of rvalue within the red zone after deallocating the frame.  */
 #define ffi_closure_RED_RVALUE	(ffi_closure_OFS_RVALUE - ffi_closure_FS)
@@ -275,6 +277,7 @@ ffi_closure_unix64:
 	leaq	0f(%rip), %r11
 	ja	9f
 	leaq	(%r11, %r10, 8), %r10
+	leaq	ffi_closure_RED_RVALUE(%rsp), %rsi
 	jmp	*%r10
 
 	.align	8
@@ -282,52 +285,54 @@ ffi_closure_unix64:
 E UNIX64_RET_VOID
 	ret
 E UNIX64_RET_UINT8
-	movzbl	ffi_closure_RED_RVALUE(%rsp), %eax
+	movzbl	(%rsi), %eax
 	ret
 E UNIX64_RET_UINT16
-	movzwl	ffi_closure_RED_RVALUE(%rsp), %eax
+	movzwl	(%rsi), %eax
 	ret
 E UNIX64_RET_UINT32
-	movl	ffi_closure_RED_RVALUE(%rsp), %eax
+	movl	(%rsi), %eax
 	ret
 E UNIX64_RET_SINT8
-	movsbl	ffi_closure_RED_RVALUE(%rsp), %eax
+	movsbl	(%rsi), %eax
 	ret
 E UNIX64_RET_SINT16
-	movswl	ffi_closure_RED_RVALUE(%rsp), %eax
+	movswl	(%rsi), %eax
 	ret
 E UNIX64_RET_SINT32
-	movl	ffi_closure_RED_RVALUE(%rsp), %eax
+	movl	(%rsi), %eax
 	ret
 E UNIX64_RET_INT64
-	movq	ffi_closure_RED_RVALUE(%rsp), %rax
+	movq	(%rsi), %rax
 	ret
 E UNIX64_RET_XMM32
-	movd	ffi_closure_RED_RVALUE(%rsp), %xmm0
+	movd	(%rsi), %xmm0
 	ret
 E UNIX64_RET_XMM64
-	movq	ffi_closure_RED_RVALUE(%rsp), %xmm0
+	movq	(%rsi), %xmm0
 	ret
 E UNIX64_RET_X87
-	fldt	ffi_closure_RED_RVALUE(%rsp)
+	fldt	(%rsi)
+	ret
+E UNIX64_RET_X87_2
+	fldt	16(%rsi)
+	fldt	(%rsi)
 	ret
-E UNIX64_RET_ST_RAX_RDX
-	movq	ffi_closure_RED_RVALUE+8(%rsp), %rdx
-	jmp	2f
 E UNIX64_RET_ST_XMM0_RAX
-	movq	ffi_closure_RED_RVALUE+8(%rsp), %rax
+	movq	8(%rsi), %rax
 	jmp	3f
 E UNIX64_RET_ST_RAX_XMM0
-	movq	ffi_closure_RED_RVALUE+8(%rsp), %xmm0
+	movq	8(%rsi), %xmm0
 	jmp	2f
 E UNIX64_RET_ST_XMM0_XMM1
-	movq	ffi_closure_RED_RVALUE+8(%rsp), %xmm1
-
-	.align	8
-3:	movq	ffi_closure_RED_RVALUE(%rsp), %xmm0
+	movq	8(%rsi), %xmm1
+	jmp	3f
+E UNIX64_RET_ST_RAX_RDX
+	movq	8(%rsi), %rdx
+2:	movq	(%rsi), %rax
 	ret
 	.align	8
-2:	movq	ffi_closure_RED_RVALUE(%rsp), %rax
+3:	movq	(%rsi), %xmm0
 	ret
 
 9:	call	abort@PLT
-- 
1.9.3

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 7/8] x86_64: Decouple return types from FFI_TYPE constants
  2014-10-28 18:32 [PATCH 0/8] Go closures for x86_64 Richard Henderson
  2014-10-28 18:32 ` [PATCH 1/8] Add entry points for interacting with Go Richard Henderson
  2014-10-28 18:32 ` [PATCH 3/8] x86-64: Support go closures Richard Henderson
@ 2014-10-28 18:32 ` Richard Henderson
  2014-10-28 18:32 ` [PATCH 2/8] Add ffi_cfi.h Richard Henderson
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2014-10-28 18:32 UTC (permalink / raw)
  To: libffi-discuss

We can better support structure returns, and as prep for
complex types.
---
 src/x86/ffi64.c      | 142 ++++++++++++++++++-------------
 src/x86/internal64.h |  20 +++++
 src/x86/unix64.S     | 236 +++++++++++++++++++++------------------------------
 3 files changed, 202 insertions(+), 196 deletions(-)
 create mode 100644 src/x86/internal64.h

diff --git a/src/x86/ffi64.c b/src/x86/ffi64.c
index 65fb595..a03061b 100644
--- a/src/x86/ffi64.c
+++ b/src/x86/ffi64.c
@@ -33,6 +33,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdint.h>
+#include "internal64.h"
 
 #ifdef __x86_64__
 
@@ -191,7 +192,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	  }
 	else if (size <= 16)
 	  {
-	    classes[0] = classes[1] = X86_64_INTEGERSI_CLASS;
+	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
 	    return 2;
 	  }
 	else
@@ -360,15 +361,55 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   int gprcount, ssecount, i, avn, ngpr, nsse, flags;
   enum x86_64_reg_class classes[MAX_CLASSES];
   size_t bytes, n;
+  ffi_type *rtype;
 
   if (cif->abi != FFI_UNIX64)
     return FFI_BAD_ABI;
 
   gprcount = ssecount = 0;
 
-  flags = cif->rtype->type;
-  if (flags != FFI_TYPE_VOID)
+  rtype = cif->rtype;
+  switch (rtype->type)
     {
+    case FFI_TYPE_VOID:
+      flags = UNIX64_RET_VOID;
+      break;
+    case FFI_TYPE_UINT8:
+      flags = UNIX64_RET_UINT8;
+      break;
+    case FFI_TYPE_SINT8:
+      flags = UNIX64_RET_SINT8;
+      break;
+    case FFI_TYPE_UINT16:
+      flags = UNIX64_RET_UINT16;
+      break;
+    case FFI_TYPE_SINT16:
+      flags = UNIX64_RET_SINT16;
+      break;
+    case FFI_TYPE_UINT32:
+      flags = UNIX64_RET_UINT32;
+      break;
+    case FFI_TYPE_INT:
+    case FFI_TYPE_SINT32:
+      flags = UNIX64_RET_SINT32;
+      break;
+    case FFI_TYPE_UINT64:
+    case FFI_TYPE_SINT64:
+      flags = UNIX64_RET_INT64;
+      break;
+    case FFI_TYPE_POINTER:
+      flags = (sizeof(void *) == 4 ? UNIX64_RET_UINT32 : UNIX64_RET_INT64);
+      break;
+    case FFI_TYPE_FLOAT:
+      flags = UNIX64_RET_XMM32;
+      break;
+    case FFI_TYPE_DOUBLE:
+      flags = UNIX64_RET_XMM64;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = UNIX64_RET_X87;
+      break;
+    case FFI_TYPE_STRUCT:
       n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
       if (n == 0)
 	{
@@ -376,22 +417,24 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	     memory is the first argument.  Allocate a register for it.  */
 	  gprcount++;
 	  /* We don't have to do anything in asm for the return.  */
-	  flags = FFI_TYPE_VOID;
+	  flags = UNIX64_RET_VOID | UNIX64_FLAG_RET_IN_MEM;
 	}
-      else if (flags == FFI_TYPE_STRUCT)
+      else
 	{
 	  /* Mark which registers the result appears in.  */
 	  _Bool sse0 = SSE_CLASS_P (classes[0]);
 	  _Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
-	  if (sse0 && !sse1)
-	    flags |= 1 << 8;
-	  else if (!sse0 && sse1)
-	    flags |= 1 << 9;
-	  else if (sse0 && sse1)
-	    flags |= 1 << 10;
+	  if (sse0)
+	    flags = (sse1 ? UNIX64_RET_ST_XMM0_XMM1 : UNIX64_RET_ST_XMM0_RAX);
+	  else
+	    flags = (sse1 ? UNIX64_RET_ST_RAX_XMM0 : UNIX64_RET_ST_RAX_RDX);
+
 	  /* Mark the true size of the structure.  */
-	  flags |= cif->rtype->size << 12;
+	  flags |= rtype->size << UNIX64_SIZE_SHIFT;
 	}
+      break;
+    default:
+      return FFI_BAD_TYPEDEF;
     }
 
   /* Go over all arguments and determine the way they should be passed.
@@ -418,9 +461,10 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	}
     }
   if (ssecount)
-    flags |= 1 << 11;
+    flags |= UNIX64_FLAG_XMM_ARGS;
+
   cif->flags = flags;
-  cif->bytes = (unsigned)ALIGN (bytes, 8);
+  cif->bytes = ALIGN (bytes, 8);
 
   return FFI_OK;
 }
@@ -432,20 +476,22 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
   enum x86_64_reg_class classes[MAX_CLASSES];
   char *stack, *argp;
   ffi_type **arg_types;
-  int gprcount, ssecount, ngpr, nsse, i, avn;
-  _Bool ret_in_memory;
+  int gprcount, ssecount, ngpr, nsse, i, avn, flags;
   struct register_args *reg_args;
 
   /* Can't call 32-bit mode from 64-bit mode.  */
   FFI_ASSERT (cif->abi == FFI_UNIX64);
 
   /* If the return value is a struct and we don't have a return value
-     address then we need to make one.  Note the setting of flags to
-     VOID above in ffi_prep_cif_machdep.  */
-  ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
-		   && (cif->flags & 0xff) == FFI_TYPE_VOID);
-  if (rvalue == NULL && ret_in_memory)
-    rvalue = alloca (cif->rtype->size);
+     address then we need to make one.  Otherwise we can ignore it.  */
+  flags = cif->flags;
+  if (rvalue == NULL)
+    {
+      if (flags & UNIX64_FLAG_RET_IN_MEM)
+	rvalue = alloca (cif->rtype->size);
+      else
+	flags = UNIX64_RET_VOID;
+    }
 
   /* Allocate the space for the arguments, plus 4 words of temp space.  */
   stack = alloca (sizeof (struct register_args) + cif->bytes + 4*8);
@@ -458,7 +504,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 
   /* If the return value is passed in memory, add the pointer as the
      first integer argument.  */
-  if (ret_in_memory)
+  if (flags & UNIX64_FLAG_RET_IN_MEM)
     reg_args->gpr[gprcount++] = (unsigned long) rvalue;
 
   avn = cif->nargs;
@@ -503,17 +549,17 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 		  switch (arg_types[i]->type)
 		    {
 		    case FFI_TYPE_SINT8:
-		      *(SINT64 *)&reg_args->gpr[gprcount] = (SINT64) *((SINT8 *) a);
+		      reg_args->gpr[gprcount] = (SINT64) *((SINT8 *) a);
 		      break;
 		    case FFI_TYPE_SINT16:
-		      *(SINT64 *)&reg_args->gpr[gprcount] = (SINT64) *((SINT16 *) a);
+		      reg_args->gpr[gprcount] = (SINT64) *((SINT16 *) a);
 		      break;
 		    case FFI_TYPE_SINT32:
-		      *(SINT64 *)&reg_args->gpr[gprcount] = (SINT64) *((SINT32 *) a);
+		      reg_args->gpr[gprcount] = (SINT64) *((SINT32 *) a);
 		      break;
 		    default:
 		      reg_args->gpr[gprcount] = 0;
-		      memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);
+		      memcpy (&reg_args->gpr[gprcount], a, size);
 		    }
 		  gprcount++;
 		  break;
@@ -533,7 +579,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
   reg_args->rax = ssecount;
 
   ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
-		   cif->flags, rvalue, fn);
+		   flags, rvalue, fn);
 }
 
 void
@@ -573,7 +619,7 @@ ffi_prep_closure_loc (ffi_closure* closure,
   if (cif->abi != FFI_UNIX64)
     return FFI_BAD_ABI;
 
-  if (cif->flags & (1 << 11))
+  if (cif->flags & UNIX64_FLAG_XMM_ARGS)
     dest = ffi_closure_unix64_sse;
   else
     dest = ffi_closure_unix64;
@@ -600,39 +646,17 @@ ffi_closure_unix64_inner(ffi_cif *cif,
   ffi_type **arg_types;
   long i, avn;
   int gprcount, ssecount, ngpr, nsse;
-  int ret;
+  int flags;
 
-  avalue = alloca(cif->nargs * sizeof(void *));
+  avn = cif->nargs;
+  flags = cif->flags;
+  avalue = alloca(avn * sizeof(void *));
   gprcount = ssecount = 0;
 
-  ret = cif->rtype->type;
-  if (ret != FFI_TYPE_VOID)
-    {
-      enum x86_64_reg_class classes[MAX_CLASSES];
-      size_t n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
-      if (n == 0)
-	{
-	  /* The return value goes in memory.  Arrange for the closure
-	     return value to go directly back to the original caller.  */
-	  rvalue = (void *) (unsigned long) reg_args->gpr[gprcount++];
-	  /* We don't have to do anything in asm for the return.  */
-	  ret = FFI_TYPE_VOID;
-	}
-      else if (ret == FFI_TYPE_STRUCT && n == 2)
-	{
-	  /* Mark which register the second word of the structure goes in.  */
-	  _Bool sse0 = SSE_CLASS_P (classes[0]);
-	  _Bool sse1 = SSE_CLASS_P (classes[1]);
-	  if (!sse0 && sse1)
-	    ret |= 1 << 8;
-	  else if (sse0 && !sse1)
-	    ret |= 1 << 9;
-	}
-    }
+  if (flags & UNIX64_FLAG_RET_IN_MEM)
+    rvalue = (void *)(uintptr_t)reg_args->gpr[gprcount++];
 
-  avn = cif->nargs;
   arg_types = cif->arg_types;
-
   for (i = 0; i < avn; ++i)
     {
       enum x86_64_reg_class classes[MAX_CLASSES];
@@ -693,7 +717,7 @@ ffi_closure_unix64_inner(ffi_cif *cif,
   fun (cif, rvalue, avalue, user_data);
 
   /* Tell assembly how to perform return type promotions.  */
-  return ret;
+  return flags;
 }
 
 extern void ffi_go_closure_unix64(void) FFI_HIDDEN;
@@ -706,7 +730,7 @@ ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
   if (cif->abi != FFI_UNIX64)
     return FFI_BAD_ABI;
 
-  closure->tramp = (cif->flags & (1 << 11)
+  closure->tramp = (cif->flags & UNIX64_FLAG_XMM_ARGS
 		    ? ffi_go_closure_unix64_sse
 		    : ffi_go_closure_unix64);
   closure->cif = cif;
diff --git a/src/x86/internal64.h b/src/x86/internal64.h
new file mode 100644
index 0000000..07b1b10
--- /dev/null
+++ b/src/x86/internal64.h
@@ -0,0 +1,20 @@
+#define UNIX64_RET_VOID		0
+#define UNIX64_RET_UINT8	1
+#define UNIX64_RET_UINT16	2
+#define UNIX64_RET_UINT32	3
+#define UNIX64_RET_SINT8	4
+#define UNIX64_RET_SINT16	5
+#define UNIX64_RET_SINT32	6
+#define UNIX64_RET_INT64	7
+#define UNIX64_RET_XMM32	8
+#define UNIX64_RET_XMM64	9
+#define UNIX64_RET_X87		10
+#define UNIX64_RET_ST_RAX_RDX	11
+#define UNIX64_RET_ST_XMM0_RAX	12
+#define UNIX64_RET_ST_RAX_XMM0	13
+#define UNIX64_RET_ST_XMM0_XMM1	14
+#define UNIX64_RET_LAST		14
+
+#define UNIX64_FLAG_RET_IN_MEM	(1 << 10)
+#define UNIX64_FLAG_XMM_ARGS	(1 << 11)
+#define UNIX64_SIZE_SHIFT	12
diff --git a/src/x86/unix64.S b/src/x86/unix64.S
index 797b9d9..0151229 100644
--- a/src/x86/unix64.S
+++ b/src/x86/unix64.S
@@ -31,9 +31,15 @@
 #include <fficonfig.h>
 #include <ffi.h>
 #include <ffi_cfi.h>
+#include "internal64.h"
 
 	.text
 
+.macro E index
+	.align	8
+	.org	0b + \index * 8, 0x90
+.endm
+
 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
 	            void *raddr, void (*fnaddr)(void));
 
@@ -41,7 +47,7 @@
    for this function.  This has been allocated by ffi_call.  We also
    deallocate some of the stack that has been alloca'd.  */
 
-	.align	2
+	.align	8
 	.globl	ffi_call_unix64
 	.type	ffi_call_unix64,@function
 	FFI_HIDDEN(ffi_call_unix64)
@@ -100,109 +106,81 @@ ffi_call_unix64:
 	cfi_restore(%rbp)
 
 	/* The first byte of the flags contains the FFI_TYPE.  */
+	cmpb	$UNIX64_RET_LAST, %cl
 	movzbl	%cl, %r10d
-	leaq	.Lstore_table(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
-	addq	%r11, %r10
-	jmp	*%r10
+	leaq	0f(%rip), %r11
+	ja	9f
+	leaq	(%r11, %r10, 8), %r10
 
-	.section .rodata
-	.align	2
-.Lstore_table:
-	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */
-	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_INT */
-	.long	.Lst_float-.Lstore_table	/* FFI_TYPE_FLOAT */
-	.long	.Lst_double-.Lstore_table	/* FFI_TYPE_DOUBLE */
-	.long	.Lst_ldouble-.Lstore_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	.Lst_uint8-.Lstore_table	/* FFI_TYPE_UINT8 */
-	.long	.Lst_sint8-.Lstore_table	/* FFI_TYPE_SINT8 */
-	.long	.Lst_uint16-.Lstore_table	/* FFI_TYPE_UINT16 */
-	.long	.Lst_sint16-.Lstore_table	/* FFI_TYPE_SINT16 */
-	.long	.Lst_uint32-.Lstore_table	/* FFI_TYPE_UINT32 */
-	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_SINT32 */
-	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_UINT64 */
-	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_SINT64 */
-	.long	.Lst_struct-.Lstore_table	/* FFI_TYPE_STRUCT */
-	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_POINTER */
-	.previous
+	/* Prep for the structure cases: scratch area in redzone.  */
+	leaq	-20(%rsp), %rsi
+	jmp	*%r10
 
-	.align 2
-.Lst_void:
+	.align	8
+0:
+E UNIX64_RET_VOID
 	ret
-	.align 2
-
-.Lst_uint8:
-	movzbq	%al, %rax
+E UNIX64_RET_UINT8
+	movzbl	%al, %eax
 	movq	%rax, (%rdi)
 	ret
-	.align 2
-.Lst_sint8:
-	movsbq	%al, %rax
+E UNIX64_RET_UINT16
+	movzwl	%ax, %eax
 	movq	%rax, (%rdi)
 	ret
-	.align 2
-.Lst_uint16:
-	movzwq	%ax, %rax
+E UNIX64_RET_UINT32
+	movl	%eax, %eax
 	movq	%rax, (%rdi)
-	.align 2
-.Lst_sint16:
-	movswq	%ax, %rax
+	ret
+E UNIX64_RET_SINT8
+	movsbq	%al, %rax
 	movq	%rax, (%rdi)
 	ret
-	.align 2
-.Lst_uint32:
-	movl	%eax, %eax
+E UNIX64_RET_SINT16
+	movswq	%ax, %rax
 	movq	%rax, (%rdi)
-	.align 2
-.Lst_sint32:
+	ret
+E UNIX64_RET_SINT32
 	cltq
 	movq	%rax, (%rdi)
 	ret
-	.align 2
-.Lst_int64:
+E UNIX64_RET_INT64
 	movq	%rax, (%rdi)
 	ret
-
-	.align 2
-.Lst_float:
-	movss	%xmm0, (%rdi)
+E UNIX64_RET_XMM32
+	movd	%xmm0, (%rdi)
 	ret
-	.align 2
-.Lst_double:
-	movsd	%xmm0, (%rdi)
+E UNIX64_RET_XMM64
+	movq	%xmm0, (%rdi)
 	ret
-.Lst_ldouble:
+E UNIX64_RET_X87
 	fstpt	(%rdi)
 	ret
-
-	.align 2
-.Lst_struct:
-	leaq	-20(%rsp), %rsi		/* Scratch area in redzone.  */
-
-	/* We have to locate the values now, and since we don't want to
-	   write too much data into the user's return value, we spill the
-	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
-	   control where the values are located.  Only one of the three
-	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
-	movd	%xmm0, %r10
-	movd	%xmm1, %r11
-	testl	$0x100, %ecx
-	cmovnz	%rax, %rdx
-	cmovnz	%r10, %rax
-	testl	$0x200, %ecx
-	cmovnz	%r10, %rdx
-	testl	$0x400, %ecx
-	cmovnz	%r10, %rax
-	cmovnz	%r11, %rdx
-	movq	%rax, (%rsi)
+E UNIX64_RET_ST_RAX_RDX
 	movq	%rdx, 8(%rsi)
-
-	/* Bits 12-31 contain the true size of the structure.  Copy from
-	   the scratch area to the true destination.  */
-	shrl	$12, %ecx
+	jmp	2f
+E UNIX64_RET_ST_XMM0_RAX
+	movq	%rax, 8(%rsi)
+	jmp	3f
+E UNIX64_RET_ST_RAX_XMM0
+	movq	%xmm0, 8(%rsi)
+	jmp	2f
+E UNIX64_RET_ST_XMM0_XMM1
+	movq	%xmm1, 8(%rsi)
+
+	.align 8
+3:	movq	%xmm0, (%rsi)
+	shrl	$UNIX64_SIZE_SHIFT, %ecx
+	rep movsb
+	ret
+	.align 8
+2:	movq	%rax, (%rsi)
+	shrl	$UNIX64_SIZE_SHIFT, %ecx
 	rep movsb
 	ret
 
+9:	call	abort@PLT
+
 	/* Many times we can avoid loading any SSE registers at all.
 	   It's not worth an indirect jump to load the exact set of
 	   SSE registers needed; zero or all is a good compromise.  */
@@ -292,84 +270,68 @@ ffi_closure_unix64:
 	cfi_adjust_cfa_offset(-ffi_closure_FS)
 
 	/* The first byte of the return value contains the FFI_TYPE.  */
+	cmpb	$UNIX64_RET_LAST, %al
 	movzbl	%al, %r10d
-	leaq	.Lload_table(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
-	addq	%r11, %r10
+	leaq	0f(%rip), %r11
+	ja	9f
+	leaq	(%r11, %r10, 8), %r10
 	jmp	*%r10
 
-	.section .rodata
-	.align	2
-.Lload_table:
-	.long	.Lld_void-.Lload_table		/* FFI_TYPE_VOID */
-	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_INT */
-	.long	.Lld_float-.Lload_table		/* FFI_TYPE_FLOAT */
-	.long	.Lld_double-.Lload_table	/* FFI_TYPE_DOUBLE */
-	.long	.Lld_ldouble-.Lload_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_UINT8 */
-	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_SINT8 */
-	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_UINT16 */
-	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_SINT16 */
-	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_UINT32 */
-	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_SINT32 */
-	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_UINT64 */
-	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_SINT64 */
-	.long	.Lld_struct-.Lload_table	/* FFI_TYPE_STRUCT */
-	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_POINTER */
-	.previous
-
-	.align 2
-.Lld_void:
+	.align	8
+0:
+E UNIX64_RET_VOID
 	ret
-
-	.align 2
-.Lld_int8:
+E UNIX64_RET_UINT8
 	movzbl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
-	.align 2
-.Lld_int16:
+E UNIX64_RET_UINT16
 	movzwl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
-	.align 2
-.Lld_int32:
+E UNIX64_RET_UINT32
 	movl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
-	.align 2
-.Lld_int64:
+E UNIX64_RET_SINT8
+	movsbl	ffi_closure_RED_RVALUE(%rsp), %eax
+	ret
+E UNIX64_RET_SINT16
+	movswl	ffi_closure_RED_RVALUE(%rsp), %eax
+	ret
+E UNIX64_RET_SINT32
+	movl	ffi_closure_RED_RVALUE(%rsp), %eax
+	ret
+E UNIX64_RET_INT64
 	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	ret
-
-	.align 2
-.Lld_float:
-	movss	ffi_closure_RED_RVALUE(%rsp), %xmm0
+E UNIX64_RET_XMM32
+	movd	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	ret
-	.align 2
-.Lld_double:
-	movsd	ffi_closure_RED_RVALUE(%rsp), %xmm0
+E UNIX64_RET_XMM64
+	movq	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	ret
-	.align 2
-.Lld_ldouble:
+E UNIX64_RET_X87
 	fldt	ffi_closure_RED_RVALUE(%rsp)
 	ret
-
-	.align 2
-.Lld_struct:
-	/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
-	   %rax/%xmm0, %xmm0/%xmm1.  We collapse two by always loading
-	   both rdx and xmm1 with the second word.  For the remaining,
-	   bit 8 set means xmm0 gets the second word, and bit 9 means
-	   that rax gets the second word.  */
-	movq	ffi_closure_RED_RVALUE(%rsp), %rcx
+E UNIX64_RET_ST_RAX_RDX
 	movq	ffi_closure_RED_RVALUE+8(%rsp), %rdx
+	jmp	2f
+E UNIX64_RET_ST_XMM0_RAX
+	movq	ffi_closure_RED_RVALUE+8(%rsp), %rax
+	jmp	3f
+E UNIX64_RET_ST_RAX_XMM0
+	movq	ffi_closure_RED_RVALUE+8(%rsp), %xmm0
+	jmp	2f
+E UNIX64_RET_ST_XMM0_XMM1
 	movq	ffi_closure_RED_RVALUE+8(%rsp), %xmm1
-	testl	$0x100, %eax
-	cmovnz	%rdx, %rcx
-	movd	%rcx, %xmm0
-	testl	$0x200, %eax
-	movq	ffi_closure_RED_RVALUE(%rsp), %rax
-	cmovnz	%rdx, %rax
+
+	.align	8
+3:	movq	ffi_closure_RED_RVALUE(%rsp), %xmm0
+	ret
+	.align	8
+2:	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	ret
 
+9:	call	abort@PLT
+
 	cfi_endproc
 	.size	ffi_closure_unix64,.-ffi_closure_unix64
 
-- 
1.9.3

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 5/8] win64: Remove support from ffi.c
  2014-10-28 18:32 [PATCH 0/8] Go closures for x86_64 Richard Henderson
                   ` (4 preceding siblings ...)
  2014-10-28 18:32 ` [PATCH 4/8] win64: Rewrite Richard Henderson
@ 2014-10-28 18:32 ` Richard Henderson
  2014-10-28 18:32 ` [PATCH 8/8] x86_64: Add support for complex types Richard Henderson
  2014-10-28 18:32 ` [PATCH 6/8] x86_64: Fixups for x32 Richard Henderson
  7 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2014-10-28 18:32 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Kai Tietz

This file is now 32-bit only.

Cc: Kai Tietz <ktietz@redhat.com>
---
 src/x86/ffi.c | 212 +++-------------------------------------------------------
 1 file changed, 8 insertions(+), 204 deletions(-)

diff --git a/src/x86/ffi.c b/src/x86/ffi.c
index 006c95d..c387fb5 100644
--- a/src/x86/ffi.c
+++ b/src/x86/ffi.c
@@ -28,18 +28,12 @@
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
-#if !defined(__x86_64__) || defined(_WIN64) || defined(__CYGWIN__)
-
-#ifdef _WIN64
-#include <windows.h>
-#endif
+#ifndef __x86_64__
 
 #include <ffi.h>
 #include <ffi_common.h>
-
 #include <stdlib.h>
 
-
 /* ffi_prep_args is called by the assembly routine once stack space
    has been allocated for the function's arguments */
 
@@ -50,26 +44,17 @@ unsigned int ffi_prep_args(char *stack, extended_cif *ecif)
   register void **p_argv;
   register char *argp;
   register ffi_type **p_arg;
-#ifndef X86_WIN64
   const int cabi = ecif->cif->abi;
   const int dir = (cabi == FFI_PASCAL || cabi == FFI_REGISTER) ? -1 : +1;
   unsigned int stack_args_count = 0;
   void *p_stack_data[3];
   char *argp2 = stack;
-#else
-  #define dir 1
-#endif
 
   argp = stack;
 
   if ((ecif->cif->flags == FFI_TYPE_STRUCT
-       || ecif->cif->flags == FFI_TYPE_MS_STRUCT)
-#ifdef X86_WIN64
-      && ((ecif->cif->rtype->size & (1 | 2 | 4 | 8)) == 0)
-#endif
-      )
+       || ecif->cif->flags == FFI_TYPE_MS_STRUCT))
     {
-#ifndef X86_WIN64
       /* For fastcall/thiscall/register this is first register-passed
          argument.  */
       if (cabi == FFI_THISCALL || cabi == FFI_FASTCALL || cabi == FFI_REGISTER)
@@ -77,7 +62,6 @@ unsigned int ffi_prep_args(char *stack, extended_cif *ecif)
           p_stack_data[stack_args_count] = argp;
           ++stack_args_count;
         }
-#endif
 
       *(void **) argp = ecif->rvalue;
       argp += sizeof(void*);
@@ -105,24 +89,6 @@ unsigned int ffi_prep_args(char *stack, extended_cif *ecif)
 
       size_t z = (*p_arg)->size;
 
-#ifdef X86_WIN64
-      if (z > FFI_SIZEOF_ARG
-          || ((*p_arg)->type == FFI_TYPE_STRUCT
-              && (z & (1 | 2 | 4 | 8)) == 0)
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
-          || ((*p_arg)->type == FFI_TYPE_LONGDOUBLE)
-#endif
-          )
-        {
-          z = FFI_SIZEOF_ARG;
-          *(void **)argp = *p_argv;
-        }
-      else if ((*p_arg)->type == FFI_TYPE_FLOAT)
-        {
-          memcpy(argp, *p_argv, z);
-        }
-      else
-#endif
       if (z < FFI_SIZEOF_ARG)
         {
           z = FFI_SIZEOF_ARG;
@@ -165,7 +131,6 @@ unsigned int ffi_prep_args(char *stack, extended_cif *ecif)
           memcpy(argp, *p_argv, z);
         }
 
-#ifndef X86_WIN64
     /* For thiscall/fastcall/register convention register-passed arguments
        are the first two none-floating-point arguments with a size
        smaller or equal to sizeof (void*).  */
@@ -188,18 +153,13 @@ unsigned int ffi_prep_args(char *stack, extended_cif *ecif)
         p_stack_data[stack_args_count] = argp;
         ++stack_args_count;
       }
-#endif
 
-#ifdef X86_WIN64
-      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-#else
       argp += z;
-#endif
     }
 
-#ifndef X86_WIN64
-  /* We need to move the register-passed arguments for thiscall/fastcall/register
-     on top of stack, so that those can be moved to registers by call-handler.  */
+  /* We need to move the register-passed arguments for thiscall,
+     fastcall, register on top of stack, so that those can be moved
+     to registers by call-handler.  */
   if (stack_args_count > 0)
     {
       if (dir < 0 && stack_args_count > 1)
@@ -225,7 +185,6 @@ unsigned int ffi_prep_args(char *stack, extended_cif *ecif)
     }
 
     return stack_args_count;
-#endif
     return 0;
 }
 
@@ -243,25 +202,16 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
     case FFI_TYPE_UINT16:
     case FFI_TYPE_SINT8:
     case FFI_TYPE_SINT16:
-#ifdef X86_WIN64
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-#endif
     case FFI_TYPE_SINT64:
     case FFI_TYPE_FLOAT:
     case FFI_TYPE_DOUBLE:
-#ifndef X86_WIN64
 #if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
 #endif
-#endif
       cif->flags = (unsigned) cif->rtype->type;
       break;
 
     case FFI_TYPE_UINT64:
-#ifdef X86_WIN64
-    case FFI_TYPE_POINTER:
-#endif
       cif->flags = FFI_TYPE_SINT64;
       break;
 
@@ -277,11 +227,7 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
         }
       else if (cif->rtype->size == 4)
         {
-#ifdef X86_WIN64
-          cif->flags = FFI_TYPE_SMALL_STRUCT_4B;
-#else
           cif->flags = FFI_TYPE_INT; /* same as int type */
-#endif
         }
       else if (cif->rtype->size == 8)
         {
@@ -302,14 +248,7 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
       break;
 
     default:
-#ifdef X86_WIN64
-      cif->flags = FFI_TYPE_SINT64;
-      break;
-    case FFI_TYPE_INT:
-      cif->flags = FFI_TYPE_SINT32;
-#else
       cif->flags = FFI_TYPE_INT;
-#endif
       break;
     }
 
@@ -320,32 +259,19 @@ ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
       cif->bytes += (unsigned)ALIGN((*ptr)->size, FFI_SIZEOF_ARG);
     }
 
-#ifdef X86_WIN64
-  /* ensure space for storing four registers */
-  cif->bytes += 4 * FFI_SIZEOF_ARG;
-#endif
-
 #ifndef X86_WIN32
-#ifndef X86_WIN64
   if (cif->abi == FFI_SYSV || cif->abi == FFI_UNIX64)
-#endif
     cif->bytes = (cif->bytes + 15) & ~0xF;
 #endif
 
   return FFI_OK;
 }
 
-#ifdef X86_WIN64
-extern int
-ffi_call_win64(unsigned int (*)(char *, extended_cif *), extended_cif *,
-               unsigned, unsigned, unsigned *, void (*fn)(void));
-#else
 extern void
 ffi_call_win32(unsigned int (*)(char *, extended_cif *), extended_cif *,
                unsigned, unsigned, unsigned, unsigned *, void (*fn)(void));
 extern void ffi_call_SYSV(void (*)(char *, extended_cif *), extended_cif *,
                           unsigned, unsigned, unsigned *, void (*fn)(void));
-#endif
 
 void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
@@ -357,33 +283,18 @@ void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
   /* If the return value is a struct and we don't have a return */
   /* value address then we need to make one                     */
 
-#ifdef X86_WIN64
-  if (rvalue == NULL
-      && cif->flags == FFI_TYPE_STRUCT
-      && ((cif->rtype->size & (1 | 2 | 4 | 8)) == 0))
-    {
-      ecif.rvalue = alloca((cif->rtype->size + 0xF) & ~0xF);
-    }
-#else
   if (rvalue == NULL
       && (cif->flags == FFI_TYPE_STRUCT
           || cif->flags == FFI_TYPE_MS_STRUCT))
     {
       ecif.rvalue = alloca(cif->rtype->size);
     }
-#endif
   else
     ecif.rvalue = rvalue;
     
   
   switch (cif->abi) 
     {
-#ifdef X86_WIN64
-    case FFI_WIN64:
-      ffi_call_win64(ffi_prep_args, &ecif, cif->bytes,
-                     cif->flags, ecif.rvalue, fn);
-      break;
-#else
 #ifndef X86_WIN32
     case FFI_SYSV:
       ffi_call_SYSV(ffi_prep_args, &ecif, cif->bytes, cif->flags, ecif.rvalue,
@@ -401,7 +312,6 @@ void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
       ffi_call_win32(ffi_prep_args, &ecif, cif->abi, cif->bytes, cif->flags,
                      ecif.rvalue, fn);
       break;
-#endif
     default:
       FFI_ASSERT(0);
       break;
@@ -427,47 +337,13 @@ void FFI_HIDDEN ffi_closure_raw_SYSV (ffi_raw_closure *)
 void FFI_HIDDEN ffi_closure_raw_THISCALL (ffi_raw_closure *)
      __attribute__ ((regparm(1)));
 #endif
-#ifndef X86_WIN64
 void FFI_HIDDEN ffi_closure_STDCALL (ffi_closure *);
 void FFI_HIDDEN ffi_closure_THISCALL (ffi_closure *);
 void FFI_HIDDEN ffi_closure_FASTCALL (ffi_closure *);
 void FFI_HIDDEN ffi_closure_REGISTER (ffi_closure *);
-#else
-void FFI_HIDDEN ffi_closure_win64 (ffi_closure *);
-#endif
 
 /* This function is jumped to by the trampoline */
 
-#ifdef X86_WIN64
-void * FFI_HIDDEN
-ffi_closure_win64_inner (ffi_closure *closure, void *args) {
-  ffi_cif       *cif;
-  void         **arg_area;
-  void          *result;
-  void          *resp = &result;
-
-  cif         = closure->cif;
-  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
-
-  /* this call will initialize ARG_AREA, such that each
-   * element in that array points to the corresponding 
-   * value on the stack; and if the function returns
-   * a structure, it will change RESP to point to the
-   * structure return address.  */
-
-  ffi_prep_incoming_args(args, &resp, arg_area, cif);
-  
-  (closure->fun) (cif, resp, arg_area, closure->user_data);
-
-  /* The result is returned in rax.  This does the right thing for
-     result types except for floats; we have to 'mov xmm0, rax' in the
-     caller to correct this.
-     TODO: structure sizes of 3 5 6 7 are returned by reference, too!!!
-  */
-  return cif->rtype->size > sizeof(void *) ? resp : *(void **)resp;
-}
-
-#else
 unsigned int FFI_HIDDEN __attribute__ ((regparm(1)))
 ffi_closure_SYSV_inner (ffi_closure *closure, void **respp, void *args)
 {
@@ -514,7 +390,6 @@ ffi_closure_WIN32_inner (ffi_closure *closure, void **respp, void *args)
 
   return ret;
 }
-#endif /* !X86_WIN64 */
 
 static unsigned int
 ffi_prep_incoming_args(char *stack, void **rvalue, void **avalue,
@@ -524,7 +399,6 @@ ffi_prep_incoming_args(char *stack, void **rvalue, void **avalue,
   register void **p_argv;
   register char *argp;
   register ffi_type **p_arg;
-#ifndef X86_WIN64
   const int cabi = cif->abi;
   const int dir = (cabi == FFI_PASCAL || cabi == FFI_REGISTER) ? -1 : +1;
   const unsigned int max_stack_count = (cabi == FFI_THISCALL) ? 1
@@ -533,37 +407,25 @@ ffi_prep_incoming_args(char *stack, void **rvalue, void **avalue,
                                      : 0;
   unsigned int passed_regs = 0;
   void *p_stack_data[3] = { stack - 1 };
-#else
-  #define dir 1
-#endif
 
   argp = stack;
-#ifndef X86_WIN64
   argp += max_stack_count * FFI_SIZEOF_ARG;
-#endif
 
   if ((cif->flags == FFI_TYPE_STRUCT
-       || cif->flags == FFI_TYPE_MS_STRUCT)
-#ifdef X86_WIN64
-      && ((cif->rtype->size & (1 | 2 | 4 | 8)) == 0)
-#endif
-      )
+       || cif->flags == FFI_TYPE_MS_STRUCT))
     {
-#ifndef X86_WIN64
       if (passed_regs < max_stack_count)
         {
           *rvalue = *(void**) (stack + (passed_regs*FFI_SIZEOF_ARG));
           ++passed_regs;
         }
       else
-#endif
         {
           *rvalue = *(void **) argp;
           argp += sizeof(void *);
         }
     }
 
-#ifndef X86_WIN64
   /* Do register arguments first  */
   for (i = 0, p_arg = cif->arg_types; 
        i < cif->nargs && passed_regs < max_stack_count;
@@ -581,7 +443,6 @@ ffi_prep_incoming_args(char *stack, void **rvalue, void **avalue,
       avalue[i] = stack + (passed_regs*FFI_SIZEOF_ARG);
       ++passed_regs;
     }
-#endif
 
   p_arg = cif->arg_types;
   p_argv = avalue;
@@ -605,20 +466,6 @@ ffi_prep_incoming_args(char *stack, void **rvalue, void **avalue,
 
       size_t z = (*p_arg)->size;
 
-#ifdef X86_WIN64
-      if (z > FFI_SIZEOF_ARG
-          || ((*p_arg)->type == FFI_TYPE_STRUCT
-              && (z & (1 | 2 | 4 | 8)) == 0)
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
-          || ((*p_arg)->type == FFI_TYPE_LONGDOUBLE)
-#endif
-          )
-        {
-          z = FFI_SIZEOF_ARG;
-          *p_argv = *(void **)argp;
-        }
-      else
-#else
       if (passed_regs > 0
           && z <= FFI_SIZEOF_ARG
           && (p_argv == p_stack_data[0]
@@ -629,40 +476,17 @@ ffi_prep_incoming_args(char *stack, void **rvalue, void **avalue,
           continue;
         }
       else
-#endif
         {
           /* because we're little endian, this is what it turns into.   */
           *p_argv = (void*) argp;
         }
 
-#ifdef X86_WIN64
-      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-#else
       argp += z;
-#endif
     }
 
   return (size_t)argp - (size_t)stack;
 }
 
-#define FFI_INIT_TRAMPOLINE_WIN64(TRAMP,FUN,CTX,MASK) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   void*  __fun = (void*)(FUN); \
-   void*  __ctx = (void*)(CTX); \
-   *(unsigned char*) &__tramp[0] = 0x41; \
-   *(unsigned char*) &__tramp[1] = 0xbb; \
-   *(unsigned int*) &__tramp[2] = MASK; /* mov $mask, %r11 */ \
-   *(unsigned char*) &__tramp[6] = 0x48; \
-   *(unsigned char*) &__tramp[7] = 0xb8; \
-   *(void**) &__tramp[8] = __ctx; /* mov __ctx, %rax */ \
-   *(unsigned char *)  &__tramp[16] = 0x49; \
-   *(unsigned char *)  &__tramp[17] = 0xba; \
-   *(void**) &__tramp[18] = __fun; /* mov __fun, %r10 */ \
-   *(unsigned char *)  &__tramp[26] = 0x41; \
-   *(unsigned char *)  &__tramp[27] = 0xff; \
-   *(unsigned char *)  &__tramp[28] = 0xe2; /* jmp %r10 */ \
- }
-
 /* How to make a trampoline.  Derived from gcc/config/i386/i386.c. */
 
 #define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX) \
@@ -723,18 +547,6 @@ ffi_prep_closure_loc (ffi_closure* closure,
                       void *user_data,
                       void *codeloc)
 {
-#ifdef X86_WIN64
-#define ISFLOAT(IDX) (cif->arg_types[IDX]->type == FFI_TYPE_FLOAT || cif->arg_types[IDX]->type == FFI_TYPE_DOUBLE)
-#define FLAG(IDX) (cif->nargs>(IDX)&&ISFLOAT(IDX)?(1<<(IDX)):0)
-  if (cif->abi == FFI_WIN64) 
-    {
-      int mask = FLAG(0)|FLAG(1)|FLAG(2)|FLAG(3);
-      FFI_INIT_TRAMPOLINE_WIN64 (&closure->tramp[0],
-                                 &ffi_closure_win64,
-                                 codeloc, mask);
-      /* make sure we can execute here */
-    }
-#else
   if (cif->abi == FFI_SYSV)
     {
       FFI_INIT_TRAMPOLINE (&closure->tramp[0],
@@ -773,7 +585,6 @@ ffi_prep_closure_loc (ffi_closure* closure,
                            (void*)codeloc);
     }
 #endif /* X86_WIN32 */
-#endif /* !X86_WIN64 */
   else
     {
       return FFI_BAD_ABI;
@@ -843,7 +654,6 @@ ffi_prep_args_raw(char *stack, extended_cif *ecif)
   const ffi_cif *cif = ecif->cif;
   unsigned int i, passed_regs = 0;
   
-#ifndef X86_WIN64
   const unsigned int abi = cif->abi;
   const unsigned int max_regs = (abi == FFI_THISCALL) ? 1
                               : (abi == FFI_FASTCALL) ? 2
@@ -865,7 +675,6 @@ ffi_prep_args_raw(char *stack, extended_cif *ecif)
 
       ++passed_regs;
     }
-#endif
 
   memcpy (stack, ecif->avalue, cif->bytes);
   return passed_regs;
@@ -909,7 +718,6 @@ ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *fake_avalue)
     case FFI_SYSV:
     case FFI_MS_CDECL:
 #endif
-#ifndef X86_WIN64
     case FFI_STDCALL:
     case FFI_THISCALL:
     case FFI_FASTCALL:
@@ -918,14 +726,10 @@ ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *fake_avalue)
       ffi_call_win32(ffi_prep_args_raw, &ecif, cif->abi, cif->bytes, cif->flags,
                      ecif.rvalue, fn);
       break;
-#endif
     default:
       FFI_ASSERT(0);
       break;
     }
 }
-
-#endif
-
-#endif /* !__x86_64__  || X86_WIN64 */
-
+#endif /* !FFI_NO_RAW_API */
+#endif /* !__x86_64__ */
-- 
1.9.3

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 6/8] x86_64: Fixups for x32
  2014-10-28 18:32 [PATCH 0/8] Go closures for x86_64 Richard Henderson
                   ` (6 preceding siblings ...)
  2014-10-28 18:32 ` [PATCH 8/8] x86_64: Add support for complex types Richard Henderson
@ 2014-10-28 18:32 ` Richard Henderson
  7 siblings, 0 replies; 9+ messages in thread
From: Richard Henderson @ 2014-10-28 18:32 UTC (permalink / raw)
  To: libffi-discuss

---
 src/x86/ffi64.c  |  5 +++--
 src/x86/unix64.S | 20 ++++++++++++++++----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/x86/ffi64.c b/src/x86/ffi64.c
index 384a93a..65fb595 100644
--- a/src/x86/ffi64.c
+++ b/src/x86/ffi64.c
@@ -568,6 +568,7 @@ ffi_prep_closure_loc (ffi_closure* closure,
     0x0f, 0x1f, 0x00
   };
   void (*dest)(void);
+  char *tramp = closure->tramp;
 
   if (cif->abi != FFI_UNIX64)
     return FFI_BAD_ABI;
@@ -577,8 +578,8 @@ ffi_prep_closure_loc (ffi_closure* closure,
   else
     dest = ffi_closure_unix64;
 
-  memcpy (closure->tramp, trampoline, sizeof(trampoline));
-  *(UINT64 *)(closure->tramp + 16) = (uintptr_t)dest;
+  memcpy (tramp, trampoline, sizeof(trampoline));
+  *(UINT64 *)(tramp + 16) = (uintptr_t)dest;
 
   closure->cif = cif;
   closure->fun = fun;
diff --git a/src/x86/unix64.S b/src/x86/unix64.S
index 134cb3d..797b9d9 100644
--- a/src/x86/unix64.S
+++ b/src/x86/unix64.S
@@ -32,7 +32,7 @@
 #include <ffi.h>
 #include <ffi_cfi.h>
 
-.text
+	.text
 
 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
 	            void *raddr, void (*fnaddr)(void));
@@ -272,9 +272,15 @@ ffi_closure_unix64:
 	movq    %r8,  ffi_closure_OFS_G+0x20(%rsp)
 	movq    %r9,  ffi_closure_OFS_G+0x28(%rsp)
 
-	movq	24(%r10), %rdi				/* Load cif */
-	movq	32(%r10), %rsi				/* Load fun */
-	movq	40(%r10), %rdx				/* Load user_data */
+#ifdef __ILP32__
+	movl	FFI_TRAMPOLINE_SIZE(%r10), %edi		/* Load cif */
+	movl	FFI_TRAMPOLINE_SIZE+4(%r10), %esi	/* Load fun */
+	movl	FFI_TRAMPOLINE_SIZE+8(%r10), %edx	/* Load user_data */
+#else
+	movq	FFI_TRAMPOLINE_SIZE(%r10), %rdi		/* Load cif */
+	movq	FFI_TRAMPOLINE_SIZE+8(%r10), %rsi	/* Load fun */
+	movq	FFI_TRAMPOLINE_SIZE+16(%r10), %rdx	/* Load user_data */
+#endif
 .Ldo_closure:
 	leaq	ffi_closure_OFS_RVALUE(%rsp), %rcx	/* Load rvalue */
 	movq	%rsp, %r8				/* Load reg_args */
@@ -407,9 +413,15 @@ ffi_go_closure_unix64:
 	movq    %r8,  ffi_closure_OFS_G+0x20(%rsp)
 	movq    %r9,  ffi_closure_OFS_G+0x28(%rsp)
 
+#ifdef __ILP32__
+	movl	4(%r10), %edi		/* Load cif */
+	movl	8(%r10), %esi		/* Load fun */
+	movl	%r10d, %edx		/* Load closure (user_data) */
+#else
 	movq	8(%r10), %rdi		/* Load cif */
 	movq	16(%r10), %rsi		/* Load fun */
 	movq	%r10, %rdx		/* Load closure (user_data) */
+#endif
 	jmp	.Ldo_closure
 
 	cfi_endproc
-- 
1.9.3

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2014-10-28 18:32 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-28 18:32 [PATCH 0/8] Go closures for x86_64 Richard Henderson
2014-10-28 18:32 ` [PATCH 1/8] Add entry points for interacting with Go Richard Henderson
2014-10-28 18:32 ` [PATCH 3/8] x86-64: Support go closures Richard Henderson
2014-10-28 18:32 ` [PATCH 7/8] x86_64: Decouple return types from FFI_TYPE constants Richard Henderson
2014-10-28 18:32 ` [PATCH 2/8] Add ffi_cfi.h Richard Henderson
2014-10-28 18:32 ` [PATCH 4/8] win64: Rewrite Richard Henderson
2014-10-28 18:32 ` [PATCH 5/8] win64: Remove support from ffi.c Richard Henderson
2014-10-28 18:32 ` [PATCH 8/8] x86_64: Add support for complex types Richard Henderson
2014-10-28 18:32 ` [PATCH 6/8] x86_64: Fixups for x32 Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).