public inbox for libffi-discuss@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 09/13] libgo: Remove __go_get/set_closure
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (8 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 01/13] Make TARGET_STATIC_CHAIN allow a function type Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 08/13] libgo: Use the new libffi interfaces for Go Richard Henderson
                   ` (5 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

---
 libgo/runtime/proc.c    | 20 --------------------
 libgo/runtime/runtime.h |  4 ----
 2 files changed, 24 deletions(-)

diff --git a/libgo/runtime/proc.c b/libgo/runtime/proc.c
index 87cd3ed..e52d37c 100644
--- a/libgo/runtime/proc.c
+++ b/libgo/runtime/proc.c
@@ -3370,26 +3370,6 @@ runtime_proc_scan(struct Workbuf** wbufp, void (*enqueue1)(struct Workbuf**, Obj
 	enqueue1(wbufp, (Obj){(byte*)&runtime_sched, sizeof runtime_sched, 0});
 }
 
-// When a function calls a closure, it passes the closure value to
-// __go_set_closure immediately before the function call.  When a
-// function uses a closure, it calls __go_get_closure immediately on
-// function entry.  This is a hack, but it will work on any system.
-// It would be better to use the static chain register when there is
-// one.  It is also worth considering expanding these functions
-// directly in the compiler.
-
-void
-__go_set_closure(void* v)
-{
-	g->closure = v;
-}
-
-void *
-__go_get_closure(void)
-{
-	return g->closure;
-}
-
 // Return whether we are waiting for a GC.  This gc toolchain uses
 // preemption instead.
 bool
diff --git a/libgo/runtime/runtime.h b/libgo/runtime/runtime.h
index c96290a..5a9bc71 100644
--- a/libgo/runtime/runtime.h
+++ b/libgo/runtime/runtime.h
@@ -195,7 +195,6 @@ struct	Location
 
 struct	G
 {
-	void*	closure;	// Closure value.
 	Defer*	defer;
 	Panic*	panic;
 	void*	exception;	// current exception being thrown
@@ -834,9 +833,6 @@ int32 getproccount(void);
 
 #define PREFETCH(p) __builtin_prefetch(p)
 
-void	__go_set_closure(void*);
-void*	__go_get_closure(void);
-
 bool	runtime_gcwaiting(void);
 void	runtime_badsignal(int);
 Defer*	runtime_newdefer(void);
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 08/13] libgo: Use the new libffi interfaces for Go
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (9 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 09/13] libgo: Remove __go_get/set_closure Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 02/13] Allow the front-end to create calls with a static chain Richard Henderson
                   ` (4 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

This does drop support for targets whose libffi hasn't been updated,
but if we go this way that should be fairly easy to do.
---
 libgo/go/reflect/makefunc.go      | 49 ++++++++++------------------
 libgo/go/reflect/makefunc_ffi.go  | 67 ++++++++++++--------------------------
 libgo/go/reflect/makefunc_ffi_c.c | 68 +++++++++------------------------------
 libgo/go/reflect/value.go         |  3 ++
 libgo/runtime/go-reflect-call.c   | 10 ++----
 5 files changed, 59 insertions(+), 138 deletions(-)

diff --git a/libgo/go/reflect/makefunc.go b/libgo/go/reflect/makefunc.go
index 977aacf..23c63a7 100644
--- a/libgo/go/reflect/makefunc.go
+++ b/libgo/go/reflect/makefunc.go
@@ -14,7 +14,11 @@ import (
 // makeFuncImpl is the closure value implementing the function
 // returned by MakeFunc.
 type makeFuncImpl struct {
-	code uintptr
+	// These first three words are layed out like ffi_go_closure.
+	code	uintptr
+	ffi_cif	unsafe.Pointer
+	ffi_fun	func(unsafe.Pointer, unsafe.Pointer)
+
 	typ  *funcType
 	fn   func([]Value) []Value
 
@@ -22,10 +26,6 @@ type makeFuncImpl struct {
 	// method values.
 	method int
 	rcvr   Value
-
-	// When using FFI, hold onto the FFI closure for the garbage
-	// collector.
-	ffi *ffiData
 }
 
 // MakeFunc returns a new function of the given Type
@@ -58,25 +58,18 @@ func MakeFunc(typ Type, fn func(args []Value) (results []Value)) Value {
 	t := typ.common()
 	ftyp := (*funcType)(unsafe.Pointer(t))
 
-	var code uintptr
-	var ffi *ffiData
-	switch runtime.GOARCH {
-	case "amd64", "386":
-		// Indirect Go func value (dummy) to obtain actual
-		// code address. (A Go func value is a pointer to a C
-		// function pointer. http://golang.org/s/go11func.)
-		dummy := makeFuncStub
-		code = **(**uintptr)(unsafe.Pointer(&dummy))
-	default:
-		code, ffi = makeFuncFFI(ftyp, fn)
-	}
-
 	impl := &makeFuncImpl{
-		code:   code,
 		typ:    ftyp,
 		fn:     fn,
 		method: -1,
-		ffi:    ffi,
+	}
+
+	switch runtime.GOARCH {
+	case "amd64", "386":
+		impl.code = makeFuncStubCode
+	default:
+		impl.fn = fn
+		makeFuncFFI(ftyp, impl)
 	}
 
 	return Value{t, unsafe.Pointer(&impl), flag(Func<<flagKindShift) | flagIndir}
@@ -125,13 +118,9 @@ func makeMethodValue(op string, v Value) Value {
 
 	switch runtime.GOARCH {
 	case "amd64", "386":
-		// Indirect Go func value (dummy) to obtain actual
-		// code address. (A Go func value is a pointer to a C
-		// function pointer. http://golang.org/s/go11func.)
-		dummy := makeFuncStub
-		fv.code = **(**uintptr)(unsafe.Pointer(&dummy))
+		fv.code = makeFuncStubCode;
 	default:
-		fv.code, fv.ffi = makeFuncFFI(ftyp, fv.call)
+		makeFuncFFI(ftyp, fv)
 	}
 
 	return Value{ft, unsafe.Pointer(&fv), v.flag&flagRO | flag(Func)<<flagKindShift | flagIndir}
@@ -160,13 +149,9 @@ func makeValueMethod(v Value) Value {
 
 	switch runtime.GOARCH {
 	case "amd64", "386":
-		// Indirect Go func value (dummy) to obtain actual
-		// code address. (A Go func value is a pointer to a C
-		// function pointer. http://golang.org/s/go11func.)
-		dummy := makeFuncStub
-		impl.code = **(**uintptr)(unsafe.Pointer(&dummy))
+		impl.code = makeFuncStubCode
 	default:
-		impl.code, impl.ffi = makeFuncFFI(ftyp, impl.call)
+		makeFuncFFI(ftyp, impl)
 	}
 
 	return Value{t, unsafe.Pointer(&impl), flag(Func<<flagKindShift) | flagIndir}
diff --git a/libgo/go/reflect/makefunc_ffi.go b/libgo/go/reflect/makefunc_ffi.go
index a13ef17..5c764e3 100644
--- a/libgo/go/reflect/makefunc_ffi.go
+++ b/libgo/go/reflect/makefunc_ffi.go
@@ -5,52 +5,27 @@
 package reflect
 
 import (
-	"runtime"
 	"unsafe"
 )
 
-// The ffi function, written in C, allocates an FFI closure.  It
-// returns the code and data pointers.  When the code pointer is
-// called, it will call callback.  CIF is an FFI data structure
-// allocated as part of the closure, and is returned to ensure that
-// the GC retains it.
-func ffi(ftyp *funcType, callback func(unsafe.Pointer, unsafe.Pointer)) (code uintptr, data uintptr, cif unsafe.Pointer)
-
-// The ffiFree function, written in C, releases the FFI closure.
-func ffiFree(uintptr)
-
-// An ffiData holds the information needed to preserve an FFI closure
-// for the garbage collector.
-type ffiData struct {
-	code     uintptr
-	data     uintptr
-	cif      unsafe.Pointer
-	callback func(unsafe.Pointer, unsafe.Pointer)
-}
-
-// The makeFuncFFI function uses libffi closures to implement
-// reflect.MakeFunc.  This is used for processors for which we don't
-// have more efficient support.
-func makeFuncFFI(ftyp *funcType, fn func(args []Value) (results []Value)) (uintptr, *ffiData) {
-	callback := func(params, results unsafe.Pointer) {
-		ffiCall(ftyp, fn, params, results)
-	}
-
-	code, data, cif := ffi(ftyp, callback)
-
-	c := &ffiData{code: code, data: data, cif: cif, callback: callback}
-
-	runtime.SetFinalizer(c,
-		func(p *ffiData) {
-			ffiFree(p.data)
-		})
-
-	return code, c
-}
-
-// ffiCall takes pointers to the parameters, calls the function, and
-// stores the results back into memory.
-func ffiCall(ftyp *funcType, fn func([]Value) []Value, params unsafe.Pointer, results unsafe.Pointer) {
+// The makeFuncFFI function, written in C, fills in an FFI closure.
+// It arranges for ffiCall to be invoked directly from FFI.
+func makeFuncFFI(ftyp *funcType, impl *makeFuncImpl)
+
+// FFICallbackGo implements the Go side of the libffi callback.
+// It is exported so that C code can call it.
+//
+// The call chain arriving here looks like
+//   some_go_caller
+//   ->some_ffi_internals
+//     ->ffi_callback (in C)
+//       ->FFICallbackGo
+//
+// The ffi_callback handles __go_makefunc_can_recover, and
+// then passes off the data as received from ffi here.
+
+func FFICallbackGo(results unsafe.Pointer, params unsafe.Pointer, impl *makeFuncImpl) {
+	ftyp := impl.typ
 	in := make([]Value, 0, len(ftyp.in))
 	ap := params
 	for _, rt := range ftyp.in {
@@ -61,18 +36,18 @@ func ffiCall(ftyp *funcType, fn func([]Value) []Value, params unsafe.Pointer, re
 		ap = (unsafe.Pointer)(uintptr(ap) + ptrSize)
 	}
 
-	out := fn(in)
+	out := impl.call(in)
 
 	off := uintptr(0)
 	for i, typ := range ftyp.out {
 		v := out[i]
 		if v.typ != typ {
-			panic("reflect: function created by MakeFunc using " + funcName(fn) +
+			panic("reflect: function created by MakeFunc using " + funcName(impl.fn) +
 				" returned wrong type: have " +
 				out[i].typ.String() + " for " + typ.String())
 		}
 		if v.flag&flagRO != 0 {
-			panic("reflect: function created by MakeFunc using " + funcName(fn) +
+			panic("reflect: function created by MakeFunc using " + funcName(impl.fn) +
 				" returned value obtained from unexported field")
 		}
 
diff --git a/libgo/go/reflect/makefunc_ffi_c.c b/libgo/go/reflect/makefunc_ffi_c.c
index a3dfd4a..727ae81 100644
--- a/libgo/go/reflect/makefunc_ffi_c.c
+++ b/libgo/go/reflect/makefunc_ffi_c.c
@@ -10,7 +10,7 @@
 
 #include "go-ffi.h"
 
-#if FFI_CLOSURES
+#if FFI_GO_CLOSURES
 #define USE_LIBFFI_CLOSURES
 #endif
 
@@ -18,36 +18,28 @@
 
 /* Declare C functions with the names used to call from Go.  */
 
-struct ffi_ret {
-  void *code;
-  void *data;
-  void *cif;
-};
-
-struct ffi_ret ffi(const struct __go_func_type *ftyp, FuncVal *callback)
-  __asm__ (GOSYM_PREFIX "reflect.ffi");
-
-void ffiFree(void *data)
-  __asm__ (GOSYM_PREFIX "reflect.ffiFree");
+void makeFuncFFI(const struct __go_func_type *ftyp, ffi_go_closure *impl)
+  __asm__ (GOSYM_PREFIX "reflect.makeFuncFFI");
 
 #ifdef USE_LIBFFI_CLOSURES
 
-/* The function that we pass to ffi_prep_closure_loc.  This calls the
-   Go callback function (passed in user_data) with the pointer to the
-   arguments and the results area.  */
+/* The function that we pass to ffi_prep_closure_loc.  This calls the Go
+   function ffiCall with the pointer to the arguments, the results area,
+   and the closure structure.  */
+
+void FFICallbackGo(void *result, void **args, ffi_go_closure *closure)
+  __asm__ (GOSYM_PREFIX "reflect.FFICallbackGo");
 
 static void ffi_callback (ffi_cif *, void *, void **, void *)
   __asm__ ("reflect.ffi_callback");
 
 static void
 ffi_callback (ffi_cif* cif __attribute__ ((unused)), void *results,
-	      void **args, void *user_data)
+	      void **args, void *closure)
 {
   Location locs[8];
   int n;
   int i;
-  FuncVal *fv;
-  void (*f) (void *, void *);
 
   /* This function is called from some series of FFI closure functions
      called by a Go function.  We want to see whether the caller of
@@ -69,10 +61,7 @@ ffi_callback (ffi_cif* cif __attribute__ ((unused)), void *results,
   if (i < n)
     __go_makefunc_ffi_can_recover (locs + i, n - i);
 
-  fv = (FuncVal *) user_data;
-  __go_set_closure (fv);
-  f = (void *) fv->fn;
-  f (args, results);
+  FFICallbackGo(results, args, closure);
 
   if (i < n)
     __go_makefunc_returning ();
@@ -80,46 +69,21 @@ ffi_callback (ffi_cif* cif __attribute__ ((unused)), void *results,
 
 /* Allocate an FFI closure and arrange to call ffi_callback.  */
 
-struct ffi_ret
-ffi (const struct __go_func_type *ftyp, FuncVal *callback)
+void
+makeFuncFFI(const struct __go_func_type *ftyp, ffi_go_closure *impl)
 {
   ffi_cif *cif;
-  void *code;
-  void *data;
-  struct ffi_ret ret;
 
   cif = (ffi_cif *) __go_alloc (sizeof (ffi_cif));
   __go_func_to_cif (ftyp, 0, 0, cif);
-  data = ffi_closure_alloc (sizeof (ffi_closure), &code);
-  if (data == NULL)
-    runtime_panicstring ("ffi_closure_alloc failed");
-  if (ffi_prep_closure_loc (data, cif, ffi_callback, callback, code)
-      != FFI_OK)
-    runtime_panicstring ("ffi_prep_closure_loc failed");
-  ret.code = code;
-  ret.data = data;
-  ret.cif = cif;
-  return ret;
-}
-
-/* Free the FFI closure.  */
 
-void
-ffiFree (void *data)
-{
-  ffi_closure_free (data);
+  ffi_prep_go_closure(impl, cif, ffi_callback);
 }
 
 #else /* !defined(USE_LIBFFI_CLOSURES) */
 
-struct ffi_ret
-ffi(const struct __go_func_type *ftyp, FuncVal *callback)
-{
-  runtime_panicstring ("libgo built without FFI does not support "
-		       "reflect.MakeFunc");
-}
-
-void ffiFree(void *data)
+void
+makeFuncFFI(const struct __go_func_type *ftyp, ffi_go_closure *impl)
 {
   runtime_panicstring ("libgo built without FFI does not support "
 		       "reflect.MakeFunc");
diff --git a/libgo/go/reflect/value.go b/libgo/go/reflect/value.go
index c390b8e..1e0b537 100644
--- a/libgo/go/reflect/value.go
+++ b/libgo/go/reflect/value.go
@@ -427,6 +427,9 @@ func (v Value) CallSlice(in []Value) []Value {
 
 var callGC bool // for testing; see TestCallMethodJump
 
+// Indirect Go func value (dummy) to obtain actual
+// code address. (A Go func value is a pointer to a C
+// function pointer. http://golang.org/s/go11func.)
 var makeFuncStubFn = makeFuncStub
 var makeFuncStubCode = **(**uintptr)(unsafe.Pointer(&makeFuncStubFn))
 
diff --git a/libgo/runtime/go-reflect-call.c b/libgo/runtime/go-reflect-call.c
index dfc703e..692c8cc 100644
--- a/libgo/runtime/go-reflect-call.c
+++ b/libgo/runtime/go-reflect-call.c
@@ -202,11 +202,7 @@ go_set_results (const struct __go_func_type *func, unsigned char *call_result,
 
    If IS_METHOD is true this is a call to a method expression.  The
    first argument is the receiver.  It is described in FUNC_TYPE, but
-   regardless of FUNC_TYPE, it is passed as a pointer.
-
-   If neither IS_INTERFACE nor IS_METHOD is true then we are calling a
-   function indirectly, and we must pass a closure pointer via
-   __go_set_closure.  The pointer to pass is simply FUNC_VAL.  */
+   regardless of FUNC_TYPE, it is passed as a pointer.  */
 
 void
 reflect_call (const struct __go_func_type *func_type, FuncVal *func_val,
@@ -221,9 +217,7 @@ reflect_call (const struct __go_func_type *func_type, FuncVal *func_val,
 
   call_result = (unsigned char *) malloc (go_results_size (func_type));
 
-  if (!is_interface && !is_method)
-    __go_set_closure (func_val);
-  ffi_call (&cif, func_val->fn, call_result, params);
+  ffi_call_go (&cif, func_val->fn, call_result, params, func_val);
 
   /* Some day we may need to free result values if RESULTS is
      NULL.  */
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 04/13] Use the static chain as the closure parameter from Go
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 11/13] libffi: Support go closures on aarch64 Richard Henderson
                   ` (14 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

This is not a standalone patch; it only touches the Go front end.
Further changes are required for the Go runtime.
---
 gcc/go/go-gcc.cc                 | 44 ++++++++++++++++++++++++++++++++++++++--
 gcc/go/gofrontend/backend.h      |  7 ++++++-
 gcc/go/gofrontend/expressions.cc | 21 ++++++++-----------
 gcc/go/gofrontend/gogo.cc        | 29 +++++++++++++-------------
 gcc/go/gofrontend/gogo.h         | 14 +++++++++++++
 gcc/go/gofrontend/runtime.def    |  6 ------
 6 files changed, 85 insertions(+), 36 deletions(-)

diff --git a/gcc/go/go-gcc.cc b/gcc/go/go-gcc.cc
index 6bac84f..01cd473 100644
--- a/gcc/go/go-gcc.cc
+++ b/gcc/go/go-gcc.cc
@@ -304,7 +304,7 @@ class Gcc_backend : public Backend
 
   Bexpression*
   call_expression(Bexpression* fn, const std::vector<Bexpression*>& args,
-                  Location);
+                  Bexpression* static_chain, Location);
 
   // Statements.
 
@@ -385,6 +385,9 @@ class Gcc_backend : public Backend
 		     Location);
 
   Bvariable*
+  static_chain_variable(Bfunction*, const std::string&, Btype*, Location);
+
+  Bvariable*
   temporary_variable(Bfunction*, Bblock*, Btype*, Bexpression*, bool,
 		     Location, Bstatement**);
 
@@ -1759,7 +1762,7 @@ Gcc_backend::array_index_expression(Bexpression* array, Bexpression* index,
 Bexpression*
 Gcc_backend::call_expression(Bexpression* fn_expr,
                              const std::vector<Bexpression*>& fn_args,
-                             Location location)
+                             Bexpression* chain_expr, Location location)
 {
   tree fn = fn_expr->get_tree();
   if (fn == error_mark_node || TREE_TYPE(fn) == error_mark_node)
@@ -1819,6 +1822,9 @@ Gcc_backend::call_expression(Bexpression* fn_expr,
                            excess_type != NULL_TREE ? excess_type : rettype,
                            fn, nargs, args);
 
+  if (chain_expr)
+    CALL_EXPR_STATIC_CHAIN (ret) = chain_expr->get_tree();
+
   if (excess_type != NULL_TREE)
     {
       // Calling convert here can undo our excess precision change.
@@ -2440,6 +2446,40 @@ Gcc_backend::parameter_variable(Bfunction* function, const std::string& name,
   return new Bvariable(decl);
 }
 
+// Make a static chain variable.
+
+Bvariable*
+Gcc_backend::static_chain_variable(Bfunction* function, const std::string& name,
+				   Btype* btype, Location location)
+{
+  tree type_tree = btype->get_tree();
+  if (type_tree == error_mark_node)
+    return this->error_variable();
+  tree decl = build_decl(location.gcc_location(), PARM_DECL,
+			 get_identifier_from_string(name), type_tree);
+  tree fndecl = function->get_tree();
+  DECL_CONTEXT(decl) = fndecl;
+  DECL_ARG_TYPE(decl) = type_tree;
+  TREE_USED(decl) = 1;
+  DECL_ARTIFICIAL(decl) = 1;
+  DECL_IGNORED_P(decl) = 1;
+  TREE_READONLY(decl) = 1;
+
+  struct function *f = DECL_STRUCT_FUNCTION(fndecl);
+  if (f == NULL)
+    {
+      push_struct_function(fndecl);
+      pop_cfun();
+      f = DECL_STRUCT_FUNCTION(fndecl);
+    }
+  gcc_assert(f->static_chain_decl == NULL);
+  f->static_chain_decl = decl;
+  DECL_STATIC_CHAIN(fndecl) = 1;
+
+  go_preserve_from_gc(decl);
+  return new Bvariable(decl);
+}
+
 // Make a temporary variable.
 
 Bvariable*
diff --git a/gcc/go/gofrontend/backend.h b/gcc/go/gofrontend/backend.h
index 98c36c1..d0a7995 100644
--- a/gcc/go/gofrontend/backend.h
+++ b/gcc/go/gofrontend/backend.h
@@ -374,7 +374,7 @@ class Backend
   // Create an expression for a call to FN with ARGS.
   virtual Bexpression*
   call_expression(Bexpression* fn, const std::vector<Bexpression*>& args,
-                  Location) = 0;
+		  Bexpression* static_chain, Location) = 0;
 
   // Statements.
 
@@ -528,6 +528,11 @@ class Backend
 		     Btype* type, bool is_address_taken,
 		     Location location) = 0;
 
+  // Create a static chain parameter.  This is the closure parameter.
+  virtual Bvariable*
+  static_chain_variable(Bfunction* function, const std::string& name,
+		        Btype* type, Location location) = 0;
+
   // Create a temporary variable.  A temporary variable has no name,
   // just a type.  We pass in FUNCTION and BLOCK in case they are
   // needed.  If INIT is not NULL, the variable should be initialized
diff --git a/gcc/go/gofrontend/expressions.cc b/gcc/go/gofrontend/expressions.cc
index df1650a..37650eb 100644
--- a/gcc/go/gofrontend/expressions.cc
+++ b/gcc/go/gofrontend/expressions.cc
@@ -6621,6 +6621,7 @@ Bound_method_expression::create_thunk(Gogo* gogo, const Method* method,
 
   Variable* cvar = new Variable(closure_type, NULL, false, false, false, loc);
   cvar->set_is_used();
+  cvar->set_is_closure();
   Named_object* cp = Named_object::make_variable("$closure", NULL, cvar);
   new_no->func_value()->set_closure_var(cp);
 
@@ -9571,19 +9572,11 @@ Call_expression::do_get_backend(Translate_context* context)
       fn_args[0] = first_arg->get_backend(context);
     }
 
-  if (!has_closure_arg)
-    go_assert(closure == NULL);
+  Bexpression* bclosure = NULL;
+  if (has_closure_arg)
+    bclosure = closure->get_backend(context);
   else
-    {
-      // Pass the closure argument by calling the function function
-      // __go_set_closure.  In the order_evaluations pass we have
-      // ensured that if any parameters contain call expressions, they
-      // will have been moved out to temporary variables.
-      go_assert(closure != NULL);
-      Expression* set_closure =
-          Runtime::make_call(Runtime::SET_CLOSURE, location, 1, closure);
-      fn = Expression::make_compound(set_closure, fn, location);
-    }
+    go_assert(closure == NULL);
 
   Bexpression* bfn = fn->get_backend(context);
 
@@ -9599,7 +9592,8 @@ Call_expression::do_get_backend(Translate_context* context)
       bfn = gogo->backend()->convert_expression(bft, bfn, location);
     }
 
-  Bexpression* call = gogo->backend()->call_expression(bfn, fn_args, location);
+  Bexpression* call = gogo->backend()->call_expression(bfn, fn_args,
+						       bclosure, location);
 
   if (this->results_ != NULL)
     {
@@ -11373,6 +11367,7 @@ Interface_field_reference_expression::create_thunk(Gogo* gogo,
 
   Variable* cvar = new Variable(closure_type, NULL, false, false, false, loc);
   cvar->set_is_used();
+  cvar->set_is_closure();
   Named_object* cp = Named_object::make_variable("$closure", NULL, cvar);
   new_no->func_value()->set_closure_var(cp);
 
diff --git a/gcc/go/gofrontend/gogo.cc b/gcc/go/gofrontend/gogo.cc
index 81a555f..3a7e686 100644
--- a/gcc/go/gofrontend/gogo.cc
+++ b/gcc/go/gofrontend/gogo.cc
@@ -702,7 +702,8 @@ Gogo::init_imports(std::vector<Bstatement*>& init_stmts)
       Bexpression* pfunc_code =
           this->backend()->function_code_expression(pfunc, unknown_loc);
       Bexpression* pfunc_call =
-          this->backend()->call_expression(pfunc_code, empty_args, unknown_loc);
+	this->backend()->call_expression(pfunc_code, empty_args,
+					 NULL, unknown_loc);
       init_stmts.push_back(this->backend()->expression_statement(pfunc_call));
     }
 }
@@ -1354,7 +1355,7 @@ Gogo::write_globals()
           this->backend()->function_code_expression(initfn, func_loc);
       Bexpression* call = this->backend()->call_expression(func_code,
                                                            empty_args,
-                                                           func_loc);
+							   NULL, func_loc);
       init_stmts.push_back(this->backend()->expression_statement(call));
     }
 
@@ -3856,6 +3857,7 @@ Build_recover_thunks::function(Named_object* orig_no)
       Variable* orig_closure_var = orig_closure_no->var_value();
       Variable* new_var = new Variable(orig_closure_var->type(), NULL, false,
 				       false, false, location);
+      new_var->set_is_closure();
       snprintf(buf, sizeof buf, "closure.%u", count);
       ++count;
       Named_object* new_closure_no = Named_object::make_variable(buf, NULL,
@@ -4466,6 +4468,7 @@ Function::closure_var()
       Variable* var = new Variable(Type::make_pointer_type(struct_type),
 				   NULL, false, false, false, loc);
       var->set_is_used();
+      var->set_is_closure();
       this->closure_var_ = Named_object::make_variable("$closure", NULL, var);
       // Note that the new variable is not in any binding contour.
     }
@@ -5128,18 +5131,12 @@ Function::build(Gogo* gogo, Named_object* named_function)
       return;
     }
 
-  // If we need a closure variable, fetch it by calling a runtime
-  // function.  The caller will have called __go_set_closure before
-  // the function call.
+  // If we need a closure variable, make sure to create it.
+  // It gets installed in the function as a side effect of creation.
   if (this->closure_var_ != NULL)
     {
-      Bvariable* closure_bvar =
-	this->closure_var_->get_backend_variable(gogo, named_function);
-      vars.push_back(closure_bvar);
-
-      Expression* closure =
-          Runtime::make_call(Runtime::GET_CLOSURE, this->location_, 0);
-      var_inits.push_back(closure->get_backend(&context));
+      go_assert(this->closure_var_->var_value()->is_closure());
+      this->closure_var_->get_backend_variable(gogo, named_function);
     }
 
   if (this->block_ != NULL)
@@ -5673,7 +5670,8 @@ Variable::Variable(Type* type, Expression* init, bool is_global,
 		   Location location)
   : type_(type), init_(init), preinit_(NULL), location_(location),
     backend_(NULL), is_global_(is_global), is_parameter_(is_parameter),
-    is_receiver_(is_receiver), is_varargs_parameter_(false), is_used_(false),
+    is_closure_(false), is_receiver_(is_receiver),
+    is_varargs_parameter_(false), is_used_(false),
     is_address_taken_(false), is_non_escaping_address_taken_(false),
     seen_(false), init_is_lowered_(false), init_is_flattened_(false),
     type_from_init_tuple_(false), type_from_range_index_(false),
@@ -6212,7 +6210,10 @@ Variable::get_backend_variable(Gogo* gogo, Named_object* function,
 	      Bfunction* bfunction = function->func_value()->get_decl();
 	      bool is_address_taken = (this->is_non_escaping_address_taken_
 				       && !this->is_in_heap());
-	      if (is_parameter)
+	      if (this->is_closure())
+		bvar = backend->static_chain_variable(bfunction, n, btype,
+						      this->location_);
+	      else if (is_parameter)
 		bvar = backend->parameter_variable(bfunction, n, btype,
 						   is_address_taken,
 						   this->location_);
diff --git a/gcc/go/gofrontend/gogo.h b/gcc/go/gofrontend/gogo.h
index 01390ac..6cc72e2 100644
--- a/gcc/go/gofrontend/gogo.h
+++ b/gcc/go/gofrontend/gogo.h
@@ -1340,6 +1340,18 @@ class Variable
   is_parameter() const
   { return this->is_parameter_; }
 
+  // Return whether this is a closure (static chain) parameter.
+  bool
+  is_closure() const
+  { return this->is_closure_; }
+
+  // Change this parameter to be a closure.
+  void
+  set_is_closure()
+  {
+    this->is_closure_ = true;
+  }
+
   // Return whether this is the receiver parameter of a method.
   bool
   is_receiver() const
@@ -1561,6 +1573,8 @@ class Variable
   bool is_global_ : 1;
   // Whether this is a function parameter.
   bool is_parameter_ : 1;
+  // Whether this is a closure parameter.
+  bool is_closure_ : 1;
   // Whether this is the receiver parameter of a method.
   bool is_receiver_ : 1;
   // Whether this is the varargs parameter of a function.
diff --git a/gcc/go/gofrontend/runtime.def b/gcc/go/gofrontend/runtime.def
index 8c6e82b..d7d4edb 100644
--- a/gcc/go/gofrontend/runtime.def
+++ b/gcc/go/gofrontend/runtime.def
@@ -230,12 +230,6 @@ DEF_GO_RUNTIME(NEW_NOPOINTERS, "__go_new_nopointers", P1(UINTPTR), R1(POINTER))
 // Start a new goroutine.
 DEF_GO_RUNTIME(GO, "__go_go", P2(FUNC_PTR, POINTER), R0())
 
-// Get the function closure.
-DEF_GO_RUNTIME(GET_CLOSURE, "__go_get_closure", P0(), R1(POINTER))
-
-// Set the function closure.
-DEF_GO_RUNTIME(SET_CLOSURE, "__go_set_closure", P1(POINTER), R0())
-
 // Defer a function.
 DEF_GO_RUNTIME(DEFER, "__go_defer", P3(BOOLPTR, FUNC_PTR, POINTER), R0())
 
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 03/13] HACK!  Allow the static chain to be set from C
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (5 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 13/13] libffi: Support go closures on i386 Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-11  0:33   ` Ian Lance Taylor
  2014-10-14 18:44   ` [PATCH v2 03/13] " Richard Henderson
  2014-10-10 20:43 ` [PATCH 06/13] libffi: Add entry points for interacting with Go Richard Henderson
                   ` (8 subsequent siblings)
  15 siblings, 2 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

This is awful syntax, and therefore contains no documentation.
But we'll need to be able to set the static chain on a few calls
within the Go runtime, so we need to expose this by some means.

It currently looks like

        function(args...) __builtin_call_chain(pointer)

because that was easy to parse.

I've considered alternatives such as

        __builtin_call_static_chain(function, pointer)(args...)
    
wherein the builtin returns function, and magically remembers
that a static chain should be applied.

Or

        __builtin_call_go(function, pointer, args...)

which is a bit more specific.

Or

        __builtin_call_go(descriptor, args...)

where descriptor must be a pointer to a structure type with a
function pointer field at offset zero.  The builtin would automatically
perform the dereference, and apply the arguments to the function type
of that field.  That seems way too overkill for exactly two uses within
libgo.  We'd have to imagine the gccgo community writing all sorts of
C-to-Go style plugins and whatnot before that becomes worthwhile.

Other reasonable approaches solicited...

---
 gcc/c-family/c-common.c             |  1 +
 gcc/c-family/c-common.h             |  2 +-
 gcc/c/c-parser.c                    | 29 +++++++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/static-chain.c | 31 +++++++++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/static-chain.c

diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
index e69d128..5d1bff7 100644
--- a/gcc/c-family/c-common.c
+++ b/gcc/c-family/c-common.c
@@ -442,6 +442,7 @@ const struct c_common_resword c_common_reswords[] =
   { "__attribute__",	RID_ATTRIBUTE,	0 },
   { "__auto_type",	RID_AUTO_TYPE,	D_CONLY },
   { "__bases",          RID_BASES, D_CXXONLY },
+  { "__builtin_call_chain", RID_BUILTIN_CALL_CHAIN, D_CONLY },
   { "__builtin_choose_expr", RID_CHOOSE_EXPR, D_CONLY },
   { "__builtin_complex", RID_BUILTIN_COMPLEX, D_CONLY },
   { "__builtin_shuffle", RID_BUILTIN_SHUFFLE, 0 },
diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
index 1e3477f..6fe4748 100644
--- a/gcc/c-family/c-common.h
+++ b/gcc/c-family/c-common.h
@@ -102,7 +102,7 @@ enum rid
   RID_EXTENSION, RID_IMAGPART, RID_REALPART, RID_LABEL,      RID_CHOOSE_EXPR,
   RID_TYPES_COMPATIBLE_P,      RID_BUILTIN_COMPLEX,	     RID_BUILTIN_SHUFFLE,
   RID_DFLOAT32, RID_DFLOAT64, RID_DFLOAT128,
-  RID_FRACT, RID_ACCUM, RID_AUTO_TYPE,
+  RID_FRACT, RID_ACCUM, RID_AUTO_TYPE, RID_BUILTIN_CALL_CHAIN,
 
   /* C11 */
   RID_ALIGNAS, RID_GENERIC,
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 0d159fd..8ec6f48 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -7776,6 +7776,35 @@ c_parser_postfix_expression_after_primary (c_parser *parser,
 	      release_tree_vector (origtypes);
 	    }
 	  arg_loc.release ();
+
+	  if (c_parser_next_token_is_keyword (parser, RID_BUILTIN_CALL_CHAIN))
+	    {
+	      vec<c_expr_t, va_gc> *cexpr_list;
+	      tree chain_value;
+
+	      c_parser_consume_token (parser);
+	      if (!c_parser_get_builtin_args (parser, "__builtin_call_chain",
+					      &cexpr_list, true))
+		break;
+
+	      if (vec_safe_length (cexpr_list) != 1)
+		{
+		  error_at (expr_loc, "wrong number of arguments to "
+			    "%<__builtin_call_chain%>");
+		  break;
+		}
+	      chain_value = (*cexpr_list)[0].value;
+	      mark_exp_read (chain_value);
+
+	      if (TREE_CODE (TREE_TYPE (chain_value)) != POINTER_TYPE)
+		{
+		  error_at (expr_loc, "argument to %<__builtin_call_chain%> "
+			    "must be pointer type");
+		  break;
+		}
+
+	      CALL_EXPR_STATIC_CHAIN (expr.value) = chain_value;
+	    }
 	  break;
 	case CPP_DOT:
 	  /* Structure element reference.  */
diff --git a/gcc/testsuite/gcc.dg/static-chain.c b/gcc/testsuite/gcc.dg/static-chain.c
new file mode 100644
index 0000000..ff99fdc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/static-chain.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+
+#if defined(__x86_64__)
+# define CHAIN	"%r10"
+#elif defined(__i386__)
+# define CHAIN  "%ecx"
+#elif defined(__aarch64__)
+# define CHAIN  "x18"
+#endif
+
+#ifdef CHAIN
+void *__attribute__((noinline, noclone)) foo(void)
+{
+  register void *chain __asm__(CHAIN);
+  return chain;
+}
+
+void * (*ptr)(void) = foo;
+extern void abort(void);
+
+int main()
+{
+  char c;
+  void *x = ptr() __builtin_call_chain(&c);
+  if (x != &c)
+    abort();
+  return 0;
+}
+#else
+int main() { return 0; }
+#endif
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 13/13] libffi: Support go closures on i386
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (4 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 10/13] libffi: Rewrite aarch64 Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 03/13] HACK! Allow the static chain to be set from C Richard Henderson
                   ` (9 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

---
 libffi/src/x86/ffi.c  |  88 ++++++++++++++++++++++++++++++++++++-----
 libffi/src/x86/sysv.S | 107 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 166 insertions(+), 29 deletions(-)

diff --git a/libffi/src/x86/ffi.c b/libffi/src/x86/ffi.c
index e3f82ef..77abbe3 100644
--- a/libffi/src/x86/ffi.c
+++ b/libffi/src/x86/ffi.c
@@ -162,8 +162,9 @@ struct ffi_call_frame
 extern void ffi_call_i386(struct ffi_call_frame *, char *)
 	FFI_HIDDEN __attribute__((fastcall));
 
-void
-ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	      void **avalue, void *closure)
 {
   size_t rsize;
   struct ffi_call_frame *frame;
@@ -206,6 +207,21 @@ ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
   frame->flags = flags;
   frame->rvalue = rvalue;
 
+  /* Install the closure as the static chain value.  Note that the
+     static chain isn't part of an official ABI, it's what gcc itself
+     allocates for a given ABI.  Generally, this is a register that's
+     predictably unused on entry.  */
+  switch (cabi)
+    {
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+      frame->eax = (unsigned)closure;
+      break;
+    default:
+      frame->ecx = (unsigned)closure;
+      break;
+    }
+
   narg_reg = 0;
   switch (flags)
     {
@@ -265,6 +281,20 @@ ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
   ffi_call_i386(frame, stack);
 }
 
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
+
+
 /* ------- Closure API support ----------------------------------- */
 
 /* How to make a trampoline.  Derived from gcc/config/i386/i386.c. */
@@ -321,18 +351,19 @@ ffi_prep_closure_loc (ffi_closure* closure,
 
 struct ffi_closure_frame
 {
-  unsigned rettemp[4];	/* 0 */
-  unsigned eax;		/* 16 */
-  unsigned edx;		/* 20 */
-  unsigned ecx;		/* 24 */
-  ffi_closure *closure;	/* 28 */
+  unsigned rettemp[4];				/* 0 */
+  unsigned eax;					/* 16 */
+  unsigned edx;					/* 20 */
+  unsigned ecx;					/* 24 */
+  ffi_cif *cif;					/* 28 */
+  void (*fun)(ffi_cif*,void*,void**,void*);	/* 32 */
+  void *user_data;				/* 36 */
 };
 
 unsigned int FFI_HIDDEN __attribute__ ((fastcall))
 ffi_closure_inner (struct ffi_closure_frame *frame, char *argp)
 {
-  ffi_closure *closure = frame->closure;
-  ffi_cif *cif = closure->cif;
+  ffi_cif *cif = frame->cif;
   int cabi, i, n, flags, narg_reg;
   ffi_type **arg_types;
   void *rvalue;
@@ -389,7 +420,7 @@ ffi_closure_inner (struct ffi_closure_frame *frame, char *argp)
       avalue[i] = valp;
     }
 
-  closure->fun (cif, rvalue, avalue, closure->user_data);
+  frame->fun (cif, rvalue, avalue, frame->user_data);
 
   if (cabi == FFI_STDCALL)
     return flags + (cif->bytes << X86_RET_POP_SHIFT);
@@ -559,4 +590,41 @@ ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *avalue)
   ffi_call_i386(frame, stack);
 }
 
+/* ------- Go API support ---------------------------------------- */
+
+extern void ffi_go_closure_eax (void) FFI_HIDDEN;
+extern void ffi_go_closure_ecx (void) FFI_HIDDEN;
+extern void ffi_go_closure_stdcall (void) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+		     void (*fun)(ffi_cif*, void*, void**, void*))
+{
+  void (*dest)(void);
+
+  /* See the comment in ffi_call_int about the static chain.  */
+  switch (cif->abi)
+    {
+    case FFI_SYSV:
+    case FFI_MS_CDECL:
+      dest = ffi_go_closure_ecx;
+      break;
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+      dest = ffi_go_closure_eax;
+      break;
+    case FFI_STDCALL:
+      dest = ffi_go_closure_stdcall;
+      break;
+    default:
+      return FFI_BAD_ABI;
+    }
+
+  closure->tramp = dest;
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+
 #endif /* !__x86_64__ */
diff --git a/libffi/src/x86/sysv.S b/libffi/src/x86/sysv.S
index d8256d0..2709b11 100644
--- a/libffi/src/x86/sysv.S
+++ b/libffi/src/x86/sysv.S
@@ -165,26 +165,37 @@ E X86_RET_UNUSED15
 	.cfi_endproc
 	.size	ffi_call_i386, . - ffi_call_i386
 
-/* The closure entry points are reached from the ffi_closure trampoline.
-   On entry, %eax contains the address of the ffi_closure.  */
-
-#define	ffi_closure_FS	(12 + 4*4 + 16)
+#define	ffi_closure_FS	(4 + 3*4 + 3*4 + 16)
 
-.macro FFI_CLOSURE_FIRST
-	subl	$ffi_closure_FS, %esp
-	.cfi_adjust_cfa_offset ffi_closure_FS
+/* Macros to help setting up the ffi_closure_data structure.  */
 
-	movl	%edx, 20(%esp)		/* save incoming register args */
+.macro FFI_CLOSURE_SAVE_REGS
+	movl	%eax, 16(%esp)		/* save incoming register args */
+	movl	%edx, 20(%esp)
 	movl	%ecx, 24(%esp)
-	movl	%eax, 28(%esp)		/* trampoline loaded closure */
+.endm
+
+.macro FFI_CLOSURE_COPY_TRAMP_DATA
+	movl	12(%eax), %edx		/* copy cif */
+	movl	16(%eax), %ecx		/* copy fun */
+	movl	20(%eax), %eax		/* copy user_data */
+	movl	%edx, 28(%esp)
+	movl	%ecx, 32(%esp)
+	movl	%eax, 36(%esp)
+.endm
 
-	movl	%esp, %ecx		/* pass save area to C */
-	leal	ffi_closure_FS+4(%esp), %edx
+.macro FFI_CLOSURE_COPY_USER_DATA base, ofs, t1
+	movl	\ofs(\base), \t1
+	movl	\t1, 36(%esp)
+.endm
 
+.macro FFI_CLOSURE_CALL_INNER
+	movl	%esp, %ecx			/* load ffi_closure_data */
+	leal	ffi_closure_FS+4(%esp), %edx	/* load incoming stack */
 #ifdef __PIC__
-	movl	%ebx, 32(%esp)		/* save ebx */
+	movl	%ebx, 40(%esp)			/* save ebx */
 	.cfi_rel_offset %esp, 32
-	call	__x86.get_pc_thunk.bx
+	call	__x86.get_pc_thunk.bx		/* load got register */
 	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
 #endif
 #if defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE || !defined __PIC__
@@ -194,11 +205,11 @@ E X86_RET_UNUSED15
 #endif
 .endm
 
-.macro FFI_CLOSURE_SECOND
+.macro FFI_CLOSURE_MASK_AND_JUMP
 	andl	$X86_RET_TYPE_MASK, %eax
 #ifdef __PIC__
 	leal	0f@GOTOFF(%ebx, %eax, 8), %eax
-	movl	32(%esp), %ebx		/* restore ebx */
+	movl	40(%esp), %ebx		/* restore ebx */
 	.cfi_restore %ebx
 #else
 	leal	0f(, %eax, 8), %eax
@@ -206,6 +217,36 @@ E X86_RET_UNUSED15
 	jmp	*%eax
 .endm
 
+/* The go closure entry points are called directly from Go code.
+   The suffix is the register in which the static chain is located.  */
+
+
+.macro FFI_GO_CLOSURE	suffix, chain, t1, t2
+	.align	16
+	.globl	ffi_go_closure_\suffix
+	.type	ffi_go_closure_\suffix, @function
+	FFI_HIDDEN (ffi_go_closure_\suffix)
+ffi_go_closure_\suffix:
+	.cfi_startproc
+	subl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+	FFI_CLOSURE_SAVE_REGS
+	movl	4(\chain), \t1		/* copy cif */
+	movl	8(\chain), \t2		/* copy fun */
+	movl	\t1, 28(%esp)
+	movl	\t2, 32(%esp)
+	movl	\chain, 36(%esp)	/* closure is user_data  */
+	jmp	88f
+	.cfi_endproc
+	.size	ffi_go_closure_\suffix, . - ffi_go_closure_\suffix
+.endm
+
+FFI_GO_CLOSURE	eax, %eax, %edx, %ecx
+FFI_GO_CLOSURE	ecx, %ecx, %edx, %eax
+
+/* The closure entry points are reached from the ffi_closure trampoline.
+   On entry, %eax contains the address of the ffi_closure.  */
+
 	.align	16
 	.globl	ffi_closure_i386
 	.type	ffi_closure_i386, @function
@@ -213,8 +254,16 @@ E X86_RET_UNUSED15
 
 ffi_closure_i386:
 	.cfi_startproc
-	FFI_CLOSURE_FIRST
-	FFI_CLOSURE_SECOND
+	subl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+	FFI_CLOSURE_SAVE_REGS
+	FFI_CLOSURE_COPY_TRAMP_DATA
+
+88:	/* Entry point from preceeding Go closures.  */
+
+	FFI_CLOSURE_CALL_INNER
+	FFI_CLOSURE_MASK_AND_JUMP
 
 	.align	8
 0:
@@ -284,6 +333,8 @@ E X86_RET_UNUSED15
 	.cfi_endproc
 	.size	ffi_closure_i386, . - ffi_closure_i386
 
+FFI_GO_CLOSURE	stdcall, %ecx, %edx, %eax
+
 	.align	16
 	.globl	ffi_closure_i386_stdcall
 	.type	ffi_closure_i386_stdcall, @function
@@ -291,16 +342,34 @@ E X86_RET_UNUSED15
 
 ffi_closure_i386_stdcall:
 	.cfi_startproc
-	FFI_CLOSURE_FIRST
+	subl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+	FFI_CLOSURE_SAVE_REGS
+	FFI_CLOSURE_COPY_TRAMP_DATA
+
+88:	/* Entry point from preceeding Go closure.  */
+
+	FFI_CLOSURE_CALL_INNER
 
 	movl	%eax, %ecx
 	shrl	$4, %ecx			    /* isolate pop count */
 	leal	ffi_closure_FS(%esp, %ecx), %ecx    /* compute popped esp */
 	movl	ffi_closure_FS(%esp), %edx	    /* move return address */
 	movl	%edx, (%ecx)
+
+	/* New pseudo-stack frame based off ecx.  This is unwind trickery
+	   in that the CFA *has* changed, to the proper popped stack address.
+	   Note that the location to which we moved the return address
+	   is (the new) CFA-4, so that's unchanged.  */
 	.cfi_def_cfa %ecx, 4
+	/* Normally esp is unwound to CFA + the caller's ARGS_SIZE.
+	   We've just set the CFA to that final value.  Tell the unwinder
+	   to restore esp from CFA without the ARGS_SIZE:
+	   DW_CFA_val_expression %esp, DW_OP_call_frame_cfa.  */
+	.cfi_escape 0x16, 4, 1, 0x9c
 
-	FFI_CLOSURE_SECOND
+	FFI_CLOSURE_MASK_AND_JUMP
 
 	.align	8
 0:
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 02/13] Allow the front-end to create calls with a static chain
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (10 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 08/13] libgo: Use the new libffi interfaces for Go Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 07/13] libffi: Support go closures on x86_64 Richard Henderson
                   ` (3 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

And, at the same time, allow indirect calls to have a static chain.
We'll always eliminate the static chain if we can prove it's unused.
---
 gcc/calls.c       | 14 ++++++++------
 gcc/gimple-fold.c | 21 +++++++++++++++++++++
 gcc/gimplify.c    | 17 ++++++++++++++++-
 gcc/tree-cfg.c    | 22 +++++++---------------
 4 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/gcc/calls.c b/gcc/calls.c
index 9c19f38..68abe46 100644
--- a/gcc/calls.c
+++ b/gcc/calls.c
@@ -173,7 +173,7 @@ static void restore_fixed_argument_area (rtx, rtx, int, int);
    CALL_INSN_FUNCTION_USAGE information.  */
 
 rtx
-prepare_call_address (tree fndecl, rtx funexp, rtx static_chain_value,
+prepare_call_address (tree fndecl_or_type, rtx funexp, rtx static_chain_value,
 		      rtx *call_fusage, int reg_parm_seen, int sibcallp)
 {
   /* Make a valid memory address and copy constants through pseudo-regs,
@@ -193,12 +193,13 @@ prepare_call_address (tree fndecl, rtx funexp, rtx static_chain_value,
 #endif
     }
 
-  if (static_chain_value != 0)
+  if (static_chain_value != 0
+      && (TREE_CODE (fndecl_or_type) != FUNCTION_DECL
+	  || DECL_STATIC_CHAIN (fndecl_or_type)))
     {
       rtx chain;
 
-      gcc_assert (fndecl);
-      chain = targetm.calls.static_chain (fndecl, false);
+      chain = targetm.calls.static_chain (fndecl_or_type, false);
       static_chain_value = convert_memory_address (Pmode, static_chain_value);
 
       emit_move_insn (chain, static_chain_value);
@@ -3106,8 +3107,9 @@ expand_call (tree exp, rtx target, int ignore)
 	}
 
       after_args = get_last_insn ();
-      funexp = prepare_call_address (fndecl, funexp, static_chain_value,
-				     &call_fusage, reg_parm_seen, pass == 0);
+      funexp = prepare_call_address (fndecl ? fndecl : fntype, funexp,
+				     static_chain_value, &call_fusage,
+				     reg_parm_seen, pass == 0);
 
       load_register_parameters (args, num_actuals, &call_fusage, flags,
 				pass == 0, &sibcall_failure);
diff --git a/gcc/gimple-fold.c b/gcc/gimple-fold.c
index 8ac2211..d5baa72 100644
--- a/gcc/gimple-fold.c
+++ b/gcc/gimple-fold.c
@@ -2636,6 +2636,27 @@ gimple_fold_call (gimple_stmt_iterator *gsi, bool inplace)
 	}
     }
 
+  /* Check for indirect calls that became direct calls, and then
+     no longer require a static chain.  */
+  if (gimple_call_chain (stmt))
+    {
+      tree fn = gimple_call_fndecl (stmt);
+      if (fn && !DECL_STATIC_CHAIN (fn))
+	{
+	  gimple_call_set_chain (stmt, NULL);
+	  changed = true;
+	}
+      else
+	{
+	  tree tmp = maybe_fold_reference (gimple_call_chain (stmt), false);
+	  if (tmp)
+	    {
+	      gimple_call_set_chain (stmt, tmp);
+	      changed = true;
+	    }
+	}
+    }
+
   if (inplace)
     return changed;
 
diff --git a/gcc/gimplify.c b/gcc/gimplify.c
index 6ee85f2..a08e217 100644
--- a/gcc/gimplify.c
+++ b/gcc/gimplify.c
@@ -2414,7 +2414,7 @@ gimplify_call_expr (tree *expr_p, gimple_seq *pre_p, bool want_value)
 	}
     }
 
-  /* Finally, gimplify the function arguments.  */
+  /* Gimplify the function arguments.  */
   if (nargs > 0)
     {
       for (i = (PUSH_ARGS_REVERSED ? nargs - 1 : 0);
@@ -2436,6 +2436,21 @@ gimplify_call_expr (tree *expr_p, gimple_seq *pre_p, bool want_value)
         }
     }
 
+  /* Gimplify the static chain.  */
+  if (CALL_EXPR_STATIC_CHAIN (*expr_p))
+    {
+      if (fndecl && !DECL_STATIC_CHAIN (fndecl))
+	CALL_EXPR_STATIC_CHAIN (*expr_p) = NULL;
+      else
+	{
+	  enum gimplify_status t;
+	  t = gimplify_arg (&CALL_EXPR_STATIC_CHAIN (*expr_p), pre_p,
+			    EXPR_LOCATION (*expr_p));
+	  if (t == GS_ERROR)
+	    ret = GS_ERROR;
+	}
+    }
+
   /* Verify the function result.  */
   if (want_value && fndecl
       && VOID_TYPE_P (TREE_TYPE (TREE_TYPE (fnptrtype))))
diff --git a/gcc/tree-cfg.c b/gcc/tree-cfg.c
index e88842a..5fd56c1 100644
--- a/gcc/tree-cfg.c
+++ b/gcc/tree-cfg.c
@@ -3297,22 +3297,14 @@ verify_gimple_call (gimple stmt)
       return true;
     }
 
-  /* If there is a static chain argument, this should not be an indirect
-     call, and the decl should have DECL_STATIC_CHAIN set.  */
-  if (gimple_call_chain (stmt))
+  /* If there is a static chain argument, the call should either be
+     indirect, or the decl should have DECL_STATIC_CHAIN set.  */
+  if (gimple_call_chain (stmt)
+      && fndecl
+      && !DECL_STATIC_CHAIN (fndecl))
     {
-      if (!gimple_call_fndecl (stmt))
-	{
-	  error ("static chain in indirect gimple call");
-	  return true;
-	}
-      fn = TREE_OPERAND (fn, 0);
-
-      if (!DECL_STATIC_CHAIN (fn))
-	{
-	  error ("static chain with function that doesn%'t use one");
-	  return true;
-	}
+      error ("static chain with function that doesn%'t use one");
+      return true;
     }
 
   /* ???  The C frontend passes unpromoted arguments in case it
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 10/13] libffi: Rewrite aarch64
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (3 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 12/13] libffi: Rewrite i386 sysv Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 13/13] libffi: Support go closures on i386 Richard Henderson
                   ` (10 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

(1) Invent a new "internal.h" rather than polluting the public ffitarget.h
    with stuff that ought not be exposed.

(2) Rewrite is_hfa to not be so horribly computationally expensive.  And
    more to the point require us to _re_ compute the same stuff in order
    to actually do anything with the type.

(3) Don't use the out-dated prep_args callback form for ffi_call.
    The x86_64 port has for years shown how to do this with a single alloca,
    but new ports keep copying i386 which still does it the inefficient way.
---
 libffi/src/aarch64/ffi.c       | 1362 +++++++++++++++-------------------------
 libffi/src/aarch64/ffitarget.h |   17 +-
 libffi/src/aarch64/internal.h  |   43 ++
 libffi/src/aarch64/sysv.S      |  499 ++++++++-------
 4 files changed, 816 insertions(+), 1105 deletions(-)
 create mode 100644 libffi/src/aarch64/internal.h

diff --git a/libffi/src/aarch64/ffi.c b/libffi/src/aarch64/ffi.c
index 1405665..c409c0c 100644
--- a/libffi/src/aarch64/ffi.c
+++ b/libffi/src/aarch64/ffi.c
@@ -20,42 +20,37 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #include <stdio.h>
-
+#include <stdlib.h>
+#include <stdint.h>
 #include <ffi.h>
 #include <ffi_common.h>
+#include "internal.h"
 
-#include <stdlib.h>
-
-/* Stack alignment requirement in bytes */
+/* Stack alignment requirement in bytes.  */
 #define AARCH64_STACK_ALIGN 16
 
+/* Number of X and V argument registers.  */
 #define N_X_ARG_REG 8
 #define N_V_ARG_REG 8
 
-#define AARCH64_FFI_WITH_V (1 << AARCH64_FFI_WITH_V_BIT)
-
 union _d
 {
   UINT64 d;
   UINT32 s[2];
 };
 
-struct call_context
+struct _v
 {
-  UINT64 x [AARCH64_N_XREG];
-  struct
-  {
-    union _d d[2];
-  } v [AARCH64_N_VREG];
+  union _d d[2] __attribute__((aligned(16)));
 };
 
-static void *
-get_x_addr (struct call_context *context, unsigned n)
+struct call_context
 {
-  return &context->x[n];
-}
+  UINT64 x[N_X_ARG_REG];
+  struct _v v[N_V_ARG_REG];
+};
 
-static void *
+static inline UINT32 *
 get_s_addr (struct call_context *context, unsigned n)
 {
 #if defined __AARCH64EB__
@@ -65,557 +60,371 @@ get_s_addr (struct call_context *context, unsigned n)
 #endif
 }
 
-static void *
+static inline UINT64 *
 get_d_addr (struct call_context *context, unsigned n)
 {
 #if defined __AARCH64EB__
-  return &context->v[n].d[1];
+  return &context->v[n].d[1].d;
 #else
-  return &context->v[n].d[0];
+  return &context->v[n].d[0].d;
 #endif
 }
 
-static void *
-get_v_addr (struct call_context *context, unsigned n)
-{
-  return &context->v[n];
-}
-
-/* Return the memory location at which a basic type would reside
-   were it to have been stored in register n.  */
-
-static void *
-get_basic_type_addr (unsigned short type, struct call_context *context,
-		     unsigned n)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return get_s_addr (context, n);
-    case FFI_TYPE_DOUBLE:
-      return get_d_addr (context, n);
-    case FFI_TYPE_LONGDOUBLE:
-      return get_v_addr (context, n);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return get_x_addr (context, n);
-    default:
-      FFI_ASSERT (0);
-      return NULL;
-    }
-}
-
-/* Return the alignment width for each of the basic types.  */
-
-static size_t
-get_basic_type_alignment (unsigned short type)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return sizeof (UINT64);
+extern void ffi_call_SYSV (void *frame, void *rvalue,
+			   struct call_context *context,
+			   unsigned flags, void (*fn)(void)) FFI_HIDDEN;
 
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
+extern void ffi_closure_SYSV (void) FFI_HIDDEN;
+extern void ffi_closure_SYSV_V (void) FFI_HIDDEN;
 
-/* Return the size in bytes for each of the basic types.  */
+/* A subroutine of is_hfa.  Given a structure type, return the type code
+   of the first non-structure element.  Recurse for structure elements.
+   Return -1 if the structure is in fact empty, i.e. no nested elements.  */
 
-static size_t
-get_basic_type_size (unsigned short type)
+static int
+is_hfa0 (const ffi_type *ty)
 {
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return sizeof (UINT32);
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-      return sizeof (UINT8);
-    case FFI_TYPE_SINT8:
-      return sizeof (SINT8);
-    case FFI_TYPE_UINT16:
-      return sizeof (UINT16);
-    case FFI_TYPE_SINT16:
-      return sizeof (SINT16);
-    case FFI_TYPE_UINT32:
-      return sizeof (UINT32);
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-      return sizeof (SINT32);
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-      return sizeof (UINT64);
-    case FFI_TYPE_SINT64:
-      return sizeof (SINT64);
-
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
+  ffi_type **elements = ty->elements;
+  int i, ret = -1;
 
-extern void
-ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
-			    extended_cif *),
-               struct call_context *context,
-               extended_cif *,
-               unsigned,
-               void (*fn)(void));
-
-extern void
-ffi_closure_SYSV (ffi_closure *);
-
-/* Test for an FFI floating point representation.  */
+  if (elements != NULL)
+    for (i = 0; elements[i]; ++i)
+      {
+	ret = elements[i]->type;
+        if (ret == FFI_TYPE_STRUCT)
+	  {
+	    ret = is_hfa0 (elements[i]);
+	    if (ret < 0)
+	      continue;
+	  }
+	break;
+      }
 
-static unsigned
-is_floating_type (unsigned short type)
-{
-  return (type == FFI_TYPE_FLOAT || type == FFI_TYPE_DOUBLE
-	  || type == FFI_TYPE_LONGDOUBLE);
+  return ret;
 }
 
-/* Test for a homogeneous structure.  */
+/* A subroutine of is_hfa.  Given a structure type, return true if all
+   of the non-structure elements are the same as CANDIDATE.  */
 
-static unsigned short
-get_homogeneous_type (ffi_type *ty)
+static int
+is_hfa1 (const ffi_type *ty, int candidate)
 {
-  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
-    {
-      unsigned i;
-      unsigned short candidate_type
-	= get_homogeneous_type (ty->elements[0]);
-      for (i =1; ty->elements[i]; i++)
-	{
-	  unsigned short iteration_type = 0;
-	  /* If we have a nested struct, we must find its homogeneous type.
-	     If that fits with our candidate type, we are still
-	     homogeneous.  */
-	  if (ty->elements[i]->type == FFI_TYPE_STRUCT
-	      && ty->elements[i]->elements)
-	    {
-	      iteration_type = get_homogeneous_type (ty->elements[i]);
-	    }
-	  else
-	    {
-	      iteration_type = ty->elements[i]->type;
-	    }
+  ffi_type **elements = ty->elements;
+  int i;
 
-	  /* If we are not homogeneous, return FFI_TYPE_STRUCT.  */
-	  if (candidate_type != iteration_type)
-	    return FFI_TYPE_STRUCT;
-	}
-      return candidate_type;
-    }
+  if (elements != NULL)
+    for (i = 0; elements[i]; ++i)
+      {
+	int t = elements[i]->type;
+	if (t == FFI_TYPE_STRUCT)
+	  {
+	    if (!is_hfa1 (elements[i], candidate))
+	      return 0;
+	  }
+	else if (t != candidate)
+	  return 0;
+      }
 
-  /* Base case, we have no more levels of nesting, so we
-     are a basic type, and so, trivially homogeneous in that type.  */
-  return ty->type;
+  return 1;
 }
 
-/* Determine the number of elements within a STRUCT.
+/* Determine if TY is an homogenous floating point aggregate (HFA).
+   That is, a structure consisting of 1 to 4 members of all the same type,
+   where that type is a floating point scalar.
 
-   Note, we must handle nested structs.
+   Returns non-zero iff TY is an HFA.  The result is an encoded value where
+   bits 0-7 contain the type code, and bits 8-10 contain the element count.  */
 
-   If ty is not a STRUCT this function will return 0.  */
-
-static unsigned
-element_count (ffi_type *ty)
+static int
+is_hfa(const ffi_type *ty)
 {
-  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
-    {
-      unsigned n;
-      unsigned elems = 0;
-      for (n = 0; ty->elements[n]; n++)
-	{
-	  if (ty->elements[n]->type == FFI_TYPE_STRUCT
-	      && ty->elements[n]->elements)
-	    elems += element_count (ty->elements[n]);
-	  else
-	    elems++;
-	}
-      return elems;
-    }
-  return 0;
-}
+  ffi_type **elements;
+  int candidate, i;
+  size_t size, ele_count;
 
-/* Test for a homogeneous floating point aggregate.
+  /* Quickest tests first.  */
+  if (ty->type != FFI_TYPE_STRUCT)
+    return 0;
 
-   A homogeneous floating point aggregate is a homogeneous aggregate of
-   a half- single- or double- precision floating point type with one
-   to four elements.  Note that this includes nested structs of the
-   basic type.  */
+  /* No HFA types are smaller than 4 bytes, or larger than 64 bytes.  */
+  size = ty->size;
+  if (size < 4 || size > 64)
+    return 0;
 
-static int
-is_hfa (ffi_type *ty)
-{
-  if (ty->type == FFI_TYPE_STRUCT
-      && ty->elements[0]
-      && is_floating_type (get_homogeneous_type (ty)))
+  /* Find the type of the first non-structure member.  */
+  elements = ty->elements;
+  candidate = elements[0]->type;
+  if (candidate == FFI_TYPE_STRUCT)
     {
-      unsigned n = element_count (ty);
-      return n >= 1 && n <= 4;
+      for (i = 0; ; ++i)
+	{
+	  candidate = is_hfa0 (elements[i]);
+	  if (candidate >= 0)
+	    break;
+	}
     }
-  return 0;
-}
-
-/* Test if an ffi_type is a candidate for passing in a register.
-
-   This test does not check that sufficient registers of the
-   appropriate class are actually available, merely that IFF
-   sufficient registers are available then the argument will be passed
-   in register(s).
-
-   Note that an ffi_type that is deemed to be a register candidate
-   will always be returned in registers.
 
-   Returns 1 if a register candidate else 0.  */
-
-static int
-is_register_candidate (ffi_type *ty)
-{
-  switch (ty->type)
+  /* If the first member is not a floating point type, it's not an HFA.
+     Also quickly re-check the size of the structure.  */
+  switch (candidate)
     {
-    case FFI_TYPE_VOID:
     case FFI_TYPE_FLOAT:
+      ele_count = size / sizeof(float);
+      if (size != ele_count * sizeof(float))
+	return 0;
+      break;
     case FFI_TYPE_DOUBLE:
+      ele_count = size / sizeof(double);
+      if (size != ele_count * sizeof(double))
+	return 0;
+      break;
     case FFI_TYPE_LONGDOUBLE:
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT64:
-      return 1;
-
-    case FFI_TYPE_STRUCT:
-      if (is_hfa (ty))
-        {
-          return 1;
-        }
-      else if (ty->size > 16)
-        {
-          /* Too large. Will be replaced with a pointer to memory. The
-             pointer MAY be passed in a register, but the value will
-             not. This test specifically fails since the argument will
-             never be passed by value in registers. */
-          return 0;
-        }
-      else
-        {
-          /* Might be passed in registers depending on the number of
-             registers required. */
-          return (ty->size + 7) / 8 < N_X_ARG_REG;
-        }
+      ele_count = size / sizeof(long double);
+      if (size != ele_count * sizeof(long double))
+	return 0;
       break;
-
     default:
-      FFI_ASSERT (0);
-      break;
+      return 0;
     }
+  if (ele_count > 4)
+    return 0;
 
-  return 0;
-}
-
-/* Test if an ffi_type argument or result is a candidate for a vector
-   register.  */
-
-static int
-is_v_register_candidate (ffi_type *ty)
-{
-  return is_floating_type (ty->type)
-	   || (ty->type == FFI_TYPE_STRUCT && is_hfa (ty));
-}
-
-/* Representation of the procedure call argument marshalling
-   state.
-
-   The terse state variable names match the names used in the AARCH64
-   PCS. */
-
-struct arg_state
-{
-  unsigned ngrn;                /* Next general-purpose register number. */
-  unsigned nsrn;                /* Next vector register number. */
-  unsigned nsaa;                /* Next stack offset. */
-};
-
-/* Initialize a procedure call argument marshalling state.  */
-static void
-arg_init (struct arg_state *state, unsigned call_frame_size)
-{
-  state->ngrn = 0;
-  state->nsrn = 0;
-  state->nsaa = 0;
-}
-
-/* Return the number of available consecutive core argument
-   registers.  */
-
-static unsigned
-available_x (struct arg_state *state)
-{
-  return N_X_ARG_REG - state->ngrn;
-}
-
-/* Return the number of available consecutive vector argument
-   registers.  */
-
-static unsigned
-available_v (struct arg_state *state)
-{
-  return N_V_ARG_REG - state->nsrn;
-}
-
-static void *
-allocate_to_x (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->ngrn < N_X_ARG_REG)
-  return get_x_addr (context, (state->ngrn)++);
-}
-
-static void *
-allocate_to_s (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
-  return get_s_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_d (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
-  return get_d_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_v (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
-  return get_v_addr (context, (state->nsrn)++);
-}
-
-/* Allocate an aligned slot on the stack and return a pointer to it.  */
-static void *
-allocate_to_stack (struct arg_state *state, void *stack, unsigned alignment,
-		   unsigned size)
-{
-  void *allocation;
-
-  /* Round up the NSAA to the larger of 8 or the natural
-     alignment of the argument's type.  */
-  state->nsaa = ALIGN (state->nsaa, alignment);
-  state->nsaa = ALIGN (state->nsaa, alignment);
-  state->nsaa = ALIGN (state->nsaa, 8);
-
-  allocation = stack + state->nsaa;
+  /* Finally, make sure that all scalar elements are the same type.  */
+  for (i = 0; elements[i]; ++i)
+    {
+      if (elements[i]->type == FFI_TYPE_STRUCT)
+	{
+	  if (!is_hfa1 (elements[i], candidate))
+	    return 0;
+	}
+      else if (elements[i]->type != candidate)
+	return 0;
+    }
 
-  state->nsaa += size;
-  return allocation;
+  /* All tests succeeded.  Encode the result.  */
+  return (ele_count << 8) | candidate;
 }
 
-static void
-copy_basic_type (void *dest, void *source, unsigned short type)
+/* Extend a basic type to fill a 64-bit slot.  */
+static UINT64
+extend_basic_type (UINT64 ret, unsigned short type)
 {
-  /* This is neccessary to ensure that basic types are copied
-     sign extended to 64-bits as libffi expects.  */
   switch (type)
     {
     case FFI_TYPE_FLOAT:
-      *(float *) dest = *(float *) source;
-      break;
-    case FFI_TYPE_DOUBLE:
-      *(double *) dest = *(double *) source;
-      break;
-    case FFI_TYPE_LONGDOUBLE:
-      *(long double *) dest = *(long double *) source;
+      ret = (UINT32)ret;
+#if defined __AARCH64EB__
+      ret <<= 32;
+#endif
       break;
     case FFI_TYPE_UINT8:
-      *(ffi_arg *) dest = *(UINT8 *) source;
+      ret = (UINT8)ret;
       break;
     case FFI_TYPE_SINT8:
-      *(ffi_sarg *) dest = *(SINT8 *) source;
+      ret = (SINT8)ret;
       break;
     case FFI_TYPE_UINT16:
-      *(ffi_arg *) dest = *(UINT16 *) source;
+      ret = (UINT16)ret;
       break;
     case FFI_TYPE_SINT16:
-      *(ffi_sarg *) dest = *(SINT16 *) source;
+      ret = (SINT16)ret;
       break;
     case FFI_TYPE_UINT32:
-      *(ffi_arg *) dest = *(UINT32 *) source;
+      ret = (UINT32)ret;
       break;
     case FFI_TYPE_INT:
     case FFI_TYPE_SINT32:
-      *(ffi_sarg *) dest = *(SINT32 *) source;
+      ret = (SINT32)ret;
       break;
-    case FFI_TYPE_POINTER:
+    case FFI_TYPE_DOUBLE:
     case FFI_TYPE_UINT64:
-      *(ffi_arg *) dest = *(UINT64 *) source;
-      break;
     case FFI_TYPE_SINT64:
-      *(ffi_sarg *) dest = *(SINT64 *) source;
       break;
-
+    case FFI_TYPE_POINTER:
+      ret = (uintptr_t)ret;
+      break;
     default:
-      FFI_ASSERT (0);
+      abort ();
     }
+  return ret;
 }
 
-static void
-copy_hfa_to_reg_or_stack (void *memory,
-			  ffi_type *ty,
-			  struct call_context *context,
-			  unsigned char *stack,
-			  struct arg_state *state)
+ffi_status FFI_HIDDEN
+ffi_prep_cif_machdep (ffi_cif *cif)
 {
-  unsigned elems = element_count (ty);
-  if (available_v (state) < elems)
-    {
-      /* There are insufficient V registers. Further V register allocations
-	 are prevented, the NSAA is adjusted (by allocate_to_stack ())
-	 and the argument is copied to memory at the adjusted NSAA.  */
-      state->nsrn = N_V_ARG_REG;
-      memcpy (allocate_to_stack (state, stack, ty->alignment, ty->size),
-	      memory,
-	      ty->size);
-    }
-  else
-    {
-      int i;
-      unsigned short type = get_homogeneous_type (ty);
-      unsigned elems = element_count (ty);
-      for (i = 0; i < elems; i++)
-	{
-	  void *reg = allocate_to_v (context, state);
-	  copy_basic_type (reg, memory, type);
-	  memory += get_basic_type_size (type);
-	}
-    }
-}
+  int flags, h, i;
+  ffi_type *rtype;
 
-/* Either allocate an appropriate register for the argument type, or if
-   none are available, allocate a stack slot and return a pointer
-   to the allocated space.  */
+  /* Round the stack up to a multiple of the stack alignment requirement. */
+  cif->bytes = ALIGN (cif->bytes, AARCH64_STACK_ALIGN);
 
-static void *
-allocate_to_register_or_stack (struct call_context *context,
-			       unsigned char *stack,
-			       struct arg_state *state,
-			       unsigned short type)
-{
-  size_t alignment = get_basic_type_alignment (type);
-  size_t size = alignment;
-  switch (type)
+  rtype = cif->rtype;
+  switch (rtype->type)
     {
-    case FFI_TYPE_FLOAT:
-      /* This is the only case for which the allocated stack size
-	 should not match the alignment of the type.  */
-      size = sizeof (UINT32);
-      /* Fall through.  */
-    case FFI_TYPE_DOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
-	return allocate_to_d (context, state);
-      state->nsrn = N_V_ARG_REG;
-      break;
-    case FFI_TYPE_LONGDOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
-	return allocate_to_v (context, state);
-      state->nsrn = N_V_ARG_REG;
+    case FFI_TYPE_VOID:
+      flags = AARCH64_RET_VOID;
       break;
     case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
     case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
     case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
+      flags = AARCH64_RET_UINT32;
+      break;
     case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
+    case FFI_TYPE_SINT8:
+    case FFI_TYPE_SINT16:
+    case FFI_TYPE_SINT32:
+      flags = AARCH64_RET_SINT32;
+      break;
     case FFI_TYPE_UINT64:
     case FFI_TYPE_SINT64:
-      if (state->ngrn < N_X_ARG_REG)
-	return allocate_to_x (context, state);
-      state->ngrn = N_X_ARG_REG;
+      flags = AARCH64_RET_INT64;
+      break;
+    case FFI_TYPE_POINTER:
+      flags = (sizeof(void *) == 8 ? AARCH64_RET_INT64 : AARCH64_RET_UINT32);
+      break;
+    case FFI_TYPE_FLOAT:
+      flags = AARCH64_RET_FLOAT;
+      break;
+    case FFI_TYPE_DOUBLE:
+      flags = AARCH64_RET_DOUBLE;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = AARCH64_RET_LDOUBLE;
+      break;
+    case FFI_TYPE_STRUCT:
+      h = is_hfa (rtype);
+      switch (h & 0xff)
+        {
+        case FFI_TYPE_FLOAT:
+          flags = AARCH64_RET_HFA_FLOAT;
+          break;
+        case FFI_TYPE_DOUBLE:
+          flags = AARCH64_RET_HFA_DOUBLE;
+          break;
+        case FFI_TYPE_LONGDOUBLE:
+          flags = AARCH64_RET_HFA_LDOUBLE;
+          break;
+        default:
+	  flags = (rtype->size > 16
+		   ? AARCH64_RET_LG_STRUCT
+		   : AARCH64_RET_SM_STRUCT);
+          break;
+	}
       break;
     default:
-      FFI_ASSERT (0);
+      abort ();
     }
 
-    return allocate_to_stack (state, stack, alignment, size);
-}
-
-/* Copy a value to an appropriate register, or if none are
-   available, to the stack.  */
+  /* Note if any argument requires fp registers.  */
+  for (i = 0; i < cif->nargs; i++)
+    {
+      ffi_type *ty = cif->arg_types[i];
+      int tt = ty->type;
+      if (tt == FFI_TYPE_FLOAT
+          || tt == FFI_TYPE_DOUBLE
+          || tt == FFI_TYPE_LONGDOUBLE
+	  || is_hfa (ty))
+	{
+	  flags |= AARCH64_FLAG_ARG_V;
+	  break;
+	}
+    }
 
-static void
-copy_to_register_or_stack (struct call_context *context,
-			   unsigned char *stack,
-			   struct arg_state *state,
-			   void *value,
-			   unsigned short type)
-{
-  copy_basic_type (
-	  allocate_to_register_or_stack (context, stack, state, type),
-	  value,
-	  type);
+  cif->flags = flags;
+  return FFI_OK;
 }
 
-/* Marshall the arguments from FFI representation to procedure call
-   context and stack.  */
-
-static unsigned
-aarch64_prep_args (struct call_context *context, unsigned char *stack,
-		   extended_cif *ecif)
+/* Call a function with the provided arguments and capture the return
+   value.  */
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
-  int i;
-  struct arg_state state;
+  struct call_context *context;
+  UINT64 *stack, *slot;
+  void *frame, *local_rvalue;
+  ffi_type **arg_types;
+  int i, h, nargs, ngrn, nsrn, nsaa;
+  size_t size, stack_space, ret_space;
 
-  arg_init (&state, ALIGN(ecif->cif->bytes, 16));
+  FFI_ASSERT (cif->abi == FFI_SYSV);
 
-  for (i = 0; i < ecif->cif->nargs; i++)
+  ret_space = 0;
+  h = cif->flags & AARCH64_FLAG_RET_MASK;
+  switch (h)
     {
-      ffi_type *ty = ecif->cif->arg_types[i];
-      switch (ty->type)
-	{
-	case FFI_TYPE_VOID:
-	  FFI_ASSERT (0);
-	  break;
+    case AARCH64_RET_HFA_FLOAT:
+      /* The assembly always writes 4 elements.  */
+      ret_space = 4 * sizeof(float);
+      break;
+    case AARCH64_RET_HFA_DOUBLE:
+      ret_space = 4 * sizeof(double);
+      break;
+    case AARCH64_RET_HFA_LDOUBLE:
+      ret_space = 4 * sizeof(long double);
+      break;
+    case AARCH64_RET_SM_STRUCT:
+      ret_space = 16;
+      break;
+    case AARCH64_RET_LG_STRUCT:
+      if (rvalue == NULL)
+	ret_space = cif->rtype->size;
+      break;
+    }
 
+  /* Allocate the space for all of the arguments, the context, the local
+     stack frame for ffi_call_SYSV, and (possibly) the return value.  */
+  stack_space = ALIGN (cif->bytes, 16);
+  context = alloca (sizeof(struct call_context)
+		    + stack_space
+                    + 4 * sizeof(UINT64)
+		    + ret_space);
+  stack = (UINT64 *)(context + 1);
+  frame = (char *)stack + stack_space;
+
+  local_rvalue = rvalue;
+  if (ret_space)
+    local_rvalue = (char *)frame + 4 * sizeof(UINT64);
+
+  ngrn = nsrn = nsaa = 0;
+  arg_types = cif->arg_types;
+  nargs = cif->nargs;
+
+  for (i = 0; i < nargs; i++)
+    {
+      ffi_type *ty = arg_types[i];
+      unsigned short t = ty->type;
+
+      switch (t)
+	{
 	/* If the argument is a basic type the argument is allocated to an
 	   appropriate register, or if none are available, to the stack.  */
 	case FFI_TYPE_FLOAT:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = get_d_addr (context, nsrn++);
+	  else
+	    slot = &stack[nsaa++];
+	  *slot = extend_basic_type (*(UINT32 *)avalue[i], t);
+	  break;
 	case FFI_TYPE_DOUBLE:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = get_d_addr (context, nsrn++);
+	  else
+	    slot = &stack[nsaa++];
+	  *slot = extend_basic_type (*(UINT64 *)avalue[i], t);
+	  break;
+
 	case FFI_TYPE_LONGDOUBLE:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = &context->v[nsrn++].d[0].d;
+	  else
+	    {
+	      nsaa = ALIGN (nsaa, 2);
+	      slot = &stack[nsaa];
+	      nsaa += 2;
+	    }
+	  memcpy (slot, avalue[i], sizeof(long double));
+	  break;
+
 	case FFI_TYPE_UINT8:
 	case FFI_TYPE_SINT8:
 	case FFI_TYPE_UINT16:
@@ -626,207 +435,111 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack,
 	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	  copy_to_register_or_stack (context, stack, &state,
-				     ecif->avalue[i], ty->type);
+	  if (ngrn < N_X_ARG_REG)
+	    slot = &context->x[ngrn++];
+	  else
+	    slot = &stack[nsaa++];
+	  *slot = extend_basic_type (*(ffi_arg *)avalue[i], t);
 	  break;
 
+	case FFI_TYPE_VOID:
+	  /* Note that libgo passes void as a parameter for a
+	     struct with no fields.  */
 	case FFI_TYPE_STRUCT:
-	  if (is_hfa (ty))
-	    {
-	      copy_hfa_to_reg_or_stack (ecif->avalue[i], ty, context,
-					stack, &state);
-	    }
-	  else if (ty->size > 16)
-	    {
-	      /* If the argument is a composite type that is larger than 16
-		 bytes, then the argument has been copied to memory, and
-		 the argument is replaced by a pointer to the copy.  */
+	  {
+	    size_t slot_count;
 
-	      copy_to_register_or_stack (context, stack, &state,
-					 &(ecif->avalue[i]), FFI_TYPE_POINTER);
-	    }
-	  else if (available_x (&state) >= (ty->size + 7) / 8)
-	    {
-	      /* If the argument is a composite type and the size in
-		 double-words is not more than the number of available
-		 X registers, then the argument is copied into consecutive
-		 X registers.  */
-	      int j;
-	      for (j = 0; j < (ty->size + 7) / 8; j++)
-		{
-		  memcpy (allocate_to_x (context, &state),
-			  &(((UINT64 *) ecif->avalue[i])[j]),
-			  sizeof (UINT64));
-		}
-	    }
-	  else
-	    {
-	      /* Otherwise, there are insufficient X registers. Further X
-		 register allocations are prevented, the NSAA is adjusted
-		 (by allocate_to_stack ()) and the argument is copied to
-		 memory at the adjusted NSAA.  */
-	      state.ngrn = N_X_ARG_REG;
-
-	      memcpy (allocate_to_stack (&state, stack, ty->alignment,
-					 ty->size), ecif->avalue + i, ty->size);
-	    }
+	    size = ty->size;
+            slot_count = (size + 7) / 8;
+	    h = is_hfa (ty);
+	    if (h)
+	      {
+		int j, reg_count = h >> 8, tt = h & 0xff;
+
+		if (nsrn + reg_count <= N_V_ARG_REG)
+		  {
+		    switch (tt)
+		      {
+		      case FFI_TYPE_FLOAT:
+		        {
+			  UINT32 *src = avalue[i];
+		          for (j = 0; j < reg_count; ++j)
+			    *get_s_addr (context, nsrn + j) = src[j];
+		        }
+		        break;
+
+		      case FFI_TYPE_DOUBLE:
+		        {
+			  UINT64 *src = avalue[i];
+		          for (j = 0; j < reg_count; ++j)
+			    *get_d_addr (context, nsrn + j) = src[j];
+		        }
+		        break;
+
+		      case FFI_TYPE_LONGDOUBLE:
+		        memcpy(&context->v[nsrn], avalue[i], size);
+		        break;
+
+		      default:
+		        abort ();
+		    }
+		    nsrn += reg_count;
+		    break;
+		  }
+		/* All out of fp registers.  Copy to the stack.  */
+		nsrn = N_V_ARG_REG;
+	      }
+	    else if (size > 16)
+	      {
+		/* If the argument is a composite type that is larger than
+		   16 bytes, then the argument has been copied to memory,
+		   and the argument is replaced by a pointer.  */
+		if (ngrn < N_X_ARG_REG)
+		  slot = &context->x[ngrn++];
+		else
+		  slot = &stack[nsaa++];
+		*slot = (uintptr_t)avalue[i];
+		break;
+	      }
+	    else
+	      {
+	        if (ty->alignment == 16)
+		  ngrn = ALIGN (ngrn, 2);
+
+	        if (ngrn + slot_count <= N_X_ARG_REG)
+		  {
+		    slot = &context->x[ngrn];
+		    ngrn += slot_count;
+		    memcpy (slot, avalue[i], size);
+		    break;
+		  }
+		/* All out of general registers.  Copy to the stack.  */
+		ngrn = N_X_ARG_REG;
+	      }
+	    if (ty->alignment > 8)
+	      {
+		int a = ty->alignment / 8;
+		nsaa = ALIGN (nsaa, a);
+	      }
+	    memcpy (&stack[nsaa], avalue[i], size);
+	    nsaa += slot_count;
+	  }
 	  break;
 
 	default:
-	  FFI_ASSERT (0);
+	  abort ();
 	  break;
 	}
     }
 
-  return ecif->cif->aarch64_flags;
+  size = cif->rtype->size;
+  ffi_call_SYSV (frame, local_rvalue, context, cif->flags, fn);
+  if (local_rvalue != rvalue && rvalue != NULL)
+    memcpy (rvalue, local_rvalue, size);
 }
 
-ffi_status
-ffi_prep_cif_machdep (ffi_cif *cif)
-{
-  /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes =
-    (cif->bytes + (AARCH64_STACK_ALIGN - 1)) & ~ (AARCH64_STACK_ALIGN - 1);
-
-  /* Initialize our flags. We are interested if this CIF will touch a
-     vector register, if so we will enable context save and load to
-     those registers, otherwise not. This is intended to be friendly
-     to lazy float context switching in the kernel.  */
-  cif->aarch64_flags = 0;
-
-  if (is_v_register_candidate (cif->rtype))
-    {
-      cif->aarch64_flags |= AARCH64_FFI_WITH_V;
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < cif->nargs; i++)
-        if (is_v_register_candidate (cif->arg_types[i]))
-          {
-            cif->aarch64_flags |= AARCH64_FFI_WITH_V;
-            break;
-          }
-    }
-
-  return FFI_OK;
-}
-
-/* Call a function with the provided arguments and capture the return
-   value.  */
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
-{
-  extended_cif ecif;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  ecif.rvalue = rvalue;
-
-  switch (cif->abi)
-    {
-    case FFI_SYSV:
-      {
-        struct call_context context;
-	unsigned stack_bytes;
-
-	/* Figure out the total amount of stack space we need, the
-	   above call frame space needs to be 16 bytes aligned to
-	   ensure correct alignment of the first object inserted in
-	   that space hence the ALIGN applied to cif->bytes.*/
-	stack_bytes = ALIGN(cif->bytes, 16);
-
-	memset (&context, 0, sizeof (context));
-        if (is_register_candidate (cif->rtype))
-          {
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
-            switch (cif->rtype->type)
-              {
-              case FFI_TYPE_VOID:
-              case FFI_TYPE_FLOAT:
-              case FFI_TYPE_DOUBLE:
-              case FFI_TYPE_LONGDOUBLE:
-              case FFI_TYPE_UINT8:
-              case FFI_TYPE_SINT8:
-              case FFI_TYPE_UINT16:
-              case FFI_TYPE_SINT16:
-              case FFI_TYPE_UINT32:
-              case FFI_TYPE_SINT32:
-              case FFI_TYPE_POINTER:
-              case FFI_TYPE_UINT64:
-              case FFI_TYPE_INT:
-              case FFI_TYPE_SINT64:
-		{
-		  void *addr = get_basic_type_addr (cif->rtype->type,
-						    &context, 0);
-		  copy_basic_type (rvalue, addr, cif->rtype->type);
-		  break;
-		}
-
-              case FFI_TYPE_STRUCT:
-                if (is_hfa (cif->rtype))
-		  {
-		    int j;
-		    unsigned short type = get_homogeneous_type (cif->rtype);
-		    unsigned elems = element_count (cif->rtype);
-		    for (j = 0; j < elems; j++)
-		      {
-			void *reg = get_basic_type_addr (type, &context, j);
-			copy_basic_type (rvalue, reg, type);
-			rvalue += get_basic_type_size (type);
-		      }
-		  }
-                else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-                  {
-                    unsigned size = ALIGN (cif->rtype->size, sizeof (UINT64));
-                    memcpy (rvalue, get_x_addr (&context, 0), size);
-                  }
-                else
-                  {
-                    FFI_ASSERT (0);
-                  }
-                break;
-
-              default:
-                FFI_ASSERT (0);
-                break;
-              }
-          }
-        else
-          {
-            memcpy (get_x_addr (&context, 8), &rvalue, sizeof (UINT64));
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif,
-			   stack_bytes, fn);
-          }
-        break;
-      }
-
-    default:
-      FFI_ASSERT (0);
-      break;
-    }
-}
-
-static unsigned char trampoline [] =
-{ 0x70, 0x00, 0x00, 0x58,	/* ldr	x16, 1f	*/
-  0x91, 0x00, 0x00, 0x10,	/* adr	x17, 2f	*/
-  0x00, 0x02, 0x1f, 0xd6	/* br	x16	*/
-};
-
 /* Build a trampoline.  */
 
-#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX,FLAGS)			\
-  ({unsigned char *__tramp = (unsigned char*)(TRAMP);			\
-    UINT64  __fun = (UINT64)(FUN);					\
-    UINT64  __ctx = (UINT64)(CTX);					\
-    UINT64  __flags = (UINT64)(FLAGS);					\
-    memcpy (__tramp, trampoline, sizeof (trampoline));			\
-    memcpy (__tramp + 12, &__fun, sizeof (__fun));			\
-    memcpy (__tramp + 20, &__ctx, sizeof (__ctx));			\
-    memcpy (__tramp + 28, &__flags, sizeof (__flags));			\
-    __clear_cache(__tramp, __tramp + FFI_TRAMPOLINE_SIZE);		\
-  })
-
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
                       ffi_cif* cif,
@@ -834,15 +547,29 @@ ffi_prep_closure_loc (ffi_closure* closure,
                       void *user_data,
                       void *codeloc)
 {
+  static const unsigned char trampoline[16] = {
+    0x90, 0x00, 0x00, 0x58,     /* ldr  x16, 16 */
+    0xf1, 0xff, 0xff, 0x10,     /* adr  x17, 0  */
+    0x00, 0x02, 0x1f, 0xd6,     /* br   x16     */
+  };
+  char *tramp = &closure->tramp[0];
+  void (*entry)(void);
+
   if (cif->abi != FFI_SYSV)
     return FFI_BAD_ABI;
 
-  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_SYSV, codeloc,
-		       cif->aarch64_flags);
+  entry = (cif->flags & AARCH64_FLAG_ARG_V
+	   ? ffi_closure_SYSV_V : ffi_closure_SYSV);
+
+  memcpy (tramp, trampoline, sizeof(trampoline));
+
+  *(UINT64 *)(tramp + 16) = (uintptr_t)entry;
 
-  closure->cif  = cif;
+  closure->cif = cif;
+  closure->fun = fun;
   closure->user_data = user_data;
-  closure->fun  = fun;
+
+  __clear_cache (tramp, tramp + sizeof(trampoline));
 
   return FFI_OK;
 }
@@ -863,26 +590,33 @@ ffi_prep_closure_loc (ffi_closure* closure,
    desriptors, invokes the wrapped function, then marshalls the return
    value back into the call context.  */
 
-void
-ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
-			void *stack)
+UINT64 FFI_HIDDEN
+ffi_closure_SYSV_inner (ffi_cif *cif,
+			void (*fun)(ffi_cif*,void*,void**,void*),
+			void *user_data,
+			struct call_context *context,
+			UINT64 *stack, void *rvalue)
 {
-  ffi_cif *cif = closure->cif;
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
-  void *rvalue = NULL;
-  int i;
-  struct arg_state state;
-
-  arg_init (&state, ALIGN(cif->bytes, 16));
-
-  for (i = 0; i < cif->nargs; i++)
+  ffi_type **arg_types;
+  int i, nargs, h, ngrn, nsrn, nsaa;
+  size_t size;
+
+  ngrn = nsrn = nsaa = 0;
+  arg_types = cif->arg_types;
+  nargs = cif->nargs;
+  
+  for (i = 0; i < nargs; i++)
     {
-      ffi_type *ty = cif->arg_types[i];
+      ffi_type *ty = arg_types[i];
+      int t = ty->type;
+      void *slot;
 
-      switch (ty->type)
+      switch (t)
 	{
 	case FFI_TYPE_VOID:
-	  FFI_ASSERT (0);
+	  /* ??? abort */
+	  slot = NULL;
 	  break;
 
 	case FFI_TYPE_UINT8:
@@ -895,182 +629,128 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
 	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	case  FFI_TYPE_FLOAT:
-	case  FFI_TYPE_DOUBLE:
-	case  FFI_TYPE_LONGDOUBLE:
-	  avalue[i] = allocate_to_register_or_stack (context, stack,
-						     &state, ty->type);
+	  if (ngrn < N_X_ARG_REG)
+	    slot = &context->x[ngrn++];
+	  else
+	    slot = &stack[nsaa++];
+	  *(ffi_arg *)slot = extend_basic_type (*(UINT64 *)slot, t);
 	  break;
 
-	case FFI_TYPE_STRUCT:
-	  if (is_hfa (ty))
-	    {
-	      unsigned n = element_count (ty);
-	      if (available_v (&state) < n)
-		{
-		  state.nsrn = N_V_ARG_REG;
-		  avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-						 ty->size);
-		}
-	      else
-		{
-		  switch (get_homogeneous_type (ty))
-		    {
-		    case FFI_TYPE_FLOAT:
-		      {
-			/* Eeek! We need a pointer to the structure,
-			   however the homogeneous float elements are
-			   being passed in individual S registers,
-			   therefore the structure is not represented as
-			   a contiguous sequence of bytes in our saved
-			   register context. We need to fake up a copy
-			   of the structure layed out in memory
-			   correctly. The fake can be tossed once the
-			   closure function has returned hence alloca()
-			   is sufficient. */
-			int j;
-			UINT32 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < element_count (ty); j++)
-			  memcpy (&p[j],
-				  allocate_to_s (context, &state),
-				  sizeof (*p));
-			break;
-		      }
-
-		    case FFI_TYPE_DOUBLE:
-		      {
-			/* Eeek! We need a pointer to the structure,
-			   however the homogeneous float elements are
-			   being passed in individual S registers,
-			   therefore the structure is not represented as
-			   a contiguous sequence of bytes in our saved
-			   register context. We need to fake up a copy
-			   of the structure layed out in memory
-			   correctly. The fake can be tossed once the
-			   closure function has returned hence alloca()
-			   is sufficient. */
-			int j;
-			UINT64 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < element_count (ty); j++)
-			  memcpy (&p[j],
-				  allocate_to_d (context, &state),
-				  sizeof (*p));
-			break;
-		      }
+	case FFI_TYPE_FLOAT:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = get_s_addr (context, nsrn++);
+	  else
+	    slot = &stack[nsaa++];
+	  break;
 
-		    case FFI_TYPE_LONGDOUBLE:
-			  memcpy (&avalue[i],
-				  allocate_to_v (context, &state),
-				  sizeof (*avalue));
-		      break;
+	case FFI_TYPE_DOUBLE:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = get_d_addr (context, nsrn++);
+	  else
+	    slot = &stack[nsaa++];
+	  break;
 
-		    default:
-		      FFI_ASSERT (0);
-		      break;
-		    }
-		}
-	    }
-	  else if (ty->size > 16)
-	    {
-	      /* Replace Composite type of size greater than 16 with a
-		 pointer.  */
-	      memcpy (&avalue[i],
-		      allocate_to_register_or_stack (context, stack,
-						     &state, FFI_TYPE_POINTER),
-		      sizeof (avalue[i]));
-	    }
-	  else if (available_x (&state) >= (ty->size + 7) / 8)
-	    {
-	      avalue[i] = get_x_addr (context, state.ngrn);
-	      state.ngrn += (ty->size + 7) / 8;
-	    }
+	case FFI_TYPE_LONGDOUBLE:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = &context->v[nsrn++];
 	  else
 	    {
-	      state.ngrn = N_X_ARG_REG;
-
-	      avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-					     ty->size);
+	      nsaa = ALIGN (nsaa, 2);
+	      slot = &stack[nsaa];
+	      nsaa += 2;
 	    }
 	  break;
 
-	default:
-	  FFI_ASSERT (0);
+	case FFI_TYPE_STRUCT:
+	  {
+	    size_t slot_count;
+
+	    size = ty->size;
+	    slot_count = (size + 7) / 8;
+	    h = is_hfa (ty);
+	    if (h)
+	      {
+		int reg_count = h >> 8;
+		int tt = h & 0xff;
+		int j;
+
+		if (nsrn + reg_count <= N_V_ARG_REG)
+		  {
+		    switch (tt)
+		      {
+		      case FFI_TYPE_FLOAT:
+			{
+			  UINT32 *dst = alloca (size);
+			  for (j = 0; j < reg_count; ++j)
+			    dst[j] = *get_s_addr(context, nsrn + j);
+			  slot = dst;
+			}
+			break;
+		      case FFI_TYPE_DOUBLE:
+			{
+			  UINT64 *dst = alloca (size);
+			  for (j = 0; j < reg_count; ++j)
+			    dst[j] = *get_d_addr(context, nsrn + j);
+			  slot = dst;
+			}
+			break;
+		      case FFI_TYPE_LONGDOUBLE:
+			slot = &context->v[nsrn];
+			break;
+		      default:
+			abort ();
+		      }
+		    nsrn += reg_count;
+		    break;
+		  }
+		/* All out of fp registers.  It's on the stack.  */
+		nsrn = N_V_ARG_REG;
+	      }
+	    else if (size > 16)
+	      {
+		/* The argument is passed by indirection.  */
+		if (ngrn < N_X_ARG_REG)
+		  slot = (void *)(uintptr_t)context->x[ngrn++];
+		else
+		  slot = (void *)(uintptr_t)stack[nsaa++];
+		break;
+	      }
+	    else
+	      {
+		if (ty->alignment == 16)
+		  ngrn = ALIGN (ngrn, 2);
+
+		if (ngrn + slot_count <= N_X_ARG_REG)
+		  {
+		    slot = &context->x[ngrn];
+		    ngrn += slot_count;
+		    break;
+		  }
+		/* All out of general registers.  Copy to the stack.  */
+                ngrn = N_X_ARG_REG;
+	      }
+	    if (ty->alignment > 8)
+	      {
+		int a = ty->alignment / 8;
+		nsaa = ALIGN (nsaa, a);
+	      }
+	    slot = &stack[nsaa];
+	    nsaa += slot_count;
+	  }
 	  break;
+
+	default:
+	  abort ();
 	}
+
+      avalue[i] = slot;
     }
 
-  /* Figure out where the return value will be passed, either in
-     registers or in a memory block allocated by the caller and passed
-     in x8.  */
+  h = cif->flags & AARCH64_FLAG_RET_MASK;
+  if (h != AARCH64_RET_LG_STRUCT)
+    rvalue = context + 1;
 
-  if (is_register_candidate (cif->rtype))
-    {
-      /* Register candidates are *always* returned in registers. */
-
-      /* Allocate a scratchpad for the return value, we will let the
-         callee scrible the result into the scratch pad then move the
-         contents into the appropriate return value location for the
-         call convention.  */
-      rvalue = alloca (cif->rtype->size);
-      (closure->fun) (cif, rvalue, avalue, closure->user_data);
-
-      /* Copy the return value into the call context so that it is returned
-         as expected to our caller.  */
-      switch (cif->rtype->type)
-        {
-        case FFI_TYPE_VOID:
-          break;
+  fun (cif, rvalue, avalue, user_data);
 
-        case FFI_TYPE_UINT8:
-        case FFI_TYPE_UINT16:
-        case FFI_TYPE_UINT32:
-        case FFI_TYPE_POINTER:
-        case FFI_TYPE_UINT64:
-        case FFI_TYPE_SINT8:
-        case FFI_TYPE_SINT16:
-        case FFI_TYPE_INT:
-        case FFI_TYPE_SINT32:
-        case FFI_TYPE_SINT64:
-        case FFI_TYPE_FLOAT:
-        case FFI_TYPE_DOUBLE:
-        case FFI_TYPE_LONGDOUBLE:
-	  {
-	    void *addr = get_basic_type_addr (cif->rtype->type, context, 0);
-	    copy_basic_type (addr, rvalue, cif->rtype->type);
-            break;
-	  }
-        case FFI_TYPE_STRUCT:
-          if (is_hfa (cif->rtype))
-	    {
-	      int i;
-	      unsigned short type = get_homogeneous_type (cif->rtype);
-	      unsigned elems = element_count (cif->rtype);
-	      for (i = 0; i < elems; i++)
-		{
-		  void *reg = get_basic_type_addr (type, context, i);
-		  copy_basic_type (reg, rvalue, type);
-		  rvalue += get_basic_type_size (type);
-		}
-	    }
-          else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-            {
-              unsigned size = ALIGN (cif->rtype->size, sizeof (UINT64)) ;
-              memcpy (get_x_addr (context, 0), rvalue, size);
-            }
-          else
-            {
-              FFI_ASSERT (0);
-            }
-          break;
-        default:
-          FFI_ASSERT (0);
-          break;
-        }
-    }
-  else
-    {
-      memcpy (&rvalue, get_x_addr (context, 8), sizeof (UINT64));
-      (closure->fun) (cif, rvalue, avalue, closure->user_data);
-    }
+  return h;
 }
-
diff --git a/libffi/src/aarch64/ffitarget.h b/libffi/src/aarch64/ffitarget.h
index 6f1a348..ecfa159 100644
--- a/libffi/src/aarch64/ffitarget.h
+++ b/libffi/src/aarch64/ffitarget.h
@@ -27,8 +27,8 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #endif
 
 #ifndef LIBFFI_ASM
-typedef unsigned long ffi_arg;
-typedef signed long ffi_sarg;
+typedef unsigned long long ffi_arg;
+typedef signed long long ffi_sarg;
 
 typedef enum ffi_abi
   {
@@ -42,18 +42,7 @@ typedef enum ffi_abi
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
-#define FFI_TRAMPOLINE_SIZE 36
+#define FFI_TRAMPOLINE_SIZE  24
 #define FFI_NATIVE_RAW_API 0
 
-/* ---- Internal ---- */
-
-
-#define FFI_EXTRA_CIF_FIELDS unsigned aarch64_flags
-
-#define AARCH64_FFI_WITH_V_BIT 0
-
-#define AARCH64_N_XREG 32
-#define AARCH64_N_VREG 32
-#define AARCH64_CALL_CONTEXT_SIZE (AARCH64_N_XREG * 8 + AARCH64_N_VREG * 16)
-
 #endif
diff --git a/libffi/src/aarch64/internal.h b/libffi/src/aarch64/internal.h
new file mode 100644
index 0000000..63cf683
--- /dev/null
+++ b/libffi/src/aarch64/internal.h
@@ -0,0 +1,43 @@
+/* 
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+/* ---- Internal ---- */
+
+#define AARCH64_RET_UINT32		0
+#define AARCH64_RET_SINT32		1
+#define AARCH64_RET_INT64		2
+#define AARCH64_RET_SM_STRUCT		3
+#define AARCH64_RET_FLOAT		4
+#define AARCH64_RET_DOUBLE		5
+#define AARCH64_RET_LDOUBLE		6
+#define AARCH64_RET_HFA_FLOAT		7
+#define AARCH64_RET_HFA_DOUBLE		8
+#define AARCH64_RET_HFA_LDOUBLE		13
+#define AARCH64_RET_LG_STRUCT		14
+#define AARCH64_RET_VOID		15
+#define AARCH64_FLAG_RET_MASK		15
+
+#define AARCH64_FLAG_ARG_V_BIT		4
+#define AARCH64_FLAG_ARG_V		(1 << AARCH64_FLAG_ARG_V_BIT)
+
+#define AARCH64_N_VREG 8
+#define AARCH64_N_XREG 8
+#define AARCH64_CALL_CONTEXT_SIZE (AARCH64_N_VREG * 16 + AARCH64_N_XREG * 8)
+
diff --git a/libffi/src/aarch64/sysv.S b/libffi/src/aarch64/sysv.S
index ffb16f8..126c527 100644
--- a/libffi/src/aarch64/sysv.S
+++ b/libffi/src/aarch64/sysv.S
@@ -22,286 +22,285 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
+#include "internal.h"
 
-#define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
-#define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
-#define cfi_restore(reg)		.cfi_restore reg
-#define cfi_def_cfa_register(reg)	.cfi_def_cfa_register reg
-
-        .text
-        .globl ffi_call_SYSV
-        .type ffi_call_SYSV, #function
+	.text
 
 /* ffi_call_SYSV()
 
-   Create a stack frame, setup an argument context, call the callee
-   and extract the result.
-
-   The maximum required argument stack size is provided,
-   ffi_call_SYSV() allocates that stack space then calls the
-   prepare_fn to populate register context and stack.  The
-   argument passing registers are loaded from the register
-   context and the callee called, on return the register passing
-   register are saved back to the context.  Our caller will
-   extract the return value from the final state of the saved
-   register context.
+   Install an argument context and a stack frame.
+   Call the callee and extract the result.
 
    Prototype:
 
-   extern unsigned
-   ffi_call_SYSV (void (*)(struct call_context *context, unsigned char *,
-			   extended_cif *),
-                  struct call_context *context,
-                  extended_cif *,
-                  unsigned required_stack_size,
-                  void (*fn)(void));
-
-   Therefore on entry we have:
-
-   x0 prepare_fn
-   x1 &context
-   x2 &ecif
-   x3 bytes
-   x4 fn
-
-   This function uses the following stack frame layout:
-
-   ==
-                saved x30(lr)
-   x29(fp)->    saved x29(fp)
-                saved x24
-                saved x23
-                saved x22
-   sp'    ->    saved x21
-                ...
-   sp     ->    (constructed callee stack arguments)
-   ==
+   extern void
+   ffi_call_SYSV (void *frame, void *rvalue, struct call_context *context,
+		  unsigned flags, void (*fn)(void))
 
-   Voila! */
+   This function uses an unusual stack layout.  Our local frame has
+   been allocated by the caller in FRAME with the outgoing arguments
+   in CONTEXT, and the outgoing stack arguments above CONTEXT.  */
 
-#define ffi_call_SYSV_FS (8 * 4)
+	.globl	ffi_call_SYSV
+	.hidden	ffi_call_SYSV
+	.type	ffi_call_SYSV, %function
+	.balign 32
 
-        .cfi_startproc
 ffi_call_SYSV:
-        stp     x29, x30, [sp, #-16]!
-	cfi_adjust_cfa_offset (16)
-        cfi_rel_offset (x29, 0)
-        cfi_rel_offset (x30, 8)
-
-        mov     x29, sp
-	cfi_def_cfa_register (x29)
-        sub     sp, sp, #ffi_call_SYSV_FS
-
-        stp     x21, x22, [sp, 0]
-        cfi_rel_offset (x21, 0 - ffi_call_SYSV_FS)
-        cfi_rel_offset (x22, 8 - ffi_call_SYSV_FS)
-
-        stp     x23, x24, [sp, 16]
-        cfi_rel_offset (x23, 16 - ffi_call_SYSV_FS)
-        cfi_rel_offset (x24, 24 - ffi_call_SYSV_FS)
-
-        mov     x21, x1
-        mov     x22, x2
-        mov     x24, x4
-
-        /* Allocate the stack space for the actual arguments, many
-           arguments will be passed in registers, but we assume
-           worst case and allocate sufficient stack for ALL of
-           the arguments.  */
-        sub     sp, sp, x3
-
-        /* unsigned (*prepare_fn) (struct call_context *context,
-				   unsigned char *stack, extended_cif *ecif);
-	 */
-        mov     x23, x0
-        mov     x0, x1
-        mov     x1, sp
-        /* x2 already in place */
-        blr     x23
-
-        /* Preserve the flags returned.  */
-        mov     x23, x0
-
-        /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FFI_WITH_V_BIT, 1f
-
-        /* Load the vector argument passing registers.  */
-        ldp     q0, q1, [x21, #8*32 +  0]
-        ldp     q2, q3, [x21, #8*32 + 32]
-        ldp     q4, q5, [x21, #8*32 + 64]
-        ldp     q6, q7, [x21, #8*32 + 96]
-1:
-        /* Load the core argument passing registers.  */
-        ldp     x0, x1, [x21,  #0]
-        ldp     x2, x3, [x21, #16]
-        ldp     x4, x5, [x21, #32]
-        ldp     x6, x7, [x21, #48]
-
-        /* Don't forget x8 which may be holding the address of a return buffer.
-	 */
-        ldr     x8,     [x21, #8*8]
-
-        blr     x24
-
-        /* Save the core argument passing registers.  */
-        stp     x0, x1, [x21,  #0]
-        stp     x2, x3, [x21, #16]
-        stp     x4, x5, [x21, #32]
-        stp     x6, x7, [x21, #48]
-
-        /* Note nothing useful ever comes back in x8!  */
-
-        /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FFI_WITH_V_BIT, 1f
-
-        /* Save the vector argument passing registers.  */
-        stp     q0, q1, [x21, #8*32 + 0]
-        stp     q2, q3, [x21, #8*32 + 32]
-        stp     q4, q5, [x21, #8*32 + 64]
-        stp     q6, q7, [x21, #8*32 + 96]
+	.cfi_startproc
+	.cfi_def_cfa x0, 32
+	stp	x29, x30, [x0]		/* Save fp, lr in our frame.  */
+	mov	x29, x0			/* Set up our new frame.  */
+	.cfi_def_cfa_register x29
+	.cfi_rel_offset x29, 0
+	.cfi_rel_offset x30, 8
+
+	/* Move parameters out of the way. */
+	stp	x3, x1, [x0, #16]	/* flags, rvalue */
+	mov	x8, x1			/* rvalue into place */
+	mov	x10, x2			/* context */
+	mov	x11, x4			/* fn */
+
+	/* Load the vector argument passing registers, if needed.  */
+	tbz     w3, #AARCH64_FLAG_ARG_V_BIT, 1f
+	ldp     q0, q1, [x10, #8*AARCH64_N_XREG + 0]
+	ldp     q2, q3, [x10, #8*AARCH64_N_XREG + 32]
+	ldp     q4, q5, [x10, #8*AARCH64_N_XREG + 64]
+	ldp     q6, q7, [x10, #8*AARCH64_N_XREG + 96]
 1:
-        /* All done, unwind our stack frame.  */
-        ldp     x21, x22, [x29,  # - ffi_call_SYSV_FS]
-        cfi_restore (x21)
-        cfi_restore (x22)
-
-        ldp     x23, x24, [x29,  # - ffi_call_SYSV_FS + 16]
-        cfi_restore (x23)
-        cfi_restore (x24)
-
-        mov     sp, x29
-	cfi_def_cfa_register (sp)
-
-        ldp     x29, x30, [sp], #16
-	cfi_adjust_cfa_offset (-16)
-        cfi_restore (x29)
-        cfi_restore (x30)
-
-        ret
-
-        .cfi_endproc
-        .size ffi_call_SYSV, .-ffi_call_SYSV
-
-#define ffi_closure_SYSV_FS (8 * 2 + AARCH64_CALL_CONTEXT_SIZE)
+	/* Load the core argument passing registers.  */
+	ldp     x0, x1, [x10, #16*0]
+	ldp     x2, x3, [x10, #16*1]
+	ldp     x4, x5, [x10, #16*2]
+	ldp     x6, x7, [x10, #16*3]
+
+	/* Setup SP for the stacked arguments.  */
+	add	sp, x10, #AARCH64_CALL_CONTEXT_SIZE
+
+	/* Call fn.  */
+	blr     x11
+
+	/* Recover the flags value and result address.  */
+	ldp	x3, x8, [x29, #16]
+
+	/* Store the return type.
+	   Each case uses 8 bytes, so compute it directly.  */
+	adr	x2, 3f
+	and	w3, w3, #AARCH64_FLAG_RET_MASK
+	add	x2, x2, x3, lsl #3
+	br	x2
+
+	/* Store results into the rvalue.  Note that for most integer
+	   cases this is actually ffi_arg, aka a 64-bit result.
+	   For the HFA cases, and the (small) struct case, we've arranged
+	   for temporary storage, so store the largest possible.
+	   For the large struct case, we've remapped to VOID, since
+	   the callee has already done the store via x8.  */
+	.balign 8
+/* 0: AARCH64_RET_UINT32 */
+3:	mov	w0, w0
+	b	4f
+/* 1: AARCH64_RET_SINT32 */
+	sxtw	x0, w0
+	nop
+/* 2: AARCH64_RET_INT64 */
+4:	str	x0, [x8]
+	b	9f
+/* 3: AARCH64_RET_SM_STRUCT */
+	stp	x0, x1, [x8]
+	b	9f
+/* 4: AARCH64_RET_FLOAT */
+	str	s0, [x8]
+	b	9f
+/* 5: AARCH64_RET_DOUBLE */
+	str	d0, [x8]
+	b	9f
+/* 6: AARCH64_RET_LONGDOUBLE */
+	str	q0, [x8]
+	b	9f
+/* 7: AARCH64_RET_HFA_FLOAT */
+	st4	{ v0.s, v1.s, v2.s, v3.s }[0], [x8]
+	b	9f
+/* 8: AARCH64_RET_HFA_DOUBLE */
+	st4	{ v0.d, v1.d, v2.d, v3.d }[0], [x8]
+	b	9f
+/* 9: invalid */
+	brk	#1000
+	nop
+/* A: invalid */
+	brk	#1000
+	nop
+/* B: invalid */
+	brk	#1000
+	nop
+/* C: invalid */
+	brk	#1000
+	nop
+/* D: AARCH64_RET_HFA_LDOUBLE */
+	stp	q0, q1, [x8]
+	stp	q2, q3, [x8, #32]
+/* E: AARCH64_RET_LG_STRUCT */
+	nop
+	nop
+/* F: AARCH64_RET_VOID */
+9:	ldp     x29, x30, [x29]
+	.cfi_def_cfa sp, 0
+	.cfi_restore x29
+	.cfi_restore x30
+	ret
+	.cfi_endproc
+	.size ffi_call_SYSV, .-ffi_call_SYSV
 
 /* ffi_closure_SYSV
 
    Closure invocation glue. This is the low level code invoked directly by
    the closure trampoline to setup and call a closure.
 
-   On entry x17 points to a struct trampoline_data, x16 has been clobbered
-   all other registers are preserved.
+   On entry x17 points to a ffi_closure, x16 has been clobbered,
+   and all other registers are preserved.
 
    We allocate a call context and save the argument passing registers,
    then invoked the generic C ffi_closure_SYSV_inner() function to do all
    the real work, on return we load the result passing registers back from
    the call context.
 
-   On entry
-
-   extern void
-   ffi_closure_SYSV (struct trampoline_data *);
-
-   struct trampoline_data
-   {
-        UINT64 *ffi_closure;
-        UINT64 flags;
-   };
+   We use two separate entry points, depending on whether there are
+   any vector argument registers.
 
    This function uses the following stack frame layout:
 
    ==
-                saved x30(lr)
-   x29(fp)->    saved x29(fp)
-                saved x22
-                saved x21
-                ...
-   sp     ->    call_context
+		temporary return slot
+		call_context
+		saved x30(lr)
+   sp, x29->    saved x29(fp)
    ==
 
    Voila!  */
 
-        .text
-        .globl ffi_closure_SYSV
-        .cfi_startproc
+#define ffi_closure_FS (16 + AARCH64_CALL_CONTEXT_SIZE + 64)
+
+	.globl	ffi_closure_SYSV_V
+	.hidden	ffi_closure_SYSV_V
+	.type	ffi_closure_SYSV_V, %function
+	.balign 32
+
+ffi_closure_SYSV_V:
+	.cfi_startproc
+	stp     x29, x30, [sp, #-ffi_closure_FS]!
+	.cfi_adjust_cfa_offset ffi_closure_FS
+	.cfi_rel_offset x29, 0
+	.cfi_rel_offset x30, 8
+	mov     x29, sp
+
+	/* Save the argument passing vector registers.  */
+	stp     q0, q1, [sp, #16 + 8*AARCH64_N_XREG + 0]
+	stp     q2, q3, [sp, #16 + 8*AARCH64_N_XREG + 32]
+	stp     q4, q5, [sp, #16 + 8*AARCH64_N_XREG + 64]
+	stp     q6, q7, [sp, #16 + 8*AARCH64_N_XREG + 96]
+	b	0f
+
+	.cfi_endproc
+	.size	ffi_closure_SYSV_V, . - ffi_closure_SYSV_V
+
+	.globl	ffi_closure_SYSV
+	.hidden	ffi_closure_SYSV
+	.type	ffi_closure_SYSV, %function
+	.balign 32
+
 ffi_closure_SYSV:
-        stp     x29, x30, [sp, #-16]!
-	cfi_adjust_cfa_offset (16)
-        cfi_rel_offset (x29, 0)
-        cfi_rel_offset (x30, 8)
-
-        mov     x29, sp
-        cfi_def_cfa_register (x29)
-
-        sub     sp, sp, #ffi_closure_SYSV_FS
-
-        stp     x21, x22, [x29, #-16]
-        cfi_rel_offset (x21, -16)
-        cfi_rel_offset (x22, -8)
-
-        /* Load x21 with &call_context.  */
-        mov     x21, sp
-        /* Preserve our struct trampoline_data *  */
-        mov     x22, x17
-
-        /* Save the rest of the argument passing registers.  */
-        stp     x0, x1, [x21, #0]
-        stp     x2, x3, [x21, #16]
-        stp     x4, x5, [x21, #32]
-        stp     x6, x7, [x21, #48]
-        /* Don't forget we may have been given a result scratch pad address.
-	 */
-        str     x8,     [x21, #64]
-
-        /* Figure out if we should touch the vector registers.  */
-        ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FFI_WITH_V_BIT, 1f
-
-        /* Save the argument passing vector registers.  */
-        stp     q0, q1, [x21, #8*32 + 0]
-        stp     q2, q3, [x21, #8*32 + 32]
-        stp     q4, q5, [x21, #8*32 + 64]
-        stp     q6, q7, [x21, #8*32 + 96]
-1:
-        /* Load &ffi_closure..  */
-        ldr     x0, [x22, #0]
-        mov     x1, x21
-        /* Compute the location of the stack at the point that the
-           trampoline was called.  */
-        add     x2, x29, #16
-
-        bl      ffi_closure_SYSV_inner
-
-        /* Figure out if we should touch the vector registers.  */
-        ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FFI_WITH_V_BIT, 1f
-
-        /* Load the result passing vector registers.  */
-        ldp     q0, q1, [x21, #8*32 + 0]
-        ldp     q2, q3, [x21, #8*32 + 32]
-        ldp     q4, q5, [x21, #8*32 + 64]
-        ldp     q6, q7, [x21, #8*32 + 96]
-1:
-        /* Load the result passing core registers.  */
-        ldp     x0, x1, [x21,  #0]
-        ldp     x2, x3, [x21, #16]
-        ldp     x4, x5, [x21, #32]
-        ldp     x6, x7, [x21, #48]
-        /* Note nothing usefull is returned in x8.  */
-
-        /* We are done, unwind our frame.  */
-        ldp     x21, x22, [x29,  #-16]
-        cfi_restore (x21)
-        cfi_restore (x22)
-
-        mov     sp, x29
-        cfi_def_cfa_register (sp)
-
-        ldp     x29, x30, [sp], #16
-	cfi_adjust_cfa_offset (-16)
-        cfi_restore (x29)
-        cfi_restore (x30)
-
-        ret
-        .cfi_endproc
-        .size ffi_closure_SYSV, .-ffi_closure_SYSV
+	.cfi_startproc
+	stp     x29, x30, [sp, #-ffi_closure_FS]!
+	.cfi_adjust_cfa_offset ffi_closure_FS
+	.cfi_rel_offset x29, 0
+	.cfi_rel_offset x30, 8
+	mov     x29, sp
+
+	/* Save the argument passing core registers.  */
+0:	stp     x0, x1, [sp, #16 + 0]
+	stp     x2, x3, [sp, #16 + 16]
+	stp     x4, x5, [sp, #16 + 32]
+	stp     x6, x7, [sp, #16 + 48]
+
+	ldp	x0, x1, [x17, #FFI_TRAMPOLINE_SIZE]  /* Load cfi, fun */
+	ldr	x2, [x17, #FFI_TRAMPOLINE_SIZE + 16] /* Load user_data */
+
+.Ldo_closure:
+	add	x3, sp, #16			/* Load &call_context.  */
+	add	x4, sp, #ffi_closure_FS		/* Load incoming sp value.  */
+	mov	x5, x8				/* Load struct return.  */
+	bl      ffi_closure_SYSV_inner
+
+	/* Load the return type.  Each case uses 8 bytes, so compute it
+	   directly.  Load x8 with address of the temporary return slot.  */
+	adr	x1, 3f
+	and	w0, w0, #AARCH64_FLAG_RET_MASK
+	add	x1, x1, x0, lsl #3
+	add	x8, sp, #16 + AARCH64_CALL_CONTEXT_SIZE
+	br	x1
+
+	/* Load results from temporary storage.  Note that for most integer
+	   cases this is actually ffi_arg, aka a 64-bit result.  For the HFA
+	   cases and the (small) struct case, we can load the maximum width.
+	   For the large struct case, we've remapped to VOID.  */
+#if defined __AARCH64EB__
+# define INT32OFS  4
+#else
+# define INT32OFS  0
+#endif
+
+	.balign 8
+/* 0: AARCH64_RET_UINT32 */
+3:	ldr	w0, [x8, #INT32OFS]
+	b	9f
+/* 1: AARCH64_RET_SINT32 */
+	ldrsw	x0, [x8, #INT32OFS]
+	b	9f
+/* 2: AARCH64_RET_INT64 */
+	ldr	x0, [x8]
+	b	9f
+/* 3: AARCH64_RET_SM_STRUCT */
+	ldp	x0, x1, [x8]
+	b	9f
+/* 4: AARCH64_RET_FLOAT */
+	ldr	s0, [x8]
+	b	9f
+/* 5: AARCH64_RET_DOUBLE */
+	ldr	d0, [x8]
+	b	9f
+/* 6: AARCH64_RET_LONGDOUBLE */
+	ldr	q0, [x8]
+	b	9f
+/* 7: AARCH64_RET_HFA_FLOAT */
+	ld4	{ v0.s, v1.s, v2.s, v3.s }[0], [x8]
+	b	9f
+/* 8: AARCH64_RET_HFA_DOUBLE */
+	ld1	{ v0.1d, v1.1d, v2.1d, v3.1d }, [x8]
+	b	9f
+/* 9: invalid */
+	brk	#1000
+	nop
+/* A: invalid */
+	brk	#1000
+	nop
+/* B: invalid */
+	brk	#1000
+	nop
+/* C: invalid */
+	brk	#1000
+	nop
+/* D: AARCH64_RET_HFA_LDOUBLE */
+	ldp	q0, q1, [x8]
+	ldp	q2, q3, [x8, #32]
+/* E: AARCH64_RET_LG_STRUCT */
+	nop
+	nop
+/* F: AARCH64_RET_VOID */
+9:	/* We are done, unwind our frame.  */
+	ldp     x29, x30, [sp], #ffi_closure_FS
+	.cfi_adjust_cfa_offset -ffi_closure_FS
+	.cfi_restore x29
+	.cfi_restore x30
+	ret
+	.cfi_endproc
+	.size ffi_closure_SYSV, .-ffi_closure_SYSV
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 00/13] Go closures, libffi, and the static chain
@ 2014-10-10 20:43 Richard Henderson
  2014-10-10 20:43 ` [PATCH 04/13] Use the static chain as the closure parameter from Go Richard Henderson
                   ` (15 more replies)
  0 siblings, 16 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

Pardon the wide distribution, the obvious hacks, and the failure
to properly split up the largest of the libffi patches.

The background here is my thread from last week[1], and Ian's reply[2],
wherein he rightly points out that not needing to play games with
mmap in order to implement closures for Go is a strong reason to
continue using custom code within libgo.

While that thread did have a go at implementing that custom code for
aarch64, I still think that replicating libffi's calling convention
knowledge for every interesting target is a mistake.

So instead I thought about how I'd add some support for Go directly
into libffi.  After all, we've got some custom code in libffi for
Java, why couldn't Go have the same treatment?

The stickler, as far as I could see, is __go_set_context.  I didn't
like the idea of libffi needing a callback into libgo in order to
accomplish the goal.

But the comment immediately before __go_set_closure itself says
that it would be better to use the static chain register.  So I set
about to see how easy that would be to accomplish.  (And not for
nothing such a change would make gccgo compiled programs faster
by avoiding the library calls.)

The following patch set enables this for x86_64, i386, and aarch64[3].

The first two patches enable a static chain to be set by the front end
on CALL_EXPRs, and to be used with indirect function calls.  The third
patch is a horrible hack to expose this feature to the C front end.

The 4th patch changes gccgo to use the static chain.  I don't bother
with checking to see that the target has one.  All targets currently
supported by libgo have one, so I don't really see this as a stumbling
block.

The 5th patch changes libgo to use the static chain.  I admit that I
haven't tested this patch solo; I simply split it out of a larger patch
for clarity.

The 6th patch adds interfaces to libffi for Go; these interfaces are
used within libgo in the 8th patch.

Patches 7, 10, 11, 12, 13 are all enabling the new libffi interface on
the aforementioned targets.  There's lots of cleanup in here, and I
owe the libffi list smaller reviewable changes.  I ask that libffi
ignore patches 10 and 12 for now and comment on the meat instead.

Before I go too much farther down this road, I wanted to get some
feedback.  FWIW, a complete tree can be found at [4].

Thanks,


r~


[1] https://gcc.gnu.org/ml/gcc-patches/2014-10/msg00098.html
[2] https://gcc.gnu.org/ml/gcc-patches/2014-10/msg00102.html
[3] Except that after rebasing the tree on yesterday's trunk,
    I discovered that i386 and aarch64 both have bootstrap
    problems on trunk.  Ouch.
[4] git://github.com/rth7680/gcc.git rth/go-closure


Richard Henderson (13):
  Make TARGET_STATIC_CHAIN allow a function type
  Allow the front-end to create calls with a static chain
  HACK!  Allow the static chain to be set from C
  Use the static chain as the closure parameter from Go
  libgo: Use the static chain for the closure
  libffi: Add entry points for interacting with Go
  libffi: Support go closures on x86_64
  libgo: Use the new libffi interfaces for Go
  libgo: Remove __go_get/set_closure
  libffi: Rewrite aarch64
  libffi: Support go closures on aarch64
  libffi: Rewrite i386 sysv
  libffi: Support go closures on i386

 gcc/c-family/c-common.c             |    1 +
 gcc/c-family/c-common.h             |    2 +-
 gcc/c/c-parser.c                    |   29 +
 gcc/calls.c                         |   14 +-
 gcc/config/i386/i386.c              |   19 +-
 gcc/config/moxie/moxie.c            |    5 +-
 gcc/config/xtensa/xtensa.c          |    2 +-
 gcc/doc/tm.texi                     |    2 +-
 gcc/gimple-fold.c                   |   21 +
 gcc/gimplify.c                      |   17 +-
 gcc/go/go-gcc.cc                    |   44 +-
 gcc/go/gofrontend/backend.h         |    7 +-
 gcc/go/gofrontend/expressions.cc    |   21 +-
 gcc/go/gofrontend/gogo.cc           |   29 +-
 gcc/go/gofrontend/gogo.h            |   14 +
 gcc/go/gofrontend/runtime.def       |    6 -
 gcc/target.def                      |    6 +-
 gcc/targhooks.c                     |    5 +-
 gcc/testsuite/gcc.dg/static-chain.c |   31 +
 gcc/tree-cfg.c                      |   22 +-
 libffi/include/ffi.h.in             |   16 +
 libffi/src/aarch64/ffi.c            | 1380 ++++++++++++++---------------------
 libffi/src/aarch64/ffitarget.h      |   18 +-
 libffi/src/aarch64/internal.h       |   43 ++
 libffi/src/aarch64/sysv.S           |  557 +++++++-------
 libffi/src/x86/ffi.c                | 1161 ++++++++++++-----------------
 libffi/src/x86/ffi64.c              |  103 ++-
 libffi/src/x86/ffitarget.h          |  112 ++-
 libffi/src/x86/internal.h           |   48 ++
 libffi/src/x86/sysv.S               | 1003 +++++++++++++++----------
 libffi/src/x86/unix64.S             |  319 ++++----
 libgo/go/reflect/makefunc.go        |   49 +-
 libgo/go/reflect/makefunc_386.S     |   22 +-
 libgo/go/reflect/makefunc_amd64.S   |   13 +-
 libgo/go/reflect/makefunc_ffi.go    |   67 +-
 libgo/go/reflect/makefunc_ffi_c.c   |   68 +-
 libgo/go/reflect/value.go           |    3 +
 libgo/runtime/go-reflect-call.c     |   10 +-
 libgo/runtime/malloc.goc            |    8 -
 libgo/runtime/mgc0.c                |    3 +-
 libgo/runtime/proc.c                |   20 -
 libgo/runtime/runtime.h             |    4 -
 libgo/runtime/time.goc              |    3 +-
 43 files changed, 2624 insertions(+), 2703 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/static-chain.c
 create mode 100644 libffi/src/aarch64/internal.h
 create mode 100644 libffi/src/x86/internal.h

-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 05/13] libgo: Use the static chain for the closure
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
  2014-10-10 20:43 ` [PATCH 04/13] Use the static chain as the closure parameter from Go Richard Henderson
  2014-10-10 20:43 ` [PATCH 11/13] libffi: Support go closures on aarch64 Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 12/13] libffi: Rewrite i386 sysv Richard Henderson
                   ` (12 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

Doesn't delete the __go_get/set_closure routines yet, as they're
still referenced by the ffi code, to be updated in another patch.
---
 libgo/go/reflect/makefunc_386.S   | 22 +++++++++-------------
 libgo/go/reflect/makefunc_amd64.S | 13 ++++---------
 libgo/runtime/malloc.goc          |  8 --------
 libgo/runtime/mgc0.c              |  3 +--
 libgo/runtime/time.goc            |  3 +--
 5 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/libgo/go/reflect/makefunc_386.S b/libgo/go/reflect/makefunc_386.S
index 0e2e764..c1caf1e 100644
--- a/libgo/go/reflect/makefunc_386.S
+++ b/libgo/go/reflect/makefunc_386.S
@@ -38,7 +38,8 @@ reflect.makeFuncStub:
 	movl	%esp, %ebp
 .LCFI1:
 	pushl	%ebx		/* In case this is PIC.  */
-	subl	$36, %esp	/* Enough for args and to align stack.  */
+	pushl	%ecx		/* Save static chain.  */
+	subl	$32, %esp	/* Enough for args and to align stack.  */
 .LCFI2:
 
 #ifdef __PIC__
@@ -47,7 +48,7 @@ reflect.makeFuncStub:
 #endif
 
 	leal	8(%ebp), %eax	/* Set esp field in struct.  */
-	movl	%eax, -24(%ebp)
+	movl	%eax, -32(%ebp)
 
 	/* For MakeFunc functions that call recover.  */
 	movl	4(%ebp), %eax
@@ -58,15 +59,10 @@ reflect.makeFuncStub:
 	call	__go_makefunc_can_recover
 #endif
 
-#ifdef __PIC__
-	call	__go_get_closure@PLT
-#else
-	call	__go_get_closure
-#endif
-
+	movl	-8(%ebp), %eax	/* Recover static chain.  */
 	movl	%eax, 4(%esp)
 
-	leal	-24(%ebp), %eax
+	leal	-32(%ebp), %eax
 	movl	%eax, (%esp)
 
 #ifdef __PIC__
@@ -84,21 +80,21 @@ reflect.makeFuncStub:
 
 	/* Set return registers.  */
 
-	movl	-20(%ebp), %eax
+	movl	-28(%ebp), %eax
 
 	cmpb	$0, -7(%ebp)
 	je	2f
 
-	fldl	-16(%ebp)
+	fldl	-24(%ebp)
 
 #ifdef __SSE2__
 	/* In case we are compiling with -msseregparm.  This won't work
 	   correctly if only SSE1 is supported, but that seems unlikely.  */
-	movsd	-16(%ebp), %xmm0
+	movsd	-24(%ebp), %xmm0
 #endif
 
 2:
-	movb	-8(%ebp), %dl
+	movb	-16(%ebp), %dl
 
 	addl	$36, %esp
 	popl	%ebx
diff --git a/libgo/go/reflect/makefunc_amd64.S b/libgo/go/reflect/makefunc_amd64.S
index 88302ee..f7db24f 100644
--- a/libgo/go/reflect/makefunc_amd64.S
+++ b/libgo/go/reflect/makefunc_amd64.S
@@ -41,7 +41,7 @@ reflect.makeFuncStub:
 	movq	%rsp, %rbp
 .LCFI1:
 
-	subq	$0xc0, %rsp		# Space for struct on stack.
+	subq	$0xd0, %rsp		# Space for struct on stack.
 
 	movq	%rax, 0x0(%rsp)
 	movq	%rdi, 0x8(%rsp)
@@ -61,6 +61,8 @@ reflect.makeFuncStub:
 	movdqa	%xmm6, 0xa0(%rsp)
 	movdqa	%xmm7, 0xb0(%rsp)
 
+	movq	%r10, 0xc0(%rsp)	# Save static chain around call.
+
 	/* For MakeFunc functions that call recover.  */
 	movq	8(%rbp), %rdi
 #ifdef __PIC__
@@ -69,14 +71,7 @@ reflect.makeFuncStub:
 	call	__go_makefunc_can_recover
 #endif
 
-	# Get function type.
-#ifdef __PIC__
-	call	__go_get_closure@PLT
-#else
-	call	__go_get_closure
-#endif
-	movq	%rax, %rsi
-
+	movq	0xc0(%rsp), %rsi	# Recover static chain.
 	movq	%rsp, %rdi
 
 #ifdef __PIC__
diff --git a/libgo/runtime/malloc.goc b/libgo/runtime/malloc.goc
index c5e64c8..0288722 100644
--- a/libgo/runtime/malloc.goc
+++ b/libgo/runtime/malloc.goc
@@ -84,7 +84,6 @@ runtime_mallocgc(uintptr size, uintptr typ, uint32 flag)
 	MLink *v, *next;
 	byte *tiny;
 	bool incallback;
-	void *closure;
 
 	if(size == 0) {
 		// All 0-length allocations use this pointer.
@@ -96,10 +95,6 @@ runtime_mallocgc(uintptr size, uintptr typ, uint32 flag)
 	m = runtime_m();
 	g = runtime_g();
 
-	// We should not be called in between __go_set_closure and the
-	// actual function call, but cope with it if we are.
-	closure = g->closure;
-
 	incallback = false;
 	if(m->mcache == nil && g->ncgo > 0) {
 		// For gccgo this case can occur when a cgo or SWIG function
@@ -180,7 +175,6 @@ runtime_mallocgc(uintptr size, uintptr typ, uint32 flag)
 					m->locks--;
 					if(incallback)
 						runtime_entersyscall();
-					g->closure = closure;
 					return v;
 				}
 			}
@@ -270,8 +264,6 @@ runtime_mallocgc(uintptr size, uintptr typ, uint32 flag)
 	if(incallback)
 		runtime_entersyscall();
 
-	g->closure = closure;
-
 	return v;
 }
 
diff --git a/libgo/runtime/mgc0.c b/libgo/runtime/mgc0.c
index dda1845..7726eec 100644
--- a/libgo/runtime/mgc0.c
+++ b/libgo/runtime/mgc0.c
@@ -133,8 +133,7 @@ clearpools(void)
 
 	// clear sync.Pool's
 	if(poolcleanup != nil) {
-		__go_set_closure(poolcleanup);
-		poolcleanup->fn();
+		poolcleanup->fn() __builtin_call_chain(poolcleanup);
 	}
 
 	for(pp=runtime_allp; (p=*pp) != nil; pp++) {
diff --git a/libgo/runtime/time.goc b/libgo/runtime/time.goc
index 220629b..645164c 100644
--- a/libgo/runtime/time.goc
+++ b/libgo/runtime/time.goc
@@ -239,8 +239,7 @@ timerproc(void* dummy __attribute__ ((unused)))
 			runtime_unlock(&timers);
 			if(raceenabled)
 				runtime_raceacquire(t);
-			__go_set_closure(fv);
-			f(now, arg);
+			f(now, arg) __builtin_call_chain(fv);
 
 			// clear f and arg to avoid leak while sleeping for next timer
 			f = nil;
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 01/13] Make TARGET_STATIC_CHAIN allow a function type
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (7 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 06/13] libffi: Add entry points for interacting with Go Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 09/13] libgo: Remove __go_get/set_closure Richard Henderson
                   ` (6 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

As opposed to always being a decl.  This is a prerequisite
to allowing the static chain to be loaded for indirect calls.
---
 gcc/config/i386/i386.c     | 19 +++++++++++++------
 gcc/config/moxie/moxie.c   |  5 +----
 gcc/config/xtensa/xtensa.c |  2 +-
 gcc/doc/tm.texi            |  2 +-
 gcc/target.def             |  6 +++---
 gcc/targhooks.c            |  5 +----
 6 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 4c4a6eb..d39e91b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -26845,13 +26845,10 @@ ix86_minimum_alignment (tree exp, enum machine_mode mode,
    This is a register, unless all free registers are used by arguments.  */
 
 static rtx
-ix86_static_chain (const_tree fndecl, bool incoming_p)
+ix86_static_chain (const_tree fndecl_or_type, bool incoming_p)
 {
   unsigned regno;
 
-  if (!DECL_STATIC_CHAIN (fndecl))
-    return NULL;
-
   if (TARGET_64BIT)
     {
       /* We always use R10 in 64-bit mode.  */
@@ -26859,13 +26856,23 @@ ix86_static_chain (const_tree fndecl, bool incoming_p)
     }
   else
     {
-      tree fntype;
+      const_tree fntype, fndecl;
       unsigned int ccvt;
 
       /* By default in 32-bit mode we use ECX to pass the static chain.  */
       regno = CX_REG;
 
-      fntype = TREE_TYPE (fndecl);
+      if (TREE_CODE (fndecl_or_type) == FUNCTION_DECL)
+	{
+          fntype = TREE_TYPE (fndecl_or_type);
+	  fndecl = fndecl_or_type;
+	}
+      else
+	{
+	  fntype = fndecl_or_type;
+	  fndecl = NULL;
+	}
+
       ccvt = ix86_get_callcvt (fntype);
       if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
 	{
diff --git a/gcc/config/moxie/moxie.c b/gcc/config/moxie/moxie.c
index 1e289068..2aa06fd 100644
--- a/gcc/config/moxie/moxie.c
+++ b/gcc/config/moxie/moxie.c
@@ -513,13 +513,10 @@ moxie_arg_partial_bytes (cumulative_args_t cum_v,
 /* Worker function for TARGET_STATIC_CHAIN.  */
 
 static rtx
-moxie_static_chain (const_tree fndecl, bool incoming_p)
+moxie_static_chain (const_tree ARG_UNUSED (fndecl_or_type), bool incoming_p)
 {
   rtx addr, mem;
 
-  if (!DECL_STATIC_CHAIN (fndecl))
-    return NULL;
-
   if (incoming_p)
     addr = plus_constant (Pmode, arg_pointer_rtx, 2 * UNITS_PER_WORD);
   else
diff --git a/gcc/config/xtensa/xtensa.c b/gcc/config/xtensa/xtensa.c
index dc0629a..6c5d4fb 100644
--- a/gcc/config/xtensa/xtensa.c
+++ b/gcc/config/xtensa/xtensa.c
@@ -3603,7 +3603,7 @@ xtensa_function_value_regno_p (const unsigned int regno)
    expressions that denote where they are stored.  */
 
 static rtx
-xtensa_static_chain (const_tree ARG_UNUSED (fndecl), bool incoming_p)
+xtensa_static_chain (const_tree ARG_UNUSED (fndecl_or_type), bool incoming_p)
 {
   rtx base = incoming_p ? arg_pointer_rtx : stack_pointer_rtx;
   return gen_frame_mem (Pmode, plus_constant (Pmode, base,
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 10af50e..557d6b5 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -3462,7 +3462,7 @@ If the static chain is passed in memory, these macros should not be
 defined; instead, the @code{TARGET_STATIC_CHAIN} hook should be used.
 @end defmac
 
-@deftypefn {Target Hook} rtx TARGET_STATIC_CHAIN (const_tree @var{fndecl}, bool @var{incoming_p})
+@deftypefn {Target Hook} rtx TARGET_STATIC_CHAIN (const_tree @var{fndecl_or_type}, bool @var{incoming_p})
 This hook replaces the use of @code{STATIC_CHAIN_REGNUM} et al for
 targets that may use different static chain locations for different
 nested functions.  This may be required if the target has function
diff --git a/gcc/target.def b/gcc/target.def
index ce11eae..b966a72 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -4147,8 +4147,8 @@ false for naked functions.  The default implementation always returns true.",
  bool, (void),
  hook_bool_void_true)
 
-/* Return an rtx for the static chain for FNDECL.  If INCOMING_P is true,
-       then it should be for the callee; otherwise for the caller.  */
+/* Return an rtx for the static chain for FNDECL_OR_TYPE.  If INCOMING_P
+   is true, then it should be for the callee; otherwise for the caller.  */
 DEFHOOK
 (static_chain,
  "This hook replaces the use of @code{STATIC_CHAIN_REGNUM} et al for\n\
@@ -4170,7 +4170,7 @@ will be at an offset from the frame pointer.\n\
 The variables @code{stack_pointer_rtx}, @code{frame_pointer_rtx}, and\n\
 @code{arg_pointer_rtx} will have been initialized and should be used\n\
 to refer to those items.",
- rtx, (const_tree fndecl, bool incoming_p),
+ rtx, (const_tree fndecl_or_type, bool incoming_p),
  default_static_chain)
 
 /* Fill in the trampoline at MEM with a call to FNDECL and a
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index 9f15559..99ea5ad 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -837,11 +837,8 @@ default_internal_arg_pointer (void)
 }
 
 rtx
-default_static_chain (const_tree fndecl, bool incoming_p)
+default_static_chain (const_tree ARG_UNUSED (fndecl_or_type), bool incoming_p)
 {
-  if (!DECL_STATIC_CHAIN (fndecl))
-    return NULL;
-
   if (incoming_p)
     {
 #ifdef STATIC_CHAIN_INCOMING_REGNUM
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 12/13] libffi: Rewrite i386 sysv
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (2 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 05/13] libgo: Use the static chain for the closure Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 10/13] libffi: Rewrite aarch64 Richard Henderson
                   ` (11 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

(1) Invent a new "internal.h" rather than polluting the public ffitarget.h
    with stuff that ought not be exposed.

(2) Reduce the ifdefs to a minimum.  Support the windows and sysv abis at
    the same time.  After all, it's possible to write functions for any of
    these abis with gcc at any time with attributes.

(3) The Win64 port should be (but hasn't been) moved to ffi64.c so that we
    can call between the abis on unix too.  Again, one can always use
    attributes.

(4) Don't use the out-dated prep_args callback form for ffi_call.

(5) Assume gas .cfi directives.  The pic/non-pic paths are IMO impossible
    to maintain with hand-written unwind info.

---
 libffi/src/x86/ffi.c       | 1097 ++++++++++++++++----------------------------
 libffi/src/x86/ffitarget.h |  112 ++---
 libffi/src/x86/internal.h  |   48 ++
 libffi/src/x86/sysv.S      |  932 +++++++++++++++++++++----------------
 4 files changed, 1049 insertions(+), 1140 deletions(-)
 create mode 100644 libffi/src/x86/internal.h

diff --git a/libffi/src/x86/ffi.c b/libffi/src/x86/ffi.c
index 6338de2..e3f82ef 100644
--- a/libffi/src/x86/ffi.c
+++ b/libffi/src/x86/ffi.c
@@ -28,620 +28,261 @@
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
-#if !defined(__x86_64__) || defined(_WIN64) || defined(__CYGWIN__)
-
-#ifdef _WIN64
-#include <windows.h>
-#endif
+#ifndef __x86_64__
 
+#include <stdlib.h>
 #include <ffi.h>
 #include <ffi_common.h>
+#include "internal.h"
 
-#include <stdlib.h>
 
-/* ffi_prep_args is called by the assembly routine once stack space
-   has been allocated for the function's arguments */
-
-void ffi_prep_args(char *stack, extended_cif *ecif)
+/* Perform machine dependent cif processing */
+ffi_status FFI_HIDDEN
+ffi_prep_cif_machdep(ffi_cif *cif)
 {
-  register unsigned int i;
-  register void **p_argv;
-  register char *argp;
-  register ffi_type **p_arg;
-#ifdef X86_WIN32
-  size_t p_stack_args[2];
-  void *p_stack_data[2];
-  char *argp2 = stack;
-  int stack_args_count = 0;
-  int cabi = ecif->cif->abi;
-#endif
-
-  argp = stack;
-
-  if ((ecif->cif->flags == FFI_TYPE_STRUCT
-       || ecif->cif->flags == FFI_TYPE_MS_STRUCT)
-#ifdef X86_WIN64
-      && (ecif->cif->rtype->size != 1 && ecif->cif->rtype->size != 2
-          && ecif->cif->rtype->size != 4 && ecif->cif->rtype->size != 8)
-#endif
-      )
-    {
-      *(void **) argp = ecif->rvalue;
-#ifdef X86_WIN32
-      /* For fastcall/thiscall this is first register-passed
-         argument.  */
-      if (cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
-	{
-	  p_stack_args[stack_args_count] = sizeof (void*);
-	  p_stack_data[stack_args_count] = argp;
-	  ++stack_args_count;
-	}
-#endif
-      argp += sizeof(void*);
-    }
+  size_t bytes = 0;
+  int i, n, flags, cabi = cif->abi;
 
-  p_argv = ecif->avalue;
-
-  for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
-       i != 0;
-       i--, p_arg++)
+  switch (cabi)
     {
-      size_t z;
-
-      /* Align if necessary */
-      if ((sizeof(void*) - 1) & (size_t) argp)
-        argp = (char *) ALIGN(argp, sizeof(void*));
-
-      z = (*p_arg)->size;
-#ifdef X86_WIN64
-      if (z > sizeof(ffi_arg)
-          || ((*p_arg)->type == FFI_TYPE_STRUCT
-              && (z != 1 && z != 2 && z != 4 && z != 8))
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
-          || ((*p_arg)->type == FFI_TYPE_LONGDOUBLE)
-#endif
-          )
-        {
-          z = sizeof(ffi_arg);
-          *(void **)argp = *p_argv;
-        }
-      else if ((*p_arg)->type == FFI_TYPE_FLOAT)
-        {
-          memcpy(argp, *p_argv, z);
-        }
-      else
-#endif
-      if (z < sizeof(ffi_arg))
-        {
-          z = sizeof(ffi_arg);
-          switch ((*p_arg)->type)
-            {
-            case FFI_TYPE_SINT8:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT8 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT8:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT8 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_SINT16:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT16 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT16:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT16 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_SINT32:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT32 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT32:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT32 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_STRUCT:
-              *(ffi_arg *) argp = *(ffi_arg *)(* p_argv);
-              break;
-
-            default:
-              FFI_ASSERT(0);
-            }
-        }
-      else
-        {
-          memcpy(argp, *p_argv, z);
-        }
-
-#ifdef X86_WIN32
-    /* For thiscall/fastcall convention register-passed arguments
-       are the first two none-floating-point arguments with a size
-       smaller or equal to sizeof (void*).  */
-    if ((cabi == FFI_THISCALL && stack_args_count < 1)
-        || (cabi == FFI_FASTCALL && stack_args_count < 2))
-      {
-	if (z <= 4
-	    && ((*p_arg)->type != FFI_TYPE_FLOAT
-	        && (*p_arg)->type != FFI_TYPE_STRUCT))
-	  {
-	    p_stack_args[stack_args_count] = z;
-	    p_stack_data[stack_args_count] = argp;
-	    ++stack_args_count;
-	  }
-      }
-#endif
-      p_argv++;
-#ifdef X86_WIN64
-      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-#else
-      argp += z;
-#endif
-    }
-
-#ifdef X86_WIN32
-  /* We need to move the register-passed arguments for thiscall/fastcall
-     on top of stack, so that those can be moved to registers ecx/edx by
-     call-handler.  */
-  if (stack_args_count > 0)
-    {
-      size_t zz = (p_stack_args[0] + 3) & ~3;
-      char *h;
-
-      /* Move first argument to top-stack position.  */
-      if (p_stack_data[0] != argp2)
-	{
-	  h = alloca (zz + 1);
-	  memcpy (h, p_stack_data[0], zz);
-	  memmove (argp2 + zz, argp2,
-	           (size_t) ((char *) p_stack_data[0] - (char*)argp2));
-	  memcpy (argp2, h, zz);
-	}
-
-      argp2 += zz;
-      --stack_args_count;
-      if (zz > 4)
-	stack_args_count = 0;
-
-      /* If we have a second argument, then move it on top
-         after the first one.  */
-      if (stack_args_count > 0 && p_stack_data[1] != argp2)
-	{
-	  zz = p_stack_args[1];
-	  zz = (zz + 3) & ~3;
-	  h = alloca (zz + 1);
-	  h = alloca (zz + 1);
-	  memcpy (h, p_stack_data[1], zz);
-	  memmove (argp2 + zz, argp2, (size_t) ((char*) p_stack_data[1] - (char*)argp2));
-	  memcpy (argp2, h, zz);
-	}
+    case FFI_SYSV:
+    case FFI_STDCALL:
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+    case FFI_MS_CDECL:
+      break;
+    default:
+      return FFI_BAD_ABI;
     }
-#endif
-  return;
-}
-
-/* Perform machine dependent cif processing */
-ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
-{
-  unsigned int i;
-  ffi_type **ptr;
 
-  /* Set the return type flag */
   switch (cif->rtype->type)
     {
     case FFI_TYPE_VOID:
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_SINT16:
-#ifdef X86_WIN64
-    case FFI_TYPE_UINT32:
+      flags = X86_RET_VOID;
+      break;
+    case FFI_TYPE_INT:
     case FFI_TYPE_SINT32:
-#endif
-    case FFI_TYPE_SINT64:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_POINTER:
+      flags = X86_RET_INT32;
+      break;
     case FFI_TYPE_FLOAT:
+      flags = X86_RET_FLOAT;
+      break;
     case FFI_TYPE_DOUBLE:
-#ifndef X86_WIN64
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
+      flags = X86_RET_DOUBLE;
+      break;
     case FFI_TYPE_LONGDOUBLE:
-#endif
-#endif
-      cif->flags = (unsigned) cif->rtype->type;
+      flags = X86_RET_LDOUBLE;
+      break;
+    case FFI_TYPE_UINT8:
+      flags = X86_RET_UINT8;
+      break;
+    case FFI_TYPE_SINT8:
+      flags = X86_RET_SINT8;
+      break;
+    case FFI_TYPE_UINT16:
+      flags = X86_RET_UINT16;
+      break;
+    case FFI_TYPE_SINT16:
+      flags = X86_RET_SINT16;
       break;
-
     case FFI_TYPE_UINT64:
-#ifdef X86_WIN64
-    case FFI_TYPE_POINTER:
-#endif
-      cif->flags = FFI_TYPE_SINT64;
+    case FFI_TYPE_SINT64:
+      flags = X86_RET_INT64;
       break;
-
     case FFI_TYPE_STRUCT:
-#ifndef X86
-      if (cif->rtype->size == 1)
-        {
-          cif->flags = FFI_TYPE_SMALL_STRUCT_1B; /* same as char size */
-        }
-      else if (cif->rtype->size == 2)
-        {
-          cif->flags = FFI_TYPE_SMALL_STRUCT_2B; /* same as short size */
-        }
-      else if (cif->rtype->size == 4)
-        {
-#ifdef X86_WIN64
-          cif->flags = FFI_TYPE_SMALL_STRUCT_4B;
-#else
-          cif->flags = FFI_TYPE_INT; /* same as int type */
-#endif
-        }
-      else if (cif->rtype->size == 8)
-        {
-          cif->flags = FFI_TYPE_SINT64; /* same as int64 type */
-        }
-      else
-#endif
-        {
-#ifdef X86_WIN32
-          if (cif->abi == FFI_MS_CDECL)
-            cif->flags = FFI_TYPE_MS_STRUCT;
-          else
-#endif
-            cif->flags = FFI_TYPE_STRUCT;
-          /* allocate space for return value pointer */
-          cif->bytes += ALIGN(sizeof(void*), FFI_SIZEOF_ARG);
-        }
+      switch (cabi)
+	{
+	case FFI_THISCALL:
+	case FFI_FASTCALL:
+	case FFI_MS_CDECL:
+	  flags = X86_RET_STRUCTECX;
+	  break;
+	default:
+	  flags = X86_RET_STRUCTPOP;
+	  break;
+	}
+      /* Allocate space for return value pointer.  */
+      bytes += ALIGN(sizeof(void*), FFI_SIZEOF_ARG);
       break;
-
     default:
-#ifdef X86_WIN64
-      cif->flags = FFI_TYPE_SINT64;
-      break;
-    case FFI_TYPE_INT:
-      cif->flags = FFI_TYPE_SINT32;
-#else
-      cif->flags = FFI_TYPE_INT;
-#endif
-      break;
+      abort();
     }
+  cif->flags = flags;
 
-  for (ptr = cif->arg_types, i = cif->nargs; i > 0; i--, ptr++)
+  for (i = 0, n = cif->nargs; i < n; ++i)
     {
-      if (((*ptr)->alignment - 1) & cif->bytes)
-        cif->bytes = ALIGN(cif->bytes, (*ptr)->alignment);
-      cif->bytes += ALIGN((*ptr)->size, FFI_SIZEOF_ARG);
-    }
-
-#ifdef X86_WIN64
-  /* ensure space for storing four registers */
-  cif->bytes += 4 * sizeof(ffi_arg);
-#endif
+      ffi_type *t = cif->arg_types[i];
 
-#ifdef X86_DARWIN
-  cif->bytes = (cif->bytes + 15) & ~0xF;
-#endif
+      bytes = ALIGN (bytes, t->alignment);
+      bytes += ALIGN(t->size, FFI_SIZEOF_ARG);
+    }
 
   return FFI_OK;
 }
 
-#ifdef X86_WIN64
-extern int
-ffi_call_win64(void (*)(char *, extended_cif *), extended_cif *,
-               unsigned, unsigned, unsigned *, void (*fn)(void));
-#elif defined(X86_WIN32)
-extern void
-ffi_call_win32(void (*)(char *, extended_cif *), extended_cif *,
-               unsigned, unsigned, unsigned, unsigned *, void (*fn)(void));
-#else
-extern void ffi_call_SYSV(void (*)(char *, extended_cif *), extended_cif *,
-                          unsigned, unsigned, unsigned *, void (*fn)(void));
-#endif
-
-void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+static ffi_arg
+extend_basic_type(void *arg, int type)
 {
-  extended_cif ecif;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  
-  /* If the return value is a struct and we don't have a return */
-  /* value address then we need to make one                     */
-
-#ifdef X86_WIN64
-  if (rvalue == NULL
-      && cif->flags == FFI_TYPE_STRUCT
-      && cif->rtype->size != 1 && cif->rtype->size != 2
-      && cif->rtype->size != 4 && cif->rtype->size != 8)
+  switch (type)
     {
-      ecif.rvalue = alloca((cif->rtype->size + 0xF) & ~0xF);
-    }
-#else
-  if (rvalue == NULL
-      && (cif->flags == FFI_TYPE_STRUCT
-          || cif->flags == FFI_TYPE_MS_STRUCT))
-    {
-      ecif.rvalue = alloca(cif->rtype->size);
-    }
-#endif
-  else
-    ecif.rvalue = rvalue;
-    
-  
-  switch (cif->abi) 
-    {
-#ifdef X86_WIN64
-    case FFI_WIN64:
-      ffi_call_win64(ffi_prep_args, &ecif, cif->bytes,
-                     cif->flags, ecif.rvalue, fn);
-      break;
-#elif defined(X86_WIN32)
-    case FFI_SYSV:
-    case FFI_STDCALL:
-    case FFI_MS_CDECL:
-      ffi_call_win32(ffi_prep_args, &ecif, cif->abi, cif->bytes, cif->flags,
-		     ecif.rvalue, fn);
-      break;
-    case FFI_THISCALL:
-    case FFI_FASTCALL:
-      {
-	unsigned int abi = cif->abi;
-	unsigned int i, passed_regs = 0;
-
-	if (cif->flags == FFI_TYPE_STRUCT)
-	  ++passed_regs;
+    case FFI_TYPE_SINT8:
+      return *(SINT8 *)arg;
+    case FFI_TYPE_UINT8:
+      return *(UINT8 *)arg;
+    case FFI_TYPE_SINT16:
+      return *(SINT16 *)arg;
+    case FFI_TYPE_UINT16:
+      return *(UINT16 *)arg;
 
-	for (i=0; i < cif->nargs && passed_regs < 2;i++)
-	  {
-	    size_t sz;
+    case FFI_TYPE_SINT32:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_POINTER:
+    case FFI_TYPE_FLOAT:
+      return *(UINT32 *)arg;
 
-	    if (cif->arg_types[i]->type == FFI_TYPE_FLOAT
-	        || cif->arg_types[i]->type == FFI_TYPE_STRUCT)
-	      continue;
-	    sz = (cif->arg_types[i]->size + 3) & ~3;
-	    if (sz == 0 || sz > 4)
-	      continue;
-	    ++passed_regs;
-	  }
-	if (passed_regs < 2 && abi == FFI_FASTCALL)
-	  abi = FFI_THISCALL;
-	if (passed_regs < 1 && abi == FFI_THISCALL)
-	  abi = FFI_STDCALL;
-        ffi_call_win32(ffi_prep_args, &ecif, abi, cif->bytes, cif->flags,
-                       ecif.rvalue, fn);
-      }
-      break;
-#else
-    case FFI_SYSV:
-      ffi_call_SYSV(ffi_prep_args, &ecif, cif->bytes, cif->flags, ecif.rvalue,
-                    fn);
-      break;
-#endif
     default:
-      FFI_ASSERT(0);
-      break;
+      abort();
     }
 }
 
-
-/** private members **/
-
-/* The following __attribute__((regparm(1))) decorations will have no effect
-   on MSVC - standard cdecl convention applies. */
-static void ffi_prep_incoming_args_SYSV (char *stack, void **ret,
-                                         void** args, ffi_cif* cif);
-void FFI_HIDDEN ffi_closure_SYSV (ffi_closure *)
-     __attribute__ ((regparm(1)));
-unsigned int FFI_HIDDEN ffi_closure_SYSV_inner (ffi_closure *, void **, void *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_raw_SYSV (ffi_raw_closure *)
-     __attribute__ ((regparm(1)));
-#ifdef X86_WIN32
-void FFI_HIDDEN ffi_closure_raw_THISCALL (ffi_raw_closure *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_STDCALL (ffi_closure *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_THISCALL (ffi_closure *)
-     __attribute__ ((regparm(1)));
-#endif
-#ifdef X86_WIN64
-void FFI_HIDDEN ffi_closure_win64 (ffi_closure *);
-#endif
-
-/* This function is jumped to by the trampoline */
-
-#ifdef X86_WIN64
-void * FFI_HIDDEN
-ffi_closure_win64_inner (ffi_closure *closure, void *args) {
-  ffi_cif       *cif;
-  void         **arg_area;
-  void          *result;
-  void          *resp = &result;
-
-  cif         = closure->cif;
-  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
-
-  /* this call will initialize ARG_AREA, such that each
-   * element in that array points to the corresponding 
-   * value on the stack; and if the function returns
-   * a structure, it will change RESP to point to the
-   * structure return address.  */
-
-  ffi_prep_incoming_args_SYSV(args, &resp, arg_area, cif);
-  
-  (closure->fun) (cif, resp, arg_area, closure->user_data);
-
-  /* The result is returned in rax.  This does the right thing for
-     result types except for floats; we have to 'mov xmm0, rax' in the
-     caller to correct this.
-     TODO: structure sizes of 3 5 6 7 are returned by reference, too!!!
-  */
-  return cif->rtype->size > sizeof(void *) ? resp : *(void **)resp;
-}
-
-#else
-unsigned int FFI_HIDDEN __attribute__ ((regparm(1)))
-ffi_closure_SYSV_inner (ffi_closure *closure, void **respp, void *args)
+struct ffi_call_frame
 {
-  /* our various things...  */
-  ffi_cif       *cif;
-  void         **arg_area;
+  void *ebp;		/* 0 */
+  void *retaddr;	/* 4 */
+  void (*fn)(void);	/* 8 */
+  int flags;		/* 12 */
+  void *rvalue;		/* 16 */
+  unsigned eax;		/* 20 */
+  unsigned edx;		/* 24 */
+  unsigned ecx;		/* 28 */
+};
+
+extern void ffi_call_i386(struct ffi_call_frame *, char *)
+	FFI_HIDDEN __attribute__((fastcall));
 
-  cif         = closure->cif;
-  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
-
-  /* this call will initialize ARG_AREA, such that each
-   * element in that array points to the corresponding 
-   * value on the stack; and if the function returns
-   * a structure, it will change RESP to point to the
-   * structure return address.  */
-
-  ffi_prep_incoming_args_SYSV(args, respp, arg_area, cif);
+void
+ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  size_t rsize;
+  struct ffi_call_frame *frame;
+  char *stack, *argp;
+  ffi_type **arg_types;
+  int flags, cabi, i, n, narg_reg;
+  size_t bytes;
+
+  flags = cif->flags;
+  cabi = cif->abi;
+
+  rsize = 0;
+  if (rvalue == NULL)
+    {
+      switch (flags)
+	{
+	case X86_RET_FLOAT:
+	case X86_RET_DOUBLE:
+	case X86_RET_LDOUBLE:
+	case X86_RET_STRUCTPOP:
+	case X86_RET_STRUCTECX:
+	  /* The float cases need to pop the 387 stack.
+	     The struct cases need to pass a valid pointer to the callee.  */
+	  rsize = cif->rtype->size;
+	  break;
+	default:
+	  /* We can just pretend the callee returns nothing.  */
+	  flags = X86_RET_VOID;
+	  break;
+	}
+    }
 
-  (closure->fun) (cif, *respp, arg_area, closure->user_data);
+  bytes = ALIGN (cif->bytes, 16);
+  argp = stack = alloca(bytes + sizeof(*frame) + rsize);
+  frame = (struct ffi_call_frame *)(stack + bytes);
+  if (rsize)
+    rvalue = frame + 1;
 
-  return cif->flags;
-}
-#endif /* !X86_WIN64 */
+  frame->fn = fn;
+  frame->flags = flags;
+  frame->rvalue = rvalue;
 
-static void
-ffi_prep_incoming_args_SYSV(char *stack, void **rvalue, void **avalue,
-                            ffi_cif *cif)
-{
-  register unsigned int i;
-  register void **p_argv;
-  register char *argp;
-  register ffi_type **p_arg;
-
-  argp = stack;
-
-#ifdef X86_WIN64
-  if (cif->rtype->size > sizeof(ffi_arg)
-      || (cif->flags == FFI_TYPE_STRUCT
-          && (cif->rtype->size != 1 && cif->rtype->size != 2
-              && cif->rtype->size != 4 && cif->rtype->size != 8))) {
-    *rvalue = *(void **) argp;
-    argp += sizeof(void *);
-  }
-#else
-  if ( cif->flags == FFI_TYPE_STRUCT
-       || cif->flags == FFI_TYPE_MS_STRUCT ) {
-    *rvalue = *(void **) argp;
-    argp += sizeof(void *);
-  }
-#endif
-
-  p_argv = avalue;
+  narg_reg = 0;
+  switch (flags)
+    {
+    case X86_RET_STRUCTECX:
+      if (cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
+	{
+	  /* For fastcall/thiscall this is first register-passed argument.  */
+	  frame->ecx = (unsigned)rvalue;
+	  narg_reg = 1;
+	  break;
+	}
+      /* fallthru */
+    case X86_RET_STRUCTPOP:
+      *(void **)argp = rvalue;
+      argp += sizeof(void*);
+      break;
+    }
 
-  for (i = cif->nargs, p_arg = cif->arg_types; (i != 0); i--, p_arg++)
+  arg_types = cif->arg_types;
+  for (i = 0, n = cif->nargs; i < n; ++i)
     {
-      size_t z;
+      size_t z = arg_types[i]->size;
+      int t = arg_types[i]->type;
+      void *valp = avalue[i];
 
-      /* Align if necessary */
-      if ((sizeof(void*) - 1) & (size_t) argp) {
-        argp = (char *) ALIGN(argp, sizeof(void*));
-      }
+      if (z <= sizeof(ffi_arg) && t != FFI_TYPE_STRUCT)
+	{
+	  ffi_arg val = extend_basic_type (valp, t);
+
+	  /* For thiscall/fastcall convention register-passed arguments
+	     are the first two non-floating-point, non-aggregate arguments
+	     with a size smaller or equal to sizeof(ffi_arg).  */
+	  if (t != FFI_TYPE_FLOAT
+	      && ((cabi == FFI_THISCALL && narg_reg < 1)
+		  || (cabi == FFI_FASTCALL && narg_reg < 2)))
+	    {
+	      if (narg_reg == 0)
+		frame->ecx = val;
+	      else
+		frame->edx = val;
+	      narg_reg++;
+	      continue;
+	    }
 
-#ifdef X86_WIN64
-      if ((*p_arg)->size > sizeof(ffi_arg)
-          || ((*p_arg)->type == FFI_TYPE_STRUCT
-              && ((*p_arg)->size != 1 && (*p_arg)->size != 2
-                  && (*p_arg)->size != 4 && (*p_arg)->size != 8)))
-        {
-          z = sizeof(void *);
-          *p_argv = *(void **)argp;
+	  *(ffi_arg *)argp = val;
+	  z = sizeof(ffi_arg);
         }
       else
-#endif
         {
-          z = (*p_arg)->size;
-          
-          /* because we're little endian, this is what it turns into.   */
-          
-          *p_argv = (void*) argp;
+          memcpy(argp, valp, z);
+	  z = ALIGN(z, sizeof(ffi_arg));
         }
-          
-      p_argv++;
-#ifdef X86_WIN64
-      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-#else
+
       argp += z;
-#endif
     }
-  
-  return;
+
+  ffi_call_i386(frame, stack);
 }
 
-#define FFI_INIT_TRAMPOLINE_WIN64(TRAMP,FUN,CTX,MASK) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   void*  __fun = (void*)(FUN); \
-   void*  __ctx = (void*)(CTX); \
-   *(unsigned char*) &__tramp[0] = 0x41; \
-   *(unsigned char*) &__tramp[1] = 0xbb; \
-   *(unsigned int*) &__tramp[2] = MASK; /* mov $mask, %r11 */ \
-   *(unsigned char*) &__tramp[6] = 0x48; \
-   *(unsigned char*) &__tramp[7] = 0xb8; \
-   *(void**) &__tramp[8] = __ctx; /* mov __ctx, %rax */ \
-   *(unsigned char *)  &__tramp[16] = 0x49; \
-   *(unsigned char *)  &__tramp[17] = 0xba; \
-   *(void**) &__tramp[18] = __fun; /* mov __fun, %r10 */ \
-   *(unsigned char *)  &__tramp[26] = 0x41; \
-   *(unsigned char *)  &__tramp[27] = 0xff; \
-   *(unsigned char *)  &__tramp[28] = 0xe2; /* jmp %r10 */ \
- }
+/* ------- Closure API support ----------------------------------- */
 
 /* How to make a trampoline.  Derived from gcc/config/i386/i386.c. */
 
-#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 10);  \
-   *(unsigned char*) &__tramp[0] = 0xb8; \
-   *(unsigned int*)  &__tramp[1] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[5] = 0xe9; \
-   *(unsigned int*)  &__tramp[6] = __dis; /* jmp __fun  */ \
- }
-
-#define FFI_INIT_TRAMPOLINE_THISCALL(TRAMP,FUN,CTX,SIZE) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 49);  \
-   unsigned short __size = (unsigned short)(SIZE); \
-   *(unsigned int *) &__tramp[0] = 0x8324048b;	/* mov (%esp), %eax */ \
-   *(unsigned int *) &__tramp[4] = 0x4c890cec;	/* sub $12, %esp */ \
-   *(unsigned int *) &__tramp[8] = 0x04890424;	/* mov %ecx, 4(%esp) */ \
-   *(unsigned char*) &__tramp[12] = 0x24;	/* mov %eax, (%esp) */ \
-   *(unsigned char*) &__tramp[13] = 0xb8; \
-   *(unsigned int *) &__tramp[14] = __size;	/* mov __size, %eax */ \
-   *(unsigned int *) &__tramp[18] = 0x08244c8d;	/* lea 8(%esp), %ecx */ \
-   *(unsigned int *) &__tramp[22] = 0x4802e8c1; /* shr $2, %eax ; dec %eax */ \
-   *(unsigned short*) &__tramp[26] = 0x0b74;	/* jz 1f */ \
-   *(unsigned int *) &__tramp[28] = 0x8908518b;	/* 2b: mov 8(%ecx), %edx */ \
-   *(unsigned int *) &__tramp[32] = 0x04c18311; /* mov %edx, (%ecx) ; add $4, %ecx */ \
-   *(unsigned char*) &__tramp[36] = 0x48;	/* dec %eax */ \
-   *(unsigned short*) &__tramp[37] = 0xf575;	/* jnz 2b ; 1f: */ \
-   *(unsigned char*) &__tramp[39] = 0xb8; \
-   *(unsigned int*)  &__tramp[40] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[44] = 0xe8; \
-   *(unsigned int*)  &__tramp[45] = __dis; /* call __fun  */ \
-   *(unsigned char*)  &__tramp[49] = 0xc2; /* ret  */ \
-   *(unsigned short*)  &__tramp[50] = (__size + 8); /* ret (__size + 8)  */ \
- }
-
-#define FFI_INIT_TRAMPOLINE_STDCALL(TRAMP,FUN,CTX,SIZE)  \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 10); \
-   unsigned short __size = (unsigned short)(SIZE); \
-   *(unsigned char*) &__tramp[0] = 0xb8; \
-   *(unsigned int*)  &__tramp[1] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[5] = 0xe8; \
-   *(unsigned int*)  &__tramp[6] = __dis; /* call __fun  */ \
-   *(unsigned char *)  &__tramp[10] = 0xc2; \
-   *(unsigned short*)  &__tramp[11] = __size; /* ret __size  */ \
- }
-
-/* the cif must already be prep'ed */
+extern void ffi_closure_i386(void) FFI_HIDDEN;
+extern void ffi_closure_i386_stdcall(void) FFI_HIDDEN;
+
+static void
+ffi_init_trampoline(char *tramp, void (*dest)(void), void *codeloc)
+{
+  unsigned diff = (unsigned)dest;
+
+  tramp[0] = 0xb8;			/* movl codeloc, %eax */
+  *(void **)(tramp + 1) = codeloc;
+  tramp[5] = 0xe9;			/* jmp  ffi_closure_i386 */
+  diff -= (unsigned)tramp + 10;
+  *(unsigned*)(tramp + 6) = diff;
+}
 
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
@@ -650,61 +291,116 @@ ffi_prep_closure_loc (ffi_closure* closure,
                       void *user_data,
                       void *codeloc)
 {
-#ifdef X86_WIN64
-#define ISFLOAT(IDX) (cif->arg_types[IDX]->type == FFI_TYPE_FLOAT || cif->arg_types[IDX]->type == FFI_TYPE_DOUBLE)
-#define FLAG(IDX) (cif->nargs>(IDX)&&ISFLOAT(IDX)?(1<<(IDX)):0)
-  if (cif->abi == FFI_WIN64) 
-    {
-      int mask = FLAG(0)|FLAG(1)|FLAG(2)|FLAG(3);
-      FFI_INIT_TRAMPOLINE_WIN64 (&closure->tramp[0],
-                                 &ffi_closure_win64,
-                                 codeloc, mask);
-      /* make sure we can execute here */
-    }
-#else
-  if (cif->abi == FFI_SYSV)
-    {
-      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
-                           &ffi_closure_SYSV,
-                           (void*)codeloc);
-    }
-#ifdef X86_WIN32
-  else if (cif->abi == FFI_THISCALL)
-    {
-      FFI_INIT_TRAMPOLINE_THISCALL (&closure->tramp[0],
-				    &ffi_closure_THISCALL,
-				    (void*)codeloc,
-				    cif->bytes);
-    }
-  else if (cif->abi == FFI_STDCALL)
+  void (*fn)(void);
+
+  switch (cif->abi)
     {
-      FFI_INIT_TRAMPOLINE_STDCALL (&closure->tramp[0],
-                                   &ffi_closure_STDCALL,
-                                   (void*)codeloc, cif->bytes);
+    case FFI_SYSV:
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+    case FFI_MS_CDECL:
+      fn = ffi_closure_i386;
+      break;
+
+    case FFI_STDCALL:
+      fn = ffi_closure_i386_stdcall;
+      break;
+
+    default:
+      return FFI_BAD_ABI;
     }
-  else if (cif->abi == FFI_MS_CDECL)
+
+  ffi_init_trampoline (closure->tramp, fn, codeloc);
+
+  closure->cif = cif;
+  closure->fun = fun;
+  closure->user_data = user_data;
+
+  return FFI_OK;
+}
+
+struct ffi_closure_frame
+{
+  unsigned rettemp[4];	/* 0 */
+  unsigned eax;		/* 16 */
+  unsigned edx;		/* 20 */
+  unsigned ecx;		/* 24 */
+  ffi_closure *closure;	/* 28 */
+};
+
+unsigned int FFI_HIDDEN __attribute__ ((fastcall))
+ffi_closure_inner (struct ffi_closure_frame *frame, char *argp)
+{
+  ffi_closure *closure = frame->closure;
+  ffi_cif *cif = closure->cif;
+  int cabi, i, n, flags, narg_reg;
+  ffi_type **arg_types;
+  void *rvalue;
+  void **avalue;
+
+  cabi = cif->abi;
+  flags = cif->flags;
+  narg_reg = 0;
+  rvalue = frame->rettemp;
+
+  switch (flags)
     {
-      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
-                           &ffi_closure_SYSV,
-                           (void*)codeloc);
+    case X86_RET_STRUCTECX:
+      if (cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
+	{
+	  rvalue = (void *)frame->ecx;
+	  narg_reg = 1;
+	  break;
+	}
+      /* fallthru */
+    case X86_RET_STRUCTPOP:
+      rvalue = *(void **)argp;
+      argp += sizeof(void *);
+      break;
     }
-#endif /* X86_WIN32 */
-#endif /* !X86_WIN64 */
-  else
+
+  n = cif->nargs;
+  avalue = alloca(sizeof(void *) * n);
+
+  arg_types = cif->arg_types;
+  for (i = 0; i < n; ++i)
     {
-      return FFI_BAD_ABI;
+      size_t z = arg_types[i]->size;
+      int t = arg_types[i]->type;
+      void *valp;
+
+      if (z <= sizeof(ffi_arg)
+	  && t != FFI_TYPE_STRUCT && t != FFI_TYPE_FLOAT
+	  && ((cabi == FFI_THISCALL && narg_reg < 1)
+	      || (cabi == FFI_FASTCALL && narg_reg < 2)))
+	{
+	  if (narg_reg == 0)
+	    valp = &frame->ecx;
+	  else
+	    valp = &frame->edx;
+	}
+      else
+	{
+	  valp = argp;
+	  z = ALIGN (z, 4);
+	  argp += z;
+	}
+
+      avalue[i] = valp;
     }
-    
-  closure->cif  = cif;
-  closure->user_data = user_data;
-  closure->fun  = fun;
 
-  return FFI_OK;
+  closure->fun (cif, rvalue, avalue, closure->user_data);
+
+  if (cabi == FFI_STDCALL)
+    return flags + (cif->bytes << X86_RET_POP_SHIFT);
+  else
+    return flags;
 }
 
 /* ------- Native raw API support -------------------------------- */
 
-#if !FFI_NO_RAW_API
+extern void ffi_closure_raw_SYSV(void) FFI_HIDDEN;
+extern void ffi_closure_raw_THISCALL(void) FFI_HIDDEN;
 
 ffi_status
 ffi_prep_raw_closure_loc (ffi_raw_closure* closure,
@@ -713,131 +409,154 @@ ffi_prep_raw_closure_loc (ffi_raw_closure* closure,
                           void *user_data,
                           void *codeloc)
 {
+  void (*fn)(void);
   int i;
 
-  if (cif->abi != FFI_SYSV) {
-#ifdef X86_WIN32
-    if (cif->abi != FFI_THISCALL)
-#endif
-    return FFI_BAD_ABI;
-  }
-
-  /* we currently don't support certain kinds of arguments for raw
-     closures.  This should be implemented by a separate assembly
-     language routine, since it would require argument processing,
-     something we don't do now for performance.  */
-
+  /* We currently don't support certain kinds of arguments for raw closures.
+     This should be implemented by a separate assembly language routine,
+     since it would require argument processing, something we don't do now
+     for performance.  */
   for (i = cif->nargs-1; i >= 0; i--)
     {
       FFI_ASSERT (cif->arg_types[i]->type != FFI_TYPE_STRUCT);
       FFI_ASSERT (cif->arg_types[i]->type != FFI_TYPE_LONGDOUBLE);
     }
   
-#ifdef X86_WIN32
-  if (cif->abi == FFI_SYSV)
-    {
-#endif
-  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_raw_SYSV,
-                       codeloc);
-#ifdef X86_WIN32
-    }
-  else if (cif->abi == FFI_THISCALL)
+  switch (cif->abi)
     {
-      FFI_INIT_TRAMPOLINE_THISCALL (&closure->tramp[0], &ffi_closure_raw_THISCALL,
-				    codeloc, cif->bytes);
-    }
-#endif
-  closure->cif  = cif;
+    case FFI_SYSV:
+      fn = ffi_closure_raw_SYSV;
+      break;
+    case FFI_THISCALL:
+      fn = ffi_closure_raw_THISCALL;
+      break;
+    default:
+      return FFI_BAD_ABI;
+  }
+
+  ffi_init_trampoline (closure->tramp, fn, codeloc);
+
+  closure->cif = cif;
+  closure->fun = fun;
   closure->user_data = user_data;
-  closure->fun  = fun;
 
   return FFI_OK;
 }
 
-static void 
-ffi_prep_args_raw(char *stack, extended_cif *ecif)
-{
-  memcpy (stack, ecif->avalue, ecif->cif->bytes);
-}
-
-/* we borrow this routine from libffi (it must be changed, though, to
- * actually call the function passed in the first argument.  as of
- * libffi-1.20, this is not the case.)
- */
-
 void
-ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *fake_avalue)
+ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *avalue)
 {
-  extended_cif ecif;
-  void **avalue = (void **)fake_avalue;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
+  size_t rsize;
+  struct ffi_call_frame *frame;
+  char *stack, *argp;
+  int flags, cabi, narg_reg;
+  size_t bytes;
   
-  /* If the return value is a struct and we don't have a return */
-  /* value address then we need to make one                     */
+  flags = cif->flags;
+  cabi = cif->abi;
+  
+  rsize = 0;
+  if (rvalue == NULL)
+    {
+      switch (flags)
+	{
+	case X86_RET_FLOAT:
+	case X86_RET_DOUBLE:
+	case X86_RET_LDOUBLE:
+	case X86_RET_STRUCTPOP:
+	case X86_RET_STRUCTECX:
+	  /* The float cases need to pop the 387 stack.
+	     The struct cases need to pass a valid pointer to the callee.  */
+	  rsize = cif->rtype->size;
+	  break;
+	default:
+	  /* We can just pretend the callee returns nothing.  */
+	  flags = X86_RET_VOID;
+	  break;
+	}
+    }
+
+  bytes = cif->bytes;
+  argp = stack = alloca(bytes + sizeof(*frame) + rsize);
+  frame = (struct ffi_call_frame *)(stack + bytes);
+  if (rsize)
+    rvalue = frame + 1;
 
-  if (rvalue == NULL
-      && (cif->flags == FFI_TYPE_STRUCT
-          || cif->flags == FFI_TYPE_MS_STRUCT))
+  frame->fn = fn;
+  frame->flags = flags;
+  frame->rvalue = rvalue;
+
+  narg_reg = 0;
+  switch (flags)
     {
-      ecif.rvalue = alloca(cif->rtype->size);
+    case X86_RET_STRUCTECX:
+      if (cabi == FFI_THISCALL)
+	{
+	  /* For fastcall/thiscall this is first register-passed argument.  */
+	  frame->ecx = (unsigned)rvalue;
+	  narg_reg = 1;
+	  break;
+	}
+      /* fallthru */
+    case X86_RET_STRUCTPOP:
+      *(void **)argp = rvalue;
+      argp += sizeof(void *);
+      bytes -= sizeof(void *);
+      break;
     }
-  else
-    ecif.rvalue = rvalue;
-    
-  
+
   switch (cif->abi) 
     {
-#ifdef X86_WIN32
     case FFI_SYSV:
     case FFI_STDCALL:
     case FFI_MS_CDECL:
-      ffi_call_win32(ffi_prep_args_raw, &ecif, cif->abi, cif->bytes, cif->flags,
-		     ecif.rvalue, fn);
+      memcpy (argp, avalue, bytes);
       break;
+
     case FFI_THISCALL:
     case FFI_FASTCALL:
       {
-	unsigned int abi = cif->abi;
-	unsigned int i, passed_regs = 0;
-
-	if (cif->flags == FFI_TYPE_STRUCT)
-	  ++passed_regs;
-
-	for (i=0; i < cif->nargs && passed_regs < 2;i++)
+	int narg_max = (cif->abi == FFI_FASTCALL ? 2 : 1);
+	ffi_type **arg_types = cif->arg_types;
+	int i, n = cif->nargs;
+	
+	for (i = 0; i < n && narg_reg < narg_max; i++)
 	  {
-	    size_t sz;
-
-	    if (cif->arg_types[i]->type == FFI_TYPE_FLOAT
-	        || cif->arg_types[i]->type == FFI_TYPE_STRUCT)
-	      continue;
-	    sz = (cif->arg_types[i]->size + 3) & ~3;
-	    if (sz == 0 || sz > 4)
-	      continue;
-	    ++passed_regs;
+	    size_t z = arg_types[i]->size;
+	    int t = arg_types[i]->type;
+
+	    if (z <= sizeof(ffi_arg)
+		&& t != FFI_TYPE_STRUCT
+		&& t != FFI_TYPE_FLOAT)
+	      {
+		ffi_arg val = extend_basic_type (avalue, t);
+		if (narg_reg == 0)
+		  frame->ecx = val;
+		else
+		  frame->edx = val;
+		narg_reg++;
+		z = sizeof(ffi_arg);
+	      }
+	    else
+	      {
+		memcpy (argp, avalue, z);
+		z = ALIGN (z, sizeof(ffi_arg));
+		argp += z;
+	      }
+	    avalue += z;
+	    bytes -= z;
 	  }
-	if (passed_regs < 2 && abi == FFI_FASTCALL)
-	  cif->abi = abi = FFI_THISCALL;
-	if (passed_regs < 1 && abi == FFI_THISCALL)
-	  cif->abi = abi = FFI_STDCALL;
-        ffi_call_win32(ffi_prep_args_raw, &ecif, abi, cif->bytes, cif->flags,
-                       ecif.rvalue, fn);
+	if (i < n)
+	  memcpy (argp, avalue, bytes);
       }
       break;
-#else
-    case FFI_SYSV:
-      ffi_call_SYSV(ffi_prep_args_raw, &ecif, cif->bytes, cif->flags,
-                    ecif.rvalue, fn);
-      break;
-#endif
+
     default:
       FFI_ASSERT(0);
-      break;
+      return;
     }
-}
 
-#endif
-
-#endif /* !__x86_64__  || X86_WIN64 */
+  ffi_call_i386(frame, stack);
+}
 
+#endif /* !__x86_64__ */
diff --git a/libffi/src/x86/ffitarget.h b/libffi/src/x86/ffitarget.h
index 592d6f8..bf8d8c6 100644
--- a/libffi/src/x86/ffitarget.h
+++ b/libffi/src/x86/ffitarget.h
@@ -36,107 +36,85 @@
 
 /* ---- System specific configurations ----------------------------------- */
 
-/* For code common to all platforms on x86 and x86_64. */
-#define X86_ANY
-
 #if defined (X86_64) && defined (__i386__)
-#undef X86_64
-#define X86
+# undef X86_64
+# define X86
+#elif defined (X86_WIN64)
+# define X86_64
+#elif defined (X86_DARWIN) && defined (__x86_64__)
+# define X86_64
 #endif
 
 #ifdef X86_WIN64
-#define FFI_SIZEOF_ARG 8
 #define USE_BUILTIN_FFS 0 /* not yet implemented in mingw-64 */
 #endif
 
 /* ---- Generic type definitions ----------------------------------------- */
 
 #ifndef LIBFFI_ASM
-#ifdef X86_WIN64
-#ifdef _MSC_VER
-typedef unsigned __int64       ffi_arg;
-typedef __int64                ffi_sarg;
-#else
-typedef unsigned long long     ffi_arg;
-typedef long long              ffi_sarg;
-#endif
-#else
-#if defined __x86_64__ && defined __ILP32__
-#define FFI_SIZEOF_ARG 8
-#define FFI_SIZEOF_JAVA_RAW  4
-typedef unsigned long long     ffi_arg;
-typedef long long              ffi_sarg;
+
+#ifdef X86_64
+# ifdef _MSC_VER
+typedef unsigned __int64    ffi_arg;
+typedef __int64             ffi_sarg;
+# else
+typedef unsigned long long  ffi_arg;
+typedef long long           ffi_sarg;
+# endif
+# define FFI_SIZEOF_ARG 8
+# ifdef __ILP32__
+#  define FFI_SIZEOF_JAVA_RAW  4
+# endif
 #else
-typedef unsigned long          ffi_arg;
-typedef signed long            ffi_sarg;
-#endif
-#endif
+typedef unsigned long       ffi_arg;
+typedef signed long         ffi_sarg;
+# define FFI_SIZEOF_ARG 4
+#endif /* X86_64 */
 
 typedef enum ffi_abi {
   FFI_FIRST_ABI = 0,
 
-  /* ---- Intel x86 Win32 ---------- */
-#ifdef X86_WIN32
+#ifdef X86_64
+  FFI_WIN64,
+  FFI_UNIX64,
+  FFI_LAST_ABI,
+# ifdef X86_WIN64
+  FFI_DEFAULT_ABI = FFI_WIN64,
+# else
+  FFI_DEFAULT_ABI = FFI_UNIX64,
+# endif
+#else
   FFI_SYSV,
   FFI_STDCALL,
   FFI_THISCALL,
   FFI_FASTCALL,
   FFI_MS_CDECL,
   FFI_LAST_ABI,
-#ifdef _MSC_VER
+# ifdef _MSC_VER
   FFI_DEFAULT_ABI = FFI_MS_CDECL
-#else
-  FFI_DEFAULT_ABI = FFI_SYSV
-#endif
-
-#elif defined(X86_WIN64)
-  FFI_WIN64,
-  FFI_LAST_ABI,
-  FFI_DEFAULT_ABI = FFI_WIN64
-
-#else
-  /* ---- Intel x86 and AMD x86-64 - */
-  FFI_SYSV,
-  FFI_UNIX64,   /* Unix variants all use the same ABI for x86-64  */
-  FFI_LAST_ABI,
-#if defined(__i386__) || defined(__i386)
+# else
   FFI_DEFAULT_ABI = FFI_SYSV
-#else
-  FFI_DEFAULT_ABI = FFI_UNIX64
-#endif
-#endif
+# endif
+#endif /* X86_64 */
 } ffi_abi;
-#endif
+
+#endif /* !LIBFFI_ASM */
 
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
 #define FFI_GO_CLOSURES 1
 
-#define FFI_TYPE_SMALL_STRUCT_1B (FFI_TYPE_LAST + 1)
-#define FFI_TYPE_SMALL_STRUCT_2B (FFI_TYPE_LAST + 2)
-#define FFI_TYPE_SMALL_STRUCT_4B (FFI_TYPE_LAST + 3)
-#define FFI_TYPE_MS_STRUCT       (FFI_TYPE_LAST + 4)
-
-#if defined (X86_64) || (defined (__x86_64__) && defined (X86_DARWIN))
+#ifdef X86_64
 #define FFI_TRAMPOLINE_SIZE 24
-#define FFI_NATIVE_RAW_API 0
-#else
-#ifdef X86_WIN32
-#define FFI_TRAMPOLINE_SIZE 52
-#else
-#ifdef X86_WIN64
-#define FFI_TRAMPOLINE_SIZE 29
-#define FFI_NATIVE_RAW_API 0
-#define FFI_NO_RAW_API 1
 #else
 #define FFI_TRAMPOLINE_SIZE 10
 #endif
-#endif
-#ifndef X86_WIN64
-#define FFI_NATIVE_RAW_API 1	/* x86 has native raw api support */
-#endif
-#endif
 
+#ifdef X86_64
+# define FFI_NATIVE_RAW_API 0
+#else
+# define FFI_NATIVE_RAW_API 1
 #endif
 
+#endif /* LIBFFI_TARGET_H */
diff --git a/libffi/src/x86/internal.h b/libffi/src/x86/internal.h
new file mode 100644
index 0000000..e1df862
--- /dev/null
+++ b/libffi/src/x86/internal.h
@@ -0,0 +1,48 @@
+/* -----------------------------------------------------------------*-C-*-
+   ffitarget.h - Copyright (c) 2012  Anthony Green
+                 Copyright (c) 1996-2003, 2010  Red Hat, Inc.
+                 Copyright (C) 2008  Free Software Foundation, Inc.
+
+   Internal configuration macros for x86 and x86-64.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+
+   ----------------------------------------------------------------------- */
+
+#define X86_RET_FLOAT		0
+#define X86_RET_DOUBLE		1
+#define X86_RET_LDOUBLE		2
+#define X86_RET_SINT8		3
+#define X86_RET_SINT16		4
+#define X86_RET_UINT8		5
+#define X86_RET_UINT16		6
+#define X86_RET_INT64		7
+#define X86_RET_INT32		8
+#define X86_RET_VOID		9
+#define X86_RET_STRUCTPOP	10
+#define X86_RET_STRUCTECX	11
+#define X86_RET_UNUSED12	12
+#define X86_RET_UNUSED13	13
+#define X86_RET_UNUSED14	14
+#define X86_RET_UNUSED15	15
+
+#define X86_RET_TYPE_MASK	15
+#define X86_RET_POP_SHIFT	4
diff --git a/libffi/src/x86/sysv.S b/libffi/src/x86/sysv.S
index f108dd8..d8256d0 100644
--- a/libffi/src/x86/sysv.S
+++ b/libffi/src/x86/sysv.S
@@ -24,226 +24,363 @@
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
-#ifndef __x86_64__
+#if !(defined(X86_WIN64) || defined(__x86_64__))
 
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
+#include "internal.h"
+
+	.text
+
+	.align	16
+	.globl	ffi_call_i386
+        .type	ffi_call_i386,@function
+	FFI_HIDDEN (ffi_call_i386)
+
+/* This macro allows the safe creation of jump tables without an
+   actual table.  The entry points into the table are all 8 bytes.
+   The use of ORG asserts that we're at the correct location.  */
+.macro	E	which
+	.align	8
+	.org	0b + \which * 8
+.endm
+
+/* This is declared as
+
+   void ffi_call_i386(struct ffi_call_frame *frame, char *argp)
+	__attribute__((fastcall));
+
+   This the arguments are present in
+
+	ecx: frame
+	edx: argp
+*/
+
+ffi_call_i386:
+	.cfi_startproc
+	movl	(%esp), %eax		/* move the return address */
+	movl	%ebp, (%ecx)		/* store %ebp into local frame */
+	movl	%eax, 4(%ecx)		/* store retaddr into local frame */
+
+	/* New stack frame based off ebp.  This is a itty bit of unwind
+	   trickery in that the CFA *has* changed.  There is no easy way
+	   to describe it correctly on entry to the function.  Fortunately,
+	   it doesn't matter too much since at all points we can correctly
+	   unwind back to ffi_call.  Note that the location to which we
+	   moved the return address is (the new) CFA-4, so from the
+	   perspective of the unwind info, it hasn't moved.  */
+	movl	%ecx, %ebp
+	.cfi_def_cfa %ebp, 8
+	.cfi_rel_offset %ebp, 0
+
+	movl	%edx, %esp		/* set outgoing argument stack */
+	movl	20(%ebp), %eax		/* set register arguments */
+	movl	24(%ebp), %edx
+	movl	28(%ebp), %ecx
+
+	call	*8(%ebp)
+
+	movl	12(%ebp), %ecx		/* load return type code */
+	movl	%ebx, 8(%ebp)		/* preserve %ebx */
+	.cfi_rel_offset %ebx, 8
+
+	andl	$X86_RET_TYPE_MASK, %ecx
+#ifdef __PIC__
+	call	__x86.get_pc_thunk.bx
+1:	leal	0f-1b(%ebx, %ecx, 8), %ebx
+#else
+	leal	0f(,%ecx, 8), %ebx
+#endif
+	movl	16(%ebp), %ecx		/* load result address */
+	jmp	*%ebx
 
-.text
-
-.globl ffi_prep_args
+	.align	8
+0:
+E X86_RET_FLOAT
+	fstps	(%ecx)
+	jmp	9f
+
+E X86_RET_DOUBLE
+	fstpl	(%ecx)
+	jmp	9f
+
+E X86_RET_LDOUBLE
+	fstpt	(%ecx)
+	jmp	9f
+
+E X86_RET_SINT8
+	movsbl	%al, %eax
+	mov	%eax, (%ecx)
+	jmp	9f
+
+E X86_RET_SINT16
+	movswl	%ax, %eax
+	mov	%eax, (%ecx)
+	jmp	9f
+
+E X86_RET_UINT8
+	movzbl	%al, %eax
+	movl	%eax, (%ebx)
+	jmp	9f
+
+E X86_RET_UINT16
+	movzwl	%ax, %eax
+	movl	%eax, (%ebx)
+	jmp	9f
+
+E X86_RET_INT64
+	movl	%edx, 4(%ecx)
+	/* fallthru */
+E X86_RET_INT32
+	movl	%eax, (%ecx)
+	/* fallthru */
+E X86_RET_VOID
+9:	movl	8(%ebp), %ebx
+	movl	%ebp, %esp
+	popl	%ebp
+	.cfi_remember_state
+	.cfi_def_cfa %esp, 4
+	.cfi_restore %ebx
+	.cfi_restore %ebp
+	ret
+	.cfi_restore_state
 
-	.align 4
-.globl ffi_call_SYSV
-        .type    ffi_call_SYSV,@function
+	/* No struct return path need do anything special.  */
+E X86_RET_STRUCTPOP
+	jmp	9b
+E X86_RET_STRUCTECX
+	jmp	9b
 
-ffi_call_SYSV:
-.LFB1:
-        pushl %ebp
-.LCFI0:
-        movl  %esp,%ebp
-.LCFI1:
-	/* Make room for all of the new args.  */
-	movl  16(%ebp),%ecx
-	subl  %ecx,%esp
+	/* Fill out the table so that bad values are predictable.  */
+E X86_RET_UNUSED12
+	ud2
+E X86_RET_UNUSED13
+	ud2
+E X86_RET_UNUSED14
+	ud2
+E X86_RET_UNUSED15
+	ud2
 
-        /* Align the stack pointer to 16-bytes */
-        andl  $0xfffffff0, %esp
+	.cfi_endproc
+	.size	ffi_call_i386, . - ffi_call_i386
 
-	movl  %esp,%eax
+/* The closure entry points are reached from the ffi_closure trampoline.
+   On entry, %eax contains the address of the ffi_closure.  */
 
-	/* Place all of the ffi_prep_args in position  */
-	pushl 12(%ebp)
-	pushl %eax
-	call  *8(%ebp)
+#define	ffi_closure_FS	(12 + 4*4 + 16)
 
-	/* Return stack to previous state and call the function  */
-	addl  $8,%esp	
+.macro FFI_CLOSURE_FIRST
+	subl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset ffi_closure_FS
 
-	call  *28(%ebp)
+	movl	%edx, 20(%esp)		/* save incoming register args */
+	movl	%ecx, 24(%esp)
+	movl	%eax, 28(%esp)		/* trampoline loaded closure */
 
-	/* Load %ecx with the return type code  */
-	movl  20(%ebp),%ecx	
+	movl	%esp, %ecx		/* pass save area to C */
+	leal	ffi_closure_FS+4(%esp), %edx
 
-	/* Protect %esi.  We're going to pop it in the epilogue.  */
-	pushl %esi
+#ifdef __PIC__
+	movl	%ebx, 32(%esp)		/* save ebx */
+	.cfi_rel_offset %esp, 32
+	call	__x86.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+#endif
+#if defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE || !defined __PIC__
+	call	ffi_closure_inner
+#else
+	call	ffi_closure_inner@PLT
+#endif
+.endm
 
-	/* If the return value pointer is NULL, assume no return value.  */
-	cmpl  $0,24(%ebp)
-	jne  0f
+.macro FFI_CLOSURE_SECOND
+	andl	$X86_RET_TYPE_MASK, %eax
+#ifdef __PIC__
+	leal	0f@GOTOFF(%ebx, %eax, 8), %eax
+	movl	32(%esp), %ebx		/* restore ebx */
+	.cfi_restore %ebx
+#else
+	leal	0f(, %eax, 8), %eax
+#endif
+	jmp	*%eax
+.endm
 
-	/* Even if there is no space for the return value, we are 
-	   obliged to handle floating-point values.  */
-	cmpl  $FFI_TYPE_FLOAT,%ecx
-	jne   noretval
-	fstp  %st(0)
+	.align	16
+	.globl	ffi_closure_i386
+	.type	ffi_closure_i386, @function
+	FFI_HIDDEN (ffi_closure_i386)
 
-        jmp   epilogue
+ffi_closure_i386:
+	.cfi_startproc
+	FFI_CLOSURE_FIRST
+	FFI_CLOSURE_SECOND
 
+	.align	8
 0:
-	call  1f
-
-.Lstore_table:
-	.long	noretval-.Lstore_table	/* FFI_TYPE_VOID */
-	.long	retint-.Lstore_table	/* FFI_TYPE_INT */
-	.long	retfloat-.Lstore_table	/* FFI_TYPE_FLOAT */
-	.long	retdouble-.Lstore_table	/* FFI_TYPE_DOUBLE */
-	.long	retlongdouble-.Lstore_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	retuint8-.Lstore_table	/* FFI_TYPE_UINT8 */
-	.long	retsint8-.Lstore_table	/* FFI_TYPE_SINT8 */
-	.long	retuint16-.Lstore_table	/* FFI_TYPE_UINT16 */
-	.long	retsint16-.Lstore_table	/* FFI_TYPE_SINT16 */
-	.long	retint-.Lstore_table	/* FFI_TYPE_UINT32 */
-	.long	retint-.Lstore_table	/* FFI_TYPE_SINT32 */
-	.long	retint64-.Lstore_table	/* FFI_TYPE_UINT64 */
-	.long	retint64-.Lstore_table	/* FFI_TYPE_SINT64 */
-	.long	retstruct-.Lstore_table	/* FFI_TYPE_STRUCT */
-	.long	retint-.Lstore_table	/* FFI_TYPE_POINTER */
-
-1:
-	pop  %esi
-	add  (%esi, %ecx, 4), %esi
-	jmp  *%esi
-
-	/* Sign/zero extend as appropriate.  */
-retsint8:
-	movsbl  %al, %eax
-	jmp  retint
-
-retsint16:
-	movswl  %ax, %eax
-	jmp  retint
-
-retuint8:
-	movzbl  %al, %eax
-	jmp  retint
-
-retuint16:
-	movzwl  %ax, %eax
-	jmp  retint
-
-retfloat:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstps (%ecx)
-	jmp   epilogue
-
-retdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstpl (%ecx)
-	jmp   epilogue
-
-retlongdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstpt (%ecx)
-	jmp   epilogue
-	
-retint64:	
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	movl  %eax,0(%ecx)
-	movl  %edx,4(%ecx)
-	jmp   epilogue
-	
-retint:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	movl  %eax,0(%ecx)
-
-retstruct:
-	/* Nothing to do!  */
-
-noretval:
-epilogue:
-        popl %esi
-        movl %ebp,%esp
-        popl %ebp
-        ret
-.LFE1:
-.ffi_call_SYSV_end:
-        .size    ffi_call_SYSV,.ffi_call_SYSV_end-ffi_call_SYSV
-
-	.align	4
-FFI_HIDDEN (ffi_closure_SYSV)
-.globl ffi_closure_SYSV
-	.type	ffi_closure_SYSV, @function
-
-ffi_closure_SYSV:
-.LFB2:
-	pushl	%ebp
-.LCFI2:
-	movl	%esp, %ebp
-.LCFI3:
-	subl	$40, %esp
-	leal	-24(%ebp), %edx
-	movl	%edx, -12(%ebp)	/* resp */
-	leal	8(%ebp), %edx
-	movl	%edx, 4(%esp)	/* args = __builtin_dwarf_cfa () */
-	leal	-12(%ebp), %edx
-	movl	%edx, (%esp)	/* &resp */
-#if defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE || !defined __PIC__
-	call	ffi_closure_SYSV_inner
-#else
-	movl	%ebx, 8(%esp)
-.LCFI7:
-	call	1f
-1:	popl	%ebx
-	addl	$_GLOBAL_OFFSET_TABLE_+[.-1b], %ebx
-	call	ffi_closure_SYSV_inner@PLT
-	movl	8(%esp), %ebx
-#endif
-	movl	-12(%ebp), %ecx
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lcls_retint
-	
-0:	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lcls_retllong
-	cmpl	$FFI_TYPE_STRUCT, %eax
-	je	.Lcls_retstruct
-.Lcls_epilogue:
-	movl	%ebp, %esp
-	popl	%ebp
+E X86_RET_FLOAT
+	flds	(%esp)
+	jmp	9f
+
+E X86_RET_DOUBLE
+	fldl	(%esp)
+	jmp	9f
+
+E X86_RET_LDOUBLE
+	fldt	(%esp)
+	jmp	9f
+
+E X86_RET_SINT8
+	movsbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_SINT16
+	movswl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT8
+	movzbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT16
+	movzwl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_INT64
+	movl	4(%esp), %edx
+	/* fallthru */
+E X86_RET_INT32
+	movl	(%esp), %eax
+	/* fallthru */
+E X86_RET_VOID
+9:	addl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset -ffi_closure_FS
 	ret
-.Lcls_retint:
-	movl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retfloat:
-	flds	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retdouble:
-	fldl	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retldouble:
-	fldt	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retllong:
-	movl	(%ecx), %eax
-	movl	4(%ecx), %edx
-	jmp	.Lcls_epilogue
-.Lcls_retstruct:
-	movl	%ebp, %esp
-	popl	%ebp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+E X86_RET_STRUCTPOP
+	addl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset -ffi_closure_FS
 	ret	$4
-.LFE2:
-	.size	ffi_closure_SYSV, .-ffi_closure_SYSV
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+E X86_RET_STRUCTECX
+	movl	24(%esp), %ecx
+	addl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset -ffi_closure_FS
+	ret
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+	/* Fill out the table so that bad values are predictable.  */
+E X86_RET_UNUSED12
+	ud2
+E X86_RET_UNUSED13
+	ud2
+E X86_RET_UNUSED14
+	ud2
+E X86_RET_UNUSED15
+	ud2
+
+	.cfi_endproc
+	.size	ffi_closure_i386, . - ffi_closure_i386
+
+	.align	16
+	.globl	ffi_closure_i386_stdcall
+	.type	ffi_closure_i386_stdcall, @function
+	FFI_HIDDEN(ffi_closure_i386_stdcall)
+
+ffi_closure_i386_stdcall:
+	.cfi_startproc
+	FFI_CLOSURE_FIRST
+
+	movl	%eax, %ecx
+	shrl	$4, %ecx			    /* isolate pop count */
+	leal	ffi_closure_FS(%esp, %ecx), %ecx    /* compute popped esp */
+	movl	ffi_closure_FS(%esp), %edx	    /* move return address */
+	movl	%edx, (%ecx)
+	.cfi_def_cfa %ecx, 4
+
+	FFI_CLOSURE_SECOND
+
+	.align	8
+0:
+E X86_RET_FLOAT
+	flds	(%esp)
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_DOUBLE
+	fldl	(%esp)
+	movl	%ecx, %esp
+	ret
 
-#if !FFI_NO_RAW_API
+E X86_RET_LDOUBLE
+	fldt	(%esp)
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_SINT8
+	movsbl	(%esp), %eax
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_SINT16
+	movswl	(%esp), %eax
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_UINT8
+	movzbl	(%esp), %eax
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_UINT16
+	movzwl	(%esp), %eax
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_INT64
+	popl	%eax
+	popl	%edx
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_INT32
+	movl	(%esp), %eax
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_VOID
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_STRUCTPOP
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_STRUCTECX
+	/* This entry is one byte too big for the 8 byte slot.  */
+	jmp	9f
+
+	/* Fill out the table so that bad values are predictable.  */
+E X86_RET_UNUSED12
+	ud2
+E X86_RET_UNUSED13
+	ud2
+E X86_RET_UNUSED14
+	ud2
+E X86_RET_UNUSED15
+	ud2
+
+	.align	8
+9:	movl	24(%esp), %eax
+	movl	%ecx, %esp
+	.cfi_def_cfa_register %esp
+	movl	%eax, %ecx
+	ret
+
+	.cfi_endproc
+	.size	ffi_closure_i386_stdcall, . - ffi_closure_i386_stdcall
 
 /* Precalculate for e.g. the Solaris 10/x86 assembler.  */
 #if FFI_TRAMPOLINE_SIZE == 10
@@ -261,208 +398,235 @@ ffi_closure_SYSV:
 #endif
 #define CIF_FLAGS_OFFSET 20
 
-	.align	4
-FFI_HIDDEN (ffi_closure_raw_SYSV)
-.globl ffi_closure_raw_SYSV
+	.align	16
+	.globl	ffi_closure_raw_SYSV
 	.type	ffi_closure_raw_SYSV, @function
+	FFI_HIDDEN (ffi_closure_raw_SYSV)
+
+#define ffi_closure_raw_SYSV_FS  (12 + 16 + 4*4)
 
 ffi_closure_raw_SYSV:
-.LFB3:
-	pushl	%ebp
-.LCFI4:
-	movl	%esp, %ebp
-.LCFI5:
-	pushl	%esi
-.LCFI6:
-	subl	$36, %esp
-	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %esi	 /* closure->cif */
-	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* closure->user_data */
-	movl	%edx, 12(%esp)	/* user_data */
-	leal	8(%ebp), %edx	/* __builtin_dwarf_cfa () */
-	movl	%edx, 8(%esp)	/* raw_args */
-	leal	-24(%ebp), %edx
-	movl	%edx, 4(%esp)	/* &res */
-	movl	%esi, (%esp)	/* cif */
-	call	*RAW_CLOSURE_FUN_OFFSET(%eax)		 /* closure->fun */
-	movl	CIF_FLAGS_OFFSET(%esi), %eax		 /* rtype */
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lrcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lrcls_retint
-0:
-	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lrcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lrcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lrcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lrcls_retllong
-.Lrcls_epilogue:
-	addl	$36, %esp
-	popl	%esi
-	popl	%ebp
-	ret
-.Lrcls_retint:
-	movl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-.Lrcls_retfloat:
-	flds	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retdouble:
-	fldl	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retldouble:
-	fldt	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retllong:
-	movl	-24(%ebp), %eax
-	movl	-20(%ebp), %edx
-	jmp	.Lrcls_epilogue
-.LFE3:
-	.size	ffi_closure_raw_SYSV, .-ffi_closure_raw_SYSV
-#endif
+	.cfi_startproc
+	subl	$ffi_closure_raw_SYSV_FS, %esp
+	.cfi_adjust_cfa_offset ffi_closure_raw_SYSV_FS
+	movl	%ebx, 32(%esp)
+	.cfi_rel_offset %ebx, 32
 
-#if defined __PIC__
-# if defined __sun__ && defined __svr4__
-/* 32-bit Solaris 2/x86 uses datarel encoding for PIC.  GNU ld before 2.22
-   doesn't correctly sort .eh_frame_hdr with mixed encodings, so match this.  */
-#  define FDE_ENCODING		0x30	/* datarel */
-#  define FDE_ENCODE(X)		X@GOTOFF
-# else
-#  define FDE_ENCODING		0x1b	/* pcrel sdata4 */
-#  if defined HAVE_AS_X86_PCREL
-#   define FDE_ENCODE(X)	X-.
-#  else
-#   define FDE_ENCODE(X)	X@rel
-#  endif
-# endif
-#else
-# define FDE_ENCODING		0	/* absolute */
-# define FDE_ENCODE(X)		X
-#endif
+	/* Install each of the arguments to the closure in turn.  */
+	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* user_data */
+	movl	%edx, 12(%esp)
+
+	leal	ffi_closure_raw_SYSV_FS+4(%esp), %edx	/* raw_args */
+	movl	%edx, 8(%esp)
+
+	leal	16(%esp), %edx				/* &res */
+	movl	%edx, 4(%esp)
+
+	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %ebx	/* cif */
+	movl	%ebx, (%esp)
+
+	call	*RAW_CLOSURE_FUN_OFFSET(%eax)
+
+	movl	CIF_FLAGS_OFFSET(%ebx), %eax		/* load rtype */
+	andl	$X86_RET_TYPE_MASK, %eax
 
-	.section	.eh_frame,EH_FRAME_FLAGS,@progbits
-.Lframe1:
-	.long	.LECIE1-.LSCIE1	/* Length of Common Information Entry */
-.LSCIE1:
-	.long	0x0	/* CIE Identifier Tag */
-	.byte	0x1	/* CIE Version */
-#ifdef HAVE_AS_ASCII_PSEUDO_OP
-#ifdef __PIC__
-	.ascii "zR\0"	/* CIE Augmentation */
-#else
-	.ascii "\0"	/* CIE Augmentation */
-#endif
-#elif defined HAVE_AS_STRING_PSEUDO_OP
 #ifdef __PIC__
-	.string "zR"	/* CIE Augmentation */
-#else
-	.string ""	/* CIE Augmentation */
-#endif
+	call	__x86.get_pc_thunk.bx
+1:	leal	0f-1b(%ebx, %eax, 8), %eax
 #else
-#error missing .ascii/.string
-#endif
-	.byte	0x1	/* .uleb128 0x1; CIE Code Alignment Factor */
-	.byte	0x7c	/* .sleb128 -4; CIE Data Alignment Factor */
-	.byte	0x8	/* CIE RA Column */
-#ifdef __PIC__
-	.byte	0x1	/* .uleb128 0x1; Augmentation size */
-	.byte	FDE_ENCODING
-#endif
-	.byte	0xc	/* DW_CFA_def_cfa */
-	.byte	0x4	/* .uleb128 0x4 */
-	.byte	0x4	/* .uleb128 0x4 */
-	.byte	0x88	/* DW_CFA_offset, column 0x8 */
-	.byte	0x1	/* .uleb128 0x1 */
-	.align 4
-.LECIE1:
-.LSFDE1:
-	.long	.LEFDE1-.LASFDE1	/* FDE Length */
-.LASFDE1:
-	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
-	.long	FDE_ENCODE(.LFB1)	/* FDE initial location */
-	.long	.LFE1-.LFB1		/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI0-.LFB1
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI1-.LCFI0
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-	.align 4
-.LEFDE1:
-.LSFDE2:
-	.long	.LEFDE2-.LASFDE2	/* FDE Length */
-.LASFDE2:
-	.long	.LASFDE2-.Lframe1	/* FDE CIE offset */
-	.long	FDE_ENCODE(.LFB2)	/* FDE initial location */
-	.long	.LFE2-.LFB2		/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI2-.LFB2
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI3-.LCFI2
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-#if !defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE && defined __PIC__
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI7-.LCFI3
-	.byte	0x83	/* DW_CFA_offset, column 0x3 */
-	.byte	0xa	/* .uleb128 0xa */
+	leal	0f(,%eax, 8), %eax
 #endif
-	.align 4
-.LEFDE2:
 
-#if !FFI_NO_RAW_API
+	movl	32(%esp), %ebx				/* restore ebx early */
+	.cfi_restore %ebx
+	jmp	*%eax
+
+	.align	8
+0:
+E X86_RET_FLOAT
+	flds	(%esp)
+	jmp	9f
+
+E X86_RET_DOUBLE
+	fldl	(%esp)
+	jmp	9f
+
+E X86_RET_LDOUBLE
+	fldt	(%esp)
+	jmp	9f
+
+E X86_RET_SINT8
+	movsbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_SINT16
+	movswl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT8
+	movzbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT16
+	movzwl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_INT64
+	movl	4(%esp), %edx
+	/* fallthru */
+E X86_RET_INT32
+	movl	(%esp), %eax
+	/* fallthru */
+E X86_RET_VOID
+9:	addl	$ffi_closure_raw_SYSV_FS, %esp
+	.cfi_adjust_cfa_offset -ffi_closure_raw_SYSV_FS
+	ret
+	.cfi_adjust_cfa_offset ffi_closure_raw_SYSV_FS
+
+	/* We should never get here.  */
+E X86_RET_STRUCTPOP
+	ud2
+E X86_RET_STRUCTECX
+	ud2
+E X86_RET_UNUSED12
+	ud2
+E X86_RET_UNUSED13
+	ud2
+E X86_RET_UNUSED14
+	ud2
+E X86_RET_UNUSED15
+	ud2
+
+	.cfi_endproc
+	.size	ffi_closure_raw_SYSV, .-ffi_closure_raw_SYSV
+
+	.align	16
+	.globl	ffi_closure_raw_THISCALL
+	.type	ffi_closure_raw_THISCALL, @function
+	FFI_HIDDEN (ffi_closure_raw_THISCALL)
+
+#define ffi_closure_raw_TC_FS  (8 + 16 + 4*4)
+	
+ffi_closure_raw_THISCALL:
+	.cfi_startproc
+	/* Rearrange the stack such that %ecx is the first argument.
+	   This means moving the return address.  */
+	popl	%edx
+	.cfi_adjust_cfa_offset -4
+	.cfi_register %eip, %edx
+	pushl	%ecx
+	.cfi_adjust_cfa_offset 4
+	pushl	%edx
+	.cfi_adjust_cfa_offset 4
+	.cfi_rel_offset %eip, 0
+	subl	$ffi_closure_raw_TC_FS, %esp
+	.cfi_adjust_cfa_offset ffi_closure_raw_TC_FS
+	movl	%ebx, 32(%esp)
+	.cfi_rel_offset %ebx, 32
+
+	/* Install each of the arguments to the closure in turn.  */
+	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* user_data */
+	movl	%edx, 12(%esp)
+
+	leal	ffi_closure_raw_TC_FS+4(%esp), %edx	/* raw_args */
+	movl	%edx, 8(%esp)
+
+	leal	16(%esp), %edx				/* &res */
+	movl	%edx, 4(%esp)
+
+	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %ebx	/* cif */
+	movl	%ebx, (%esp)
+
+	call	*RAW_CLOSURE_FUN_OFFSET(%eax)
+
+	movl	CIF_FLAGS_OFFSET(%ebx), %eax		/* load rtype */
+	andl	$X86_RET_TYPE_MASK, %eax
 
-.LSFDE3:
-	.long	.LEFDE3-.LASFDE3	/* FDE Length */
-.LASFDE3:
-	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
-	.long	FDE_ENCODE(.LFB3)	/* FDE initial location */
-	.long	.LFE3-.LFB3		/* FDE address range */
 #ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
+	call	__x86.get_pc_thunk.bx
+1:	leal	0f-1b(%ebx, %eax, 8), %eax
+#else
+	leal	0f(,%eax, 8), %eax
 #endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI4-.LFB3
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI5-.LCFI4
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI6-.LCFI5
-	.byte	0x86	/* DW_CFA_offset, column 0x6 */
-	.byte	0x3	/* .uleb128 0x3 */
-	.align 4
-.LEFDE3:
 
-#endif
+	movl	32(%esp), %ebx				/* restore ebx early */
+	.cfi_restore %ebx
+	jmp	*%eax
 
+	.align	8
+0:
+E X86_RET_FLOAT
+	flds	(%esp)
+	jmp	9f
+
+E X86_RET_DOUBLE
+	fldl	(%esp)
+	jmp	9f
+
+E X86_RET_LDOUBLE
+	fldt	(%esp)
+	jmp	9f
+
+E X86_RET_SINT8
+	movsbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_SINT16
+	movswl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT8
+	movzbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT16
+	movzwl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_INT64
+	movl	4(%esp), %edx
+	/* fallthru */
+E X86_RET_INT32
+	movl	(%esp), %eax
+	/* fallthru */
+E X86_RET_VOID
+9:	addl	$ffi_closure_raw_TC_FS, %esp
+	.cfi_adjust_cfa_offset -ffi_closure_raw_TC_FS
+	/* Remove the extra %ecx argument we pushed.  */
+	ret	$4
+	.cfi_adjust_cfa_offset ffi_closure_raw_TC_FS
+
+	/* We should never get here.  */
+E X86_RET_STRUCTPOP
+	ud2
+E X86_RET_STRUCTECX
+	ud2
+E X86_RET_UNUSED12
+	ud2
+E X86_RET_UNUSED13
+	ud2
+E X86_RET_UNUSED14
+	ud2
+E X86_RET_UNUSED15
+	ud2
+
+	.cfi_endproc
+	.size	ffi_closure_raw_THISCALL, .-ffi_closure_raw_THISCALL
 #endif /* ifndef __x86_64__ */
 
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
 #endif
+
+#ifdef __PIC__
+        .section .text.__x86.get_pc_thunk.bx,"axG",@progbits,__x86.get_pc_thunk.bx,comdat
+	.globl	__x86.get_pc_thunk.bx
+	.hidden	__x86.get_pc_thunk.bx
+	.type	__x86.get_pc_thunk.bx, @function
+__x86.get_pc_thunk.bx:
+	.cfi_startproc
+	movl	(%esp), %ebx
+	ret
+	.cfi_endproc
+	.size	__x86.get_pc_thunk.bx, . - __x86.get_pc_thunk.bx
+#endif
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 11/13] libffi: Support go closures on aarch64
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
  2014-10-10 20:43 ` [PATCH 04/13] Use the static chain as the closure parameter from Go Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 05/13] libgo: Use the static chain for the closure Richard Henderson
                   ` (13 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

---
 libffi/src/aarch64/ffi.c       | 42 ++++++++++++++++++++++++++---
 libffi/src/aarch64/ffitarget.h |  3 ++-
 libffi/src/aarch64/sysv.S      | 60 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 99 insertions(+), 6 deletions(-)

diff --git a/libffi/src/aarch64/ffi.c b/libffi/src/aarch64/ffi.c
index c409c0c..1fe5a60 100644
--- a/libffi/src/aarch64/ffi.c
+++ b/libffi/src/aarch64/ffi.c
@@ -72,10 +72,13 @@ get_d_addr (struct call_context *context, unsigned n)
 
 extern void ffi_call_SYSV (void *frame, void *rvalue,
 			   struct call_context *context,
-			   unsigned flags, void (*fn)(void)) FFI_HIDDEN;
+			   unsigned flags, void (*fn)(void),
+			   void *static_chain) FFI_HIDDEN;
 
 extern void ffi_closure_SYSV (void) FFI_HIDDEN;
 extern void ffi_closure_SYSV_V (void) FFI_HIDDEN;
+extern void ffi_go_closure_SYSV (void) FFI_HIDDEN;
+extern void ffi_go_closure_SYSV_V (void) FFI_HIDDEN;
 
 /* A subroutine of is_hfa.  Given a structure type, return the type code
    of the first non-structure element.  Recurse for structure elements.
@@ -336,8 +339,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 
 /* Call a function with the provided arguments and capture the return
    value.  */
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
+              void **avalue, void *closure)
 {
   struct call_context *context;
   UINT64 *stack, *slot;
@@ -533,11 +537,24 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
     }
 
   size = cif->rtype->size;
-  ffi_call_SYSV (frame, local_rvalue, context, cif->flags, fn);
+  ffi_call_SYSV (frame, local_rvalue, context, cif->flags, fn, closure);
   if (local_rvalue != rvalue && rvalue != NULL)
     memcpy (rvalue, local_rvalue, size);
 }
 
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
+
 /* Build a trampoline.  */
 
 ffi_status
@@ -574,6 +591,23 @@ ffi_prep_closure_loc (ffi_closure* closure,
   return FFI_OK;
 }
 
+/* Build a Go language closure.  */
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+                     void (*fun)(ffi_cif*,void*,void**,void*))
+{
+  if (cif->abi != FFI_SYSV)
+    return FFI_BAD_ABI;
+
+  closure->tramp = (cif->flags & AARCH64_FLAG_ARG_V
+		    ? ffi_go_closure_SYSV_V : ffi_go_closure_SYSV);
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+
 /* Primary handler to setup and invoke a function within a closure.
 
    A closure when invoked enters via the assembler wrapper
diff --git a/libffi/src/aarch64/ffitarget.h b/libffi/src/aarch64/ffitarget.h
index ecfa159..bb7340b 100644
--- a/libffi/src/aarch64/ffitarget.h
+++ b/libffi/src/aarch64/ffitarget.h
@@ -42,7 +42,8 @@ typedef enum ffi_abi
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
-#define FFI_TRAMPOLINE_SIZE  24
+#define FFI_GO_CLOSURES 1
+#define FFI_TRAMPOLINE_SIZE 24
 #define FFI_NATIVE_RAW_API 0
 
 #endif
diff --git a/libffi/src/aarch64/sysv.S b/libffi/src/aarch64/sysv.S
index 126c527..0544176 100644
--- a/libffi/src/aarch64/sysv.S
+++ b/libffi/src/aarch64/sysv.S
@@ -35,7 +35,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
    extern void
    ffi_call_SYSV (void *frame, void *rvalue, struct call_context *context,
-		  unsigned flags, void (*fn)(void))
+		  unsigned flags, void (*fn)(void), void *static_chain)
 
    This function uses an unusual stack layout.  Our local frame has
    been allocated by the caller in FRAME with the outgoing arguments
@@ -60,6 +60,7 @@ ffi_call_SYSV:
 	mov	x8, x1			/* rvalue into place */
 	mov	x10, x2			/* context */
 	mov	x11, x4			/* fn */
+	mov	x18, x5			/* static chain into place */
 
 	/* Load the vector argument passing registers, if needed.  */
 	tbz     w3, #AARCH64_FLAG_ARG_V_BIT, 1f
@@ -304,3 +305,60 @@ ffi_closure_SYSV:
 	ret
 	.cfi_endproc
 	.size ffi_closure_SYSV, .-ffi_closure_SYSV
+
+/* ffi_go_closure_SYSV
+
+   Similarly for a Go closure.  The difference here is that the
+   calling convention loads x18 with the ffi_go_closure structure
+   automatically.  Further, the ffi_go_closure is also the user_data
+   that want passed to the inner function.
+*/
+
+	.globl	ffi_go_closure_SYSV_V
+	.hidden	ffi_go_closure_SYSV_V
+	.type	ffi_go_closure_SYSV_V, %function
+	.balign 32
+
+ffi_go_closure_SYSV_V:
+	.cfi_startproc
+	stp     x29, x30, [sp, #-ffi_closure_FS]!
+	.cfi_adjust_cfa_offset ffi_closure_FS
+	.cfi_rel_offset x29, 0
+	.cfi_rel_offset x30, 8
+	mov     x29, sp
+
+	/* Save the argument passing vector registers.  */
+	stp     q0, q1, [sp, #16 + 8*AARCH64_N_XREG + 0]
+	stp     q2, q3, [sp, #16 + 8*AARCH64_N_XREG + 32]
+	stp     q4, q5, [sp, #16 + 8*AARCH64_N_XREG + 64]
+	stp     q6, q7, [sp, #16 + 8*AARCH64_N_XREG + 96]
+	b	0f
+
+	.cfi_endproc
+	.size	ffi_go_closure_SYSV_V, . - ffi_go_closure_SYSV_V
+
+	.globl	ffi_go_closure_SYSV
+	.hidden	ffi_go_closure_SYSV
+	.type	ffi_go_closure_SYSV, %function
+	.balign 32
+
+ffi_go_closure_SYSV:
+	.cfi_startproc
+	stp     x29, x30, [sp, #-ffi_closure_FS]!
+	.cfi_adjust_cfa_offset ffi_closure_FS
+	.cfi_rel_offset x29, 0
+	.cfi_rel_offset x30, 8
+	mov     x29, sp
+
+	/* Save the argument passing core registers.  */
+0:	stp     x0, x1, [sp, #16 + 0]
+	stp     x2, x3, [sp, #16 + 16]
+	stp     x4, x5, [sp, #16 + 32]
+	stp     x6, x7, [sp, #16 + 48]
+
+	ldp	x0, x1, [x18, #8]			/* cif and fun */
+	mov	x2, x18					/* user_data */
+	b	.Ldo_closure
+
+	.cfi_endproc
+	.size	ffi_go_closure_SYSV, . - ffi_go_closure_SYSV
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 07/13] libffi: Support go closures on x86_64
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (11 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 02/13] Allow the front-end to create calls with a static chain Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-11  0:23 ` [PATCH 00/13] Go closures, libffi, and the static chain Ian Lance Taylor
                   ` (2 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

Still missing changes for darwin, win64, and all 32-bit abis.
Dumps all of the hand-coded unwind info for gas generated, as
I can't be bothered to do the updates by hand again.
---
 libffi/src/x86/ffi64.c     | 103 ++++++++++-----
 libffi/src/x86/ffitarget.h |   2 +
 libffi/src/x86/unix64.S    | 319 ++++++++++++++++++++++-----------------------
 3 files changed, 230 insertions(+), 194 deletions(-)

diff --git a/libffi/src/x86/ffi64.c b/libffi/src/x86/ffi64.c
index 1daa1c0..428168c 100644
--- a/libffi/src/x86/ffi64.c
+++ b/libffi/src/x86/ffi64.c
@@ -31,6 +31,7 @@
 
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdint.h>
 
 #ifdef __x86_64__
 
@@ -48,10 +49,12 @@ struct register_args
   /* Registers for argument passing.  */
   UINT64 gpr[MAX_GPR_REGS];
   UINT128 sse[MAX_SSE_REGS];
+  UINT64 rax;	/* ssecount */
+  UINT64 r10;	/* static chain */
 };
 
 extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-			     void *raddr, void (*fnaddr)(void), unsigned ssecount);
+			     void *raddr, void (*fnaddr)(void)) FFI_HIDDEN;
 
 /* All reference to register classes here is identical to the code in
    gcc/config/i386/i386.c. Do *not* change one without the other.  */
@@ -341,6 +344,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   enum x86_64_reg_class classes[MAX_CLASSES];
   size_t bytes;
 
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
+
   gprcount = ssecount = 0;
 
   flags = cif->rtype->type;
@@ -402,8 +408,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   return FFI_OK;
 }
 
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	      void **avalue, void *closure)
 {
   enum x86_64_reg_class classes[MAX_CLASSES];
   char *stack, *argp;
@@ -428,6 +435,8 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
   reg_args = (struct register_args *) stack;
   argp = stack + sizeof (struct register_args);
 
+  reg_args->r10 = (unsigned long) closure;
+
   gprcount = ssecount = 0;
 
   /* If the return value is passed in memory, add the pointer as the
@@ -488,13 +497,27 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 	    }
 	}
     }
+  reg_args->rax = ssecount;
 
   ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
-		   cif->flags, rvalue, fn, ssecount);
+		   cif->flags, rvalue, fn);
 }
 
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
 
-extern void ffi_closure_unix64(void);
+extern void ffi_closure_unix64(void) FFI_HIDDEN;
+extern void ffi_closure_unix64_sse(void) FFI_HIDDEN;
 
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
@@ -503,29 +526,26 @@ ffi_prep_closure_loc (ffi_closure* closure,
 		      void *user_data,
 		      void *codeloc)
 {
-  volatile unsigned short *tramp;
-
-  /* Sanity check on the cif ABI.  */
-  {
-    int abi = cif->abi;
-    if (UNLIKELY (! (abi > FFI_FIRST_ABI && abi < FFI_LAST_ABI)))
-      return FFI_BAD_ABI;
-  }
-
-  tramp = (volatile unsigned short *) &closure->tramp[0];
+  static const unsigned char trampoline[16] = {
+    /* leaq  -0x7(%rip),%r10   # 0x0  */
+    0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff,
+    /* jmpq  *0x3(%rip)        # 0x10 */
+    0xff, 0x25, 0x03, 0x00, 0x00, 0x00,
+    /* nopl  (%rax) */
+    0x0f, 0x1f, 0x00
+  };
+  void (*dest)(void);
 
-  tramp[0] = 0xbb49;		/* mov <code>, %r11	*/
-  *((unsigned long long * volatile) &tramp[1])
-    = (unsigned long) ffi_closure_unix64;
-  tramp[5] = 0xba49;		/* mov <data>, %r10	*/
-  *((unsigned long long * volatile) &tramp[6])
-    = (unsigned long) codeloc;
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
 
-  /* Set the carry bit iff the function uses any sse registers.
-     This is clc or stc, together with the first byte of the jmp.  */
-  tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
+  if (cif->flags & (1 << 11))
+    dest = ffi_closure_unix64_sse;
+  else
+    dest = ffi_closure_unix64;
 
-  tramp[11] = 0xe3ff;			/* jmp *%r11    */
+  memcpy (closure->tramp, trampoline, sizeof(trampoline));
+  *(UINT64 *)(closure->tramp + 16) = (uintptr_t)dest;
 
   closure->cif = cif;
   closure->fun = fun;
@@ -534,18 +554,20 @@ ffi_prep_closure_loc (ffi_closure* closure,
   return FFI_OK;
 }
 
-int
-ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
-			 struct register_args *reg_args, char *argp)
+int FFI_HIDDEN
+ffi_closure_unix64_inner(ffi_cif *cif,
+			 void (*fun)(ffi_cif*, void*, void**, void*),
+			 void *user_data,
+			 void *rvalue,
+			 struct register_args *reg_args,
+			 char *argp)
 {
-  ffi_cif *cif;
   void **avalue;
   ffi_type **arg_types;
   long i, avn;
   int gprcount, ssecount, ngpr, nsse;
   int ret;
 
-  cif = closure->cif;
   avalue = alloca(cif->nargs * sizeof(void *));
   gprcount = ssecount = 0;
 
@@ -634,10 +656,29 @@ ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
     }
 
   /* Invoke the closure.  */
-  closure->fun (cif, rvalue, avalue, closure->user_data);
+  fun (cif, rvalue, avalue, user_data);
 
   /* Tell assembly how to perform return type promotions.  */
   return ret;
 }
 
+extern void ffi_go_closure_unix64(void) FFI_HIDDEN;
+extern void ffi_go_closure_unix64_sse(void) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+		     void (*fun)(ffi_cif*, void*, void**, void*))
+{
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
+
+  closure->tramp = (cif->flags & (1 << 11)
+		    ? ffi_go_closure_unix64_sse
+		    : ffi_go_closure_unix64);
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+
 #endif /* __x86_64__ */
diff --git a/libffi/src/x86/ffitarget.h b/libffi/src/x86/ffitarget.h
index 46f294c..592d6f8 100644
--- a/libffi/src/x86/ffitarget.h
+++ b/libffi/src/x86/ffitarget.h
@@ -111,6 +111,8 @@ typedef enum ffi_abi {
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
+#define FFI_GO_CLOSURES 1
+
 #define FFI_TYPE_SMALL_STRUCT_1B (FFI_TYPE_LAST + 1)
 #define FFI_TYPE_SMALL_STRUCT_2B (FFI_TYPE_LAST + 2)
 #define FFI_TYPE_SMALL_STRUCT_4B (FFI_TYPE_LAST + 3)
diff --git a/libffi/src/x86/unix64.S b/libffi/src/x86/unix64.S
index 7a6619a..3881f51 100644
--- a/libffi/src/x86/unix64.S
+++ b/libffi/src/x86/unix64.S
@@ -41,10 +41,11 @@
 
 	.align	2
 	.globl	ffi_call_unix64
+	.hidden	ffi_call_unix64
 	.type	ffi_call_unix64,@function
 
 ffi_call_unix64:
-.LUW0:
+	.cfi_startproc
 	movq	(%rsp), %r10		/* Load return address.  */
 	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
 	movq	%rdx, (%rax)		/* Save flags.  */
@@ -52,24 +53,36 @@ ffi_call_unix64:
 	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
 	movq	%r10, 24(%rax)		/* Relocate return address.  */
 	movq	%rax, %rbp		/* Finalize local stack frame.  */
-.LUW1:
+
+	/* New stack frame based off rbp.  This is a itty bit of unwind
+	   trickery in that the CFA *has* changed.  There is no easy way
+	   to describe it correctly on entry to the function.  Fortunately,
+	   it doesn't matter too much since at all points we can correctly
+	   unwind back to ffi_call.  Note that the location to which we
+	   moved the return address is (the new) CFA-8, so from the
+	   perspective of the unwind info, it hasn't moved.  */
+	.cfi_def_cfa %rbp, 32
+	.cfi_rel_offset %rbp, 16
+
 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
 	movl	%r9d, %eax		/* Set number of SSE registers.  */
 
 	/* Load up all argument registers.  */
 	movq	(%r10), %rdi
-	movq	8(%r10), %rsi
-	movq	16(%r10), %rdx
-	movq	24(%r10), %rcx
-	movq	32(%r10), %r8
-	movq	40(%r10), %r9
+	movq	0x08(%r10), %rsi
+	movq	0x10(%r10), %rdx
+	movq	0x18(%r10), %rcx
+	movq	0x20(%r10), %r8
+	movq	0x28(%r10), %r9
+	movl	0xb0(%r10), %eax
 	testl	%eax, %eax
 	jnz	.Lload_sse
 .Lret_from_load_sse:
 
-	/* Deallocate the reg arg area.  */
-	leaq	176(%r10), %rsp
+	/* Deallocate the reg arg area, except for r10, then load via pop.  */
+	leaq	0xb8(%r10), %rsp
+	popq	%r10
 
 	/* Call the user function.  */
 	call	*%r11
@@ -80,7 +93,9 @@ ffi_call_unix64:
 	movq	0(%rbp), %rcx		/* Reload flags.  */
 	movq	8(%rbp), %rdi		/* Reload raddr.  */
 	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
-.LUW2:
+	.cfi_remember_state
+	.cfi_def_cfa %rsp, 8
+	.cfi_restore %rbp
 
 	/* The first byte of the flags contains the FFI_TYPE.  */
 	movzbl	%cl, %r10d
@@ -89,6 +104,8 @@ ffi_call_unix64:
 	addq	%r11, %r10
 	jmp	*%r10
 
+	.section .rodata
+	.align	2
 .Lstore_table:
 	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */
 	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_INT */
@@ -105,6 +122,7 @@ ffi_call_unix64:
 	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_SINT64 */
 	.long	.Lst_struct-.Lstore_table	/* FFI_TYPE_STRUCT */
 	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_POINTER */
+	.previous
 
 	.align 2
 .Lst_void:
@@ -187,49 +205,83 @@ ffi_call_unix64:
 	   It's not worth an indirect jump to load the exact set of
 	   SSE registers needed; zero or all is a good compromise.  */
 	.align 2
-.LUW3:
+	.cfi_restore_state
 .Lload_sse:
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
+	movdqa	0x30(%r10), %xmm0
+	movdqa	0x40(%r10), %xmm1
+	movdqa	0x50(%r10), %xmm2
+	movdqa	0x60(%r10), %xmm3
+	movdqa	0x70(%r10), %xmm4
+	movdqa	0x80(%r10), %xmm5
+	movdqa	0x90(%r10), %xmm6
+	movdqa	0xa0(%r10), %xmm7
 	jmp	.Lret_from_load_sse
 
-.LUW4:
+	.cfi_endproc
 	.size    ffi_call_unix64,.-ffi_call_unix64
 
+/* 6 general registers, 8 vector registers,
+   16 bytes of rvalue, 8 bytes of alignment.  */
+#define ffi_closure_OFS_G	0
+#define ffi_closure_OFS_V	(6*8)
+#define ffi_closure_OFS_RVALUE	(ffi_closure_OFS_V + 8*16)
+#define ffi_closure_FS		(ffi_closure_OFS_RVALUE + 16 + 8)
+
+/* The location of rvalue within the red zone after deallocating the frame.  */
+#define ffi_closure_RED_RVALUE	(ffi_closure_OFS_RVALUE - ffi_closure_FS)
+
+	.align	2
+	.globl	ffi_closure_unix64_sse
+	.hidden	ffi_closure_unix64_sse
+	.type	ffi_closure_unix64_sse,@function
+
+ffi_closure_unix64_sse:
+	.cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
+	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
+	movdqa	%xmm2, ffi_closure_OFS_V+0x20(%rsp)
+	movdqa	%xmm3, ffi_closure_OFS_V+0x30(%rsp)
+	movdqa	%xmm4, ffi_closure_OFS_V+0x40(%rsp)
+	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
+	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
+	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
+	jmp	0f
+
+	.cfi_endproc
+	.size	ffi_closure_unix64_sse,.-ffi_closure_unix64_sse
+
 	.align	2
-	.globl ffi_closure_unix64
+	.globl	ffi_closure_unix64
+	.hidden	ffi_closure_unix64
 	.type	ffi_closure_unix64,@function
 
 ffi_closure_unix64:
-.LUW5:
-	/* The carry flag is set by the trampoline iff SSE registers
-	   are used.  Don't clobber it before the branch instruction.  */
-	leaq    -200(%rsp), %rsp
-.LUW6:
-	movq	%rdi, (%rsp)
-	movq    %rsi, 8(%rsp)
-	movq    %rdx, 16(%rsp)
-	movq    %rcx, 24(%rsp)
-	movq    %r8, 32(%rsp)
-	movq    %r9, 40(%rsp)
-	jc      .Lsave_sse
-.Lret_from_save_sse:
-
-	movq	%r10, %rdi
-	leaq	176(%rsp), %rsi
-	movq	%rsp, %rdx
-	leaq	208(%rsp), %rcx
-	call	ffi_closure_unix64_inner@PLT
+	.cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+0:
+	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
+	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
+	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
+	movq    %rcx, ffi_closure_OFS_G+0x18(%rsp)
+	movq    %r8,  ffi_closure_OFS_G+0x20(%rsp)
+	movq    %r9,  ffi_closure_OFS_G+0x28(%rsp)
+
+	movq	24(%r10), %rdi				/* Load cif */
+	movq	32(%r10), %rsi				/* Load fun */
+	movq	40(%r10), %rdx				/* Load user_data */
+.Ldo_closure:
+	leaq	ffi_closure_OFS_RVALUE(%rsp), %rcx	/* Load rvalue */
+	movq	%rsp, %r8				/* Load reg_args */
+	leaq	ffi_closure_FS+8(%rsp), %r9		/* Load argp */
+	call	ffi_closure_unix64_inner
 
 	/* Deallocate stack frame early; return value is now in redzone.  */
-	addq	$200, %rsp
-.LUW7:
+	addq	$ffi_closure_FS, %rsp
+	.cfi_adjust_cfa_offset -ffi_closure_FS
 
 	/* The first byte of the return value contains the FFI_TYPE.  */
 	movzbl	%al, %r10d
@@ -238,6 +290,8 @@ ffi_closure_unix64:
 	addq	%r11, %r10
 	jmp	*%r10
 
+	.section .rodata
+	.align	2
 .Lload_table:
 	.long	.Lld_void-.Lload_table		/* FFI_TYPE_VOID */
 	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_INT */
@@ -254,6 +308,7 @@ ffi_closure_unix64:
 	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_SINT64 */
 	.long	.Lld_struct-.Lload_table	/* FFI_TYPE_STRUCT */
 	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_POINTER */
+	.previous
 
 	.align 2
 .Lld_void:
@@ -261,32 +316,32 @@ ffi_closure_unix64:
 
 	.align 2
 .Lld_int8:
-	movzbl	-24(%rsp), %eax
+	movzbl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 	.align 2
 .Lld_int16:
-	movzwl	-24(%rsp), %eax
+	movzwl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 	.align 2
 .Lld_int32:
-	movl	-24(%rsp), %eax
+	movl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 	.align 2
 .Lld_int64:
-	movq	-24(%rsp), %rax
+	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	ret
 
 	.align 2
 .Lld_float:
-	movss	-24(%rsp), %xmm0
+	movss	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	ret
 	.align 2
 .Lld_double:
-	movsd	-24(%rsp), %xmm0
+	movsd	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	ret
 	.align 2
 .Lld_ldouble:
-	fldt	-24(%rsp)
+	fldt	ffi_closure_RED_RVALUE(%rsp)
 	ret
 
 	.align 2
@@ -296,131 +351,69 @@ ffi_closure_unix64:
 	   both rdx and xmm1 with the second word.  For the remaining,
 	   bit 8 set means xmm0 gets the second word, and bit 9 means
 	   that rax gets the second word.  */
-	movq	-24(%rsp), %rcx
-	movq	-16(%rsp), %rdx
-	movq	-16(%rsp), %xmm1
+	movq	ffi_closure_RED_RVALUE(%rsp), %rcx
+	movq	ffi_closure_RED_RVALUE+8(%rsp), %rdx
+	movq	ffi_closure_RED_RVALUE+8(%rsp), %xmm1
 	testl	$0x100, %eax
 	cmovnz	%rdx, %rcx
 	movd	%rcx, %xmm0
 	testl	$0x200, %eax
-	movq	-24(%rsp), %rax
+	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	cmovnz	%rdx, %rax
 	ret
 
-	/* See the comment above .Lload_sse; the same logic applies here.  */
-	.align 2
-.LUW8:
-.Lsave_sse:
-	movdqa	%xmm0, 48(%rsp)
-	movdqa	%xmm1, 64(%rsp)
-	movdqa	%xmm2, 80(%rsp)
-	movdqa	%xmm3, 96(%rsp)
-	movdqa	%xmm4, 112(%rsp)
-	movdqa	%xmm5, 128(%rsp)
-	movdqa	%xmm6, 144(%rsp)
-	movdqa	%xmm7, 160(%rsp)
-	jmp	.Lret_from_save_sse
-
-.LUW9:
+	.cfi_endproc
 	.size	ffi_closure_unix64,.-ffi_closure_unix64
 
-#ifdef HAVE_AS_X86_64_UNWIND_SECTION_TYPE
-	.section	.eh_frame,"a",@unwind
-#else
-	.section	.eh_frame,"a",@progbits
-#endif
-.Lframe1:
-	.long	.LECIE1-.LSCIE1		/* CIE Length */
-.LSCIE1:
-	.long	0			/* CIE Identifier Tag */
-	.byte	1			/* CIE Version */
-	.ascii "zR\0"			/* CIE Augmentation */
-	.uleb128 1			/* CIE Code Alignment Factor */
-	.sleb128 -8			/* CIE Data Alignment Factor */
-	.byte	0x10			/* CIE RA Column */
-	.uleb128 1			/* Augmentation size */
-	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.uleb128 7
-	.uleb128 8
-	.byte	0x80+16			/* DW_CFA_offset, %rip offset 1*-8 */
-	.uleb128 1
-	.align 8
-.LECIE1:
-.LSFDE1:
-	.long	.LEFDE1-.LASFDE1	/* FDE Length */
-.LASFDE1:
-	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
-#if HAVE_AS_X86_PCREL
-	.long	.LUW0-.			/* FDE initial location */
-#else
-	.long	.LUW0@rel
-#endif
-	.long	.LUW4-.LUW0		/* FDE address range */
-	.uleb128 0x0			/* Augmentation size */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW1-.LUW0
-
-	/* New stack frame based off rbp.  This is a itty bit of unwind
-	   trickery in that the CFA *has* changed.  There is no easy way
-	   to describe it correctly on entry to the function.  Fortunately,
-	   it doesn't matter too much since at all points we can correctly
-	   unwind back to ffi_call.  Note that the location to which we
-	   moved the return address is (the new) CFA-8, so from the
-	   perspective of the unwind info, it hasn't moved.  */
-	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
-	.uleb128 6
-	.uleb128 32
-	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
-	.uleb128 2
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW2-.LUW1
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.uleb128 7
-	.uleb128 8
-	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW3-.LUW2
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align 8
-.LEFDE1:
-.LSFDE3:
-	.long	.LEFDE3-.LASFDE3	/* FDE Length */
-.LASFDE3:
-	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
-#if HAVE_AS_X86_PCREL
-	.long	.LUW5-.			/* FDE initial location */
-#else
-	.long	.LUW5@rel
-#endif
-	.long	.LUW9-.LUW5		/* FDE address range */
-	.uleb128 0x0			/* Augmentation size */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW6-.LUW5
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.uleb128 208
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW7-.LUW6
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.uleb128 8
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW8-.LUW7
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align 8
-.LEFDE3:
+	.align	2
+	.globl	ffi_go_closure_unix64_sse
+	.hidden	ffi_go_closure_unix64_sse
+	.type	ffi_go_closure_unix64_sse,@function
+
+ffi_go_closure_unix64_sse:
+	.cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
+	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
+	movdqa	%xmm2, ffi_closure_OFS_V+0x20(%rsp)
+	movdqa	%xmm3, ffi_closure_OFS_V+0x30(%rsp)
+	movdqa	%xmm4, ffi_closure_OFS_V+0x40(%rsp)
+	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
+	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
+	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
+	jmp	0f
+
+	.cfi_endproc
+	.size	ffi_go_closure_unix64_sse,.-ffi_go_closure_unix64_sse
 
-#endif /* __x86_64__ */
+	.align	2
+	.globl	ffi_go_closure_unix64
+	.hidden	ffi_go_closure_unix64
+	.type	ffi_go_closure_unix64,@function
+
+ffi_go_closure_unix64:
+	.cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+0:
+	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
+	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
+	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
+	movq    %rcx, ffi_closure_OFS_G+0x18(%rsp)
+	movq    %r8,  ffi_closure_OFS_G+0x20(%rsp)
+	movq    %r9,  ffi_closure_OFS_G+0x28(%rsp)
+
+	movq	8(%r10), %rdi		/* Load cif */
+	movq	16(%r10), %rsi		/* Load fun */
+	movq	%r10, %rdx		/* Load closure (user_data) */
+	jmp	.Ldo_closure
+
+	.cfi_endproc
+	.size	ffi_go_closure_unix64,.-ffi_go_closure_unix64
 
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
 #endif
+#endif /* x86_64 */
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH 06/13] libffi: Add entry points for interacting with Go
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (6 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 03/13] HACK! Allow the static chain to be set from C Richard Henderson
@ 2014-10-10 20:43 ` Richard Henderson
  2014-10-10 20:43 ` [PATCH 01/13] Make TARGET_STATIC_CHAIN allow a function type Richard Henderson
                   ` (7 subsequent siblings)
  15 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-10 20:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

A "ffi_go_closure" is intended to be compatible with the
function descriptors used by Go, and ffi_call_go sets up
the static chain parameter for calling a Go function.

The entry points are disabled when a backend has not been
updated, much like we do for "normal" closures.
---
 libffi/include/ffi.h.in | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/libffi/include/ffi.h.in b/libffi/include/ffi.h.in
index 380673b..ccd4ac0 100644
--- a/libffi/include/ffi.h.in
+++ b/libffi/include/ffi.h.in
@@ -390,6 +390,22 @@ ffi_prep_java_raw_closure_loc (ffi_java_raw_closure*,
 
 #endif /* FFI_CLOSURES */
 
+#if FFI_GO_CLOSURES
+
+typedef struct {
+  void      *tramp;
+  ffi_cif   *cif;
+  void     (*fun)(ffi_cif*,void*,void**,void*);
+} ffi_go_closure;
+
+ffi_status ffi_prep_go_closure (ffi_go_closure*, ffi_cif *,
+				void (*fun)(ffi_cif*,void*,void**,void*));
+
+void ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+		  void **avalue, void *closure);
+
+#endif /* FFI_GO_CLOSURES */
+
 /* ---- Public interface definition -------------------------------------- */
 
 ffi_status ffi_prep_cif(ffi_cif *cif,
-- 
1.9.3

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (12 preceding siblings ...)
  2014-10-10 20:43 ` [PATCH 07/13] libffi: Support go closures on x86_64 Richard Henderson
@ 2014-10-11  0:23 ` Ian Lance Taylor
  2014-11-05 21:34 ` Lynn A. Boger
  2014-12-11  9:06 ` Dominik Vogt
  15 siblings, 0 replies; 43+ messages in thread
From: Ian Lance Taylor @ 2014-10-11  0:23 UTC (permalink / raw)
  To: Richard Henderson; +Cc: gcc-patches, libffi-discuss, gofrontend-dev

On Fri, Oct 10, 2014 at 1:42 PM, Richard Henderson <rth@redhat.com> wrote:
>
> So instead I thought about how I'd add some support for Go directly
> into libffi.  After all, we've got some custom code in libffi for
> Java, why couldn't Go have the same treatment?
>
> The stickler, as far as I could see, is __go_set_context.  I didn't
> like the idea of libffi needing a callback into libgo in order to
> accomplish the goal.
>
> But the comment immediately before __go_set_closure itself says
> that it would be better to use the static chain register.  So I set
> about to see how easy that would be to accomplish.  (And not for
> nothing such a change would make gccgo compiled programs faster
> by avoiding the library calls.)
>
> The following patch set enables this for x86_64, i386, and aarch64[3].

...

> Before I go too much farther down this road, I wanted to get some
> feedback.  FWIW, a complete tree can be found at [4].

I think this is a great idea.  Thanks.

Ian

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 03/13] HACK! Allow the static chain to be set from C
  2014-10-10 20:43 ` [PATCH 03/13] HACK! Allow the static chain to be set from C Richard Henderson
@ 2014-10-11  0:33   ` Ian Lance Taylor
       [not found]     ` <CAMn1gO7vJOcNi218p9m32de_rrnKBrUcGF-EKP3dJwaL+8BtUw@mail.gmail.com>
  2014-10-14 18:44   ` [PATCH v2 03/13] " Richard Henderson
  1 sibling, 1 reply; 43+ messages in thread
From: Ian Lance Taylor @ 2014-10-11  0:33 UTC (permalink / raw)
  To: Richard Henderson; +Cc: gcc-patches, libffi-discuss, gofrontend-dev

On Fri, Oct 10, 2014 at 1:42 PM, Richard Henderson <rth@redhat.com> wrote:
>
> This is awful syntax, and therefore contains no documentation.
> But we'll need to be able to set the static chain on a few calls
> within the Go runtime, so we need to expose this by some means.
>
> It currently looks like
>
>         function(args...) __builtin_call_chain(pointer)
>
> because that was easy to parse.

How crazy would it be to move __builtin_call_chain into the function
arguments, as in
    function(a1, a2, __builtin_call_chain(pointer))
This __builtin_call_chain call would be removed from the argument list
so type checking would only look at a1, a2.  It would just set the
static chain value.  That at least puts the call_chain in the right
place, which is a special kind of function argument.

Ian

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 03/13] HACK! Allow the static chain to be set from C
       [not found]     ` <CAMn1gO7vJOcNi218p9m32de_rrnKBrUcGF-EKP3dJwaL+8BtUw@mail.gmail.com>
@ 2014-10-11  1:42       ` Peter Collingbourne
  2014-10-11  4:24         ` Richard Henderson
  0 siblings, 1 reply; 43+ messages in thread
From: Peter Collingbourne @ 2014-10-11  1:42 UTC (permalink / raw)
  To: Ian Lance Taylor
  Cc: Richard Henderson, gcc-patches, libffi-discuss, gofrontend-dev

On Fri, Oct 10, 2014 at 6:06 PM, Peter Collingbourne <pcc@google.com> wrote:
> On Fri, Oct 10, 2014 at 5:33 PM, 'Ian Lance Taylor' via gofrontend-dev
> <gofrontend-dev@googlegroups.com> wrote:
>>
>> On Fri, Oct 10, 2014 at 1:42 PM, Richard Henderson <rth@redhat.com> wrote:
>> >
>> > This is awful syntax, and therefore contains no documentation.
>> > But we'll need to be able to set the static chain on a few calls
>> > within the Go runtime, so we need to expose this by some means.
>> >
>> > It currently looks like
>> >
>> >         function(args...) __builtin_call_chain(pointer)
>> >
>> > because that was easy to parse.
>>
>> How crazy would it be to move __builtin_call_chain into the function
>> arguments, as in
>>     function(a1, a2, __builtin_call_chain(pointer))
>> This __builtin_call_chain call would be removed from the argument list
>> so type checking would only look at a1, a2.  It would just set the
>> static chain value.  That at least puts the call_chain in the right
>> place, which is a special kind of function argument.
>
>
> Clang will need to be able to parse this syntax too, so let's not do
> anything that diverges too much from the standard.
>
> Can we perhaps make this look like a new calling convention? So e.g. you
> could do:
>
> (((void (__attribute__((chaincall)) *)(void *, int, int)))function)(pointer,
> a1, a2);

A colleague has suggested a perhaps nicer syntax:

__builtin_call_chain(pointer, call) where call must be a call expression

Peter

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 03/13] HACK! Allow the static chain to be set from C
  2014-10-11  1:42       ` [gofrontend-dev] " Peter Collingbourne
@ 2014-10-11  4:24         ` Richard Henderson
  2014-10-13  8:10           ` Richard Biener
  0 siblings, 1 reply; 43+ messages in thread
From: Richard Henderson @ 2014-10-11  4:24 UTC (permalink / raw)
  To: Peter Collingbourne, Ian Lance Taylor
  Cc: gcc-patches, libffi-discuss, gofrontend-dev

On 10/10/2014 06:42 PM, Peter Collingbourne wrote:
> A colleague has suggested a perhaps nicer syntax:
> 
> __builtin_call_chain(pointer, call) where call must be a call expression

I like this.

Unlike the other suggestions, it doesn't mess with the parsing of the "regular"
part of the function call.  And, depending on what point the builtin is lowered
and applied to the AST, it might not require any parsing changes at all.

I'll have a look at this next week.  Thanks.


r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 03/13] HACK! Allow the static chain to be set from C
  2014-10-11  4:24         ` Richard Henderson
@ 2014-10-13  8:10           ` Richard Biener
  2014-10-13 18:46             ` Peter Collingbourne
  0 siblings, 1 reply; 43+ messages in thread
From: Richard Biener @ 2014-10-13  8:10 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Peter Collingbourne, Ian Lance Taylor, gcc-patches,
	libffi-discuss, gofrontend-dev

On Sat, Oct 11, 2014 at 6:23 AM, Richard Henderson <rth@redhat.com> wrote:
> On 10/10/2014 06:42 PM, Peter Collingbourne wrote:
>> A colleague has suggested a perhaps nicer syntax:
>>
>> __builtin_call_chain(pointer, call) where call must be a call expression
>
> I like this.
>
> Unlike the other suggestions, it doesn't mess with the parsing of the "regular"
> part of the function call.  And, depending on what point the builtin is lowered
> and applied to the AST, it might not require any parsing changes at all.
>
> I'll have a look at this next week.  Thanks.

Does the frontend know that the call expects a static chain?  If so
I like Ians suggestion more:

"
How crazy would it be to move __builtin_call_chain into the function
arguments, as in
    function(a1, a2, __builtin_call_chain(pointer))
This __builtin_call_chain call would be removed from the argument list
so type checking would only look at a1, a2.  It would just set the
static chain value.  That at least puts the call_chain in the right
place, which is a special kind of function argument.
"

Richard.

>
> r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 03/13] HACK! Allow the static chain to be set from C
  2014-10-13  8:10           ` Richard Biener
@ 2014-10-13 18:46             ` Peter Collingbourne
  0 siblings, 0 replies; 43+ messages in thread
From: Peter Collingbourne @ 2014-10-13 18:46 UTC (permalink / raw)
  To: Richard Biener
  Cc: Richard Henderson, Ian Lance Taylor, gcc-patches, libffi-discuss,
	gofrontend-dev

On Mon, Oct 13, 2014 at 1:10 AM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Sat, Oct 11, 2014 at 6:23 AM, Richard Henderson <rth@redhat.com> wrote:
>> On 10/10/2014 06:42 PM, Peter Collingbourne wrote:
>>> A colleague has suggested a perhaps nicer syntax:
>>>
>>> __builtin_call_chain(pointer, call) where call must be a call expression
>>
>> I like this.
>>
>> Unlike the other suggestions, it doesn't mess with the parsing of the "regular"
>> part of the function call.  And, depending on what point the builtin is lowered
>> and applied to the AST, it might not require any parsing changes at all.
>>
>> I'll have a look at this next week.  Thanks.
>
> Does the frontend know that the call expects a static chain?

The chain is not part of the function type, so no.

Peter

^ permalink raw reply	[flat|nested] 43+ messages in thread

* [PATCH v2 03/13] Allow the static chain to be set from C
  2014-10-10 20:43 ` [PATCH 03/13] HACK! Allow the static chain to be set from C Richard Henderson
  2014-10-11  0:33   ` Ian Lance Taylor
@ 2014-10-14 18:44   ` Richard Henderson
  1 sibling, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-10-14 18:44 UTC (permalink / raw)
  To: gcc-patches; +Cc: libffi-discuss, gofrontend-dev

[-- Attachment #1: Type: text/plain, Size: 240 bytes --]

Replacing the hacky v1 with the proposed syntax relayed by PCC,
and changing the name to __builtin_call_with_static_chain.  Which
is kinda long, but at least it's more properly descriptive.

Adds documentation and an errors test case.


r~

[-- Attachment #2: 0003-Allow-the-static-chain-to-be-set-from-C.patch --]
[-- Type: text/x-patch, Size: 6160 bytes --]

From 7e31234f2e112bad576b748b2ff6cc615194c0f7 Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@redhat.com>
Date: Tue, 7 Oct 2014 12:17:28 -0700
Subject: [PATCH 03/13] Allow the static chain to be set from C

We need to be able to set the static chain on a few calls within the
Go runtime, so expose this with __builtin_call_with_static_chain.
---
 gcc/c-family/c-common.c      |  2 ++
 gcc/c-family/c-common.h      |  2 +-
 gcc/c/c-parser.c             | 40 ++++++++++++++++++++++++++++++++++++++++
 gcc/doc/extend.texi          | 13 +++++++++++++
 gcc/testsuite/gcc.dg/cwsc0.c | 18 ++++++++++++++++++
 gcc/testsuite/gcc.dg/cwsc1.c | 31 +++++++++++++++++++++++++++++++
 6 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/cwsc0.c
 create mode 100644 gcc/testsuite/gcc.dg/cwsc1.c

diff --git a/gcc/c-family/c-common.c b/gcc/c-family/c-common.c
index 23163f5..f1bf47b 100644
--- a/gcc/c-family/c-common.c
+++ b/gcc/c-family/c-common.c
@@ -442,6 +442,8 @@ const struct c_common_resword c_common_reswords[] =
   { "__attribute__",	RID_ATTRIBUTE,	0 },
   { "__auto_type",	RID_AUTO_TYPE,	D_CONLY },
   { "__bases",          RID_BASES, D_CXXONLY },
+  { "__builtin_call_with_static_chain",
+    RID_BUILTIN_CALL_WITH_STATIC_CHAIN, D_CONLY },
   { "__builtin_choose_expr", RID_CHOOSE_EXPR, D_CONLY },
   { "__builtin_complex", RID_BUILTIN_COMPLEX, D_CONLY },
   { "__builtin_shuffle", RID_BUILTIN_SHUFFLE, 0 },
diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
index 1e3477f..da1c12e 100644
--- a/gcc/c-family/c-common.h
+++ b/gcc/c-family/c-common.h
@@ -102,7 +102,7 @@ enum rid
   RID_EXTENSION, RID_IMAGPART, RID_REALPART, RID_LABEL,      RID_CHOOSE_EXPR,
   RID_TYPES_COMPATIBLE_P,      RID_BUILTIN_COMPLEX,	     RID_BUILTIN_SHUFFLE,
   RID_DFLOAT32, RID_DFLOAT64, RID_DFLOAT128,
-  RID_FRACT, RID_ACCUM, RID_AUTO_TYPE,
+  RID_FRACT, RID_ACCUM, RID_AUTO_TYPE, RID_BUILTIN_CALL_WITH_STATIC_CHAIN,
 
   /* C11 */
   RID_ALIGNAS, RID_GENERIC,
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 346448a..708a125 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -7372,6 +7372,46 @@ c_parser_postfix_expression (c_parser *parser)
 	      = comptypes (e1, e2) ? integer_one_node : integer_zero_node;
 	  }
 	  break;
+	case RID_BUILTIN_CALL_WITH_STATIC_CHAIN:
+	  {
+	    vec<c_expr_t, va_gc> *cexpr_list;
+	    c_expr_t *e2_p;
+	    tree chain_value;
+
+	    c_parser_consume_token (parser);
+	    if (!c_parser_get_builtin_args (parser,
+					    "__builtin_call_with_static_chain",
+					    &cexpr_list, false))
+	      {
+		expr.value = error_mark_node;
+		break;
+	      }
+	    if (vec_safe_length (cexpr_list) != 2)
+	      {
+		error_at (loc, "wrong number of arguments to "
+			       "%<__builtin_call_with_static_chain%>");
+		expr.value = error_mark_node;
+		break;
+	      }
+
+	    expr = (*cexpr_list)[0];
+	    e2_p = &(*cexpr_list)[1];
+	    *e2_p = convert_lvalue_to_rvalue (loc, *e2_p, true, true);
+	    chain_value = e2_p->value;
+	    mark_exp_read (chain_value);
+
+	    if (TREE_CODE (expr.value) != CALL_EXPR)
+	      error_at (loc, "first argument to "
+			"%<__builtin_call_with_static_chain%> "
+			"must be a call expression");
+	    else if (TREE_CODE (TREE_TYPE (chain_value)) != POINTER_TYPE)
+	      error_at (loc, "second argument to "
+			"%<__builtin_call_with_static_chain%> "
+			"must be a pointer type");
+	    else
+	      CALL_EXPR_STATIC_CHAIN (expr.value) = chain_value;
+	    break;
+	  }
 	case RID_BUILTIN_COMPLEX:
 	  {
 	    vec<c_expr_t, va_gc> *cexpr_list;
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 6db142e..f092ea1 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -8639,6 +8639,7 @@ in the Cilk Plus language manual which can be found at
 @node Other Builtins
 @section Other Built-in Functions Provided by GCC
 @cindex built-in functions
+@findex __builtin_call_with_static_chain
 @findex __builtin_fpclassify
 @findex __builtin_isfinite
 @findex __builtin_isnormal
@@ -9227,6 +9228,18 @@ depending on the arguments' types.  For example:
 
 @end deftypefn
 
+@deftypefn {Built-in Function} @var{type} __builtin_call_with_static_chain (@var{call_exp}, @var{pointer_exp})
+
+The @var{call_exp} expression must be a function call, and the
+@var{pointer_exp} expression must be a pointer.  The @var{pointer_exp}
+is passed to the function call in the target's static chain location.
+The result of builtin is the result of the function call.
+
+@emph{Note:} This builtin is only available for C@.
+This builtin can be used to call Go closures from C.
+
+@end deftypefn
+
 @deftypefn {Built-in Function} @var{type} __builtin_choose_expr (@var{const_exp}, @var{exp1}, @var{exp2})
 
 You can use the built-in function @code{__builtin_choose_expr} to
diff --git a/gcc/testsuite/gcc.dg/cwsc0.c b/gcc/testsuite/gcc.dg/cwsc0.c
new file mode 100644
index 0000000..4918b85
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cwsc0.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+
+#include <stddef.h>
+
+#define cwsc  __builtin_call_with_static_chain
+
+void foo(void);
+void test(int (*f)(void), char *p)
+{
+  cwsc(f(), p);
+  cwsc(p, f());		/* { dg-error "must be a call" } */
+  cwsc(f() + 1, p);	/* { dg-error "must be a call" } */
+  cwsc(f(), 0);		/* { dg-error "must be a pointer" } */
+  cwsc(f(), NULL);
+  cwsc(foo, p);		/* { dg-error "must be a call" } */
+  cwsc(foo(), p);
+  cwsc(foo(), foo);
+}
diff --git a/gcc/testsuite/gcc.dg/cwsc1.c b/gcc/testsuite/gcc.dg/cwsc1.c
new file mode 100644
index 0000000..4ab86fb
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cwsc1.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+
+#if defined(__x86_64__)
+# define CHAIN	"%r10"
+#elif defined(__i386__)
+# define CHAIN  "%ecx"
+#elif defined(__aarch64__)
+# define CHAIN  "x18"
+#endif
+
+#ifdef CHAIN
+void *__attribute__((noinline, noclone)) foo(void)
+{
+  register void *chain __asm__(CHAIN);
+  return chain;
+}
+
+void * (*ptr)(void) = foo;
+extern void abort(void);
+
+int main()
+{
+  char c;
+  void *x = __builtin_call_with_static_chain(ptr(), &c);
+  if (x != &c)
+    abort();
+  return 0;
+}
+#else
+int main() { return 0; }
+#endif
-- 
1.9.3


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (13 preceding siblings ...)
  2014-10-11  0:23 ` [PATCH 00/13] Go closures, libffi, and the static chain Ian Lance Taylor
@ 2014-11-05 21:34 ` Lynn A. Boger
  2014-11-06  6:59   ` Richard Henderson
  2014-12-11  9:06 ` Dominik Vogt
  15 siblings, 1 reply; 43+ messages in thread
From: Lynn A. Boger @ 2014-11-05 21:34 UTC (permalink / raw)
  To: Richard Henderson, gcc-patches; +Cc: libffi-discuss, gofrontend-dev

What about the libffi changes that are needed to make this work on other 
platforms, like PowerPC?

On 10/10/2014 03:42 PM, Richard Henderson wrote:
> Pardon the wide distribution, the obvious hacks, and the failure
> to properly split up the largest of the libffi patches.
>
> The background here is my thread from last week[1], and Ian's reply[2],
> wherein he rightly points out that not needing to play games with
> mmap in order to implement closures for Go is a strong reason to
> continue using custom code within libgo.
>
> While that thread did have a go at implementing that custom code for
> aarch64, I still think that replicating libffi's calling convention
> knowledge for every interesting target is a mistake.
>
> So instead I thought about how I'd add some support for Go directly
> into libffi.  After all, we've got some custom code in libffi for
> Java, why couldn't Go have the same treatment?
>
> The stickler, as far as I could see, is __go_set_context.  I didn't
> like the idea of libffi needing a callback into libgo in order to
> accomplish the goal.
>
> But the comment immediately before __go_set_closure itself says
> that it would be better to use the static chain register.  So I set
> about to see how easy that would be to accomplish.  (And not for
> nothing such a change would make gccgo compiled programs faster
> by avoiding the library calls.)
>
> The following patch set enables this for x86_64, i386, and aarch64[3].
>
> The first two patches enable a static chain to be set by the front end
> on CALL_EXPRs, and to be used with indirect function calls.  The third
> patch is a horrible hack to expose this feature to the C front end.
>
> The 4th patch changes gccgo to use the static chain.  I don't bother
> with checking to see that the target has one.  All targets currently
> supported by libgo have one, so I don't really see this as a stumbling
> block.
>
> The 5th patch changes libgo to use the static chain.  I admit that I
> haven't tested this patch solo; I simply split it out of a larger patch
> for clarity.
>
> The 6th patch adds interfaces to libffi for Go; these interfaces are
> used within libgo in the 8th patch.
>
> Patches 7, 10, 11, 12, 13 are all enabling the new libffi interface on
> the aforementioned targets.  There's lots of cleanup in here, and I
> owe the libffi list smaller reviewable changes.  I ask that libffi
> ignore patches 10 and 12 for now and comment on the meat instead.
>
> Before I go too much farther down this road, I wanted to get some
> feedback.  FWIW, a complete tree can be found at [4].
>
> Thanks,
>
>
> r~
>
>
> [1] https://gcc.gnu.org/ml/gcc-patches/2014-10/msg00098.html
> [2] https://gcc.gnu.org/ml/gcc-patches/2014-10/msg00102.html
> [3] Except that after rebasing the tree on yesterday's trunk,
>      I discovered that i386 and aarch64 both have bootstrap
>      problems on trunk.  Ouch.
> [4] git://github.com/rth7680/gcc.git rth/go-closure
>
>
> Richard Henderson (13):
>    Make TARGET_STATIC_CHAIN allow a function type
>    Allow the front-end to create calls with a static chain
>    HACK!  Allow the static chain to be set from C
>    Use the static chain as the closure parameter from Go
>    libgo: Use the static chain for the closure
>    libffi: Add entry points for interacting with Go
>    libffi: Support go closures on x86_64
>    libgo: Use the new libffi interfaces for Go
>    libgo: Remove __go_get/set_closure
>    libffi: Rewrite aarch64
>    libffi: Support go closures on aarch64
>    libffi: Rewrite i386 sysv
>    libffi: Support go closures on i386
>
>   gcc/c-family/c-common.c             |    1 +
>   gcc/c-family/c-common.h             |    2 +-
>   gcc/c/c-parser.c                    |   29 +
>   gcc/calls.c                         |   14 +-
>   gcc/config/i386/i386.c              |   19 +-
>   gcc/config/moxie/moxie.c            |    5 +-
>   gcc/config/xtensa/xtensa.c          |    2 +-
>   gcc/doc/tm.texi                     |    2 +-
>   gcc/gimple-fold.c                   |   21 +
>   gcc/gimplify.c                      |   17 +-
>   gcc/go/go-gcc.cc                    |   44 +-
>   gcc/go/gofrontend/backend.h         |    7 +-
>   gcc/go/gofrontend/expressions.cc    |   21 +-
>   gcc/go/gofrontend/gogo.cc           |   29 +-
>   gcc/go/gofrontend/gogo.h            |   14 +
>   gcc/go/gofrontend/runtime.def       |    6 -
>   gcc/target.def                      |    6 +-
>   gcc/targhooks.c                     |    5 +-
>   gcc/testsuite/gcc.dg/static-chain.c |   31 +
>   gcc/tree-cfg.c                      |   22 +-
>   libffi/include/ffi.h.in             |   16 +
>   libffi/src/aarch64/ffi.c            | 1380 ++++++++++++++---------------------
>   libffi/src/aarch64/ffitarget.h      |   18 +-
>   libffi/src/aarch64/internal.h       |   43 ++
>   libffi/src/aarch64/sysv.S           |  557 +++++++-------
>   libffi/src/x86/ffi.c                | 1161 ++++++++++++-----------------
>   libffi/src/x86/ffi64.c              |  103 ++-
>   libffi/src/x86/ffitarget.h          |  112 ++-
>   libffi/src/x86/internal.h           |   48 ++
>   libffi/src/x86/sysv.S               | 1003 +++++++++++++++----------
>   libffi/src/x86/unix64.S             |  319 ++++----
>   libgo/go/reflect/makefunc.go        |   49 +-
>   libgo/go/reflect/makefunc_386.S     |   22 +-
>   libgo/go/reflect/makefunc_amd64.S   |   13 +-
>   libgo/go/reflect/makefunc_ffi.go    |   67 +-
>   libgo/go/reflect/makefunc_ffi_c.c   |   68 +-
>   libgo/go/reflect/value.go           |    3 +
>   libgo/runtime/go-reflect-call.c     |   10 +-
>   libgo/runtime/malloc.goc            |    8 -
>   libgo/runtime/mgc0.c                |    3 +-
>   libgo/runtime/proc.c                |   20 -
>   libgo/runtime/runtime.h             |    4 -
>   libgo/runtime/time.goc              |    3 +-
>   43 files changed, 2624 insertions(+), 2703 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.dg/static-chain.c
>   create mode 100644 libffi/src/aarch64/internal.h
>   create mode 100644 libffi/src/x86/internal.h
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-11-05 21:34 ` Lynn A. Boger
@ 2014-11-06  6:59   ` Richard Henderson
  2014-11-06 12:48     ` Alan Modra
  0 siblings, 1 reply; 43+ messages in thread
From: Richard Henderson @ 2014-11-06  6:59 UTC (permalink / raw)
  To: Lynn A. Boger, gcc-patches; +Cc: libffi-discuss, gofrontend-dev

On 11/05/2014 10:33 PM, Lynn A. Boger wrote:
> What about the libffi changes that are needed to make this work on other
> platforms, like PowerPC?

I've been working my way through the currently supported libgo targets on
libffi-discuss, hoping to get them in upstream libffi before importing to gcc.

I haven't done powerpc yet.  If you'd like to help, I'd be delighted.


r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-11-06  6:59   ` Richard Henderson
@ 2014-11-06 12:48     ` Alan Modra
  2014-11-06 13:04       ` Richard Henderson
  2014-11-06 13:10       ` Lynn A. Boger
  0 siblings, 2 replies; 43+ messages in thread
From: Alan Modra @ 2014-11-06 12:48 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Lynn A. Boger, gcc-patches, libffi-discuss, gofrontend-dev

On Thu, Nov 06, 2014 at 07:59:16AM +0100, Richard Henderson wrote:
> I haven't done powerpc yet.  If you'd like to help, I'd be delighted.

I was going to say that it doesn't look too difficult, but then I
noticed we have a problem.  PowerPC uses r11 as the static chain,
a register that is allowed to be used by linkage stubs.

So any call to a shared libffi will (or may) blow away r11.  On ppc32,
every plt call currently uses r11.  On ppc64 ELFv1 most plt calls do
(in fact r11 is loaded from the third word of the plt function
descriptor if using a standard plt stub).  On ppc64 ELFv2 just the
lazy plt resolution trashes r11.

-- 
Alan Modra
Australia Development Lab, IBM

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-11-06 12:48     ` Alan Modra
@ 2014-11-06 13:04       ` Richard Henderson
  2014-11-06 17:45         ` [gofrontend-dev] " Ian Taylor
  2014-11-06 13:10       ` Lynn A. Boger
  1 sibling, 1 reply; 43+ messages in thread
From: Richard Henderson @ 2014-11-06 13:04 UTC (permalink / raw)
  To: Lynn A. Boger, gcc-patches, libffi-discuss, gofrontend-dev, ian

On 11/06/2014 01:48 PM, Alan Modra wrote:
> On Thu, Nov 06, 2014 at 07:59:16AM +0100, Richard Henderson wrote:
>> I haven't done powerpc yet.  If you'd like to help, I'd be delighted.
> 
> I was going to say that it doesn't look too difficult, but then I
> noticed we have a problem.  PowerPC uses r11 as the static chain,
> a register that is allowed to be used by linkage stubs.

Hum.

At the moment, the static chain is not part of the ABI -- it's private to the
translation unit.  But as soon as we start using this for the Go closure, it
does become part of the ABI, so it would be best if we can choose a different
register.

That said, this *may* not actually be a problem.  It's not the direct (possibly
lazy bound) call into libffi that needs a static chain, it's the indirect call
that libffi produces.  And the indirect calls that Go produces.

I'm pretty sure that there are no dynamically linked Go calls that require the
static chain.  They're used for closures, which are either fully indirect from
a different translation unit, or locally bound closures through which the
optimizer has seen the construction, and optimized to a direct call.

Ian, have I missed a case where a closure could wind up with a direct call to a
lazy bound function?


r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-11-06 12:48     ` Alan Modra
  2014-11-06 13:04       ` Richard Henderson
@ 2014-11-06 13:10       ` Lynn A. Boger
  2014-11-06 13:17         ` Richard Henderson
  1 sibling, 1 reply; 43+ messages in thread
From: Lynn A. Boger @ 2014-11-06 13:10 UTC (permalink / raw)
  To: Richard Henderson, gcc-patches, libffi-discuss, gofrontend-dev

Aren't there cases where the static chain register is needed?  How does 
that work if it could be trashed on a plt call?

On 11/06/2014 06:48 AM, Alan Modra wrote:
> On Thu, Nov 06, 2014 at 07:59:16AM +0100, Richard Henderson wrote:
>> I haven't done powerpc yet.  If you'd like to help, I'd be delighted.
> I was going to say that it doesn't look too difficult, but then I
> noticed we have a problem.  PowerPC uses r11 as the static chain,
> a register that is allowed to be used by linkage stubs.
>
> So any call to a shared libffi will (or may) blow away r11.  On ppc32,
> every plt call currently uses r11.  On ppc64 ELFv1 most plt calls do
> (in fact r11 is loaded from the third word of the plt function
> descriptor if using a standard plt stub).  On ppc64 ELFv2 just the
> lazy plt resolution trashes r11.
>

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-11-06 13:10       ` Lynn A. Boger
@ 2014-11-06 13:17         ` Richard Henderson
  0 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-11-06 13:17 UTC (permalink / raw)
  To: Lynn A. Boger, gcc-patches, libffi-discuss, gofrontend-dev

On 11/06/2014 02:10 PM, Lynn A. Boger wrote:
> Aren't there cases where the static chain register is needed?  How does that
> work if it could be trashed on a plt call?

No.  At the moment the static chain is only used for nested functions, which
are local to the translation unit, and are therefore locally bound, and
therefore never go through the plt.


r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-11-06 13:04       ` Richard Henderson
@ 2014-11-06 17:45         ` Ian Taylor
  2014-11-07  7:39           ` Richard Henderson
  0 siblings, 1 reply; 43+ messages in thread
From: Ian Taylor @ 2014-11-06 17:45 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Lynn A. Boger, gcc-patches, libffi-discuss, gofrontend-dev, ian

On Thu, Nov 6, 2014 at 5:04 AM, Richard Henderson <rth@redhat.com> wrote:
>
> That said, this *may* not actually be a problem.  It's not the direct (possibly
> lazy bound) call into libffi that needs a static chain, it's the indirect call
> that libffi produces.  And the indirect calls that Go produces.
>
> I'm pretty sure that there are no dynamically linked Go calls that require the
> static chain.  They're used for closures, which are either fully indirect from
> a different translation unit, or locally bound closures through which the
> optimizer has seen the construction, and optimized to a direct call.
>
> Ian, have I missed a case where a closure could wind up with a direct call to a
> lazy bound function?

I think you've covered all the cases.  The closure value is only
required when calling a nested function.  There is no way to refer
directly to a nested function defined in a different shared library.
The only way you can get such a reference is if some function in that
shared library returns it.

So we are OK assuming that when returning a nested function, which is
always known to be locally defined, we never return a reference to the
PLT, but always return a fully resolved function address.  That seems
like a plausible assumption, particularly since we should never need
to set up a PLT for a nested function, since it can never be called
directly from a different shared library.

Ian

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-11-06 17:45         ` [gofrontend-dev] " Ian Taylor
@ 2014-11-07  7:39           ` Richard Henderson
  2014-11-07  8:50             ` Jay
  2014-11-07 16:06             ` Ian Taylor
  0 siblings, 2 replies; 43+ messages in thread
From: Richard Henderson @ 2014-11-07  7:39 UTC (permalink / raw)
  To: Ian Taylor
  Cc: Lynn A. Boger, gcc-patches, libffi-discuss, gofrontend-dev, ian

On 11/06/2014 06:45 PM, Ian Taylor wrote:
> On Thu, Nov 6, 2014 at 5:04 AM, Richard Henderson <rth@redhat.com> wrote:
>>
>> That said, this *may* not actually be a problem.  It's not the direct (possibly
>> lazy bound) call into libffi that needs a static chain, it's the indirect call
>> that libffi produces.  And the indirect calls that Go produces.
>>
>> I'm pretty sure that there are no dynamically linked Go calls that require the
>> static chain.  They're used for closures, which are either fully indirect from
>> a different translation unit, or locally bound closures through which the
>> optimizer has seen the construction, and optimized to a direct call.
>>
>> Ian, have I missed a case where a closure could wind up with a direct call to a
>> lazy bound function?
> 
> I think you've covered all the cases.  The closure value is only
> required when calling a nested function.  There is no way to refer
> directly to a nested function defined in a different shared library.
> The only way you can get such a reference is if some function in that
> shared library returns it.

Sorry, I wasn't clear.  I know nested functions must be local.

I'm asking about Go closures, supposing we go ahead with the change to
make them use the static chain register.

I'm merely pretty sure that calling a closure is either fully indirect
or local direct.

Certainly there are cases in the testsuite where -O3 is able to look
through the creation of a closure and have a direct call to the function.

Given that closures are custom created for the data at the creation
site, it seems unlikely that the optimizer could look through that and
come up with a dynamically bound function.


r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-11-07  7:39           ` Richard Henderson
@ 2014-11-07  8:50             ` Jay
  2014-11-07 16:06             ` Ian Taylor
  1 sibling, 0 replies; 43+ messages in thread
From: Jay @ 2014-11-07  8:50 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Ian Taylor, Lynn A. Boger, gcc-patches,
	<libffi-discuss@sourceware.org>,
	gofrontend-dev, <ian@airs.com>

I worked on what I suspect is similar stuff.

I ran into the problem..pardon me if my terminology is wrong..PLT thunks for nested functions trashed registers that were in use. My solution was to mark them "hidden" or whatever is the term for not replaceable...also not exported but I recall not replaceable is more important.

 - Jay

On Nov 6, 2014, at 11:38 PM, Richard Henderson <rth@redhat.com> wrote:

> On 11/06/2014 06:45 PM, Ian Taylor wrote:
>> On Thu, Nov 6, 2014 at 5:04 AM, Richard Henderson <rth@redhat.com> wrote:
>>> 
>>> That said, this *may* not actually be a problem.  It's not the direct (possibly
>>> lazy bound) call into libffi that needs a static chain, it's the indirect call
>>> that libffi produces.  And the indirect calls that Go produces.
>>> 
>>> I'm pretty sure that there are no dynamically linked Go calls that require the
>>> static chain.  They're used for closures, which are either fully indirect from
>>> a different translation unit, or locally bound closures through which the
>>> optimizer has seen the construction, and optimized to a direct call.
>>> 
>>> Ian, have I missed a case where a closure could wind up with a direct call to a
>>> lazy bound function?
>> 
>> I think you've covered all the cases.  The closure value is only
>> required when calling a nested function.  There is no way to refer
>> directly to a nested function defined in a different shared library.
>> The only way you can get such a reference is if some function in that
>> shared library returns it.
> 
> Sorry, I wasn't clear.  I know nested functions must be local.
> 
> I'm asking about Go closures, supposing we go ahead with the change to
> make them use the static chain register.
> 
> I'm merely pretty sure that calling a closure is either fully indirect
> or local direct.
> 
> Certainly there are cases in the testsuite where -O3 is able to look
> through the creation of a closure and have a direct call to the function.
> 
> Given that closures are custom created for the data at the creation
> site, it seems unlikely that the optimizer could look through that and
> come up with a dynamically bound function.
> 
> 
> r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-11-07  7:39           ` Richard Henderson
  2014-11-07  8:50             ` Jay
@ 2014-11-07 16:06             ` Ian Taylor
  2014-11-07 23:55               ` Alan Modra
  1 sibling, 1 reply; 43+ messages in thread
From: Ian Taylor @ 2014-11-07 16:06 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Lynn A. Boger, gcc-patches, libffi-discuss, gofrontend-dev,
	Ian Lance Taylor

On Thu, Nov 6, 2014 at 11:38 PM, Richard Henderson <rth@redhat.com> wrote:
> On 11/06/2014 06:45 PM, Ian Taylor wrote:
>> On Thu, Nov 6, 2014 at 5:04 AM, Richard Henderson <rth@redhat.com> wrote:
>>>
>>> That said, this *may* not actually be a problem.  It's not the direct (possibly
>>> lazy bound) call into libffi that needs a static chain, it's the indirect call
>>> that libffi produces.  And the indirect calls that Go produces.
>>>
>>> I'm pretty sure that there are no dynamically linked Go calls that require the
>>> static chain.  They're used for closures, which are either fully indirect from
>>> a different translation unit, or locally bound closures through which the
>>> optimizer has seen the construction, and optimized to a direct call.
>>>
>>> Ian, have I missed a case where a closure could wind up with a direct call to a
>>> lazy bound function?
>>
>> I think you've covered all the cases.  The closure value is only
>> required when calling a nested function.  There is no way to refer
>> directly to a nested function defined in a different shared library.
>> The only way you can get such a reference is if some function in that
>> shared library returns it.
>
> Sorry, I wasn't clear.  I know nested functions must be local.
>
> I'm asking about Go closures, supposing we go ahead with the change to
> make them use the static chain register.

I think we're saying the same thing.

Closures exist only for nested functions and for functions created by
reflect.MakeFunc and friends.

Storing a top-level function into a variable will give you something
that looks like it has a closure, but the closure will always be empty
and it will never be used.  The indirect call will set the closure
value in the static chain register, but the register will not be used
by the function being called.

> I'm merely pretty sure that calling a closure is either fully indirect
> or local direct.

Yes.

> Certainly there are cases in the testsuite where -O3 is able to look
> through the creation of a closure and have a direct call to the function.
>
> Given that closures are custom created for the data at the creation
> site, it seems unlikely that the optimizer could look through that and
> come up with a dynamically bound function.

Yes.

Ian

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-11-07 16:06             ` Ian Taylor
@ 2014-11-07 23:55               ` Alan Modra
  0 siblings, 0 replies; 43+ messages in thread
From: Alan Modra @ 2014-11-07 23:55 UTC (permalink / raw)
  To: Ian Taylor
  Cc: Richard Henderson, Lynn A. Boger, gcc-patches, libffi-discuss,
	gofrontend-dev, Ian Lance Taylor

On Fri, Nov 07, 2014 at 08:06:52AM -0800, Ian Taylor wrote:
> Closures exist only for nested functions and for functions created by
> reflect.MakeFunc and friends.
> 
> Storing a top-level function into a variable will give you something
> that looks like it has a closure, but the closure will always be empty
> and it will never be used.  The indirect call will set the closure
> value in the static chain register, but the register will not be used
> by the function being called.

Good, this was something I was still worried about, because the mere
fact that a call is indirect doesn't guarantee it won't hit a plt
stub.  Many ABIs define the address of a non-local function in an
executable to be the address of the plt stub code for that function.
So it is quite possible for an indirect call to bounce through a plt
stub.  Various linker optimisations make this fairly uncommon, for
instance the GNU powerpc linkers won't do this unless the function
address is taken in the executable by non-PIC (see
pointer_equality_needed in BFD code).

-- 
Alan Modra
Australia Development Lab, IBM

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
                   ` (14 preceding siblings ...)
  2014-11-05 21:34 ` Lynn A. Boger
@ 2014-12-11  9:06 ` Dominik Vogt
  2014-12-11  9:21   ` Alan Modra
  2014-12-11 19:38   ` Richard Henderson
  15 siblings, 2 replies; 43+ messages in thread
From: Dominik Vogt @ 2014-12-11  9:06 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev; +Cc: Andreas Krebbel

On Fri, Oct 10, 2014 at 01:42:40PM -0700, Richard Henderson wrote:
> The background here is my thread from last week[1], and Ian's reply[2],
> wherein he rightly points out that not needing to play games with
> mmap in order to implement closures for Go is a strong reason to
> continue using custom code within libgo.
> 
> While that thread did have a go at implementing that custom code for
> aarch64, I still think that replicating libffi's calling convention
> knowledge for every interesting target is a mistake.
> 
> So instead I thought about how I'd add some support for Go directly
> into libffi.
...
> But the comment immediately before __go_set_closure itself says
> that it would be better to use the static chain register.
...
> Before I go too much farther down this road, I wanted to get some
> feedback.  FWIW, a complete tree can be found at [4].
...
> [4] git://github.com/rth7680/gcc.git rth/go-closure

1)

On s390x, the static chain register cannot be used for passing the
Go closure pointer to a function:  According to the Abi, the
dynamic linker is allowed to destroy the contents of r0 (static
chain register) eventually causing a crash if libgo is linked
dynamically.  The assumption that the static chain register can be
used to pass information to a function is wrong for s390x.

2)

With this branch, the reflection tests on amd64 crash:

  $ cd <gcc source tree>/build
  # build gcc
  $ cd <targetdir>/libgo
  $ make reflect/check

  -->

-- snip --
Aborted

reflect.call
	../../../libgo/runtime/go-reflect-call.c:216
reflect.call.N13_reflect.Value
	GCCDIR/build-go-closure/x86_64-unknown-linux-gnu/libgo/gotest30365/test/value.go:579
reflect.Call.N13_reflect.Value
	GCCDIR/build-go-closure/x86_64-unknown-linux-gnu/libgo/gotest30365/test/value.go:412
reflect_test.TestCallWithStruct
	GCCDIR/build-go-closure/x86_64-unknown-linux-gnu/libgo/gotest30365/test/all_test.go:1490
testing.tRunner
	../../../libgo/go/testing/testing.go:422

goroutine 16 [chan receive]:
testing.RunTests
	../../../libgo/go/testing/testing.go:505
testing.Main
	../../../libgo/go/testing/testing.go:435
main.main
	GCCDIR/build-go-closure/x86_64-unknown-linux-gnu/libgo/gotest30365/test/_testmain.go:124
created by main
	../../../libgo/runtime/go-main.c:42

goroutine 18 [finalizer wait]:
created by runtime_createfing
	../../../libgo/runtime/mgc0.c:2572

goroutine 53 [sleep]:
reflect_test.selectWatcher
	GCCDIR/build-go-closure/x86_64-unknown-linux-gnu/libgo/gotest30365/test/all_test.go:1377
created by reflect_test.$nested2
	GCCDIR/build-go-closure/x86_64-unknown-linux-gnu/libgo/gotest30365/test/all_test.go:1107
FAIL: reflect
make: *** [reflect/check] Error 1
-- snip --

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-12-11  9:06 ` Dominik Vogt
@ 2014-12-11  9:21   ` Alan Modra
  2014-12-11 10:31     ` [gofrontend-dev] " Dominik Vogt
  2014-12-12 13:57     ` Dominik Vogt
  2014-12-11 19:38   ` Richard Henderson
  1 sibling, 2 replies; 43+ messages in thread
From: Alan Modra @ 2014-12-11  9:21 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev, Andreas Krebbel

On Thu, Dec 11, 2014 at 10:06:23AM +0100, Dominik Vogt wrote:
> On s390x, the static chain register cannot be used for passing the
> Go closure pointer to a function:  According to the Abi, the
> dynamic linker is allowed to destroy the contents of r0 (static
> chain register) eventually causing a crash if libgo is linked
> dynamically.  The assumption that the static chain register can be
> used to pass information to a function is wrong for s390x.

I was worried about exactly the same "problem" on powerpc with r11
being used for the static chain and also destroyed in linkage stubs.
It turns out we don't traverse any linkage stubs.

See https://gcc.gnu.org/ml/gcc-patches/2014-11/msg00446.html.  

-- 
Alan Modra
Australia Development Lab, IBM

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-12-11  9:21   ` Alan Modra
@ 2014-12-11 10:31     ` Dominik Vogt
  2014-12-11 12:25       ` Dominik Vogt
  2014-12-12 13:57     ` Dominik Vogt
  1 sibling, 1 reply; 43+ messages in thread
From: Dominik Vogt @ 2014-12-11 10:31 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev; +Cc: Andreas Krebbel

On Thu, Dec 11, 2014 at 07:51:44PM +1030, Alan Modra wrote:
> On Thu, Dec 11, 2014 at 10:06:23AM +0100, Dominik Vogt wrote:
> > On s390x, the static chain register cannot be used for passing the
> > Go closure pointer to a function:  According to the Abi, the
> > dynamic linker is allowed to destroy the contents of r0 (static
> > chain register) eventually causing a crash if libgo is linked
> > dynamically.  The assumption that the static chain register can be
> > used to pass information to a function is wrong for s390x.
> 
> I was worried about exactly the same "problem" on powerpc with r11
> being used for the static chain and also destroyed in linkage stubs.
> It turns out we don't traverse any linkage stubs.

Just to make this clear:  It's not something that *might* happen.
It *does* happen on s390[x] which does not use libffi but the hand
written code in makefunc_s390.S and makefuncgo_s390[x].go.

The same may not happen when calling functions through libffi
(which may be dynamically linked) because ffi_call_go() is passed
the closure pointer as an argument and not in the static chain
register.

> See https://gcc.gnu.org/ml/gcc-patches/2014-11/msg00446.html.  

Thanks for the link.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-12-11 10:31     ` [gofrontend-dev] " Dominik Vogt
@ 2014-12-11 12:25       ` Dominik Vogt
  2014-12-11 19:56         ` Richard Henderson
  0 siblings, 1 reply; 43+ messages in thread
From: Dominik Vogt @ 2014-12-11 12:25 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev; +Cc: Andreas Krebbel

On Thu, Dec 11, 2014 at 11:31:06AM +0100, Dominik Vogt wrote:
> Just to make this clear:  It's not something that *might* happen.
> It *does* happen on s390[x] which does not use libffi but the hand
> written code in makefunc_s390.S and makefuncgo_s390[x].go.
> 
> The same may not happen when calling functions through libffi
> (which may be dynamically linked) because ffi_call_go() is passed
> the closure pointer as an argument and not in the static chain
> register.

Update:  If I disable the custom s390x code and switch to the
implementation just using libffi for reflection calls, the same
crash occurs with the testing/quick libgo test case.  The called
function sees a bogus value written by the synamic linker as the
closure pointer, for example with this line in the test code:

  CheckEqual(fComplex64, fComplex64, nil)

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-12-11  9:06 ` Dominik Vogt
  2014-12-11  9:21   ` Alan Modra
@ 2014-12-11 19:38   ` Richard Henderson
  1 sibling, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-12-11 19:38 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev, Andreas Krebbel

On 12/11/2014 01:06 AM, Dominik Vogt wrote:
> reflect.call
> 	../../../libgo/runtime/go-reflect-call.c:216
> reflect.call.N13_reflect.Value
> 	GCCDIR/build-go-closure/x86_64-unknown-linux-gnu/libgo/gotest30365/test/value.go:579
> reflect.Call.N13_reflect.Value
> 	GCCDIR/build-go-closure/x86_64-unknown-linux-gnu/libgo/gotest30365/test/value.go:412
> reflect_test.TestCallWithStruct
> 	GCCDIR/build-go-closure/x86_64-unknown-linux-gnu/libgo/gotest30365/test/all_test.go:1490
> testing.tRunner
> 	../../../libgo/go/testing/testing.go:422

Indeed.  libgo uses ffi_type_void to represent empty structures,
and libffi would crash for x86_64 when passing such parameters.

This does go back to an open bug report about how libffi handles
empty structures in general.

I've fixed this on the branch, and I'll push this through the
proper channels later.


r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-12-11 12:25       ` Dominik Vogt
@ 2014-12-11 19:56         ` Richard Henderson
  2014-12-12 12:06           ` Dominik Vogt
  0 siblings, 1 reply; 43+ messages in thread
From: Richard Henderson @ 2014-12-11 19:56 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev; +Cc: Andreas Krebbel, Alan Modra

On 12/11/2014 04:25 AM, Dominik Vogt wrote:
> Update:  If I disable the custom s390x code and switch to the
> implementation just using libffi for reflection calls, the same
> crash occurs with the testing/quick libgo test case.  The called
> function sees a bogus value written by the synamic linker as the
> closure pointer, for example with this line in the test code:
> 
>   CheckEqual(fComplex64, fComplex64, nil)

The compiler should be generating a static structure for these.
On x86_64, I see

Relocation section '.rela.rodata.testing_quick.fComplex64$descriptor' at offset
0x5d4c0 contains 1 entries:
  Offset          Info           Type           Sym. Value    Sym. Name + Addend
000000000000  000200000001 R_X86_64_64       0000000000000000 .text + c0

00000000000000c0 t quick.fComplex64

so that is in fact a direct relocation, and will not go via the dynamic linker.
 Is the s390 port somehow putting the address of a plt entry here?


r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-12-11 19:56         ` Richard Henderson
@ 2014-12-12 12:06           ` Dominik Vogt
  2014-12-12 18:14             ` Richard Henderson
  0 siblings, 1 reply; 43+ messages in thread
From: Dominik Vogt @ 2014-12-12 12:06 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev

[-- Attachment #1: Type: text/plain, Size: 1620 bytes --]

On Thu, Dec 11, 2014 at 11:56:00AM -0800, Richard Henderson wrote:
> On 12/11/2014 04:25 AM, Dominik Vogt wrote:
> > Update:  If I disable the custom s390x code and switch to the
> > implementation just using libffi for reflection calls, the same
> > crash occurs with the testing/quick libgo test case.  The called
> > function sees a bogus value written by the synamic linker as the
> > closure pointer, for example with this line in the test code:
> > 
> >   CheckEqual(fComplex64, fComplex64, nil)

>  Is the s390 port somehow putting the address of a plt entry here?

Digging through the test program with the debugger reveals that
the register corruption is not caused by dynamic linking.
Instead, libgo lacks a patch that is necessary for complex
support.  Without that, ffi_prep_args treats _Complex like a
struct with two elements (which it is not on s390[x]) and messes
up the layout of the stack arguments, eventually loading the wrong
values into the registers when the test function is called.  It
turns out that the bad value in r0 was just a red herring in this
case.

I'm not sure I've posted the missing patch anywhere yet, so it's
attached to this message.  At the moment it enables
FFI_TYPE_COMPLEX only for s390[x], but eventually this should be
used unconditionally.

--

(This still leaves the dynamic linking issue if we do not use
libffi for reflection calls with x86* and s390[x].  Is the plan to
remove the platform specific abi code for the few platforms that
have it?  I see no way to make them work with the static chain
patch anyway.)

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany

[-- Attachment #2: 0001-libgo-Enable-complex-number-support-from-libffi.patch --]
[-- Type: text/x-diff, Size: 2432 bytes --]

From 84235d9e7ba8a55dea182adc4007bfab6a35fb1f Mon Sep 17 00:00:00 2001
From: Dominik Vogt <vogt@de.ibm.com>
Date: Wed, 29 Oct 2014 09:08:01 +0100
Subject: [PATCH] libgo: Enable complex number support from libffi.

---
 libgo/runtime/go-ffi.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/libgo/runtime/go-ffi.c b/libgo/runtime/go-ffi.c
index 21879b9..42462d0 100644
--- a/libgo/runtime/go-ffi.c
+++ b/libgo/runtime/go-ffi.c
@@ -150,11 +150,26 @@ go_complex_to_ffi (ffi_type *float_type)
   ffi_type *ret;
 
   ret = (ffi_type *) __go_alloc (sizeof (ffi_type));
+  /* Use libffi with complex type support for targets that have it.  This should
+     be the case for all targets eventually, so the #else branch should then be
+     removed.  */
+#if defined (__s390__) && defined (FFI_TYPE_COMPLEX)
+  ret->type = FFI_TYPE_COMPLEX;
+  ret->size = 2 * float_type->size;
+  ret->alignment = float_type->alignment;
+  ret->elements = (ffi_type **) __go_alloc (2 * sizeof (ffi_type *));
+  ret->elements[0] = float_type;
+  ret->elements[1] = NULL;
+#else
+  /* Warning: This works only on platforms that define C _Complex types like
+     structures in their Abi.  */
   ret->type = FFI_TYPE_STRUCT;
   ret->elements = (ffi_type **) __go_alloc (3 * sizeof (ffi_type *));
   ret->elements[0] = float_type;
   ret->elements[1] = float_type;
   ret->elements[2] = NULL;
+#endif
+
   return ret;
 }
 
@@ -184,6 +199,9 @@ go_type_to_ffi (const struct __go_type_descriptor *descriptor)
 #ifdef __alpha__
       runtime_throw("the libffi library does not support Complex64 type with "
 		    "reflect.Call or runtime.SetFinalizer");
+#elif defined(__s390__) && !defined(FFI_TYPE_COMPLEX)
+      runtime_throw("the libffi library does not support Complex64 type with "
+		    "reflect.Call or runtime.SetFinalizer");
 #else
       if (sizeof (float) == 4)
 	return go_complex_to_ffi (&ffi_type_float);
@@ -193,6 +211,9 @@ go_type_to_ffi (const struct __go_type_descriptor *descriptor)
 #ifdef __alpha__
       runtime_throw("the libffi library does not support Complex128 type with "
 		    "reflect.Call or runtime.SetFinalizer");
+#elif defined(__s390__) && !defined(FFI_TYPE_COMPLEX)
+      runtime_throw("the libffi library does not support Complex128 type with "
+		    "reflect.Call or runtime.SetFinalizer");
 #else
       if (sizeof (double) == 8)
 	return go_complex_to_ffi (&ffi_type_double);
-- 
1.8.4.2


^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-12-11  9:21   ` Alan Modra
  2014-12-11 10:31     ` [gofrontend-dev] " Dominik Vogt
@ 2014-12-12 13:57     ` Dominik Vogt
  1 sibling, 0 replies; 43+ messages in thread
From: Dominik Vogt @ 2014-12-12 13:57 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev; +Cc: Andreas Krebbel

[-- Attachment #1: Type: text/plain, Size: 611 bytes --]

On Thu, Dec 11, 2014 at 07:51:44PM +1030, Alan Modra wrote:
> I was worried about exactly the same "problem" on powerpc with r11
> being used for the static chain and also destroyed in linkage stubs.
> It turns out we don't traverse any linkage stubs.
> 
> See https://gcc.gnu.org/ml/gcc-patches/2014-11/msg00446.html.  

I've written a small test suite that tests reflection calls over
module boundaries (see attachment).  Build with "make" and then
just run "./main".  The program must not crash; it does not check
consistency of the function arguments.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany

[-- Attachment #2: closure_reflect_tests.tgz --]
[-- Type: application/x-gtar-compressed, Size: 1101 bytes --]

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-12-12 12:06           ` Dominik Vogt
@ 2014-12-12 18:14             ` Richard Henderson
  2014-12-15  9:42               ` Dominik Vogt
  0 siblings, 1 reply; 43+ messages in thread
From: Richard Henderson @ 2014-12-12 18:14 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev

On 12/12/2014 04:06 AM, Dominik Vogt wrote:
> I'm not sure I've posted the missing patch anywhere yet, so it's
> attached to this message.  At the moment it enables
> FFI_TYPE_COMPLEX only for s390[x], but eventually this should be
> used unconditionally.

Thanks for that.  I'd been meaning to get around to that.  I'll change the test
to use FFI_TARGET_HAS_COMPLEX_TYPE and apply it to my branch.

> (This still leaves the dynamic linking issue if we do not use
> libffi for reflection calls with x86* and s390[x].  Is the plan to
> remove the platform specific abi code for the few platforms that
> have it?  I see no way to make them work with the static chain
> patch anyway.)

Well, the x86 paths were updated to work with the static chain, but indeed that
required assembly rather than cheating and using C as you did.

But removing all of that was always my goal.  Indeed, my branch now has a patch
to remove all of the target-specific code.  Tested only on x86_64 so far, but I
plan to test i686 today.


r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-12-12 18:14             ` Richard Henderson
@ 2014-12-15  9:42               ` Dominik Vogt
  2014-12-15 20:11                 ` Richard Henderson
  0 siblings, 1 reply; 43+ messages in thread
From: Dominik Vogt @ 2014-12-15  9:42 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev

On Fri, Dec 12, 2014 at 10:14:21AM -0800, Richard Henderson wrote:
> On 12/12/2014 04:06 AM, Dominik Vogt wrote:
> > I'm not sure I've posted the missing patch anywhere yet, so it's
> > attached to this message.  At the moment it enables
> > FFI_TYPE_COMPLEX only for s390[x], but eventually this should be
> > used unconditionally.
> 
> Thanks for that.  I'd been meaning to get around to that.  I'll change the test
> to use FFI_TARGET_HAS_COMPLEX_TYPE and apply it to my branch.

Good.  I'm not sure whether it's a good idea to expose
FFI_TARGET_HAS_COMPLEX_TYPE as part of the libffi interface
though.  It was meant as a temporary thing to be removed once all
platforms supported by libffi have implemented complex support.  A
while ago I've posted a patch to change the macro's name to begin
with an underscore to make that clearer.

> > (This still leaves the dynamic linking issue if we do not use
> > libffi for reflection calls with x86* and s390[x].  Is the plan to
> > remove the platform specific abi code for the few platforms that
> > have it?  I see no way to make them work with the static chain
> > patch anyway.)
> 
> Well, the x86 paths were updated to work with the static chain, but indeed that
> required assembly rather than cheating and using C as you did.
> 
> But removing all of that was always my goal.  Indeed, my branch now has a patch
> to remove all of the target-specific code.

Fine with that.  I wouldn't have written the s390 specific Abi code
in Go if libffi had been an option back then.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany

^ permalink raw reply	[flat|nested] 43+ messages in thread

* Re: [gofrontend-dev] Re: [PATCH 00/13] Go closures, libffi, and the static chain
  2014-12-15  9:42               ` Dominik Vogt
@ 2014-12-15 20:11                 ` Richard Henderson
  0 siblings, 0 replies; 43+ messages in thread
From: Richard Henderson @ 2014-12-15 20:11 UTC (permalink / raw)
  To: libffi-discuss, gcc-patches, gofrontend-dev

On 12/15/2014 03:42 AM, Dominik Vogt wrote:
> On Fri, Dec 12, 2014 at 10:14:21AM -0800, Richard Henderson wrote:
>> On 12/12/2014 04:06 AM, Dominik Vogt wrote:
>>> I'm not sure I've posted the missing patch anywhere yet, so it's
>>> attached to this message.  At the moment it enables
>>> FFI_TYPE_COMPLEX only for s390[x], but eventually this should be
>>> used unconditionally.
>>
>> Thanks for that.  I'd been meaning to get around to that.  I'll change the test
>> to use FFI_TARGET_HAS_COMPLEX_TYPE and apply it to my branch.
> 
> Good.  I'm not sure whether it's a good idea to expose
> FFI_TARGET_HAS_COMPLEX_TYPE as part of the libffi interface
> though.  It was meant as a temporary thing to be removed once all
> platforms supported by libffi have implemented complex support.  A
> while ago I've posted a patch to change the macro's name to begin
> with an underscore to make that clearer.

It's our copy of libffi -- I think we can assume any internals we like.

Similarly, when I finish writing the bits that allow libffi to
handle empty structures, I don't plan to conditionalize libgo,
I simply plan to assume it works.


r~

^ permalink raw reply	[flat|nested] 43+ messages in thread

end of thread, other threads:[~2014-12-15 20:11 UTC | newest]

Thread overview: 43+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-10 20:43 [PATCH 00/13] Go closures, libffi, and the static chain Richard Henderson
2014-10-10 20:43 ` [PATCH 04/13] Use the static chain as the closure parameter from Go Richard Henderson
2014-10-10 20:43 ` [PATCH 11/13] libffi: Support go closures on aarch64 Richard Henderson
2014-10-10 20:43 ` [PATCH 05/13] libgo: Use the static chain for the closure Richard Henderson
2014-10-10 20:43 ` [PATCH 12/13] libffi: Rewrite i386 sysv Richard Henderson
2014-10-10 20:43 ` [PATCH 10/13] libffi: Rewrite aarch64 Richard Henderson
2014-10-10 20:43 ` [PATCH 13/13] libffi: Support go closures on i386 Richard Henderson
2014-10-10 20:43 ` [PATCH 03/13] HACK! Allow the static chain to be set from C Richard Henderson
2014-10-11  0:33   ` Ian Lance Taylor
     [not found]     ` <CAMn1gO7vJOcNi218p9m32de_rrnKBrUcGF-EKP3dJwaL+8BtUw@mail.gmail.com>
2014-10-11  1:42       ` [gofrontend-dev] " Peter Collingbourne
2014-10-11  4:24         ` Richard Henderson
2014-10-13  8:10           ` Richard Biener
2014-10-13 18:46             ` Peter Collingbourne
2014-10-14 18:44   ` [PATCH v2 03/13] " Richard Henderson
2014-10-10 20:43 ` [PATCH 06/13] libffi: Add entry points for interacting with Go Richard Henderson
2014-10-10 20:43 ` [PATCH 01/13] Make TARGET_STATIC_CHAIN allow a function type Richard Henderson
2014-10-10 20:43 ` [PATCH 09/13] libgo: Remove __go_get/set_closure Richard Henderson
2014-10-10 20:43 ` [PATCH 08/13] libgo: Use the new libffi interfaces for Go Richard Henderson
2014-10-10 20:43 ` [PATCH 02/13] Allow the front-end to create calls with a static chain Richard Henderson
2014-10-10 20:43 ` [PATCH 07/13] libffi: Support go closures on x86_64 Richard Henderson
2014-10-11  0:23 ` [PATCH 00/13] Go closures, libffi, and the static chain Ian Lance Taylor
2014-11-05 21:34 ` Lynn A. Boger
2014-11-06  6:59   ` Richard Henderson
2014-11-06 12:48     ` Alan Modra
2014-11-06 13:04       ` Richard Henderson
2014-11-06 17:45         ` [gofrontend-dev] " Ian Taylor
2014-11-07  7:39           ` Richard Henderson
2014-11-07  8:50             ` Jay
2014-11-07 16:06             ` Ian Taylor
2014-11-07 23:55               ` Alan Modra
2014-11-06 13:10       ` Lynn A. Boger
2014-11-06 13:17         ` Richard Henderson
2014-12-11  9:06 ` Dominik Vogt
2014-12-11  9:21   ` Alan Modra
2014-12-11 10:31     ` [gofrontend-dev] " Dominik Vogt
2014-12-11 12:25       ` Dominik Vogt
2014-12-11 19:56         ` Richard Henderson
2014-12-12 12:06           ` Dominik Vogt
2014-12-12 18:14             ` Richard Henderson
2014-12-15  9:42               ` Dominik Vogt
2014-12-15 20:11                 ` Richard Henderson
2014-12-12 13:57     ` Dominik Vogt
2014-12-11 19:38   ` Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).