From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <libffi-discuss-return-1949-listarch-libffi-discuss=sources.redhat.com@sourceware.org>
Received: (qmail 14937 invoked by alias); 10 Oct 2014 20:43:54 -0000
Mailing-List: contact libffi-discuss-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libffi-discuss.sourceware.org>
List-Subscribe: <mailto:libffi-discuss-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libffi-discuss/>
List-Post: <mailto:libffi-discuss@sourceware.org>
List-Help: <mailto:libffi-discuss-help@sourceware.org>, <http://sourceware.org/ml/#faqs>
Sender: libffi-discuss-owner@sourceware.org
Received: (qmail 14671 invoked by uid 89); 10 Oct 2014 20:43:52 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-2.3 required=5.0 tests=AWL,BAYES_00,FREEMAIL_ENVFROM_END_DIGIT,FREEMAIL_FROM,RCVD_IN_DNSWL_LOW,SPF_PASS autolearn=ham version=3.3.2
X-HELO: mail-qg0-f50.google.com
Received: from mail-qg0-f50.google.com (HELO mail-qg0-f50.google.com) (209.85.192.50) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-SHA encrypted) ESMTPS; Fri, 10 Oct 2014 20:43:44 +0000
Received: by mail-qg0-f50.google.com with SMTP id q108so4445722qgd.37        for <libffi-discuss@sourceware.org>; Fri, 10 Oct 2014 13:43:42 -0700 (PDT)
X-Received: by 10.224.47.134 with SMTP id n6mr12764563qaf.15.1412973822253;        Fri, 10 Oct 2014 13:43:42 -0700 (PDT)
Received: from anchor.com (50-194-63-110-static.hfc.comcastbusiness.net. [50.194.63.110])        by mx.google.com with ESMTPSA id s49sm5909008qge.15.2014.10.10.13.43.40        for <multiple recipients>        (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128);        Fri, 10 Oct 2014 13:43:41 -0700 (PDT)
From: Richard Henderson <rth@redhat.com>
To: gcc-patches@gcc.gnu.org
Cc: libffi-discuss@sourceware.org,	gofrontend-dev@googlegroups.com
Subject: [PATCH 10/13] libffi: Rewrite aarch64
Date: Fri, 10 Oct 2014 20:43:00 -0000
Message-Id: <1412973773-3942-11-git-send-email-rth@redhat.com>
In-Reply-To: <1412973773-3942-1-git-send-email-rth@redhat.com>
References: <1412973773-3942-1-git-send-email-rth@redhat.com>
X-IsSubscribed: yes
X-SW-Source: 2014/txt/msg00106.txt.bz2

(1) Invent a new "internal.h" rather than polluting the public ffitarget.h
    with stuff that ought not be exposed.

(2) Rewrite is_hfa to not be so horribly computationally expensive.  And
    more to the point require us to _re_ compute the same stuff in order
    to actually do anything with the type.

(3) Don't use the out-dated prep_args callback form for ffi_call.
    The x86_64 port has for years shown how to do this with a single alloca,
    but new ports keep copying i386 which still does it the inefficient way.
---
 libffi/src/aarch64/ffi.c       | 1362 +++++++++++++++-------------------------
 libffi/src/aarch64/ffitarget.h |   17 +-
 libffi/src/aarch64/internal.h  |   43 ++
 libffi/src/aarch64/sysv.S      |  499 ++++++++-------
 4 files changed, 816 insertions(+), 1105 deletions(-)
 create mode 100644 libffi/src/aarch64/internal.h
diff --git a/libffi/src/aarch64/ffi.c b/libffi/src/aarch64/ffi.c
index 1405665..c409c0c 100644
--- a/libffi/src/aarch64/ffi.c
+++ b/libffi/src/aarch64/ffi.c
@@ -20,42 +20,37 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #include <stdio.h>
-
+#include <stdlib.h>
+#include <stdint.h>
 #include <ffi.h>
 #include <ffi_common.h>
+#include "internal.h"
 
-#include <stdlib.h>
-
-/* Stack alignment requirement in bytes */
+/* Stack alignment requirement in bytes.  */
 #define AARCH64_STACK_ALIGN 16
 
+/* Number of X and V argument registers.  */
 #define N_X_ARG_REG 8
 #define N_V_ARG_REG 8
 
-#define AARCH64_FFI_WITH_V (1 << AARCH64_FFI_WITH_V_BIT)
-
 union _d
 {
   UINT64 d;
   UINT32 s[2];
 };
 
-struct call_context
+struct _v
 {
-  UINT64 x [AARCH64_N_XREG];
-  struct
-  {
-    union _d d[2];
-  } v [AARCH64_N_VREG];
+  union _d d[2] __attribute__((aligned(16)));
 };
 
-static void *
-get_x_addr (struct call_context *context, unsigned n)
+struct call_context
 {
-  return &context->x[n];
-}
+  UINT64 x[N_X_ARG_REG];
+  struct _v v[N_V_ARG_REG];
+};
 
-static void *
+static inline UINT32 *
 get_s_addr (struct call_context *context, unsigned n)
 {
 #if defined __AARCH64EB__
@@ -65,557 +60,371 @@ get_s_addr (struct call_context *context, unsigned n)
 #endif
 }
 
-static void *
+static inline UINT64 *
 get_d_addr (struct call_context *context, unsigned n)
 {
 #if defined __AARCH64EB__
-  return &context->v[n].d[1];
+  return &context->v[n].d[1].d;
 #else
-  return &context->v[n].d[0];
+  return &context->v[n].d[0].d;
 #endif
 }
 
-static void *
-get_v_addr (struct call_context *context, unsigned n)
-{
-  return &context->v[n];
-}
-
-/* Return the memory location at which a basic type would reside
-   were it to have been stored in register n.  */
-
-static void *
-get_basic_type_addr (unsigned short type, struct call_context *context,
-		     unsigned n)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return get_s_addr (context, n);
-    case FFI_TYPE_DOUBLE:
-      return get_d_addr (context, n);
-    case FFI_TYPE_LONGDOUBLE:
-      return get_v_addr (context, n);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return get_x_addr (context, n);
-    default:
-      FFI_ASSERT (0);
-      return NULL;
-    }
-}
-
-/* Return the alignment width for each of the basic types.  */
-
-static size_t
-get_basic_type_alignment (unsigned short type)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return sizeof (UINT64);
+extern void ffi_call_SYSV (void *frame, void *rvalue,
+			   struct call_context *context,
+			   unsigned flags, void (*fn)(void)) FFI_HIDDEN;
 
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
+extern void ffi_closure_SYSV (void) FFI_HIDDEN;
+extern void ffi_closure_SYSV_V (void) FFI_HIDDEN;
 
-/* Return the size in bytes for each of the basic types.  */
+/* A subroutine of is_hfa.  Given a structure type, return the type code
+   of the first non-structure element.  Recurse for structure elements.
+   Return -1 if the structure is in fact empty, i.e. no nested elements.  */
 
-static size_t
-get_basic_type_size (unsigned short type)
+static int
+is_hfa0 (const ffi_type *ty)
 {
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return sizeof (UINT32);
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-      return sizeof (UINT8);
-    case FFI_TYPE_SINT8:
-      return sizeof (SINT8);
-    case FFI_TYPE_UINT16:
-      return sizeof (UINT16);
-    case FFI_TYPE_SINT16:
-      return sizeof (SINT16);
-    case FFI_TYPE_UINT32:
-      return sizeof (UINT32);
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-      return sizeof (SINT32);
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-      return sizeof (UINT64);
-    case FFI_TYPE_SINT64:
-      return sizeof (SINT64);
-
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
+  ffi_type **elements = ty->elements;
+  int i, ret = -1;
 
-extern void
-ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
-			    extended_cif *),
-               struct call_context *context,
-               extended_cif *,
-               unsigned,
-               void (*fn)(void));
-
-extern void
-ffi_closure_SYSV (ffi_closure *);
-
-/* Test for an FFI floating point representation.  */
+  if (elements != NULL)
+    for (i = 0; elements[i]; ++i)
+      {
+	ret = elements[i]->type;
+        if (ret == FFI_TYPE_STRUCT)
+	  {
+	    ret = is_hfa0 (elements[i]);
+	    if (ret < 0)
+	      continue;
+	  }
+	break;
+      }
 
-static unsigned
-is_floating_type (unsigned short type)
-{
-  return (type == FFI_TYPE_FLOAT || type == FFI_TYPE_DOUBLE
-	  || type == FFI_TYPE_LONGDOUBLE);
+  return ret;
 }
 
-/* Test for a homogeneous structure.  */
+/* A subroutine of is_hfa.  Given a structure type, return true if all
+   of the non-structure elements are the same as CANDIDATE.  */
 
-static unsigned short
-get_homogeneous_type (ffi_type *ty)
+static int
+is_hfa1 (const ffi_type *ty, int candidate)
 {
-  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
-    {
-      unsigned i;
-      unsigned short candidate_type
-	= get_homogeneous_type (ty->elements[0]);
-      for (i =1; ty->elements[i]; i++)
-	{
-	  unsigned short iteration_type = 0;
-	  /* If we have a nested struct, we must find its homogeneous type.
-	     If that fits with our candidate type, we are still
-	     homogeneous.  */
-	  if (ty->elements[i]->type == FFI_TYPE_STRUCT
-	      && ty->elements[i]->elements)
-	    {
-	      iteration_type = get_homogeneous_type (ty->elements[i]);
-	    }
-	  else
-	    {
-	      iteration_type = ty->elements[i]->type;
-	    }
+  ffi_type **elements = ty->elements;
+  int i;
 
-	  /* If we are not homogeneous, return FFI_TYPE_STRUCT.  */
-	  if (candidate_type != iteration_type)
-	    return FFI_TYPE_STRUCT;
-	}
-      return candidate_type;
-    }
+  if (elements != NULL)
+    for (i = 0; elements[i]; ++i)
+      {
+	int t = elements[i]->type;
+	if (t == FFI_TYPE_STRUCT)
+	  {
+	    if (!is_hfa1 (elements[i], candidate))
+	      return 0;
+	  }
+	else if (t != candidate)
+	  return 0;
+      }
 
-  /* Base case, we have no more levels of nesting, so we
-     are a basic type, and so, trivially homogeneous in that type.  */
-  return ty->type;
+  return 1;
 }
 
-/* Determine the number of elements within a STRUCT.
+/* Determine if TY is an homogenous floating point aggregate (HFA).
+   That is, a structure consisting of 1 to 4 members of all the same type,
+   where that type is a floating point scalar.
 
-   Note, we must handle nested structs.
+   Returns non-zero iff TY is an HFA.  The result is an encoded value where
+   bits 0-7 contain the type code, and bits 8-10 contain the element count.  */
 
-   If ty is not a STRUCT this function will return 0.  */
-
-static unsigned
-element_count (ffi_type *ty)
+static int
+is_hfa(const ffi_type *ty)
 {
-  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
-    {
-      unsigned n;
-      unsigned elems = 0;
-      for (n = 0; ty->elements[n]; n++)
-	{
-	  if (ty->elements[n]->type == FFI_TYPE_STRUCT
-	      && ty->elements[n]->elements)
-	    elems += element_count (ty->elements[n]);
-	  else
-	    elems++;
-	}
-      return elems;
-    }
-  return 0;
-}
+  ffi_type **elements;
+  int candidate, i;
+  size_t size, ele_count;
 
-/* Test for a homogeneous floating point aggregate.
+  /* Quickest tests first.  */
+  if (ty->type != FFI_TYPE_STRUCT)
+    return 0;
 
-   A homogeneous floating point aggregate is a homogeneous aggregate of
-   a half- single- or double- precision floating point type with one
-   to four elements.  Note that this includes nested structs of the
-   basic type.  */
+  /* No HFA types are smaller than 4 bytes, or larger than 64 bytes.  */
+  size = ty->size;
+  if (size < 4 || size > 64)
+    return 0;
 
-static int
-is_hfa (ffi_type *ty)
-{
-  if (ty->type == FFI_TYPE_STRUCT
-      && ty->elements[0]
-      && is_floating_type (get_homogeneous_type (ty)))
+  /* Find the type of the first non-structure member.  */
+  elements = ty->elements;
+  candidate = elements[0]->type;
+  if (candidate == FFI_TYPE_STRUCT)
     {
-      unsigned n = element_count (ty);
-      return n >= 1 && n <= 4;
+      for (i = 0; ; ++i)
+	{
+	  candidate = is_hfa0 (elements[i]);
+	  if (candidate >= 0)
+	    break;
+	}
     }
-  return 0;
-}
-
-/* Test if an ffi_type is a candidate for passing in a register.
-
-   This test does not check that sufficient registers of the
-   appropriate class are actually available, merely that IFF
-   sufficient registers are available then the argument will be passed
-   in register(s).
-
-   Note that an ffi_type that is deemed to be a register candidate
-   will always be returned in registers.
 
-   Returns 1 if a register candidate else 0.  */
-
-static int
-is_register_candidate (ffi_type *ty)
-{
-  switch (ty->type)
+  /* If the first member is not a floating point type, it's not an HFA.
+     Also quickly re-check the size of the structure.  */
+  switch (candidate)
     {
-    case FFI_TYPE_VOID:
     case FFI_TYPE_FLOAT:
+      ele_count = size / sizeof(float);
+      if (size != ele_count * sizeof(float))
+	return 0;
+      break;
     case FFI_TYPE_DOUBLE:
+      ele_count = size / sizeof(double);
+      if (size != ele_count * sizeof(double))
+	return 0;
+      break;
     case FFI_TYPE_LONGDOUBLE:
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT64:
-      return 1;
-
-    case FFI_TYPE_STRUCT:
-      if (is_hfa (ty))
-        {
-          return 1;
-        }
-      else if (ty->size > 16)
-        {
-          /* Too large. Will be replaced with a pointer to memory. The
-             pointer MAY be passed in a register, but the value will
-             not. This test specifically fails since the argument will
-             never be passed by value in registers. */
-          return 0;
-        }
-      else
-        {
-          /* Might be passed in registers depending on the number of
-             registers required. */
-          return (ty->size + 7) / 8 < N_X_ARG_REG;
-        }
+      ele_count = size / sizeof(long double);
+      if (size != ele_count * sizeof(long double))
+	return 0;
       break;
-
     default:
-      FFI_ASSERT (0);
-      break;
+      return 0;
     }
+  if (ele_count > 4)
+    return 0;
 
-  return 0;
-}
-
-/* Test if an ffi_type argument or result is a candidate for a vector
-   register.  */
-
-static int
-is_v_register_candidate (ffi_type *ty)
-{
-  return is_floating_type (ty->type)
-	   || (ty->type == FFI_TYPE_STRUCT && is_hfa (ty));
-}
-
-/* Representation of the procedure call argument marshalling
-   state.
-
-   The terse state variable names match the names used in the AARCH64
-   PCS. */
-
-struct arg_state
-{
-  unsigned ngrn;                /* Next general-purpose register number. */
-  unsigned nsrn;                /* Next vector register number. */
-  unsigned nsaa;                /* Next stack offset. */
-};
-
-/* Initialize a procedure call argument marshalling state.  */
-static void
-arg_init (struct arg_state *state, unsigned call_frame_size)
-{
-  state->ngrn = 0;
-  state->nsrn = 0;
-  state->nsaa = 0;
-}
-
-/* Return the number of available consecutive core argument
-   registers.  */
-
-static unsigned
-available_x (struct arg_state *state)
-{
-  return N_X_ARG_REG - state->ngrn;
-}
-
-/* Return the number of available consecutive vector argument
-   registers.  */
-
-static unsigned
-available_v (struct arg_state *state)
-{
-  return N_V_ARG_REG - state->nsrn;
-}
-
-static void *
-allocate_to_x (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->ngrn < N_X_ARG_REG)
-  return get_x_addr (context, (state->ngrn)++);
-}
-
-static void *
-allocate_to_s (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
-  return get_s_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_d (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
-  return get_d_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_v (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG)
-  return get_v_addr (context, (state->nsrn)++);
-}
-
-/* Allocate an aligned slot on the stack and return a pointer to it.  */
-static void *
-allocate_to_stack (struct arg_state *state, void *stack, unsigned alignment,
-		   unsigned size)
-{
-  void *allocation;
-
-  /* Round up the NSAA to the larger of 8 or the natural
-     alignment of the argument's type.  */
-  state->nsaa = ALIGN (state->nsaa, alignment);
-  state->nsaa = ALIGN (state->nsaa, alignment);
-  state->nsaa = ALIGN (state->nsaa, 8);
-
-  allocation = stack + state->nsaa;
+  /* Finally, make sure that all scalar elements are the same type.  */
+  for (i = 0; elements[i]; ++i)
+    {
+      if (elements[i]->type == FFI_TYPE_STRUCT)
+	{
+	  if (!is_hfa1 (elements[i], candidate))
+	    return 0;
+	}
+      else if (elements[i]->type != candidate)
+	return 0;
+    }
 
-  state->nsaa += size;
-  return allocation;
+  /* All tests succeeded.  Encode the result.  */
+  return (ele_count << 8) | candidate;
 }
 
-static void
-copy_basic_type (void *dest, void *source, unsigned short type)
+/* Extend a basic type to fill a 64-bit slot.  */
+static UINT64
+extend_basic_type (UINT64 ret, unsigned short type)
 {
-  /* This is neccessary to ensure that basic types are copied
-     sign extended to 64-bits as libffi expects.  */
   switch (type)
     {
     case FFI_TYPE_FLOAT:
-      *(float *) dest = *(float *) source;
-      break;
-    case FFI_TYPE_DOUBLE:
-      *(double *) dest = *(double *) source;
-      break;
-    case FFI_TYPE_LONGDOUBLE:
-      *(long double *) dest = *(long double *) source;
+      ret = (UINT32)ret;
+#if defined __AARCH64EB__
+      ret <<= 32;
+#endif
       break;
     case FFI_TYPE_UINT8:
-      *(ffi_arg *) dest = *(UINT8 *) source;
+      ret = (UINT8)ret;
       break;
     case FFI_TYPE_SINT8:
-      *(ffi_sarg *) dest = *(SINT8 *) source;
+      ret = (SINT8)ret;
       break;
     case FFI_TYPE_UINT16:
-      *(ffi_arg *) dest = *(UINT16 *) source;
+      ret = (UINT16)ret;
       break;
     case FFI_TYPE_SINT16:
-      *(ffi_sarg *) dest = *(SINT16 *) source;
+      ret = (SINT16)ret;
       break;
     case FFI_TYPE_UINT32:
-      *(ffi_arg *) dest = *(UINT32 *) source;
+      ret = (UINT32)ret;
       break;
     case FFI_TYPE_INT:
     case FFI_TYPE_SINT32:
-      *(ffi_sarg *) dest = *(SINT32 *) source;
+      ret = (SINT32)ret;
       break;
-    case FFI_TYPE_POINTER:
+    case FFI_TYPE_DOUBLE:
     case FFI_TYPE_UINT64:
-      *(ffi_arg *) dest = *(UINT64 *) source;
-      break;
     case FFI_TYPE_SINT64:
-      *(ffi_sarg *) dest = *(SINT64 *) source;
       break;
-
+    case FFI_TYPE_POINTER:
+      ret = (uintptr_t)ret;
+      break;
     default:
-      FFI_ASSERT (0);
+      abort ();
     }
+  return ret;
 }
 
-static void
-copy_hfa_to_reg_or_stack (void *memory,
-			  ffi_type *ty,
-			  struct call_context *context,
-			  unsigned char *stack,
-			  struct arg_state *state)
+ffi_status FFI_HIDDEN
+ffi_prep_cif_machdep (ffi_cif *cif)
 {
-  unsigned elems = element_count (ty);
-  if (available_v (state) < elems)
-    {
-      /* There are insufficient V registers. Further V register allocations
-	 are prevented, the NSAA is adjusted (by allocate_to_stack ())
-	 and the argument is copied to memory at the adjusted NSAA.  */
-      state->nsrn = N_V_ARG_REG;
-      memcpy (allocate_to_stack (state, stack, ty->alignment, ty->size),
-	      memory,
-	      ty->size);
-    }
-  else
-    {
-      int i;
-      unsigned short type = get_homogeneous_type (ty);
-      unsigned elems = element_count (ty);
-      for (i = 0; i < elems; i++)
-	{
-	  void *reg = allocate_to_v (context, state);
-	  copy_basic_type (reg, memory, type);
-	  memory += get_basic_type_size (type);
-	}
-    }
-}
+  int flags, h, i;
+  ffi_type *rtype;
 
-/* Either allocate an appropriate register for the argument type, or if
-   none are available, allocate a stack slot and return a pointer
-   to the allocated space.  */
+  /* Round the stack up to a multiple of the stack alignment requirement. */
+  cif->bytes = ALIGN (cif->bytes, AARCH64_STACK_ALIGN);
 
-static void *
-allocate_to_register_or_stack (struct call_context *context,
-			       unsigned char *stack,
-			       struct arg_state *state,
-			       unsigned short type)
-{
-  size_t alignment = get_basic_type_alignment (type);
-  size_t size = alignment;
-  switch (type)
+  rtype = cif->rtype;
+  switch (rtype->type)
     {
-    case FFI_TYPE_FLOAT:
-      /* This is the only case for which the allocated stack size
-	 should not match the alignment of the type.  */
-      size = sizeof (UINT32);
-      /* Fall through.  */
-    case FFI_TYPE_DOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
-	return allocate_to_d (context, state);
-      state->nsrn = N_V_ARG_REG;
-      break;
-    case FFI_TYPE_LONGDOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
-	return allocate_to_v (context, state);
-      state->nsrn = N_V_ARG_REG;
+    case FFI_TYPE_VOID:
+      flags = AARCH64_RET_VOID;
       break;
     case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
     case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
     case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
+      flags = AARCH64_RET_UINT32;
+      break;
     case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
+    case FFI_TYPE_SINT8:
+    case FFI_TYPE_SINT16:
+    case FFI_TYPE_SINT32:
+      flags = AARCH64_RET_SINT32;
+      break;
     case FFI_TYPE_UINT64:
     case FFI_TYPE_SINT64:
-      if (state->ngrn < N_X_ARG_REG)
-	return allocate_to_x (context, state);
-      state->ngrn = N_X_ARG_REG;
+      flags = AARCH64_RET_INT64;
+      break;
+    case FFI_TYPE_POINTER:
+      flags = (sizeof(void *) == 8 ? AARCH64_RET_INT64 : AARCH64_RET_UINT32);
+      break;
+    case FFI_TYPE_FLOAT:
+      flags = AARCH64_RET_FLOAT;
+      break;
+    case FFI_TYPE_DOUBLE:
+      flags = AARCH64_RET_DOUBLE;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = AARCH64_RET_LDOUBLE;
+      break;
+    case FFI_TYPE_STRUCT:
+      h = is_hfa (rtype);
+      switch (h & 0xff)
+        {
+        case FFI_TYPE_FLOAT:
+          flags = AARCH64_RET_HFA_FLOAT;
+          break;
+        case FFI_TYPE_DOUBLE:
+          flags = AARCH64_RET_HFA_DOUBLE;
+          break;
+        case FFI_TYPE_LONGDOUBLE:
+          flags = AARCH64_RET_HFA_LDOUBLE;
+          break;
+        default:
+	  flags = (rtype->size > 16
+		   ? AARCH64_RET_LG_STRUCT
+		   : AARCH64_RET_SM_STRUCT);
+          break;
+	}
       break;
     default:
-      FFI_ASSERT (0);
+      abort ();
     }
 
-    return allocate_to_stack (state, stack, alignment, size);
-}
-
-/* Copy a value to an appropriate register, or if none are
-   available, to the stack.  */
+  /* Note if any argument requires fp registers.  */
+  for (i = 0; i < cif->nargs; i++)
+    {
+      ffi_type *ty = cif->arg_types[i];
+      int tt = ty->type;
+      if (tt == FFI_TYPE_FLOAT
+          || tt == FFI_TYPE_DOUBLE
+          || tt == FFI_TYPE_LONGDOUBLE
+	  || is_hfa (ty))
+	{
+	  flags |= AARCH64_FLAG_ARG_V;
+	  break;
+	}
+    }
 
-static void
-copy_to_register_or_stack (struct call_context *context,
-			   unsigned char *stack,
-			   struct arg_state *state,
-			   void *value,
-			   unsigned short type)
-{
-  copy_basic_type (
-	  allocate_to_register_or_stack (context, stack, state, type),
-	  value,
-	  type);
+  cif->flags = flags;
+  return FFI_OK;
 }
 
-/* Marshall the arguments from FFI representation to procedure call
-   context and stack.  */
-
-static unsigned
-aarch64_prep_args (struct call_context *context, unsigned char *stack,
-		   extended_cif *ecif)
+/* Call a function with the provided arguments and capture the return
+   value.  */
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
-  int i;
-  struct arg_state state;
+  struct call_context *context;
+  UINT64 *stack, *slot;
+  void *frame, *local_rvalue;
+  ffi_type **arg_types;
+  int i, h, nargs, ngrn, nsrn, nsaa;
+  size_t size, stack_space, ret_space;
 
-  arg_init (&state, ALIGN(ecif->cif->bytes, 16));
+  FFI_ASSERT (cif->abi == FFI_SYSV);
 
-  for (i = 0; i < ecif->cif->nargs; i++)
+  ret_space = 0;
+  h = cif->flags & AARCH64_FLAG_RET_MASK;
+  switch (h)
     {
-      ffi_type *ty = ecif->cif->arg_types[i];
-      switch (ty->type)
-	{
-	case FFI_TYPE_VOID:
-	  FFI_ASSERT (0);
-	  break;
+    case AARCH64_RET_HFA_FLOAT:
+      /* The assembly always writes 4 elements.  */
+      ret_space = 4 * sizeof(float);
+      break;
+    case AARCH64_RET_HFA_DOUBLE:
+      ret_space = 4 * sizeof(double);
+      break;
+    case AARCH64_RET_HFA_LDOUBLE:
+      ret_space = 4 * sizeof(long double);
+      break;
+    case AARCH64_RET_SM_STRUCT:
+      ret_space = 16;
+      break;
+    case AARCH64_RET_LG_STRUCT:
+      if (rvalue == NULL)
+	ret_space = cif->rtype->size;
+      break;
+    }
 
+  /* Allocate the space for all of the arguments, the context, the local
+     stack frame for ffi_call_SYSV, and (possibly) the return value.  */
+  stack_space = ALIGN (cif->bytes, 16);
+  context = alloca (sizeof(struct call_context)
+		    + stack_space
+                    + 4 * sizeof(UINT64)
+		    + ret_space);
+  stack = (UINT64 *)(context + 1);
+  frame = (char *)stack + stack_space;
+
+  local_rvalue = rvalue;
+  if (ret_space)
+    local_rvalue = (char *)frame + 4 * sizeof(UINT64);
+
+  ngrn = nsrn = nsaa = 0;
+  arg_types = cif->arg_types;
+  nargs = cif->nargs;
+
+  for (i = 0; i < nargs; i++)
+    {
+      ffi_type *ty = arg_types[i];
+      unsigned short t = ty->type;
+
+      switch (t)
+	{
 	/* If the argument is a basic type the argument is allocated to an
 	   appropriate register, or if none are available, to the stack.  */
 	case FFI_TYPE_FLOAT:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = get_d_addr (context, nsrn++);
+	  else
+	    slot = &stack[nsaa++];
+	  *slot = extend_basic_type (*(UINT32 *)avalue[i], t);
+	  break;
 	case FFI_TYPE_DOUBLE:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = get_d_addr (context, nsrn++);
+	  else
+	    slot = &stack[nsaa++];
+	  *slot = extend_basic_type (*(UINT64 *)avalue[i], t);
+	  break;
+
 	case FFI_TYPE_LONGDOUBLE:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = &context->v[nsrn++].d[0].d;
+	  else
+	    {
+	      nsaa = ALIGN (nsaa, 2);
+	      slot = &stack[nsaa];
+	      nsaa += 2;
+	    }
+	  memcpy (slot, avalue[i], sizeof(long double));
+	  break;
+
 	case FFI_TYPE_UINT8:
 	case FFI_TYPE_SINT8:
 	case FFI_TYPE_UINT16:
@@ -626,207 +435,111 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack,
 	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	  copy_to_register_or_stack (context, stack, &state,
-				     ecif->avalue[i], ty->type);
+	  if (ngrn < N_X_ARG_REG)
+	    slot = &context->x[ngrn++];
+	  else
+	    slot = &stack[nsaa++];
+	  *slot = extend_basic_type (*(ffi_arg *)avalue[i], t);
 	  break;
 
+	case FFI_TYPE_VOID:
+	  /* Note that libgo passes void as a parameter for a
+	     struct with no fields.  */
 	case FFI_TYPE_STRUCT:
-	  if (is_hfa (ty))
-	    {
-	      copy_hfa_to_reg_or_stack (ecif->avalue[i], ty, context,
-					stack, &state);
-	    }
-	  else if (ty->size > 16)
-	    {
-	      /* If the argument is a composite type that is larger than 16
-		 bytes, then the argument has been copied to memory, and
-		 the argument is replaced by a pointer to the copy.  */
+	  {
+	    size_t slot_count;
 
-	      copy_to_register_or_stack (context, stack, &state,
-					 &(ecif->avalue[i]), FFI_TYPE_POINTER);
-	    }
-	  else if (available_x (&state) >= (ty->size + 7) / 8)
-	    {
-	      /* If the argument is a composite type and the size in
-		 double-words is not more than the number of available
-		 X registers, then the argument is copied into consecutive
-		 X registers.  */
-	      int j;
-	      for (j = 0; j < (ty->size + 7) / 8; j++)
-		{
-		  memcpy (allocate_to_x (context, &state),
-			  &(((UINT64 *) ecif->avalue[i])[j]),
-			  sizeof (UINT64));
-		}
-	    }
-	  else
-	    {
-	      /* Otherwise, there are insufficient X registers. Further X
-		 register allocations are prevented, the NSAA is adjusted
-		 (by allocate_to_stack ()) and the argument is copied to
-		 memory at the adjusted NSAA.  */
-	      state.ngrn = N_X_ARG_REG;
-
-	      memcpy (allocate_to_stack (&state, stack, ty->alignment,
-					 ty->size), ecif->avalue + i, ty->size);
-	    }
+	    size = ty->size;
+            slot_count = (size + 7) / 8;
+	    h = is_hfa (ty);
+	    if (h)
+	      {
+		int j, reg_count = h >> 8, tt = h & 0xff;
+
+		if (nsrn + reg_count <= N_V_ARG_REG)
+		  {
+		    switch (tt)
+		      {
+		      case FFI_TYPE_FLOAT:
+		        {
+			  UINT32 *src = avalue[i];
+		          for (j = 0; j < reg_count; ++j)
+			    *get_s_addr (context, nsrn + j) = src[j];
+		        }
+		        break;
+
+		      case FFI_TYPE_DOUBLE:
+		        {
+			  UINT64 *src = avalue[i];
+		          for (j = 0; j < reg_count; ++j)
+			    *get_d_addr (context, nsrn + j) = src[j];
+		        }
+		        break;
+
+		      case FFI_TYPE_LONGDOUBLE:
+		        memcpy(&context->v[nsrn], avalue[i], size);
+		        break;
+
+		      default:
+		        abort ();
+		    }
+		    nsrn += reg_count;
+		    break;
+		  }
+		/* All out of fp registers.  Copy to the stack.  */
+		nsrn = N_V_ARG_REG;
+	      }
+	    else if (size > 16)
+	      {
+		/* If the argument is a composite type that is larger than
+		   16 bytes, then the argument has been copied to memory,
+		   and the argument is replaced by a pointer.  */
+		if (ngrn < N_X_ARG_REG)
+		  slot = &context->x[ngrn++];
+		else
+		  slot = &stack[nsaa++];
+		*slot = (uintptr_t)avalue[i];
+		break;
+	      }
+	    else
+	      {
+	        if (ty->alignment == 16)
+		  ngrn = ALIGN (ngrn, 2);
+
+	        if (ngrn + slot_count <= N_X_ARG_REG)
+		  {
+		    slot = &context->x[ngrn];
+		    ngrn += slot_count;
+		    memcpy (slot, avalue[i], size);
+		    break;
+		  }
+		/* All out of general registers.  Copy to the stack.  */
+		ngrn = N_X_ARG_REG;
+	      }
+	    if (ty->alignment > 8)
+	      {
+		int a = ty->alignment / 8;
+		nsaa = ALIGN (nsaa, a);
+	      }
+	    memcpy (&stack[nsaa], avalue[i], size);
+	    nsaa += slot_count;
+	  }
 	  break;
 
 	default:
-	  FFI_ASSERT (0);
+	  abort ();
 	  break;
 	}
     }
 
-  return ecif->cif->aarch64_flags;
+  size = cif->rtype->size;
+  ffi_call_SYSV (frame, local_rvalue, context, cif->flags, fn);
+  if (local_rvalue != rvalue && rvalue != NULL)
+    memcpy (rvalue, local_rvalue, size);
 }
 
-ffi_status
-ffi_prep_cif_machdep (ffi_cif *cif)
-{
-  /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes =
-    (cif->bytes + (AARCH64_STACK_ALIGN - 1)) & ~ (AARCH64_STACK_ALIGN - 1);
-
-  /* Initialize our flags. We are interested if this CIF will touch a
-     vector register, if so we will enable context save and load to
-     those registers, otherwise not. This is intended to be friendly
-     to lazy float context switching in the kernel.  */
-  cif->aarch64_flags = 0;
-
-  if (is_v_register_candidate (cif->rtype))
-    {
-      cif->aarch64_flags |= AARCH64_FFI_WITH_V;
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < cif->nargs; i++)
-        if (is_v_register_candidate (cif->arg_types[i]))
-          {
-            cif->aarch64_flags |= AARCH64_FFI_WITH_V;
-            break;
-          }
-    }
-
-  return FFI_OK;
-}
-
-/* Call a function with the provided arguments and capture the return
-   value.  */
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
-{
-  extended_cif ecif;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  ecif.rvalue = rvalue;
-
-  switch (cif->abi)
-    {
-    case FFI_SYSV:
-      {
-        struct call_context context;
-	unsigned stack_bytes;
-
-	/* Figure out the total amount of stack space we need, the
-	   above call frame space needs to be 16 bytes aligned to
-	   ensure correct alignment of the first object inserted in
-	   that space hence the ALIGN applied to cif->bytes.*/
-	stack_bytes = ALIGN(cif->bytes, 16);
-
-	memset (&context, 0, sizeof (context));
-        if (is_register_candidate (cif->rtype))
-          {
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
-            switch (cif->rtype->type)
-              {
-              case FFI_TYPE_VOID:
-              case FFI_TYPE_FLOAT:
-              case FFI_TYPE_DOUBLE:
-              case FFI_TYPE_LONGDOUBLE:
-              case FFI_TYPE_UINT8:
-              case FFI_TYPE_SINT8:
-              case FFI_TYPE_UINT16:
-              case FFI_TYPE_SINT16:
-              case FFI_TYPE_UINT32:
-              case FFI_TYPE_SINT32:
-              case FFI_TYPE_POINTER:
-              case FFI_TYPE_UINT64:
-              case FFI_TYPE_INT:
-              case FFI_TYPE_SINT64:
-		{
-		  void *addr = get_basic_type_addr (cif->rtype->type,
-						    &context, 0);
-		  copy_basic_type (rvalue, addr, cif->rtype->type);
-		  break;
-		}
-
-              case FFI_TYPE_STRUCT:
-                if (is_hfa (cif->rtype))
-		  {
-		    int j;
-		    unsigned short type = get_homogeneous_type (cif->rtype);
-		    unsigned elems = element_count (cif->rtype);
-		    for (j = 0; j < elems; j++)
-		      {
-			void *reg = get_basic_type_addr (type, &context, j);
-			copy_basic_type (rvalue, reg, type);
-			rvalue += get_basic_type_size (type);
-		      }
-		  }
-                else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-                  {
-                    unsigned size = ALIGN (cif->rtype->size, sizeof (UINT64));
-                    memcpy (rvalue, get_x_addr (&context, 0), size);
-                  }
-                else
-                  {
-                    FFI_ASSERT (0);
-                  }
-                break;
-
-              default:
-                FFI_ASSERT (0);
-                break;
-              }
-          }
-        else
-          {
-            memcpy (get_x_addr (&context, 8), &rvalue, sizeof (UINT64));
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif,
-			   stack_bytes, fn);
-          }
-        break;
-      }
-
-    default:
-      FFI_ASSERT (0);
-      break;
-    }
-}
-
-static unsigned char trampoline [] =
-{ 0x70, 0x00, 0x00, 0x58,	/* ldr	x16, 1f	*/
-  0x91, 0x00, 0x00, 0x10,	/* adr	x17, 2f	*/
-  0x00, 0x02, 0x1f, 0xd6	/* br	x16	*/
-};
-
 /* Build a trampoline.  */
 
-#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX,FLAGS)			\
-  ({unsigned char *__tramp = (unsigned char*)(TRAMP);			\
-    UINT64  __fun = (UINT64)(FUN);					\
-    UINT64  __ctx = (UINT64)(CTX);					\
-    UINT64  __flags = (UINT64)(FLAGS);					\
-    memcpy (__tramp, trampoline, sizeof (trampoline));			\
-    memcpy (__tramp + 12, &__fun, sizeof (__fun));			\
-    memcpy (__tramp + 20, &__ctx, sizeof (__ctx));			\
-    memcpy (__tramp + 28, &__flags, sizeof (__flags));			\
-    __clear_cache(__tramp, __tramp + FFI_TRAMPOLINE_SIZE);		\
-  })
-
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
                       ffi_cif* cif,
@@ -834,15 +547,29 @@ ffi_prep_closure_loc (ffi_closure* closure,
                       void *user_data,
                       void *codeloc)
 {
+  static const unsigned char trampoline[16] = {
+    0x90, 0x00, 0x00, 0x58,     /* ldr  x16, 16 */
+    0xf1, 0xff, 0xff, 0x10,     /* adr  x17, 0  */
+    0x00, 0x02, 0x1f, 0xd6,     /* br   x16     */
+  };
+  char *tramp = &closure->tramp[0];
+  void (*entry)(void);
+
   if (cif->abi != FFI_SYSV)
     return FFI_BAD_ABI;
 
-  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_SYSV, codeloc,
-		       cif->aarch64_flags);
+  entry = (cif->flags & AARCH64_FLAG_ARG_V
+	   ? ffi_closure_SYSV_V : ffi_closure_SYSV);
+
+  memcpy (tramp, trampoline, sizeof(trampoline));
+
+  *(UINT64 *)(tramp + 16) = (uintptr_t)entry;
 
-  closure->cif  = cif;
+  closure->cif = cif;
+  closure->fun = fun;
   closure->user_data = user_data;
-  closure->fun  = fun;
+
+  __clear_cache (tramp, tramp + sizeof(trampoline));
 
   return FFI_OK;
 }
@@ -863,26 +590,33 @@ ffi_prep_closure_loc (ffi_closure* closure,
    desriptors, invokes the wrapped function, then marshalls the return
    value back into the call context.  */
 
-void
-ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
-			void *stack)
+UINT64 FFI_HIDDEN
+ffi_closure_SYSV_inner (ffi_cif *cif,
+			void (*fun)(ffi_cif*,void*,void**,void*),
+			void *user_data,
+			struct call_context *context,
+			UINT64 *stack, void *rvalue)
 {
-  ffi_cif *cif = closure->cif;
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
-  void *rvalue = NULL;
-  int i;
-  struct arg_state state;
-
-  arg_init (&state, ALIGN(cif->bytes, 16));
-
-  for (i = 0; i < cif->nargs; i++)
+  ffi_type **arg_types;
+  int i, nargs, h, ngrn, nsrn, nsaa;
+  size_t size;
+
+  ngrn = nsrn = nsaa = 0;
+  arg_types = cif->arg_types;
+  nargs = cif->nargs;
+  
+  for (i = 0; i < nargs; i++)
     {
-      ffi_type *ty = cif->arg_types[i];
+      ffi_type *ty = arg_types[i];
+      int t = ty->type;
+      void *slot;
 
-      switch (ty->type)
+      switch (t)
 	{
 	case FFI_TYPE_VOID:
-	  FFI_ASSERT (0);
+	  /* ??? abort */
+	  slot = NULL;
 	  break;
 
 	case FFI_TYPE_UINT8:
@@ -895,182 +629,128 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
 	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	case  FFI_TYPE_FLOAT:
-	case  FFI_TYPE_DOUBLE:
-	case  FFI_TYPE_LONGDOUBLE:
-	  avalue[i] = allocate_to_register_or_stack (context, stack,
-						     &state, ty->type);
+	  if (ngrn < N_X_ARG_REG)
+	    slot = &context->x[ngrn++];
+	  else
+	    slot = &stack[nsaa++];
+	  *(ffi_arg *)slot = extend_basic_type (*(UINT64 *)slot, t);
 	  break;
 
-	case FFI_TYPE_STRUCT:
-	  if (is_hfa (ty))
-	    {
-	      unsigned n = element_count (ty);
-	      if (available_v (&state) < n)
-		{
-		  state.nsrn = N_V_ARG_REG;
-		  avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-						 ty->size);
-		}
-	      else
-		{
-		  switch (get_homogeneous_type (ty))
-		    {
-		    case FFI_TYPE_FLOAT:
-		      {
-			/* Eeek! We need a pointer to the structure,
-			   however the homogeneous float elements are
-			   being passed in individual S registers,
-			   therefore the structure is not represented as
-			   a contiguous sequence of bytes in our saved
-			   register context. We need to fake up a copy
-			   of the structure layed out in memory
-			   correctly. The fake can be tossed once the
-			   closure function has returned hence alloca()
-			   is sufficient. */
-			int j;
-			UINT32 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < element_count (ty); j++)
-			  memcpy (&p[j],
-				  allocate_to_s (context, &state),
-				  sizeof (*p));
-			break;
-		      }
-
-		    case FFI_TYPE_DOUBLE:
-		      {
-			/* Eeek! We need a pointer to the structure,
-			   however the homogeneous float elements are
-			   being passed in individual S registers,
-			   therefore the structure is not represented as
-			   a contiguous sequence of bytes in our saved
-			   register context. We need to fake up a copy
-			   of the structure layed out in memory
-			   correctly. The fake can be tossed once the
-			   closure function has returned hence alloca()
-			   is sufficient. */
-			int j;
-			UINT64 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < element_count (ty); j++)
-			  memcpy (&p[j],
-				  allocate_to_d (context, &state),
-				  sizeof (*p));
-			break;
-		      }
+	case FFI_TYPE_FLOAT:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = get_s_addr (context, nsrn++);
+	  else
+	    slot = &stack[nsaa++];
+	  break;
 
-		    case FFI_TYPE_LONGDOUBLE:
-			  memcpy (&avalue[i],
-				  allocate_to_v (context, &state),
-				  sizeof (*avalue));
-		      break;
+	case FFI_TYPE_DOUBLE:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = get_d_addr (context, nsrn++);
+	  else
+	    slot = &stack[nsaa++];
+	  break;
 
-		    default:
-		      FFI_ASSERT (0);
-		      break;
-		    }
-		}
-	    }
-	  else if (ty->size > 16)
-	    {
-	      /* Replace Composite type of size greater than 16 with a
-		 pointer.  */
-	      memcpy (&avalue[i],
-		      allocate_to_register_or_stack (context, stack,
-						     &state, FFI_TYPE_POINTER),
-		      sizeof (avalue[i]));
-	    }
-	  else if (available_x (&state) >= (ty->size + 7) / 8)
-	    {
-	      avalue[i] = get_x_addr (context, state.ngrn);
-	      state.ngrn += (ty->size + 7) / 8;
-	    }
+	case FFI_TYPE_LONGDOUBLE:
+	  if (nsrn < N_V_ARG_REG)
+	    slot = &context->v[nsrn++];
 	  else
 	    {
-	      state.ngrn = N_X_ARG_REG;
-
-	      avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-					     ty->size);
+	      nsaa = ALIGN (nsaa, 2);
+	      slot = &stack[nsaa];
+	      nsaa += 2;
 	    }
 	  break;
 
-	default:
-	  FFI_ASSERT (0);
+	case FFI_TYPE_STRUCT:
+	  {
+	    size_t slot_count;
+
+	    size = ty->size;
+	    slot_count = (size + 7) / 8;
+	    h = is_hfa (ty);
+	    if (h)
+	      {
+		int reg_count = h >> 8;
+		int tt = h & 0xff;
+		int j;
+
+		if (nsrn + reg_count <= N_V_ARG_REG)
+		  {
+		    switch (tt)
+		      {
+		      case FFI_TYPE_FLOAT:
+			{
+			  UINT32 *dst = alloca (size);
+			  for (j = 0; j < reg_count; ++j)
+			    dst[j] = *get_s_addr(context, nsrn + j);
+			  slot = dst;
+			}
+			break;
+		      case FFI_TYPE_DOUBLE:
+			{
+			  UINT64 *dst = alloca (size);
+			  for (j = 0; j < reg_count; ++j)
+			    dst[j] = *get_d_addr(context, nsrn + j);
+			  slot = dst;
+			}
+			break;
+		      case FFI_TYPE_LONGDOUBLE:
+			slot = &context->v[nsrn];
+			break;
+		      default:
+			abort ();
+		      }
+		    nsrn += reg_count;
+		    break;
+		  }
+		/* All out of fp registers.  It's on the stack.  */
+		nsrn = N_V_ARG_REG;
+	      }
+	    else if (size > 16)
+	      {
+		/* The argument is passed by indirection.  */
+		if (ngrn < N_X_ARG_REG)
+		  slot = (void *)(uintptr_t)context->x[ngrn++];
+		else
+		  slot = (void *)(uintptr_t)stack[nsaa++];
+		break;
+	      }
+	    else
+	      {
+		if (ty->alignment == 16)
+		  ngrn = ALIGN (ngrn, 2);
+
+		if (ngrn + slot_count <= N_X_ARG_REG)
+		  {
+		    slot = &context->x[ngrn];
+		    ngrn += slot_count;
+		    break;
+		  }
+		/* All out of general registers.  Copy to the stack.  */
+                ngrn = N_X_ARG_REG;
+	      }
+	    if (ty->alignment > 8)
+	      {
+		int a = ty->alignment / 8;
+		nsaa = ALIGN (nsaa, a);
+	      }
+	    slot = &stack[nsaa];
+	    nsaa += slot_count;
+	  }
 	  break;
+
+	default:
+	  abort ();
 	}
+
+      avalue[i] = slot;
     }
 
-  /* Figure out where the return value will be passed, either in
-     registers or in a memory block allocated by the caller and passed
-     in x8.  */
+  h = cif->flags & AARCH64_FLAG_RET_MASK;
+  if (h != AARCH64_RET_LG_STRUCT)
+    rvalue = context + 1;
 
-  if (is_register_candidate (cif->rtype))
-    {
-      /* Register candidates are *always* returned in registers. */
-
-      /* Allocate a scratchpad for the return value, we will let the
-         callee scrible the result into the scratch pad then move the
-         contents into the appropriate return value location for the
-         call convention.  */
-      rvalue = alloca (cif->rtype->size);
-      (closure->fun) (cif, rvalue, avalue, closure->user_data);
-
-      /* Copy the return value into the call context so that it is returned
-         as expected to our caller.  */
-      switch (cif->rtype->type)
-        {
-        case FFI_TYPE_VOID:
-          break;
+  fun (cif, rvalue, avalue, user_data);
 
-        case FFI_TYPE_UINT8:
-        case FFI_TYPE_UINT16:
-        case FFI_TYPE_UINT32:
-        case FFI_TYPE_POINTER:
-        case FFI_TYPE_UINT64:
-        case FFI_TYPE_SINT8:
-        case FFI_TYPE_SINT16:
-        case FFI_TYPE_INT:
-        case FFI_TYPE_SINT32:
-        case FFI_TYPE_SINT64:
-        case FFI_TYPE_FLOAT:
-        case FFI_TYPE_DOUBLE:
-        case FFI_TYPE_LONGDOUBLE:
-	  {
-	    void *addr = get_basic_type_addr (cif->rtype->type, context, 0);
-	    copy_basic_type (addr, rvalue, cif->rtype->type);
-            break;
-	  }
-        case FFI_TYPE_STRUCT:
-          if (is_hfa (cif->rtype))
-	    {
-	      int i;
-	      unsigned short type = get_homogeneous_type (cif->rtype);
-	      unsigned elems = element_count (cif->rtype);
-	      for (i = 0; i < elems; i++)
-		{
-		  void *reg = get_basic_type_addr (type, context, i);
-		  copy_basic_type (reg, rvalue, type);
-		  rvalue += get_basic_type_size (type);
-		}
-	    }
-          else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-            {
-              unsigned size = ALIGN (cif->rtype->size, sizeof (UINT64)) ;
-              memcpy (get_x_addr (context, 0), rvalue, size);
-            }
-          else
-            {
-              FFI_ASSERT (0);
-            }
-          break;
-        default:
-          FFI_ASSERT (0);
-          break;
-        }
-    }
-  else
-    {
-      memcpy (&rvalue, get_x_addr (context, 8), sizeof (UINT64));
-      (closure->fun) (cif, rvalue, avalue, closure->user_data);
-    }
+  return h;
 }
-
diff --git a/libffi/src/aarch64/ffitarget.h b/libffi/src/aarch64/ffitarget.h
index 6f1a348..ecfa159 100644
--- a/libffi/src/aarch64/ffitarget.h
+++ b/libffi/src/aarch64/ffitarget.h
@@ -27,8 +27,8 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #endif
 
 #ifndef LIBFFI_ASM
-typedef unsigned long ffi_arg;
-typedef signed long ffi_sarg;
+typedef unsigned long long ffi_arg;
+typedef signed long long ffi_sarg;
 
 typedef enum ffi_abi
   {
@@ -42,18 +42,7 @@ typedef enum ffi_abi
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
-#define FFI_TRAMPOLINE_SIZE 36
+#define FFI_TRAMPOLINE_SIZE  24
 #define FFI_NATIVE_RAW_API 0
 
-/* ---- Internal ---- */
-
-
-#define FFI_EXTRA_CIF_FIELDS unsigned aarch64_flags
-
-#define AARCH64_FFI_WITH_V_BIT 0
-
-#define AARCH64_N_XREG 32
-#define AARCH64_N_VREG 32
-#define AARCH64_CALL_CONTEXT_SIZE (AARCH64_N_XREG * 8 + AARCH64_N_VREG * 16)
-
 #endif
diff --git a/libffi/src/aarch64/internal.h b/libffi/src/aarch64/internal.h
new file mode 100644
index 0000000..63cf683
--- /dev/null
+++ b/libffi/src/aarch64/internal.h
@@ -0,0 +1,43 @@
+/* 
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+/* ---- Internal ---- */
+
+#define AARCH64_RET_UINT32		0
+#define AARCH64_RET_SINT32		1
+#define AARCH64_RET_INT64		2
+#define AARCH64_RET_SM_STRUCT		3
+#define AARCH64_RET_FLOAT		4
+#define AARCH64_RET_DOUBLE		5
+#define AARCH64_RET_LDOUBLE		6
+#define AARCH64_RET_HFA_FLOAT		7
+#define AARCH64_RET_HFA_DOUBLE		8
+#define AARCH64_RET_HFA_LDOUBLE		13
+#define AARCH64_RET_LG_STRUCT		14
+#define AARCH64_RET_VOID		15
+#define AARCH64_FLAG_RET_MASK		15
+
+#define AARCH64_FLAG_ARG_V_BIT		4
+#define AARCH64_FLAG_ARG_V		(1 << AARCH64_FLAG_ARG_V_BIT)
+
+#define AARCH64_N_VREG 8
+#define AARCH64_N_XREG 8
+#define AARCH64_CALL_CONTEXT_SIZE (AARCH64_N_VREG * 16 + AARCH64_N_XREG * 8)
+
diff --git a/libffi/src/aarch64/sysv.S b/libffi/src/aarch64/sysv.S
index ffb16f8..126c527 100644
--- a/libffi/src/aarch64/sysv.S
+++ b/libffi/src/aarch64/sysv.S
@@ -22,286 +22,285 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
+#include "internal.h"
 
-#define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
-#define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
-#define cfi_restore(reg)		.cfi_restore reg
-#define cfi_def_cfa_register(reg)	.cfi_def_cfa_register reg
-
-        .text
-        .globl ffi_call_SYSV
-        .type ffi_call_SYSV, #function
+	.text
 
 /* ffi_call_SYSV()
 
-   Create a stack frame, setup an argument context, call the callee
-   and extract the result.
-
-   The maximum required argument stack size is provided,
-   ffi_call_SYSV() allocates that stack space then calls the
-   prepare_fn to populate register context and stack.  The
-   argument passing registers are loaded from the register
-   context and the callee called, on return the register passing
-   register are saved back to the context.  Our caller will
-   extract the return value from the final state of the saved
-   register context.
+   Install an argument context and a stack frame.
+   Call the callee and extract the result.
 
    Prototype:
 
-   extern unsigned
-   ffi_call_SYSV (void (*)(struct call_context *context, unsigned char *,
-			   extended_cif *),
-                  struct call_context *context,
-                  extended_cif *,
-                  unsigned required_stack_size,
-                  void (*fn)(void));
-
-   Therefore on entry we have:
-
-   x0 prepare_fn
-   x1 &context
-   x2 &ecif
-   x3 bytes
-   x4 fn
-
-   This function uses the following stack frame layout:
-
-   ==
-                saved x30(lr)
-   x29(fp)->    saved x29(fp)
-                saved x24
-                saved x23
-                saved x22
-   sp'    ->    saved x21
-                ...
-   sp     ->    (constructed callee stack arguments)
-   ==
+   extern void
+   ffi_call_SYSV (void *frame, void *rvalue, struct call_context *context,
+		  unsigned flags, void (*fn)(void))
 
-   Voila! */
+   This function uses an unusual stack layout.  Our local frame has
+   been allocated by the caller in FRAME with the outgoing arguments
+   in CONTEXT, and the outgoing stack arguments above CONTEXT.  */
 
-#define ffi_call_SYSV_FS (8 * 4)
+	.globl	ffi_call_SYSV
+	.hidden	ffi_call_SYSV
+	.type	ffi_call_SYSV, %function
+	.balign 32
 
-        .cfi_startproc
 ffi_call_SYSV:
-        stp     x29, x30, [sp, #-16]!
-	cfi_adjust_cfa_offset (16)
-        cfi_rel_offset (x29, 0)
-        cfi_rel_offset (x30, 8)
-
-        mov     x29, sp
-	cfi_def_cfa_register (x29)
-        sub     sp, sp, #ffi_call_SYSV_FS
-
-        stp     x21, x22, [sp, 0]
-        cfi_rel_offset (x21, 0 - ffi_call_SYSV_FS)
-        cfi_rel_offset (x22, 8 - ffi_call_SYSV_FS)
-
-        stp     x23, x24, [sp, 16]
-        cfi_rel_offset (x23, 16 - ffi_call_SYSV_FS)
-        cfi_rel_offset (x24, 24 - ffi_call_SYSV_FS)
-
-        mov     x21, x1
-        mov     x22, x2
-        mov     x24, x4
-
-        /* Allocate the stack space for the actual arguments, many
-           arguments will be passed in registers, but we assume
-           worst case and allocate sufficient stack for ALL of
-           the arguments.  */
-        sub     sp, sp, x3
-
-        /* unsigned (*prepare_fn) (struct call_context *context,
-				   unsigned char *stack, extended_cif *ecif);
-	 */
-        mov     x23, x0
-        mov     x0, x1
-        mov     x1, sp
-        /* x2 already in place */
-        blr     x23
-
-        /* Preserve the flags returned.  */
-        mov     x23, x0
-
-        /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FFI_WITH_V_BIT, 1f
-
-        /* Load the vector argument passing registers.  */
-        ldp     q0, q1, [x21, #8*32 +  0]
-        ldp     q2, q3, [x21, #8*32 + 32]
-        ldp     q4, q5, [x21, #8*32 + 64]
-        ldp     q6, q7, [x21, #8*32 + 96]
-1:
-        /* Load the core argument passing registers.  */
-        ldp     x0, x1, [x21,  #0]
-        ldp     x2, x3, [x21, #16]
-        ldp     x4, x5, [x21, #32]
-        ldp     x6, x7, [x21, #48]
-
-        /* Don't forget x8 which may be holding the address of a return buffer.
-	 */
-        ldr     x8,     [x21, #8*8]
-
-        blr     x24
-
-        /* Save the core argument passing registers.  */
-        stp     x0, x1, [x21,  #0]
-        stp     x2, x3, [x21, #16]
-        stp     x4, x5, [x21, #32]
-        stp     x6, x7, [x21, #48]
-
-        /* Note nothing useful ever comes back in x8!  */
-
-        /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FFI_WITH_V_BIT, 1f
-
-        /* Save the vector argument passing registers.  */
-        stp     q0, q1, [x21, #8*32 + 0]
-        stp     q2, q3, [x21, #8*32 + 32]
-        stp     q4, q5, [x21, #8*32 + 64]
-        stp     q6, q7, [x21, #8*32 + 96]
+	.cfi_startproc
+	.cfi_def_cfa x0, 32
+	stp	x29, x30, [x0]		/* Save fp, lr in our frame.  */
+	mov	x29, x0			/* Set up our new frame.  */
+	.cfi_def_cfa_register x29
+	.cfi_rel_offset x29, 0
+	.cfi_rel_offset x30, 8
+
+	/* Move parameters out of the way. */
+	stp	x3, x1, [x0, #16]	/* flags, rvalue */
+	mov	x8, x1			/* rvalue into place */
+	mov	x10, x2			/* context */
+	mov	x11, x4			/* fn */
+
+	/* Load the vector argument passing registers, if needed.  */
+	tbz     w3, #AARCH64_FLAG_ARG_V_BIT, 1f
+	ldp     q0, q1, [x10, #8*AARCH64_N_XREG + 0]
+	ldp     q2, q3, [x10, #8*AARCH64_N_XREG + 32]
+	ldp     q4, q5, [x10, #8*AARCH64_N_XREG + 64]
+	ldp     q6, q7, [x10, #8*AARCH64_N_XREG + 96]
 1:
-        /* All done, unwind our stack frame.  */
-        ldp     x21, x22, [x29,  # - ffi_call_SYSV_FS]
-        cfi_restore (x21)
-        cfi_restore (x22)
-
-        ldp     x23, x24, [x29,  # - ffi_call_SYSV_FS + 16]
-        cfi_restore (x23)
-        cfi_restore (x24)
-
-        mov     sp, x29
-	cfi_def_cfa_register (sp)
-
-        ldp     x29, x30, [sp], #16
-	cfi_adjust_cfa_offset (-16)
-        cfi_restore (x29)
-        cfi_restore (x30)
-
-        ret
-
-        .cfi_endproc
-        .size ffi_call_SYSV, .-ffi_call_SYSV
-
-#define ffi_closure_SYSV_FS (8 * 2 + AARCH64_CALL_CONTEXT_SIZE)
+	/* Load the core argument passing registers.  */
+	ldp     x0, x1, [x10, #16*0]
+	ldp     x2, x3, [x10, #16*1]
+	ldp     x4, x5, [x10, #16*2]
+	ldp     x6, x7, [x10, #16*3]
+
+	/* Setup SP for the stacked arguments.  */
+	add	sp, x10, #AARCH64_CALL_CONTEXT_SIZE
+
+	/* Call fn.  */
+	blr     x11
+
+	/* Recover the flags value and result address.  */
+	ldp	x3, x8, [x29, #16]
+
+	/* Store the return type.
+	   Each case uses 8 bytes, so compute it directly.  */
+	adr	x2, 3f
+	and	w3, w3, #AARCH64_FLAG_RET_MASK
+	add	x2, x2, x3, lsl #3
+	br	x2
+
+	/* Store results into the rvalue.  Note that for most integer
+	   cases this is actually ffi_arg, aka a 64-bit result.
+	   For the HFA cases, and the (small) struct case, we've arranged
+	   for temporary storage, so store the largest possible.
+	   For the large struct case, we've remapped to VOID, since
+	   the callee has already done the store via x8.  */
+	.balign 8
+/* 0: AARCH64_RET_UINT32 */
+3:	mov	w0, w0
+	b	4f
+/* 1: AARCH64_RET_SINT32 */
+	sxtw	x0, w0
+	nop
+/* 2: AARCH64_RET_INT64 */
+4:	str	x0, [x8]
+	b	9f
+/* 3: AARCH64_RET_SM_STRUCT */
+	stp	x0, x1, [x8]
+	b	9f
+/* 4: AARCH64_RET_FLOAT */
+	str	s0, [x8]
+	b	9f
+/* 5: AARCH64_RET_DOUBLE */
+	str	d0, [x8]
+	b	9f
+/* 6: AARCH64_RET_LONGDOUBLE */
+	str	q0, [x8]
+	b	9f
+/* 7: AARCH64_RET_HFA_FLOAT */
+	st4	{ v0.s, v1.s, v2.s, v3.s }[0], [x8]
+	b	9f
+/* 8: AARCH64_RET_HFA_DOUBLE */
+	st4	{ v0.d, v1.d, v2.d, v3.d }[0], [x8]
+	b	9f
+/* 9: invalid */
+	brk	#1000
+	nop
+/* A: invalid */
+	brk	#1000
+	nop
+/* B: invalid */
+	brk	#1000
+	nop
+/* C: invalid */
+	brk	#1000
+	nop
+/* D: AARCH64_RET_HFA_LDOUBLE */
+	stp	q0, q1, [x8]
+	stp	q2, q3, [x8, #32]
+/* E: AARCH64_RET_LG_STRUCT */
+	nop
+	nop
+/* F: AARCH64_RET_VOID */
+9:	ldp     x29, x30, [x29]
+	.cfi_def_cfa sp, 0
+	.cfi_restore x29
+	.cfi_restore x30
+	ret
+	.cfi_endproc
+	.size ffi_call_SYSV, .-ffi_call_SYSV
 
 /* ffi_closure_SYSV
 
    Closure invocation glue. This is the low level code invoked directly by
    the closure trampoline to setup and call a closure.
 
-   On entry x17 points to a struct trampoline_data, x16 has been clobbered
-   all other registers are preserved.
+   On entry x17 points to a ffi_closure, x16 has been clobbered,
+   and all other registers are preserved.
 
    We allocate a call context and save the argument passing registers,
    then invoked the generic C ffi_closure_SYSV_inner() function to do all
    the real work, on return we load the result passing registers back from
    the call context.
 
-   On entry
-
-   extern void
-   ffi_closure_SYSV (struct trampoline_data *);
-
-   struct trampoline_data
-   {
-        UINT64 *ffi_closure;
-        UINT64 flags;
-   };
+   We use two separate entry points, depending on whether there are
+   any vector argument registers.
 
    This function uses the following stack frame layout:
 
    ==
-                saved x30(lr)
-   x29(fp)->    saved x29(fp)
-                saved x22
-                saved x21
-                ...
-   sp     ->    call_context
+		temporary return slot
+		call_context
+		saved x30(lr)
+   sp, x29->    saved x29(fp)
    ==
 
    Voila!  */
 
-        .text
-        .globl ffi_closure_SYSV
-        .cfi_startproc
+#define ffi_closure_FS (16 + AARCH64_CALL_CONTEXT_SIZE + 64)
+
+	.globl	ffi_closure_SYSV_V
+	.hidden	ffi_closure_SYSV_V
+	.type	ffi_closure_SYSV_V, %function
+	.balign 32
+
+ffi_closure_SYSV_V:
+	.cfi_startproc
+	stp     x29, x30, [sp, #-ffi_closure_FS]!
+	.cfi_adjust_cfa_offset ffi_closure_FS
+	.cfi_rel_offset x29, 0
+	.cfi_rel_offset x30, 8
+	mov     x29, sp
+
+	/* Save the argument passing vector registers.  */
+	stp     q0, q1, [sp, #16 + 8*AARCH64_N_XREG + 0]
+	stp     q2, q3, [sp, #16 + 8*AARCH64_N_XREG + 32]
+	stp     q4, q5, [sp, #16 + 8*AARCH64_N_XREG + 64]
+	stp     q6, q7, [sp, #16 + 8*AARCH64_N_XREG + 96]
+	b	0f
+
+	.cfi_endproc
+	.size	ffi_closure_SYSV_V, . - ffi_closure_SYSV_V
+
+	.globl	ffi_closure_SYSV
+	.hidden	ffi_closure_SYSV
+	.type	ffi_closure_SYSV, %function
+	.balign 32
+
 ffi_closure_SYSV:
-        stp     x29, x30, [sp, #-16]!
-	cfi_adjust_cfa_offset (16)
-        cfi_rel_offset (x29, 0)
-        cfi_rel_offset (x30, 8)
-
-        mov     x29, sp
-        cfi_def_cfa_register (x29)
-
-        sub     sp, sp, #ffi_closure_SYSV_FS
-
-        stp     x21, x22, [x29, #-16]
-        cfi_rel_offset (x21, -16)
-        cfi_rel_offset (x22, -8)
-
-        /* Load x21 with &call_context.  */
-        mov     x21, sp
-        /* Preserve our struct trampoline_data *  */
-        mov     x22, x17
-
-        /* Save the rest of the argument passing registers.  */
-        stp     x0, x1, [x21, #0]
-        stp     x2, x3, [x21, #16]
-        stp     x4, x5, [x21, #32]
-        stp     x6, x7, [x21, #48]
-        /* Don't forget we may have been given a result scratch pad address.
-	 */
-        str     x8,     [x21, #64]
-
-        /* Figure out if we should touch the vector registers.  */
-        ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FFI_WITH_V_BIT, 1f
-
-        /* Save the argument passing vector registers.  */
-        stp     q0, q1, [x21, #8*32 + 0]
-        stp     q2, q3, [x21, #8*32 + 32]
-        stp     q4, q5, [x21, #8*32 + 64]
-        stp     q6, q7, [x21, #8*32 + 96]
-1:
-        /* Load &ffi_closure..  */
-        ldr     x0, [x22, #0]
-        mov     x1, x21
-        /* Compute the location of the stack at the point that the
-           trampoline was called.  */
-        add     x2, x29, #16
-
-        bl      ffi_closure_SYSV_inner
-
-        /* Figure out if we should touch the vector registers.  */
-        ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FFI_WITH_V_BIT, 1f
-
-        /* Load the result passing vector registers.  */
-        ldp     q0, q1, [x21, #8*32 + 0]
-        ldp     q2, q3, [x21, #8*32 + 32]
-        ldp     q4, q5, [x21, #8*32 + 64]
-        ldp     q6, q7, [x21, #8*32 + 96]
-1:
-        /* Load the result passing core registers.  */
-        ldp     x0, x1, [x21,  #0]
-        ldp     x2, x3, [x21, #16]
-        ldp     x4, x5, [x21, #32]
-        ldp     x6, x7, [x21, #48]
-        /* Note nothing usefull is returned in x8.  */
-
-        /* We are done, unwind our frame.  */
-        ldp     x21, x22, [x29,  #-16]
-        cfi_restore (x21)
-        cfi_restore (x22)
-
-        mov     sp, x29
-        cfi_def_cfa_register (sp)
-
-        ldp     x29, x30, [sp], #16
-	cfi_adjust_cfa_offset (-16)
-        cfi_restore (x29)
-        cfi_restore (x30)
-
-        ret
-        .cfi_endproc
-        .size ffi_closure_SYSV, .-ffi_closure_SYSV
+	.cfi_startproc
+	stp     x29, x30, [sp, #-ffi_closure_FS]!
+	.cfi_adjust_cfa_offset ffi_closure_FS
+	.cfi_rel_offset x29, 0
+	.cfi_rel_offset x30, 8
+	mov     x29, sp
+
+	/* Save the argument passing core registers.  */
+0:	stp     x0, x1, [sp, #16 + 0]
+	stp     x2, x3, [sp, #16 + 16]
+	stp     x4, x5, [sp, #16 + 32]
+	stp     x6, x7, [sp, #16 + 48]
+
+	ldp	x0, x1, [x17, #FFI_TRAMPOLINE_SIZE]  /* Load cfi, fun */
+	ldr	x2, [x17, #FFI_TRAMPOLINE_SIZE + 16] /* Load user_data */
+
+.Ldo_closure:
+	add	x3, sp, #16			/* Load &call_context.  */
+	add	x4, sp, #ffi_closure_FS		/* Load incoming sp value.  */
+	mov	x5, x8				/* Load struct return.  */
+	bl      ffi_closure_SYSV_inner
+
+	/* Load the return type.  Each case uses 8 bytes, so compute it
+	   directly.  Load x8 with address of the temporary return slot.  */
+	adr	x1, 3f
+	and	w0, w0, #AARCH64_FLAG_RET_MASK
+	add	x1, x1, x0, lsl #3
+	add	x8, sp, #16 + AARCH64_CALL_CONTEXT_SIZE
+	br	x1
+
+	/* Load results from temporary storage.  Note that for most integer
+	   cases this is actually ffi_arg, aka a 64-bit result.  For the HFA
+	   cases and the (small) struct case, we can load the maximum width.
+	   For the large struct case, we've remapped to VOID.  */
+#if defined __AARCH64EB__
+# define INT32OFS  4
+#else
+# define INT32OFS  0
+#endif
+
+	.balign 8
+/* 0: AARCH64_RET_UINT32 */
+3:	ldr	w0, [x8, #INT32OFS]
+	b	9f
+/* 1: AARCH64_RET_SINT32 */
+	ldrsw	x0, [x8, #INT32OFS]
+	b	9f
+/* 2: AARCH64_RET_INT64 */
+	ldr	x0, [x8]
+	b	9f
+/* 3: AARCH64_RET_SM_STRUCT */
+	ldp	x0, x1, [x8]
+	b	9f
+/* 4: AARCH64_RET_FLOAT */
+	ldr	s0, [x8]
+	b	9f
+/* 5: AARCH64_RET_DOUBLE */
+	ldr	d0, [x8]
+	b	9f
+/* 6: AARCH64_RET_LONGDOUBLE */
+	ldr	q0, [x8]
+	b	9f
+/* 7: AARCH64_RET_HFA_FLOAT */
+	ld4	{ v0.s, v1.s, v2.s, v3.s }[0], [x8]
+	b	9f
+/* 8: AARCH64_RET_HFA_DOUBLE */
+	ld1	{ v0.1d, v1.1d, v2.1d, v3.1d }, [x8]
+	b	9f
+/* 9: invalid */
+	brk	#1000
+	nop
+/* A: invalid */
+	brk	#1000
+	nop
+/* B: invalid */
+	brk	#1000
+	nop
+/* C: invalid */
+	brk	#1000
+	nop
+/* D: AARCH64_RET_HFA_LDOUBLE */
+	ldp	q0, q1, [x8]
+	ldp	q2, q3, [x8, #32]
+/* E: AARCH64_RET_LG_STRUCT */
+	nop
+	nop
+/* F: AARCH64_RET_VOID */
+9:	/* We are done, unwind our frame.  */
+	ldp     x29, x30, [sp], #ffi_closure_FS
+	.cfi_adjust_cfa_offset -ffi_closure_FS
+	.cfi_restore x29
+	.cfi_restore x30
+	ret
+	.cfi_endproc
+	.size ffi_closure_SYSV, .-ffi_closure_SYSV
-- 
1.9.3