public inbox for libffi-discuss@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 00/16] Go closures for aarch64
@ 2014-10-28 18:53 Richard Henderson
  2014-10-28 18:53 ` [PATCH 03/16] aarch64: Always distinguish LONGDOUBLE Richard Henderson
                   ` (16 more replies)
  0 siblings, 17 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:53 UTC (permalink / raw)
  To: libffi-discuss

This patch set fixes a compilation error since the iOS merge,
tidies up the port significantly, and finally adds support for
complex and Go closures.


r~


Richard Henderson (16):
  aarch64: Fix non-apple compilation
  aarch64: Improve is_hfa
  aarch64: Always distinguish LONGDOUBLE
  aarch64: Simplify AARCH64_STACK_ALIGN
  aarch64: Reduce the size of register_context
  aarch64: Use correct return registers
  aarch64: Treat void return as not passed in registers
  aarch64: Tidy up abi manipulation
  aarch64: Merge prep_args with ffi_call
  aarch64: Move return value handling into ffi_call_SYSV
  aarch64: Move return value handling into ffi_closure_SYSV
  aarch64: Unify scalar fp and hfa handling
  aarch64: Remove aarch64_flags
  aarch64: Add support for complex types
  aarch64: Move x8 out of call_context
  aarch64: Add support for Go closures

 src/aarch64/ffi.c              | 1477 ++++++++++++++++------------------------
 src/aarch64/ffitarget.h        |   14 +-
 src/aarch64/internal.h         |   67 ++
 src/aarch64/sysv.S             |  589 +++++++++-------
 testsuite/libffi.call/call.exp |   10 +-
 5 files changed, 1008 insertions(+), 1149 deletions(-)
 create mode 100644 src/aarch64/internal.h

-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 01/16] aarch64: Fix non-apple compilation
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
  2014-10-28 18:53 ` [PATCH 03/16] aarch64: Always distinguish LONGDOUBLE Richard Henderson
@ 2014-10-28 18:53 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 11/16] aarch64: Move return value handling into ffi_closure_SYSV Richard Henderson
                   ` (14 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:53 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

---
 src/aarch64/ffi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 5369ea4..cdb7816 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -782,7 +782,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
           }
     }
 
+#if defined (__APPLE__)
   cif->aarch64_nfixedargs = 0;
+#endif
 
   return FFI_OK;
 }
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 03/16] aarch64: Always distinguish LONGDOUBLE
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
@ 2014-10-28 18:53 ` Richard Henderson
  2014-10-28 18:53 ` [PATCH 01/16] aarch64: Fix non-apple compilation Richard Henderson
                   ` (15 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:53 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

Avoid if-deffery by forcing FFI_TYPE_LONGDOUBLE different
from FFI_TYPE_DOUBLE.  This will simply be unused on hosts
that define them identically.
---
 src/aarch64/ffi.c | 41 ++++++++++++++---------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 0834614..f065be5 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -20,11 +20,20 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #include <stdio.h>
-
+#include <stdlib.h>
 #include <ffi.h>
 #include <ffi_common.h>
 
-#include <stdlib.h>
+/* Force FFI_TYPE_LONGDOUBLE to be different than FFI_TYPE_DOUBLE;
+   all further uses in this file will refer to the 128-bit type.  */
+#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
+# if FFI_TYPE_LONGDOUBLE != 4
+#  error FFI_TYPE_LONGDOUBLE out of date
+# endif
+#else
+# undef FFI_TYPE_LONGDOUBLE
+# define FFI_TYPE_LONGDOUBLE 4
+#endif
 
 /* Stack alignment requirement in bytes */
 #if defined (__APPLE__)
@@ -115,10 +124,8 @@ get_basic_type_addr (unsigned short type, struct call_context *context,
       return get_s_addr (context, n);
     case FFI_TYPE_DOUBLE:
       return get_d_addr (context, n);
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
       return get_v_addr (context, n);
-#endif
     case FFI_TYPE_UINT8:
     case FFI_TYPE_SINT8:
     case FFI_TYPE_UINT16:
@@ -151,10 +158,8 @@ get_basic_type_alignment (unsigned short type)
 #endif
     case FFI_TYPE_DOUBLE:
       return sizeof (UINT64);
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
       return sizeof (long double);
-#endif
     case FFI_TYPE_UINT8:
     case FFI_TYPE_SINT8:
 #if defined (__APPLE__)
@@ -193,10 +198,8 @@ get_basic_type_size (unsigned short type)
       return sizeof (UINT32);
     case FFI_TYPE_DOUBLE:
       return sizeof (UINT64);
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
       return sizeof (long double);
-#endif
     case FFI_TYPE_UINT8:
       return sizeof (UINT8);
     case FFI_TYPE_SINT8:
@@ -390,9 +393,7 @@ is_register_candidate (ffi_type *ty)
     case FFI_TYPE_VOID:
     case FFI_TYPE_FLOAT:
     case FFI_TYPE_DOUBLE:
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
-#endif
     case FFI_TYPE_UINT8:
     case FFI_TYPE_UINT16:
     case FFI_TYPE_UINT32:
@@ -557,11 +558,9 @@ copy_basic_type (void *dest, void *source, unsigned short type)
     case FFI_TYPE_DOUBLE:
       *(double *) dest = *(double *) source;
       break;
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
       *(long double *) dest = *(long double *) source;
       break;
-#endif
     case FFI_TYPE_UINT8:
       *(ffi_arg *) dest = *(UINT8 *) source;
       break;
@@ -653,13 +652,11 @@ allocate_to_register_or_stack (struct call_context *context,
 	return allocate_to_d (context, state);
       state->nsrn = N_V_ARG_REG;
       break;
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
     case FFI_TYPE_LONGDOUBLE:
       if (state->nsrn < N_V_ARG_REG)
 	return allocate_to_v (context, state);
       state->nsrn = N_V_ARG_REG;
       break;
-#endif
     case FFI_TYPE_UINT8:
     case FFI_TYPE_SINT8:
     case FFI_TYPE_UINT16:
@@ -722,9 +719,7 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack,
 	   appropriate register, or if none are available, to the stack.  */
 	case FFI_TYPE_FLOAT:
 	case FFI_TYPE_DOUBLE:
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
 	case FFI_TYPE_LONGDOUBLE:
-#endif
 	case FFI_TYPE_UINT8:
 	case FFI_TYPE_SINT8:
 	case FFI_TYPE_UINT16:
@@ -887,9 +882,7 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
               case FFI_TYPE_VOID:
               case FFI_TYPE_FLOAT:
               case FFI_TYPE_DOUBLE:
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
               case FFI_TYPE_LONGDOUBLE:
-#endif
               case FFI_TYPE_UINT8:
               case FFI_TYPE_SINT8:
               case FFI_TYPE_UINT16:
@@ -1040,14 +1033,12 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
 	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	case  FFI_TYPE_FLOAT:
-	case  FFI_TYPE_DOUBLE:
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
-	case  FFI_TYPE_LONGDOUBLE:
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
 	  avalue[i] = allocate_to_register_or_stack (context, stack,
 						     &state, ty->type);
 	  break;
-#endif
 
 	case FFI_TYPE_STRUCT:
 	  h = is_hfa (ty);
@@ -1106,13 +1097,11 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
 			break;
 		      }
 
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
 		    case FFI_TYPE_LONGDOUBLE:
 			  memcpy (&avalue[i],
 				  allocate_to_v (context, &state),
 				  sizeof (*avalue));
 		      break;
-#endif
 
 		    default:
 		      FFI_ASSERT (0);
@@ -1183,9 +1172,7 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
         case FFI_TYPE_SINT64:
         case FFI_TYPE_FLOAT:
         case FFI_TYPE_DOUBLE:
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
         case FFI_TYPE_LONGDOUBLE:
-#endif
 	  {
 	    void *addr = get_basic_type_addr (cif->rtype->type, context, 0);
 	    copy_basic_type (addr, rvalue, cif->rtype->type);
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 08/16] aarch64: Tidy up abi manipulation
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (12 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 15/16] aarch64: Move x8 out of call_context Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2017-09-17 14:24   ` Andreas Schwab
  2014-10-28 18:54 ` [PATCH 06/16] aarch64: Use correct return registers Richard Henderson
                   ` (2 subsequent siblings)
  16 siblings, 1 reply; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

Avoid false abstraction, like get_x_addr.  Avoid recomputing data
about the type being manipulated.  Use NEON insns for HFA manipulation.

Note that some of the inline assembly will go away in a subsequent patch.
---
 src/aarch64/ffi.c | 932 +++++++++++++++++++++---------------------------------
 1 file changed, 367 insertions(+), 565 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 6c338e1..d19384b 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -71,152 +71,6 @@ ffi_clear_cache (void *start, void *end)
 #endif
 }
 
-static void *
-get_x_addr (struct call_context *context, unsigned n)
-{
-  return &context->x[n];
-}
-
-static void *
-get_s_addr (struct call_context *context, unsigned n)
-{
-#if defined __AARCH64EB__
-  return &context->v[n].d[1].s[1];
-#else
-  return &context->v[n].d[0].s[0];
-#endif
-}
-
-static void *
-get_d_addr (struct call_context *context, unsigned n)
-{
-#if defined __AARCH64EB__
-  return &context->v[n].d[1];
-#else
-  return &context->v[n].d[0];
-#endif
-}
-
-static void *
-get_v_addr (struct call_context *context, unsigned n)
-{
-  return &context->v[n];
-}
-
-/* Return the memory location at which a basic type would reside
-   were it to have been stored in register n.  */
-
-static void *
-get_basic_type_addr (unsigned short type, struct call_context *context,
-		     unsigned n)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return get_s_addr (context, n);
-    case FFI_TYPE_DOUBLE:
-      return get_d_addr (context, n);
-    case FFI_TYPE_LONGDOUBLE:
-      return get_v_addr (context, n);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return get_x_addr (context, n);
-    case FFI_TYPE_VOID:
-      return NULL;
-    default:
-      FFI_ASSERT (0);
-      return NULL;
-    }
-}
-
-/* Return the alignment width for each of the basic types.  */
-
-static size_t
-get_basic_type_alignment (unsigned short type)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-#if defined (__APPLE__)
-      return sizeof (UINT32);
-#endif
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-#if defined (__APPLE__)
-	  return sizeof (UINT8);
-#endif
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-#if defined (__APPLE__)
-	  return sizeof (UINT16);
-#endif
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-#if defined (__APPLE__)
-	  return sizeof (UINT32);
-#endif
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      return sizeof (UINT64);
-
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
-
-/* Return the size in bytes for each of the basic types.  */
-
-static size_t
-get_basic_type_size (unsigned short type)
-{
-  switch (type)
-    {
-    case FFI_TYPE_FLOAT:
-      return sizeof (UINT32);
-    case FFI_TYPE_DOUBLE:
-      return sizeof (UINT64);
-    case FFI_TYPE_LONGDOUBLE:
-      return sizeof (long double);
-    case FFI_TYPE_UINT8:
-      return sizeof (UINT8);
-    case FFI_TYPE_SINT8:
-      return sizeof (SINT8);
-    case FFI_TYPE_UINT16:
-      return sizeof (UINT16);
-    case FFI_TYPE_SINT16:
-      return sizeof (SINT16);
-    case FFI_TYPE_UINT32:
-      return sizeof (UINT32);
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT32:
-      return sizeof (SINT32);
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-      return sizeof (UINT64);
-    case FFI_TYPE_SINT64:
-      return sizeof (SINT64);
-
-    default:
-      FFI_ASSERT (0);
-      return 0;
-    }
-}
-
 extern void
 ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
 			    extended_cif *),
@@ -468,223 +322,211 @@ arg_init (struct arg_state *state, size_t call_frame_size)
 #endif
 }
 
-/* Return the number of available consecutive core argument
-   registers.  */
-
-static unsigned
-available_x (struct arg_state *state)
-{
-  return N_X_ARG_REG - state->ngrn;
-}
-
-/* Return the number of available consecutive vector argument
-   registers.  */
-
-static unsigned
-available_v (struct arg_state *state)
-{
-  return N_V_ARG_REG - state->nsrn;
-}
-
-static void *
-allocate_to_x (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->ngrn < N_X_ARG_REG);
-  return get_x_addr (context, (state->ngrn)++);
-}
-
-static void *
-allocate_to_s (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG);
-  return get_s_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_d (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG);
-  return get_d_addr (context, (state->nsrn)++);
-}
-
-static void *
-allocate_to_v (struct call_context *context, struct arg_state *state)
-{
-  FFI_ASSERT (state->nsrn < N_V_ARG_REG);
-  return get_v_addr (context, (state->nsrn)++);
-}
-
 /* Allocate an aligned slot on the stack and return a pointer to it.  */
 static void *
-allocate_to_stack (struct arg_state *state, void *stack, size_t alignment,
-		   size_t size)
+allocate_to_stack (struct arg_state *state, void *stack,
+		   size_t alignment, size_t size)
 {
-  void *allocation;
+  size_t nsaa = state->nsaa;
 
   /* Round up the NSAA to the larger of 8 or the natural
      alignment of the argument's type.  */
-  state->nsaa = ALIGN (state->nsaa, alignment);
-  state->nsaa = ALIGN (state->nsaa, alignment);
 #if defined (__APPLE__)
-  if (state->allocating_variadic)
-    state->nsaa = ALIGN (state->nsaa, 8);
+  if (state->allocating_variadic && alignment < 8)
+    alignment = 8;
 #else
-  state->nsaa = ALIGN (state->nsaa, 8);
+  if (alignment < 8)
+    alignment = 8;
 #endif
+    
+  nsaa = ALIGN (nsaa, alignment);
+  state->nsaa = nsaa + size;
 
-  allocation = stack + state->nsaa;
-
-  state->nsaa += size;
-  return allocation;
+  return (char *)stack + nsaa;
 }
 
-static void
-copy_basic_type (void *dest, void *source, unsigned short type)
+static ffi_arg
+extend_integer_type (void *source, int type)
 {
-  /* This is necessary to ensure that basic types are copied
-     sign extended to 64-bits as libffi expects.  */
   switch (type)
     {
-    case FFI_TYPE_FLOAT:
-      *(float *) dest = *(float *) source;
-      break;
-    case FFI_TYPE_DOUBLE:
-      *(double *) dest = *(double *) source;
-      break;
-    case FFI_TYPE_LONGDOUBLE:
-      *(long double *) dest = *(long double *) source;
-      break;
     case FFI_TYPE_UINT8:
-      *(ffi_arg *) dest = *(UINT8 *) source;
-      break;
+      return *(UINT8 *) source;
     case FFI_TYPE_SINT8:
-      *(ffi_sarg *) dest = *(SINT8 *) source;
-      break;
+      return *(SINT8 *) source;
     case FFI_TYPE_UINT16:
-      *(ffi_arg *) dest = *(UINT16 *) source;
-      break;
+      return *(UINT16 *) source;
     case FFI_TYPE_SINT16:
-      *(ffi_sarg *) dest = *(SINT16 *) source;
-      break;
+      return *(SINT16 *) source;
     case FFI_TYPE_UINT32:
-      *(ffi_arg *) dest = *(UINT32 *) source;
-      break;
+      return *(UINT32 *) source;
     case FFI_TYPE_INT:
     case FFI_TYPE_SINT32:
-      *(ffi_sarg *) dest = *(SINT32 *) source;
-      break;
-    case FFI_TYPE_POINTER:
+      return *(SINT32 *) source;
     case FFI_TYPE_UINT64:
-      *(ffi_arg *) dest = *(UINT64 *) source;
-      break;
     case FFI_TYPE_SINT64:
-      *(ffi_sarg *) dest = *(SINT64 *) source;
-      break;
-    case FFI_TYPE_VOID:
+      return *(UINT64 *) source;
       break;
-
+    case FFI_TYPE_POINTER:
+      return *(uintptr_t *) source;
     default:
-      FFI_ASSERT (0);
+      abort();
     }
 }
 
 static void
-copy_hfa_to_reg_or_stack (void *memory,
-			  ffi_type *ty,
-			  struct call_context *context,
-			  unsigned char *stack,
-			  struct arg_state *state)
-{
-  int h = is_hfa (ty);
-  int type = h & 0xff;
-  unsigned elems = h >> 8;
-
-  if (available_v (state) < elems)
-    {
-      /* There are insufficient V registers. Further V register allocations
-	 are prevented, the NSAA is adjusted (by allocate_to_stack ())
-	 and the argument is copied to memory at the adjusted NSAA.  */
-      state->nsrn = N_V_ARG_REG;
-      memcpy (allocate_to_stack (state, stack, ty->alignment, ty->size),
-	      memory,
-	      ty->size);
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < elems; i++)
-	{
-	  void *reg = allocate_to_v (context, state);
-	  copy_basic_type (reg, memory, type);
-	  memory += get_basic_type_size (type);
-	}
-    }
+extend_hfa_type (void *dest, void *src, int h)
+{
+  int n = (h >> 8);
+  int t = h & 0xff;
+  int f = (t - FFI_TYPE_FLOAT) * 4 + 4 - n;
+  void *x0;
+
+  asm volatile (
+	"adr	%0, 0f\n"
+"	add	%0, %0, %1\n"
+"	br	%0\n"
+"0:	ldp	s16, s17, [%3]\n"	/* S4 */
+"	ldp	s18, s19, [%3, #8]\n"
+"	b	4f\n"
+"	ldp	s16, s17, [%3]\n"	/* S3 */
+"	ldr	s18, [%3, #8]\n"
+"	b	3f\n"
+"	ldp	s16, s17, [%3]\n"	/* S2 */
+"	b	2f\n"
+"	nop\n"
+"	ldr	s16, [%3]\n"		/* S1 */
+"	b	1f\n"
+"	nop\n"
+"	ldp	d16, d17, [%3]\n"	/* D4 */
+"	ldp	d18, d19, [%3, #16]\n"
+"	b	4f\n"
+"	ldp	d16, d17, [%3]\n"	/* D3 */
+"	ldr	d18, [%3, #16]\n"
+"	b	3f\n"
+"	ldp	d16, d17, [%3]\n"	/* D2 */
+"	b	2f\n"
+"	nop\n"
+"	ldr	d16, [%3]\n"		/* D1 */
+"	b	1f\n"
+"	nop\n"
+"	ldp	q16, q17, [%3]\n"	/* Q4 */
+"	ldp	q18, q19, [%3, #16]\n"
+"	b	4f\n"
+"	ldp	q16, q17, [%3]\n"	/* Q3 */
+"	ldr	q18, [%3, #16]\n"
+"	b	3f\n"
+"	ldp	q16, q17, [%3]\n"	/* Q2 */
+"	b	2f\n"
+"	nop\n"
+"	ldr	q16, [%3]\n"		/* Q1 */
+"	b	1f\n"
+"4:	str	q19, [%2, #48]\n"
+"3:	str	q18, [%2, #32]\n"
+"2:	str	q17, [%2, #16]\n"
+"1:	str	q16, [%2]"
+    : "=&r"(x0)
+    : "r"(f * 12), "r"(dest), "r"(src)
+    : "memory", "v16", "v17", "v18", "v19");
 }
 
-/* Either allocate an appropriate register for the argument type, or if
-   none are available, allocate a stack slot and return a pointer
-   to the allocated space.  */
-
 static void *
-allocate_to_register_or_stack (struct call_context *context,
-			       unsigned char *stack,
-			       struct arg_state *state,
-			       unsigned short type)
+compress_hfa_type (void *dest, void *reg, int h)
 {
-  size_t alignment = get_basic_type_alignment (type);
-  size_t size = alignment;
-  switch (type)
+  int n = h >> 8;
+  switch (h & 0xff)
     {
     case FFI_TYPE_FLOAT:
-      /* This is the only case for which the allocated stack size
-	 should not match the alignment of the type.  */
-      size = sizeof (UINT32);
-      /* Fall through.  */
+      switch (n)
+	{
+	default:
+	  if (dest == reg)
+	    {
+#ifdef __AARCH64EB__
+	      dest += 12;
+#endif
+	    }
+	  else
+	    *(float *)dest = *(float *)reg;
+	  break;
+	case 2:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "st2 { v16.s, v17.s }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+	  break;
+	case 3:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "ldr q18, [%1, #32]\n\t"
+	      "st3 { v16.s, v17.s, v18.s }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+	  break;
+	case 4:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "ldp q18, q19, [%1, #32]\n\t"
+	      "st4 { v16.s, v17.s, v18.s, v19.s }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
+	  break;
+	}
+      break;
+
     case FFI_TYPE_DOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
-	return allocate_to_d (context, state);
-      state->nsrn = N_V_ARG_REG;
+      switch (n)
+	{
+	default:
+	  if (dest == reg)
+	    {
+#ifdef __AARCH64EB__
+	      dest += 8;
+#endif
+	    }
+	  else
+	    *(double *)dest = *(double *)reg;
+	  break;
+	case 2:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "st2 { v16.d, v17.d }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+	  break;
+	case 3:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "ldr q18, [%1, #32]\n\t"
+	      "st3 { v16.d, v17.d, v18.d }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+	  break;
+	case 4:
+	  asm("ldp q16, q17, [%1]\n\t"
+	      "ldp q18, q19, [%1, #32]\n\t"
+	      "st4 { v16.d, v17.d, v18.d, v19.d }[0], [%0]"
+	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
+	  break;
+	}
       break;
+
     case FFI_TYPE_LONGDOUBLE:
-      if (state->nsrn < N_V_ARG_REG)
-	return allocate_to_v (context, state);
-      state->nsrn = N_V_ARG_REG;
-      break;
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_SINT64:
-      if (state->ngrn < N_X_ARG_REG)
-	return allocate_to_x (context, state);
-      state->ngrn = N_X_ARG_REG;
+      if (dest != reg)
+	return memcpy (dest, reg, 16 * n);
       break;
+
     default:
       FFI_ASSERT (0);
     }
-
-    return allocate_to_stack (state, stack, alignment, size);
+  return dest;
 }
 
-/* Copy a value to an appropriate register, or if none are
-   available, to the stack.  */
+/* Either allocate an appropriate register for the argument type, or if
+   none are available, allocate a stack slot and return a pointer
+   to the allocated space.  */
 
-static void
-copy_to_register_or_stack (struct call_context *context,
-			   unsigned char *stack,
-			   struct arg_state *state,
-			   void *value,
-			   unsigned short type)
+static void *
+allocate_int_to_reg_or_stack (struct call_context *context,
+			      struct arg_state *state,
+			      void *stack, size_t size)
 {
-  copy_basic_type (
-	  allocate_to_register_or_stack (context, stack, state, type),
-	  value,
-	  type);
+  if (state->ngrn < N_X_ARG_REG)
+    return &context->x[state->ngrn++];
+
+  state->ngrn = N_X_ARG_REG;
+  return allocate_to_stack (state, stack, size, size);
 }
 
 /* Marshall the arguments from FFI representation to procedure call
@@ -694,15 +536,21 @@ static unsigned
 aarch64_prep_args (struct call_context *context, unsigned char *stack,
 		   extended_cif *ecif)
 {
-  int i;
+  ffi_cif *cif = ecif->cif;
+  void **avalue = ecif->avalue;
+  int i, nargs = cif->nargs;
   struct arg_state state;
 
-  arg_init (&state, ALIGN(ecif->cif->bytes, 16));
+  arg_init (&state, cif->bytes);
 
-  for (i = 0; i < ecif->cif->nargs; i++)
+  for (i = 0; i < nargs; i++)
     {
-      ffi_type *ty = ecif->cif->arg_types[i];
-      switch (ty->type)
+      ffi_type *ty = cif->arg_types[i];
+      size_t s = ty->size;
+      int h, t = ty->type;
+      void *a = avalue[i];
+
+      switch (t)
 	{
 	case FFI_TYPE_VOID:
 	  FFI_ASSERT (0);
@@ -710,82 +558,114 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack,
 
 	/* If the argument is a basic type the argument is allocated to an
 	   appropriate register, or if none are available, to the stack.  */
-	case FFI_TYPE_FLOAT:
-	case FFI_TYPE_DOUBLE:
-	case FFI_TYPE_LONGDOUBLE:
+	case FFI_TYPE_INT:
 	case FFI_TYPE_UINT8:
 	case FFI_TYPE_SINT8:
 	case FFI_TYPE_UINT16:
 	case FFI_TYPE_SINT16:
 	case FFI_TYPE_UINT32:
-	case FFI_TYPE_INT:
 	case FFI_TYPE_SINT32:
-	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	  copy_to_register_or_stack (context, stack, &state,
-				     ecif->avalue[i], ty->type);
+	case FFI_TYPE_POINTER:
+	do_pointer:
+	  {
+	    ffi_arg ext = extend_integer_type (a, t);
+	    if (state.ngrn < N_X_ARG_REG)
+	      context->x[state.ngrn++] = ext;
+	    else
+	      {
+		void *d = allocate_to_stack (&state, stack, ty->alignment, s);
+		state.ngrn = N_X_ARG_REG;
+		/* Note that the default abi extends each argument
+		   to a full 64-bit slot, while the iOS abi allocates
+		   only enough space. */
+#ifdef __APPLE__
+		memcpy(d, a, s);
+#else
+		*(ffi_arg *)d = ext;
+#endif
+	      }
+	  }
 	  break;
 
-	case FFI_TYPE_STRUCT:
-	  if (is_hfa (ty))
-	    {
-	      copy_hfa_to_reg_or_stack (ecif->avalue[i], ty, context,
-					stack, &state);
-	    }
-	  else if (ty->size > 16)
-	    {
-	      /* If the argument is a composite type that is larger than 16
-		 bytes, then the argument has been copied to memory, and
-		 the argument is replaced by a pointer to the copy.  */
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
+	  /* Scalar float is a degenerate case of HFA.  */
+	  h = t + 0x100;
+	  goto do_hfa;
 
-	      copy_to_register_or_stack (context, stack, &state,
-					 &(ecif->avalue[i]), FFI_TYPE_POINTER);
-	    }
-	  else if (available_x (&state) >= (ty->size + 7) / 8)
-	    {
-	      /* If the argument is a composite type and the size in
-		 double-words is not more than the number of available
-		 X registers, then the argument is copied into consecutive
-		 X registers.  */
-	      int j;
-	      for (j = 0; j < (ty->size + 7) / 8; j++)
-		{
-		  memcpy (allocate_to_x (context, &state),
-			  &(((UINT64 *) ecif->avalue[i])[j]),
-			  sizeof (UINT64));
+	case FFI_TYPE_STRUCT:
+	  {
+	    void *dest;
+	    int elems;
+
+	    h = is_hfa (ty);
+	    if (h)
+	      {
+	    do_hfa:
+		elems = h >> 8;
+	        if (state.nsrn + elems <= N_V_ARG_REG)
+		  {
+		    dest = &context->v[state.nsrn];
+		    state.nsrn += elems;
+		    extend_hfa_type (dest, a, h);
+		    break;
+		  }
+		state.nsrn = N_V_ARG_REG;
+		dest = allocate_to_stack (&state, stack, ty->alignment, s);
+	      }
+	    else if (s > 16)
+	      {
+		/* If the argument is a composite type that is larger than 16
+		   bytes, then the argument has been copied to memory, and
+		   the argument is replaced by a pointer to the copy.  */
+		a = &avalue[i];
+		t = FFI_TYPE_POINTER;
+		goto do_pointer;
+	      }
+	    else
+	      {
+		size_t n = (s + 7) / 8;
+		if (state.ngrn + n <= N_X_ARG_REG)
+		  {
+		    /* If the argument is a composite type and the size in
+		       double-words is not more than the number of available
+		       X registers, then the argument is copied into
+		       consecutive X registers.  */
+		    dest = &context->x[state.ngrn];
+		    state.ngrn += n;
+		  }
+		else
+		  {
+		    /* Otherwise, there are insufficient X registers. Further
+		       X register allocations are prevented, the NSAA is
+		       adjusted and the argument is copied to memory at the
+		       adjusted NSAA.  */
+		    state.ngrn = N_X_ARG_REG;
+		    dest = allocate_to_stack (&state, stack, ty->alignment, s);
+		  }
 		}
-	    }
-	  else
-	    {
-	      /* Otherwise, there are insufficient X registers. Further X
-		 register allocations are prevented, the NSAA is adjusted
-		 (by allocate_to_stack ()) and the argument is copied to
-		 memory at the adjusted NSAA.  */
-	      state.ngrn = N_X_ARG_REG;
-
-	      memcpy (allocate_to_stack (&state, stack, ty->alignment,
-					 ty->size), ecif->avalue + i, ty->size);
+	      memcpy (dest, a, s);
 	    }
 	  break;
 
 	default:
-	  FFI_ASSERT (0);
-	  break;
+	  abort();
 	}
 
 #if defined (__APPLE__)
-      if (i + 1 == ecif->cif->aarch64_nfixedargs)
+      if (i + 1 == cif->aarch64_nfixedargs)
 	{
 	  state.ngrn = N_X_ARG_REG;
 	  state.nsrn = N_V_ARG_REG;
-
 	  state.allocating_variadic = 1;
 	}
 #endif
     }
 
-  return ecif->cif->aarch64_flags;
+  return cif->aarch64_flags;
 }
 
 ffi_status
@@ -846,94 +726,61 @@ void
 ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
   extended_cif ecif;
-  int h;
+  struct call_context context;
+  size_t stack_bytes;
+  int h, t;
 
   ecif.cif = cif;
   ecif.avalue = avalue;
   ecif.rvalue = rvalue;
 
-  switch (cif->abi)
+  stack_bytes = cif->bytes;
+
+  memset (&context, 0, sizeof (context));
+  if (is_register_candidate (cif->rtype))
     {
-    case FFI_SYSV:
-      {
-        struct call_context context;
-	size_t stack_bytes;
+      ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
 
-	/* Figure out the total amount of stack space we need, the
-	   above call frame space needs to be 16 bytes aligned to
-	   ensure correct alignment of the first object inserted in
-	   that space hence the ALIGN applied to cif->bytes.*/
-	stack_bytes = ALIGN(cif->bytes, 16);
+      t = cif->rtype->type;
+      switch (t)
+	{
+	case FFI_TYPE_INT:
+	case FFI_TYPE_UINT8:
+	case FFI_TYPE_SINT8:
+	case FFI_TYPE_UINT16:
+	case FFI_TYPE_SINT16:
+	case FFI_TYPE_UINT32:
+	case FFI_TYPE_SINT32:
+	case FFI_TYPE_POINTER:
+	case FFI_TYPE_UINT64:
+	case FFI_TYPE_SINT64:
+	  *(ffi_arg *)rvalue = extend_integer_type (&context.x[0], t);
+	  break;
 
-	memset (&context, 0, sizeof (context));
-        if (is_register_candidate (cif->rtype))
-          {
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
-            switch (cif->rtype->type)
-              {
-              case FFI_TYPE_VOID:
-              case FFI_TYPE_FLOAT:
-              case FFI_TYPE_DOUBLE:
-              case FFI_TYPE_LONGDOUBLE:
-              case FFI_TYPE_UINT8:
-              case FFI_TYPE_SINT8:
-              case FFI_TYPE_UINT16:
-              case FFI_TYPE_SINT16:
-              case FFI_TYPE_UINT32:
-              case FFI_TYPE_SINT32:
-              case FFI_TYPE_POINTER:
-              case FFI_TYPE_UINT64:
-              case FFI_TYPE_INT:
-              case FFI_TYPE_SINT64:
-		{
-		  void *addr = get_basic_type_addr (cif->rtype->type,
-						    &context, 0);
-		  copy_basic_type (rvalue, addr, cif->rtype->type);
-		  break;
-		}
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
+	  compress_hfa_type (rvalue, &context.v[0], 0x100 + t);
+	  break;
 
-              case FFI_TYPE_STRUCT:
-		h = is_hfa (cif->rtype);
-                if (h)
-		  {
-		    int j;
-		    int type = h & 0xff;
-		    int elems = h >> 8;
-		    for (j = 0; j < elems; j++)
-		      {
-			void *reg = get_basic_type_addr (type, &context, j);
-			copy_basic_type (rvalue, reg, type);
-			rvalue += get_basic_type_size (type);
-		      }
-		  }
-                else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-                  {
-                    size_t size = ALIGN (cif->rtype->size, sizeof (UINT64));
-                    memcpy (rvalue, get_x_addr (&context, 0), size);
-                  }
-                else
-                  {
-                    FFI_ASSERT (0);
-                  }
-                break;
-
-              default:
-                FFI_ASSERT (0);
-                break;
-              }
-          }
-        else
-          {
-	    context.x8 = (uintptr_t)rvalue;
-            ffi_call_SYSV (aarch64_prep_args, &context, &ecif,
-			   stack_bytes, fn);
-          }
-        break;
-      }
+	case FFI_TYPE_STRUCT:
+	  h = is_hfa (cif->rtype);
+	  if (h)
+	    compress_hfa_type (rvalue, &context.v[0], h);
+	  else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
+	    memcpy (rvalue, &context.x[0], cif->rtype->size);
+	  else
+	    abort();
+	  break;
 
-    default:
-      FFI_ASSERT (0);
-      break;
+	default:
+	  abort();
+	}
+    }
+  else
+    {
+      context.x8 = (uintptr_t)rvalue;
+      ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
     }
 }
 
@@ -1000,203 +847,158 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
   ffi_cif *cif = closure->cif;
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
   void *rvalue = NULL;
-  int i, h;
+  int i, h, nargs = cif->nargs;
   struct arg_state state;
+  ffi_type *rtype;
 
   arg_init (&state, ALIGN(cif->bytes, 16));
 
-  for (i = 0; i < cif->nargs; i++)
+  for (i = 0; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
+      int t = ty->type;
+      size_t n, s = ty->size;
 
-      switch (ty->type)
+      switch (t)
 	{
 	case FFI_TYPE_VOID:
 	  FFI_ASSERT (0);
 	  break;
 
+	case FFI_TYPE_INT:
 	case FFI_TYPE_UINT8:
 	case FFI_TYPE_SINT8:
 	case FFI_TYPE_UINT16:
 	case FFI_TYPE_SINT16:
 	case FFI_TYPE_UINT32:
 	case FFI_TYPE_SINT32:
-	case FFI_TYPE_INT:
-	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
+	case FFI_TYPE_POINTER:
+	  avalue[i] = allocate_int_to_reg_or_stack (context, &state, stack, s);
+	  break;
+
 	case FFI_TYPE_FLOAT:
 	case FFI_TYPE_DOUBLE:
 	case FFI_TYPE_LONGDOUBLE:
-	  avalue[i] = allocate_to_register_or_stack (context, stack,
-						     &state, ty->type);
-	  break;
+	  /* Scalar float is a degenerate case of HFA.  */
+	  h = t + 0x100;
+	  goto do_hfa;
 
 	case FFI_TYPE_STRUCT:
 	  h = is_hfa (ty);
 	  if (h)
 	    {
-	      unsigned n = h >> 8;
-	      if (available_v (&state) < n)
+	    do_hfa:
+	      n = h >> 8;
+	      if (state.nsrn + n <= N_V_ARG_REG)
 		{
-		  state.nsrn = N_V_ARG_REG;
-		  avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-						 ty->size);
+		  void *reg = &context->v[state.nsrn];
+		  state.nsrn += n;
+
+		  /* Eeek! We need a pointer to the structure, however the
+		     homogeneous float elements are being passed in individual
+		     registers, therefore for float and double the structure
+		     is not represented as a contiguous sequence of bytes in
+		     our saved register context.  We don't need the original
+		     contents of the register storage, so we reformat the
+		     structure into the same memory.  */
+		  avalue[i] = compress_hfa_type (reg, reg, h);
 		}
 	      else
 		{
-		  switch (h & 0xff)
-		    {
-		    case FFI_TYPE_FLOAT:
-		      {
-			/* Eeek! We need a pointer to the structure,
-			   however the homogeneous float elements are
-			   being passed in individual S registers,
-			   therefore the structure is not represented as
-			   a contiguous sequence of bytes in our saved
-			   register context. We need to fake up a copy
-			   of the structure laid out in memory
-			   correctly. The fake can be tossed once the
-			   closure function has returned hence alloca()
-			   is sufficient. */
-			unsigned j;
-			UINT32 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < n; j++)
-			  memcpy (&p[j],
-				  allocate_to_s (context, &state),
-				  sizeof (*p));
-			break;
-		      }
-
-		    case FFI_TYPE_DOUBLE:
-		      {
-			/* Eeek! We need a pointer to the structure,
-			   however the homogeneous float elements are
-			   being passed in individual S registers,
-			   therefore the structure is not represented as
-			   a contiguous sequence of bytes in our saved
-			   register context. We need to fake up a copy
-			   of the structure laid out in memory
-			   correctly. The fake can be tossed once the
-			   closure function has returned hence alloca()
-			   is sufficient. */
-			unsigned j;
-			UINT64 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < n; j++)
-			  memcpy (&p[j],
-				  allocate_to_d (context, &state),
-				  sizeof (*p));
-			break;
-		      }
-
-		    case FFI_TYPE_LONGDOUBLE:
-			  memcpy (&avalue[i],
-				  allocate_to_v (context, &state),
-				  sizeof (*avalue));
-		      break;
-
-		    default:
-		      FFI_ASSERT (0);
-		      break;
-		    }
+		  state.nsrn = N_V_ARG_REG;
+		  avalue[i] = allocate_to_stack (&state, stack,
+						 ty->alignment, s);
 		}
 	    }
-	  else if (ty->size > 16)
+	  else if (s > 16)
 	    {
 	      /* Replace Composite type of size greater than 16 with a
 		 pointer.  */
-	      memcpy (&avalue[i],
-		      allocate_to_register_or_stack (context, stack,
-						     &state, FFI_TYPE_POINTER),
-		      sizeof (avalue[i]));
-	    }
-	  else if (available_x (&state) >= (ty->size + 7) / 8)
-	    {
-	      avalue[i] = get_x_addr (context, state.ngrn);
-	      state.ngrn += (ty->size + 7) / 8;
+	      avalue[i] = *(void **)
+		allocate_int_to_reg_or_stack (context, &state, stack,
+					      sizeof (void *));
 	    }
 	  else
 	    {
-	      state.ngrn = N_X_ARG_REG;
-
-	      avalue[i] = allocate_to_stack (&state, stack, ty->alignment,
-					     ty->size);
+	      n = (s + 7) / 8;
+	      if (state.ngrn + n <= N_X_ARG_REG)
+		{
+		  avalue[i] = &context->x[state.ngrn];
+		  state.ngrn += n;
+		}
+	      else
+		{
+		  state.ngrn = N_X_ARG_REG;
+		  avalue[i] = allocate_to_stack (&state, stack,
+						 ty->alignment, s);
+		}
 	    }
 	  break;
 
 	default:
-	  FFI_ASSERT (0);
-	  break;
+	  abort();
 	}
     }
 
-  /* Figure out where the return value will be passed, either in
-     registers or in a memory block allocated by the caller and passed
-     in x8.  */
-
-  if (is_register_candidate (cif->rtype))
+  /* Figure out where the return value will be passed, either in registers
+     or in a memory block allocated by the caller and passed in x8.  */
+  rtype = cif->rtype;
+  if (is_register_candidate (rtype))
     {
+      size_t s = rtype->size;
+      int t;
+
       /* Register candidates are *always* returned in registers. */
 
       /* Allocate a scratchpad for the return value, we will let the
          callee scrible the result into the scratch pad then move the
          contents into the appropriate return value location for the
          call convention.  */
-      rvalue = alloca (cif->rtype->size);
+      rvalue = alloca (s);
       (closure->fun) (cif, rvalue, avalue, closure->user_data);
 
       /* Copy the return value into the call context so that it is returned
          as expected to our caller.  */
-      switch (cif->rtype->type)
+      t = rtype->type;
+      switch (t)
         {
         case FFI_TYPE_VOID:
           break;
 
+        case FFI_TYPE_INT:
         case FFI_TYPE_UINT8:
         case FFI_TYPE_UINT16:
         case FFI_TYPE_UINT32:
-        case FFI_TYPE_POINTER:
         case FFI_TYPE_UINT64:
         case FFI_TYPE_SINT8:
         case FFI_TYPE_SINT16:
-        case FFI_TYPE_INT:
         case FFI_TYPE_SINT32:
         case FFI_TYPE_SINT64:
+        case FFI_TYPE_POINTER:
+	  context->x[0] = extend_integer_type (rvalue, t);
+          break;
+
         case FFI_TYPE_FLOAT:
         case FFI_TYPE_DOUBLE:
         case FFI_TYPE_LONGDOUBLE:
-	  {
-	    void *addr = get_basic_type_addr (cif->rtype->type, context, 0);
-	    copy_basic_type (addr, rvalue, cif->rtype->type);
-            break;
-	  }
+	  extend_hfa_type (&context->v[0], rvalue, 0x100 + t);
+	  break;
+
         case FFI_TYPE_STRUCT:
 	  h = is_hfa (cif->rtype);
           if (h)
-	    {
-	      int j;
-	      int type = h & 0xff;
-	      int elems = h >> 8;
-	      for (j = 0; j < elems; j++)
-		{
-		  void *reg = get_basic_type_addr (type, context, j);
-		  copy_basic_type (reg, rvalue, type);
-		  rvalue += get_basic_type_size (type);
-		}
-	    }
-          else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-            {
-              size_t size = ALIGN (cif->rtype->size, sizeof (UINT64)) ;
-              memcpy (get_x_addr (context, 0), rvalue, size);
-            }
+	    extend_hfa_type (&context->v[0], rvalue, h);
           else
-            {
-              FFI_ASSERT (0);
+	    {
+	      FFI_ASSERT (s <= 16);
+              memcpy (&context->x[0], rvalue, s);
             }
           break;
+
         default:
-          FFI_ASSERT (0);
-          break;
+          abort();
         }
     }
   else
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 06/16] aarch64: Use correct return registers
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (13 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 08/16] aarch64: Tidy up abi manipulation Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 14/16] aarch64: Add support for complex types Richard Henderson
  2014-11-10 10:12 ` [PATCH 00/16] Go closures for aarch64 James Greenhalgh
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

There are fewer return registers than argument registers.
---
 src/aarch64/sysv.S | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index 70870db..fa7ff5b 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -160,22 +160,15 @@ CNAME(ffi_call_SYSV):
 
         blr     x24
 
-        /* Save the core argument passing registers.  */
-        stp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
-        stp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
-        stp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
-        stp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
-
-        /* Note nothing useful ever comes back in x8!  */
+        /* Save the core return registers.  */
+        stp     x0, x1, [x21, #16*N_V_ARG_REG]
 
         /* Figure out if we should touch the vector registers.  */
         tbz     x23, #AARCH64_FLAG_ARG_V_BIT, 1f
 
-        /* Save the vector argument passing registers.  */
+        /* Save the vector return registers.  */
         stp     q0, q1, [x21, #0]
         stp     q2, q3, [x21, #32]
-        stp     q4, q5, [x21, #64]
-        stp     q6, q7, [x21, #96]
 1:
         /* All done, unwind our stack frame.  */
         ldp     x21, x22, [x29,  # - ffi_call_SYSV_FS]
@@ -299,15 +292,9 @@ CNAME(ffi_closure_SYSV):
         /* Load the result passing vector registers.  */
         ldp     q0, q1, [x21, #0]
         ldp     q2, q3, [x21, #32]
-        ldp     q4, q5, [x21, #64]
-        ldp     q6, q7, [x21, #96]
 1:
         /* Load the result passing core registers.  */
         ldp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
-        ldp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
-        ldp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
-        ldp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
-        /* Note nothing useful is returned in x8.  */
 
         /* We are done, unwind our frame.  */
         ldp     x21, x22, [x29,  #-16]
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 05/16] aarch64: Reduce the size of register_context
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (8 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 13/16] aarch64: Remove aarch64_flags Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 04/16] aarch64: Simplify AARCH64_STACK_ALIGN Richard Henderson
                   ` (6 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

We don't need to store 32 general and vector registers.
Only 8 of each are used for parameter passing.
---
 src/aarch64/ffi.c       |  35 ++++++++---------
 src/aarch64/ffitarget.h |   6 ---
 src/aarch64/internal.h  |  26 +++++++++++++
 src/aarch64/sysv.S      | 100 +++++++++++++++++++++++-------------------------
 4 files changed, 91 insertions(+), 76 deletions(-)
 create mode 100644 src/aarch64/internal.h

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index a6fcc11..58d088b 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -21,8 +21,10 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdint.h>
 #include <ffi.h>
 #include <ffi_common.h>
+#include "internal.h"
 
 /* Force FFI_TYPE_LONGDOUBLE to be different than FFI_TYPE_DOUBLE;
    all further uses in this file will refer to the 128-bit type.  */
@@ -35,38 +37,35 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 # define FFI_TYPE_LONGDOUBLE 4
 #endif
 
-#define N_X_ARG_REG 8
-#define N_V_ARG_REG 8
-
-#define AARCH64_FFI_WITH_V (1 << AARCH64_FFI_WITH_V_BIT)
-
 union _d
 {
   UINT64 d;
   UINT32 s[2];
 };
 
+struct _v
+{
+  union _d d[2] __attribute__((aligned(16)));
+};
+
 struct call_context
 {
-  UINT64 x [AARCH64_N_XREG];
-  struct
-  {
-    union _d d[2];
-  } v [AARCH64_N_VREG];
+  struct _v v[N_V_ARG_REG];
+  UINT64 x[N_X_ARG_REG];
+  UINT64 x8;
 };
 
 #if defined (__clang__) && defined (__APPLE__)
-extern void
-sys_icache_invalidate (void *start, size_t len);
+extern void sys_icache_invalidate (void *start, size_t len);
 #endif
 
 static inline void
 ffi_clear_cache (void *start, void *end)
 {
 #if defined (__clang__) && defined (__APPLE__)
-	sys_icache_invalidate (start, (char *)end - (char *)start);
+  sys_icache_invalidate (start, (char *)end - (char *)start);
 #elif defined (__GNUC__)
-	__builtin___clear_cache (start, end);
+  __builtin___clear_cache (start, end);
 #else
 #error "Missing builtin to flush instruction cache"
 #endif
@@ -802,7 +801,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 
   if (is_v_register_candidate (cif->rtype))
     {
-      cif->aarch64_flags |= AARCH64_FFI_WITH_V;
+      cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
     }
   else
     {
@@ -810,7 +809,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
       for (i = 0; i < cif->nargs; i++)
         if (is_v_register_candidate (cif->arg_types[i]))
           {
-            cif->aarch64_flags |= AARCH64_FFI_WITH_V;
+            cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
             break;
           }
     }
@@ -924,7 +923,7 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
           }
         else
           {
-            memcpy (get_x_addr (&context, 8), &rvalue, sizeof (UINT64));
+	    context.x8 = (uintptr_t)rvalue;
             ffi_call_SYSV (aarch64_prep_args, &context, &ecif,
 			   stack_bytes, fn);
           }
@@ -1201,7 +1200,7 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
     }
   else
     {
-      memcpy (&rvalue, get_x_addr (context, 8), sizeof (UINT64));
+      rvalue = (void *)(uintptr_t)context->x8;
       (closure->fun) (cif, rvalue, avalue, closure->user_data);
     }
 }
diff --git a/src/aarch64/ffitarget.h b/src/aarch64/ffitarget.h
index 4bbced2..336f28a 100644
--- a/src/aarch64/ffitarget.h
+++ b/src/aarch64/ffitarget.h
@@ -54,10 +54,4 @@ typedef enum ffi_abi
 #define FFI_EXTRA_CIF_FIELDS unsigned aarch64_flags
 #endif
 
-#define AARCH64_FFI_WITH_V_BIT 0
-
-#define AARCH64_N_XREG 32
-#define AARCH64_N_VREG 32
-#define AARCH64_CALL_CONTEXT_SIZE (AARCH64_N_XREG * 8 + AARCH64_N_VREG * 16)
-
 #endif
diff --git a/src/aarch64/internal.h b/src/aarch64/internal.h
new file mode 100644
index 0000000..b6b6104
--- /dev/null
+++ b/src/aarch64/internal.h
@@ -0,0 +1,26 @@
+/* 
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+``Software''), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
+
+#define AARCH64_FLAG_ARG_V_BIT	0
+#define AARCH64_FLAG_ARG_V	(1 << AARCH64_FLAG_ARG_V_BIT)
+
+#define N_X_ARG_REG		8
+#define N_V_ARG_REG		8
+#define CALL_CONTEXT_SIZE	(N_V_ARG_REG * 16 + N_X_ARG_REG * 8 + 16)
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index 169eab8..70870db 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -22,6 +22,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
+#include "internal.h"
 
 #ifdef HAVE_MACHINE_ASM_H
 #include <machine/asm.h>
@@ -43,13 +44,12 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #define cfi_def_cfa_register(reg)	.cfi_def_cfa_register reg
 
         .text
+        .align 2
+
         .globl CNAME(ffi_call_SYSV)
 #ifdef __ELF__
         .type CNAME(ffi_call_SYSV), #function
 #endif
-#ifdef __APPLE__
-        .align 2
-#endif
 
 /* ffi_call_SYSV()
 
@@ -142,42 +142,40 @@ CNAME(ffi_call_SYSV):
         mov     x23, x0
 
         /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FFI_WITH_V_BIT, 1f
+        tbz     x23, #AARCH64_FLAG_ARG_V_BIT, 1f
 
         /* Load the vector argument passing registers.  */
-        ldp     q0, q1, [x21, #8*32 +  0]
-        ldp     q2, q3, [x21, #8*32 + 32]
-        ldp     q4, q5, [x21, #8*32 + 64]
-        ldp     q6, q7, [x21, #8*32 + 96]
+        ldp     q0, q1, [x21, #0]
+        ldp     q2, q3, [x21, #32]
+        ldp     q4, q5, [x21, #64]
+        ldp     q6, q7, [x21, #96]
 1:
-        /* Load the core argument passing registers.  */
-        ldp     x0, x1, [x21,  #0]
-        ldp     x2, x3, [x21, #16]
-        ldp     x4, x5, [x21, #32]
-        ldp     x6, x7, [x21, #48]
-
-        /* Don't forget x8 which may be holding the address of a return buffer.
-	 */
-        ldr     x8,     [x21, #8*8]
+        /* Load the core argument passing registers, including
+	   the structure return pointer.  */
+        ldp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
+        ldp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
+        ldp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
+        ldp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
+        ldr     x8,     [x21, #16*N_V_ARG_REG + 64]
 
         blr     x24
 
         /* Save the core argument passing registers.  */
-        stp     x0, x1, [x21,  #0]
-        stp     x2, x3, [x21, #16]
-        stp     x4, x5, [x21, #32]
-        stp     x6, x7, [x21, #48]
+        stp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
+        stp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
+        stp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
+        stp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
 
         /* Note nothing useful ever comes back in x8!  */
 
         /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FFI_WITH_V_BIT, 1f
+        tbz     x23, #AARCH64_FLAG_ARG_V_BIT, 1f
 
         /* Save the vector argument passing registers.  */
-        stp     q0, q1, [x21, #8*32 + 0]
-        stp     q2, q3, [x21, #8*32 + 32]
-        stp     q4, q5, [x21, #8*32 + 64]
-        stp     q6, q7, [x21, #8*32 + 96]
+        stp     q0, q1, [x21, #0]
+        stp     q2, q3, [x21, #32]
+        stp     q4, q5, [x21, #64]
+        stp     q6, q7, [x21, #96]
 1:
         /* All done, unwind our stack frame.  */
         ldp     x21, x22, [x29,  # - ffi_call_SYSV_FS]
@@ -203,7 +201,7 @@ CNAME(ffi_call_SYSV):
         .size CNAME(ffi_call_SYSV), .-CNAME(ffi_call_SYSV)
 #endif
 
-#define ffi_closure_SYSV_FS (8 * 2 + AARCH64_CALL_CONTEXT_SIZE)
+#define ffi_closure_SYSV_FS (8 * 2 + CALL_CONTEXT_SIZE)
 
 /* ffi_closure_SYSV
 
@@ -243,10 +241,9 @@ CNAME(ffi_call_SYSV):
    Voila!  */
 
         .text
-        .globl CNAME(ffi_closure_SYSV)
-#ifdef __APPLE__
         .align 2
-#endif
+
+        .globl CNAME(ffi_closure_SYSV)
         .cfi_startproc
 CNAME(ffi_closure_SYSV):
         stp     x29, x30, [sp, #-16]!
@@ -268,24 +265,23 @@ CNAME(ffi_closure_SYSV):
         /* Preserve our struct trampoline_data *  */
         mov     x22, x17
 
-        /* Save the rest of the argument passing registers.  */
-        stp     x0, x1, [x21, #0]
-        stp     x2, x3, [x21, #16]
-        stp     x4, x5, [x21, #32]
-        stp     x6, x7, [x21, #48]
-        /* Don't forget we may have been given a result scratch pad address.
-	 */
-        str     x8,     [x21, #64]
+        /* Save the rest of the argument passing registers, including
+	   the structure return pointer.  */
+        stp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
+        stp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
+        stp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
+        stp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
+        str     x8,     [x21, #16*N_V_ARG_REG + 64]
 
         /* Figure out if we should touch the vector registers.  */
         ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FFI_WITH_V_BIT, 1f
+        tbz     x0, #AARCH64_FLAG_ARG_V_BIT, 1f
 
         /* Save the argument passing vector registers.  */
-        stp     q0, q1, [x21, #8*32 + 0]
-        stp     q2, q3, [x21, #8*32 + 32]
-        stp     q4, q5, [x21, #8*32 + 64]
-        stp     q6, q7, [x21, #8*32 + 96]
+        stp     q0, q1, [x21, #0]
+        stp     q2, q3, [x21, #32]
+        stp     q4, q5, [x21, #64]
+        stp     q6, q7, [x21, #96]
 1:
         /* Load &ffi_closure..  */
         ldr     x0, [x22, #0]
@@ -298,19 +294,19 @@ CNAME(ffi_closure_SYSV):
 
         /* Figure out if we should touch the vector registers.  */
         ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FFI_WITH_V_BIT, 1f
+        tbz     x0, #AARCH64_FLAG_ARG_V_BIT, 1f
 
         /* Load the result passing vector registers.  */
-        ldp     q0, q1, [x21, #8*32 + 0]
-        ldp     q2, q3, [x21, #8*32 + 32]
-        ldp     q4, q5, [x21, #8*32 + 64]
-        ldp     q6, q7, [x21, #8*32 + 96]
+        ldp     q0, q1, [x21, #0]
+        ldp     q2, q3, [x21, #32]
+        ldp     q4, q5, [x21, #64]
+        ldp     q6, q7, [x21, #96]
 1:
         /* Load the result passing core registers.  */
-        ldp     x0, x1, [x21,  #0]
-        ldp     x2, x3, [x21, #16]
-        ldp     x4, x5, [x21, #32]
-        ldp     x6, x7, [x21, #48]
+        ldp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
+        ldp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
+        ldp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
+        ldp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
         /* Note nothing useful is returned in x8.  */
 
         /* We are done, unwind our frame.  */
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 11/16] aarch64: Move return value handling into ffi_closure_SYSV
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
  2014-10-28 18:53 ` [PATCH 03/16] aarch64: Always distinguish LONGDOUBLE Richard Henderson
  2014-10-28 18:53 ` [PATCH 01/16] aarch64: Fix non-apple compilation Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 10/16] aarch64: Move return value handling into ffi_call_SYSV Richard Henderson
                   ` (13 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

As with the change to ffi_call_SYSV, this avoids copying data
into a temporary buffer.
---
 src/aarch64/ffi.c       | 196 +++++++------------------------------
 src/aarch64/ffitarget.h |   2 +-
 src/aarch64/sysv.S      | 249 +++++++++++++++++++++++++++---------------------
 3 files changed, 176 insertions(+), 271 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index ffa1363..c5a429a 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -71,9 +71,6 @@ ffi_clear_cache (void *start, void *end)
 #endif
 }
 
-extern void
-ffi_closure_SYSV (ffi_closure *);
-
 /* Test for an FFI floating point representation.  */
 
 static unsigned
@@ -211,69 +208,6 @@ is_hfa(const ffi_type *ty)
   return (ele_count << 8) | candidate;
 }
 
-/* Test if an ffi_type is a candidate for passing in a register.
-
-   This test does not check that sufficient registers of the
-   appropriate class are actually available, merely that IFF
-   sufficient registers are available then the argument will be passed
-   in register(s).
-
-   Note that an ffi_type that is deemed to be a register candidate
-   will always be returned in registers.
-
-   Returns 1 if a register candidate else 0.  */
-
-static int
-is_register_candidate (ffi_type *ty)
-{
-  switch (ty->type)
-    {
-    case FFI_TYPE_VOID:
-      return 0;
-    case FFI_TYPE_FLOAT:
-    case FFI_TYPE_DOUBLE:
-    case FFI_TYPE_LONGDOUBLE:
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_UINT32:
-    case FFI_TYPE_UINT64:
-    case FFI_TYPE_POINTER:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_SINT16:
-    case FFI_TYPE_SINT32:
-    case FFI_TYPE_INT:
-    case FFI_TYPE_SINT64:
-      return 1;
-
-    case FFI_TYPE_STRUCT:
-      if (is_hfa (ty))
-        {
-          return 1;
-        }
-      else if (ty->size > 16)
-        {
-          /* Too large. Will be replaced with a pointer to memory. The
-             pointer MAY be passed in a register, but the value will
-             not. This test specifically fails since the argument will
-             never be passed by value in registers. */
-          return 0;
-        }
-      else
-        {
-          /* Might be passed in registers depending on the number of
-             registers required. */
-          return (ty->size + 7) / 8 < N_X_ARG_REG;
-        }
-      break;
-
-    default:
-      FFI_ASSERT (0);
-      break;
-    }
-
-  return 0;
-}
-
 /* Test if an ffi_type argument or result is a candidate for a vector
    register.  */
 
@@ -797,42 +731,42 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
     memcpy (orig_rvalue, rvalue, rtype_size);
 }
 
-static unsigned char trampoline [] =
-{ 0x70, 0x00, 0x00, 0x58,	/* ldr	x16, 1f	*/
-  0x91, 0x00, 0x00, 0x10,	/* adr	x17, 2f	*/
-  0x00, 0x02, 0x1f, 0xd6	/* br	x16	*/
-};
-
 /* Build a trampoline.  */
 
-#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX,FLAGS)			\
-  ({unsigned char *__tramp = (unsigned char*)(TRAMP);			\
-    UINT64  __fun = (UINT64)(FUN);					\
-    UINT64  __ctx = (UINT64)(CTX);					\
-    UINT64  __flags = (UINT64)(FLAGS);					\
-    memcpy (__tramp, trampoline, sizeof (trampoline));			\
-    memcpy (__tramp + 12, &__fun, sizeof (__fun));			\
-    memcpy (__tramp + 20, &__ctx, sizeof (__ctx));			\
-    memcpy (__tramp + 28, &__flags, sizeof (__flags));			\
-    ffi_clear_cache(__tramp, __tramp + FFI_TRAMPOLINE_SIZE);		\
-  })
+extern void ffi_closure_SYSV (void) FFI_HIDDEN;
+extern void ffi_closure_SYSV_V (void) FFI_HIDDEN;
 
 ffi_status
-ffi_prep_closure_loc (ffi_closure* closure,
+ffi_prep_closure_loc (ffi_closure *closure,
                       ffi_cif* cif,
                       void (*fun)(ffi_cif*,void*,void**,void*),
                       void *user_data,
                       void *codeloc)
 {
+  static const unsigned char trampoline[16] = {
+    0x90, 0x00, 0x00, 0x58,	/* ldr	x16, tramp+16	*/
+    0xf1, 0xff, 0xff, 0x10,	/* adr	x17, tramp+0	*/
+    0x00, 0x02, 0x1f, 0xd6	/* br	x16		*/
+  };
+  char *tramp = closure->tramp;
+  void (*start)(void);
+
   if (cif->abi != FFI_SYSV)
     return FFI_BAD_ABI;
 
-  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_SYSV, codeloc,
-		       cif->aarch64_flags);
-
-  closure->cif  = cif;
+  closure->cif = cif;
+  closure->fun = fun;
   closure->user_data = user_data;
-  closure->fun  = fun;
+
+  memcpy (tramp, trampoline, sizeof(trampoline));
+
+  if (cif->flags & AARCH64_FLAG_ARG_V)
+    start = ffi_closure_SYSV_V;
+  else
+    start = ffi_closure_SYSV;
+  *(UINT64 *)(tramp + 16) = (uintptr_t)start;
+
+  ffi_clear_cache(tramp, tramp + FFI_TRAMPOLINE_SIZE);
 
   return FFI_OK;
 }
@@ -853,20 +787,20 @@ ffi_prep_closure_loc (ffi_closure* closure,
    descriptors, invokes the wrapped function, then marshalls the return
    value back into the call context.  */
 
-void FFI_HIDDEN
-ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
-			void *stack)
+int FFI_HIDDEN
+ffi_closure_SYSV_inner (ffi_cif *cif,
+			void (*fun)(ffi_cif*,void*,void**,void*),
+			void *user_data,
+			struct call_context *context,
+			void *stack, void *rvalue)
 {
-  ffi_cif *cif = closure->cif;
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
-  void *rvalue = NULL;
-  int i, h, nargs = cif->nargs;
+  int i, h, nargs, flags;
   struct arg_state state;
-  ffi_type *rtype;
 
   arg_init (&state);
 
-  for (i = 0; i < nargs; i++)
+  for (i = 0, nargs = cif->nargs; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
       int t = ty->type;
@@ -955,69 +889,11 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
 	}
     }
 
-  /* Figure out where the return value will be passed, either in registers
-     or in a memory block allocated by the caller and passed in x8.  */
-  rtype = cif->rtype;
-  if (is_register_candidate (rtype))
-    {
-      size_t s = rtype->size;
-      int t;
-
-      /* Register candidates are *always* returned in registers. */
-
-      /* Allocate a scratchpad for the return value, we will let the
-         callee scrible the result into the scratch pad then move the
-         contents into the appropriate return value location for the
-         call convention.  */
-      rvalue = alloca (s);
-      (closure->fun) (cif, rvalue, avalue, closure->user_data);
-
-      /* Copy the return value into the call context so that it is returned
-         as expected to our caller.  */
-      t = rtype->type;
-      switch (t)
-        {
-        case FFI_TYPE_VOID:
-          break;
-
-        case FFI_TYPE_INT:
-        case FFI_TYPE_UINT8:
-        case FFI_TYPE_UINT16:
-        case FFI_TYPE_UINT32:
-        case FFI_TYPE_UINT64:
-        case FFI_TYPE_SINT8:
-        case FFI_TYPE_SINT16:
-        case FFI_TYPE_SINT32:
-        case FFI_TYPE_SINT64:
-        case FFI_TYPE_POINTER:
-	  context->x[0] = extend_integer_type (rvalue, t);
-          break;
-
-        case FFI_TYPE_FLOAT:
-        case FFI_TYPE_DOUBLE:
-        case FFI_TYPE_LONGDOUBLE:
-	  extend_hfa_type (&context->v[0], rvalue, 0x100 + t);
-	  break;
+  flags = cif->flags;
+  if (flags & AARCH64_RET_IN_MEM)
+    rvalue = (void *)(uintptr_t)context->x8;
 
-        case FFI_TYPE_STRUCT:
-	  h = is_hfa (cif->rtype);
-          if (h)
-	    extend_hfa_type (&context->v[0], rvalue, h);
-          else
-	    {
-	      FFI_ASSERT (s <= 16);
-              memcpy (&context->x[0], rvalue, s);
-            }
-          break;
+  fun (cif, rvalue, avalue, user_data);
 
-        default:
-          abort();
-        }
-    }
-  else
-    {
-      rvalue = (void *)(uintptr_t)context->x8;
-      (closure->fun) (cif, rvalue, avalue, closure->user_data);
-    }
+  return flags;
 }
-
diff --git a/src/aarch64/ffitarget.h b/src/aarch64/ffitarget.h
index 336f28a..b488bbe 100644
--- a/src/aarch64/ffitarget.h
+++ b/src/aarch64/ffitarget.h
@@ -42,7 +42,7 @@ typedef enum ffi_abi
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
-#define FFI_TRAMPOLINE_SIZE 36
+#define FFI_TRAMPOLINE_SIZE 24
 #define FFI_NATIVE_RAW_API 0
 
 /* ---- Internal ---- */
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index ba15663..abd848d 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -39,15 +39,15 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #endif
 #endif
 
+#ifdef __AARCH64EB__
+# define BE(X)	X
+#else
+# define BE(X)	0
+#endif
+
 	.text
 	.align 4
 
-	.globl	CNAME(ffi_call_SYSV)
-#ifdef __ELF__
-	.type	CNAME(ffi_call_SYSV), #function
-	.hidden	CNAME(ffi_call_SYSV)
-#endif
-
 /* ffi_call_SYSV
    extern void ffi_call_SYSV (void *stack, void *frame,
 			      void (*fn)(void), void *rvalue, int flags);
@@ -179,131 +179,160 @@ CNAME(ffi_call_SYSV):
 	nop
 
 	cfi_endproc
+
+	.globl	CNAME(ffi_call_SYSV)
 #ifdef __ELF__
-        .size CNAME(ffi_call_SYSV), .-CNAME(ffi_call_SYSV)
+	.type	CNAME(ffi_call_SYSV), #function
+	.hidden	CNAME(ffi_call_SYSV)
+	.size CNAME(ffi_call_SYSV), .-CNAME(ffi_call_SYSV)
 #endif
 
-#define ffi_closure_SYSV_FS (8 * 2 + CALL_CONTEXT_SIZE)
-
 /* ffi_closure_SYSV
 
    Closure invocation glue. This is the low level code invoked directly by
    the closure trampoline to setup and call a closure.
 
-   On entry x17 points to a struct trampoline_data, x16 has been clobbered
+   On entry x17 points to a struct ffi_closure, x16 has been clobbered
    all other registers are preserved.
 
    We allocate a call context and save the argument passing registers,
    then invoked the generic C ffi_closure_SYSV_inner() function to do all
    the real work, on return we load the result passing registers back from
    the call context.
+*/
 
-   On entry
-
-   extern void
-   ffi_closure_SYSV (struct trampoline_data *);
-
-   struct trampoline_data
-   {
-        UINT64 *ffi_closure;
-        UINT64 flags;
-   };
-
-   This function uses the following stack frame layout:
-
-   ==
-                saved x30(lr)
-   x29(fp)->    saved x29(fp)
-                saved x22
-                saved x21
-                ...
-   sp     ->    call_context
-   ==
+#define ffi_closure_SYSV_FS (8*2 + CALL_CONTEXT_SIZE + 64)
 
-   Voila!  */
+	.align 4
+CNAME(ffi_closure_SYSV_V):
+	cfi_startproc
+	stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
+	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
+	cfi_rel_offset (x29, 0)
+	cfi_rel_offset (x30, 8)
 
-        .text
-        .align 4
+	/* Save the argument passing vector registers.  */
+	stp     q0, q1, [sp, #16 + 0]
+	stp     q2, q3, [sp, #16 + 32]
+	stp     q4, q5, [sp, #16 + 64]
+	stp     q6, q7, [sp, #16 + 96]
+	b	0f
+	cfi_endproc
 
-        .globl	CNAME(ffi_closure_SYSV)
+	.globl	CNAME(ffi_closure_SYSV_V)
 #ifdef __ELF__
-	.type	CNAME(ffi_closure_SYSV), #function
-	.hidden	CNAME(ffi_closure_SYSV)
+	.type	CNAME(ffi_closure_SYSV_V), #function
+	.hidden	CNAME(ffi_closure_SYSV_V)
+	.size	CNAME(ffi_closure_SYSV_V), . - CNAME(ffi_closure_SYSV_V)
 #endif
-        cfi_startproc
-CNAME(ffi_closure_SYSV):
-        stp     x29, x30, [sp, #-16]!
-	cfi_adjust_cfa_offset (16)
-        cfi_rel_offset (x29, 0)
-        cfi_rel_offset (x30, 8)
-
-        mov     x29, sp
-        cfi_def_cfa_register (x29)
-
-        sub     sp, sp, #ffi_closure_SYSV_FS
-
-        stp     x21, x22, [x29, #-16]
-        cfi_rel_offset (x21, -16)
-        cfi_rel_offset (x22, -8)
-
-        /* Load x21 with &call_context.  */
-        mov     x21, sp
-        /* Preserve our struct trampoline_data *  */
-        mov     x22, x17
-
-        /* Save the rest of the argument passing registers, including
-	   the structure return pointer.  */
-        stp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
-        stp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
-        stp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
-        stp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
-        str     x8,     [x21, #16*N_V_ARG_REG + 64]
-
-        /* Figure out if we should touch the vector registers.  */
-        ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FLAG_ARG_V_BIT, 1f
-
-        /* Save the argument passing vector registers.  */
-        stp     q0, q1, [x21, #0]
-        stp     q2, q3, [x21, #32]
-        stp     q4, q5, [x21, #64]
-        stp     q6, q7, [x21, #96]
-1:
-        /* Load &ffi_closure..  */
-        ldr     x0, [x22, #0]
-        mov     x1, x21
-        /* Compute the location of the stack at the point that the
-           trampoline was called.  */
-        add     x2, x29, #16
-
-        bl      CNAME(ffi_closure_SYSV_inner)
-
-        /* Figure out if we should touch the vector registers.  */
-        ldr     x0, [x22, #8]
-        tbz     x0, #AARCH64_FLAG_ARG_V_BIT, 1f
-
-        /* Load the result passing vector registers.  */
-        ldp     q0, q1, [x21, #0]
-        ldp     q2, q3, [x21, #32]
-1:
-        /* Load the result passing core registers.  */
-        ldp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
-
-        /* We are done, unwind our frame.  */
-        ldp     x21, x22, [x29,  #-16]
-        cfi_restore (x21)
-        cfi_restore (x22)
 
-        mov     sp, x29
-        cfi_def_cfa_register (sp)
-
-        ldp     x29, x30, [sp], #16
-	cfi_adjust_cfa_offset (-16)
-        cfi_restore (x29)
-        cfi_restore (x30)
-
-        ret
+	.align	4
+	cfi_startproc
+CNAME(ffi_closure_SYSV):
+	stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
+	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
+	cfi_rel_offset (x29, 0)
+	cfi_rel_offset (x30, 8)
+0:
+	mov     x29, sp
+
+	/* Save the argument passing core registers.  */
+	stp     x0, x1, [sp, #16 + 16*N_V_ARG_REG + 0]
+	stp     x2, x3, [sp, #16 + 16*N_V_ARG_REG + 16]
+	stp     x4, x5, [sp, #16 + 16*N_V_ARG_REG + 32]
+	stp     x6, x7, [sp, #16 + 16*N_V_ARG_REG + 48]
+	str     x8,     [sp, #16 + 16*N_V_ARG_REG + 64]
+
+	/* Load ffi_closure_inner arguments.  */
+	ldp	x0, x1, [x17, #FFI_TRAMPOLINE_SIZE]	/* load cif, fn */
+	ldr	x2, [x17, #FFI_TRAMPOLINE_SIZE+16]	/* load user_data */
+	add	x3, sp, #16				/* load context */
+	add	x4, sp, #ffi_closure_SYSV_FS		/* load stack */
+	add	x5, sp, #16+CALL_CONTEXT_SIZE		/* load rvalue */
+	bl      CNAME(ffi_closure_SYSV_inner)
+
+	/* Load the return value as directed.  */
+	adr	x1, 0f
+	and	w0, w0, #AARCH64_RET_MASK
+	add	x1, x1, x0, lsl #3
+	add	x3, sp, #16+CALL_CONTEXT_SIZE
+	br	x1
+
+	/* Note that each table entry is 2 insns, and thus 8 bytes.  */
+	.align	4
+0:	b	99f			/* VOID */
+	nop
+1:	ldr	x0, [x3]		/* INT64 */
+	b	99f
+2:	ldp	x0, x1, [x3]		/* INT128 */
+	b	99f
+3:	brk	#1000			/* UNUSED */
+	nop
+4:	brk	#1000			/* UNUSED */
+	nop
+5:	brk	#1000			/* UNUSED */
+	nop
+6:	brk	#1000			/* UNUSED */
+	nop
+7:	brk	#1000			/* UNUSED */
+	nop
+8:	ldr	s3, [x3, #12]		/* S4 */
+	nop
+9:	ldr	s2, [x2, #8]		/* S3 */
+	nop
+10:	ldp	s0, s1, [x3]		/* S2 */
+	b	99f
+11:	ldr	s0, [x3]		/* S1 */
+	b	99f
+12:	ldr	d3, [x3, #24]		/* D4 */
+	nop
+13:	ldr	d2, [x3, #16]		/* D3 */
+	nop
+14:	ldp	d0, d1, [x3]		/* D2 */
+	b	99f
+15:	ldr	d0, [x3]		/* D1 */
+	b	99f
+16:	ldr	q3, [x3, #48]		/* Q4 */
+	nop
+17:	ldr	q2, [x3, #32]		/* Q3 */
+	nop
+18:	ldp	q0, q1, [x3]		/* Q2 */
+	b	99f
+19:	ldr	q0, [x3]		/* Q1 */
+	b	99f
+20:	ldrb	w0, [x3, #BE(7)]	/* UINT8 */
+	b	99f
+21:	brk	#1000			/* reserved */
+	nop
+22:	ldrh	w0, [x3, #BE(6)]	/* UINT16 */
+	b	99f
+23:	brk	#1000			/* reserved */
+	nop
+24:	ldr	w0, [x3, #BE(4)]	/* UINT32 */
+	b	99f
+25:	brk	#1000			/* reserved */
+	nop
+26:	ldrsb	x0, [x3, #BE(7)]	/* SINT8 */
+	b	99f
+27:	brk	#1000			/* reserved */
+	nop
+28:	ldrsh	x0, [x3, #BE(6)]	/* SINT16 */
+	b	99f
+29:	brk	#1000			/* reserved */
+	nop
+30:	ldrsw	x0, [x3, #BE(4)]	/* SINT32 */
+	nop
+31:					/* reserved */
+99:	ldp     x29, x30, [sp], #ffi_closure_SYSV_FS
+	cfi_adjust_cfa_offset (-ffi_closure_SYSV_FS)
+	cfi_restore (x29)
+	cfi_restore (x30)
+	ret
 	cfi_endproc
+
+	.globl	CNAME(ffi_closure_SYSV)
 #ifdef __ELF__
-        .size CNAME(ffi_closure_SYSV), .-CNAME(ffi_closure_SYSV)
+	.type	CNAME(ffi_closure_SYSV), #function
+	.hidden	CNAME(ffi_closure_SYSV)
+	.size	CNAME(ffi_closure_SYSV), . - CNAME(ffi_closure_SYSV)
 #endif
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 14/16] aarch64: Add support for complex types
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (14 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 06/16] aarch64: Use correct return registers Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-11-10 10:12 ` [PATCH 00/16] Go closures for aarch64 James Greenhalgh
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

---
 src/aarch64/ffi.c              | 34 +++++++++++++++++++++++++---------
 src/aarch64/ffitarget.h        |  2 ++
 testsuite/libffi.call/call.exp | 10 +++-------
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index b3e0b16..4f85140 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -85,7 +85,7 @@ is_hfa0 (const ffi_type *ty)
     for (i = 0; elements[i]; ++i)
       {
         ret = elements[i]->type;
-        if (ret == FFI_TYPE_STRUCT)
+        if (ret == FFI_TYPE_STRUCT || ret == FFI_TYPE_COMPLEX)
           {
             ret = is_hfa0 (elements[i]);
             if (ret < 0)
@@ -110,7 +110,7 @@ is_hfa1 (const ffi_type *ty, int candidate)
     for (i = 0; elements[i]; ++i)
       {
         int t = elements[i]->type;
-        if (t == FFI_TYPE_STRUCT)
+        if (t == FFI_TYPE_STRUCT || t == FFI_TYPE_COMPLEX)
           {
             if (!is_hfa1 (elements[i], candidate))
               return 0;
@@ -138,16 +138,27 @@ is_vfp_type (const ffi_type *ty)
   size_t size, ele_count;
 
   /* Quickest tests first.  */
-  switch (ty->type)
+  candidate = ty->type;
+  switch (candidate)
     {
     default:
       return 0;
     case FFI_TYPE_FLOAT:
-      return AARCH64_RET_S1;
     case FFI_TYPE_DOUBLE:
-      return AARCH64_RET_D1;
     case FFI_TYPE_LONGDOUBLE:
-      return AARCH64_RET_Q1;
+      ele_count = 1;
+      goto done;
+    case FFI_TYPE_COMPLEX:
+      candidate = ty->elements[0]->type;
+      switch (candidate)
+	{
+	case FFI_TYPE_FLOAT:
+	case FFI_TYPE_DOUBLE:
+	case FFI_TYPE_LONGDOUBLE:
+	  ele_count = 2;
+	  goto done;
+	}
+      return 0;
     case FFI_TYPE_STRUCT:
       break;
     }
@@ -160,7 +171,7 @@ is_vfp_type (const ffi_type *ty)
   /* Find the type of the first non-structure member.  */
   elements = ty->elements;
   candidate = elements[0]->type;
-  if (candidate == FFI_TYPE_STRUCT)
+  if (candidate == FFI_TYPE_STRUCT || candidate == FFI_TYPE_COMPLEX)
     {
       for (i = 0; ; ++i)
         {
@@ -198,16 +209,18 @@ is_vfp_type (const ffi_type *ty)
   /* Finally, make sure that all scalar elements are the same type.  */
   for (i = 0; elements[i]; ++i)
     {
-      if (elements[i]->type == FFI_TYPE_STRUCT)
+      int t = elements[i]->type;
+      if (t == FFI_TYPE_STRUCT || t == FFI_TYPE_COMPLEX)
         {
           if (!is_hfa1 (elements[i], candidate))
             return 0;
         }
-      else if (elements[i]->type != candidate)
+      else if (t != candidate)
         return 0;
     }
 
   /* All tests succeeded.  Encode the result.  */
+ done:
   return candidate * 4 + (4 - ele_count);
 }
 
@@ -474,6 +487,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
     case FFI_TYPE_DOUBLE:
     case FFI_TYPE_LONGDOUBLE:
     case FFI_TYPE_STRUCT:
+    case FFI_TYPE_COMPLEX:
       flags = is_vfp_type (rtype);
       if (flags == 0)
 	{
@@ -618,6 +632,7 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
 	case FFI_TYPE_DOUBLE:
 	case FFI_TYPE_LONGDOUBLE:
 	case FFI_TYPE_STRUCT:
+	case FFI_TYPE_COMPLEX:
 	  {
 	    void *dest;
 
@@ -788,6 +803,7 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
 	case FFI_TYPE_DOUBLE:
 	case FFI_TYPE_LONGDOUBLE:
 	case FFI_TYPE_STRUCT:
+	case FFI_TYPE_COMPLEX:
 	  h = is_vfp_type (ty);
 	  if (h)
 	    {
diff --git a/src/aarch64/ffitarget.h b/src/aarch64/ffitarget.h
index 6d6d3e6..7461386 100644
--- a/src/aarch64/ffitarget.h
+++ b/src/aarch64/ffitarget.h
@@ -52,4 +52,6 @@ typedef enum ffi_abi
 #define FFI_EXTRA_CIF_FIELDS unsigned aarch64_nfixedargs
 #endif
 
+#define FFI_TARGET_HAS_COMPLEX_TYPE
+
 #endif
diff --git a/testsuite/libffi.call/call.exp b/testsuite/libffi.call/call.exp
index 5177f07..ceacd49 100644
--- a/testsuite/libffi.call/call.exp
+++ b/testsuite/libffi.call/call.exp
@@ -24,16 +24,12 @@ set ctlist [lsearch -inline -all -glob [lsort [glob -nocomplain -- $srcdir/$subd
 
 run-many-tests $tlist ""
 
-if { ![istarget s390*] } {
-
+if { [istarget s390*] || [istarget aarch64*] } {
+    run-many-tests $ctlist ""
+} else {
     foreach test $ctlist {
 	unsupported "$test"
     }
-
-} else {
-
-  run-many-tests $ctlist ""
-
 }
 
 dg-finish
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 16/16] aarch64: Add support for Go closures
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (5 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 12/16] aarch64: Unify scalar fp and hfa handling Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 02/16] aarch64: Improve is_hfa Richard Henderson
                   ` (9 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

---
 src/aarch64/ffi.c       | 52 +++++++++++++++++++++++++++++++++++++----
 src/aarch64/ffitarget.h |  4 ++++
 src/aarch64/sysv.S      | 62 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index f546ab2..0cace9d 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -539,13 +539,14 @@ ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
 #endif /* __APPLE__ */
 
 extern void ffi_call_SYSV (struct call_context *context, void *frame,
-			   void (*fn)(void), void *rvalue, int flags)
-	FFI_HIDDEN;
+			   void (*fn)(void), void *rvalue, int flags,
+			   void *closure) FFI_HIDDEN;
 
 /* Call a function with the provided arguments and capture the return
    value.  */
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *orig_rvalue,
+	      void **avalue, void *closure)
 {
   struct call_context *context;
   void *stack, *frame, *rvalue;
@@ -698,12 +699,27 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
 #endif
     }
 
-  ffi_call_SYSV (context, frame, fn, rvalue, flags);
+  ffi_call_SYSV (context, frame, fn, rvalue, flags, closure);
 
   if (flags & AARCH64_RET_NEED_COPY)
     memcpy (orig_rvalue, rvalue, rtype_size);
 }
 
+void
+ffi_call (ffi_cif *cif, void (*fn) (void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+#ifdef FFI_GO_CLOSURES
+void
+ffi_call_go (ffi_cif *cif, void (*fn) (void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
+#endif /* FFI_GO_CLOSURES */
+
 /* Build a trampoline.  */
 
 extern void ffi_closure_SYSV (void) FFI_HIDDEN;
@@ -744,6 +760,32 @@ ffi_prep_closure_loc (ffi_closure *closure,
   return FFI_OK;
 }
 
+#ifdef FFI_GO_CLOSURES
+extern void ffi_go_closure_SYSV (void) FFI_HIDDEN;
+extern void ffi_go_closure_SYSV_V (void) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure *closure, ffi_cif* cif,
+                     void (*fun)(ffi_cif*,void*,void**,void*))
+{
+  void (*start)(void);
+
+  if (cif->abi != FFI_SYSV)
+    return FFI_BAD_ABI;
+
+  if (cif->flags & AARCH64_FLAG_ARG_V)
+    start = ffi_go_closure_SYSV_V;
+  else
+    start = ffi_go_closure_SYSV;
+
+  closure->tramp = start;
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+#endif /* FFI_GO_CLOSURES */
+
 /* Primary handler to setup and invoke a function within a closure.
 
    A closure when invoked enters via the assembler wrapper
diff --git a/src/aarch64/ffitarget.h b/src/aarch64/ffitarget.h
index 7461386..80d09af 100644
--- a/src/aarch64/ffitarget.h
+++ b/src/aarch64/ffitarget.h
@@ -50,6 +50,10 @@ typedef enum ffi_abi
 #if defined (__APPLE__)
 #define FFI_TARGET_SPECIFIC_VARIADIC
 #define FFI_EXTRA_CIF_FIELDS unsigned aarch64_nfixedargs
+#else
+/* iOS reserves x18 for the system.  Disable Go closures until
+   a new static chain is chosen.  */
+#define FFI_GO_CLOSURES 1
 #endif
 
 #define FFI_TARGET_HAS_COMPLEX_TYPE
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index 7f00a3f..1fb68f2 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -50,7 +50,8 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 /* ffi_call_SYSV
    extern void ffi_call_SYSV (void *stack, void *frame,
-			      void (*fn)(void), void *rvalue, int flags);
+			      void (*fn)(void), void *rvalue,
+			      int flags, void *closure);
 
    Therefore on entry we have:
 
@@ -59,6 +60,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
    x2 fn
    x3 rvalue
    x4 flags
+   x5 closure
 */
 
 	cfi_startproc
@@ -74,6 +76,9 @@ CNAME(ffi_call_SYSV):
 
 	mov	x9, x2			/* save fn */
 	mov	x8, x3			/* install structure return */
+#ifdef FFI_GO_CLOSURES
+	mov	x18, x5			/* install static chain */
+#endif
 	stp	x3, x4, [x29, #16]	/* save rvalue and flags */
 
 	/* Load the vector argument passing registers, if necessary.  */
@@ -245,6 +250,7 @@ CNAME(ffi_closure_SYSV):
 	/* Load ffi_closure_inner arguments.  */
 	ldp	x0, x1, [x17, #FFI_TRAMPOLINE_SIZE]	/* load cif, fn */
 	ldr	x2, [x17, #FFI_TRAMPOLINE_SIZE+16]	/* load user_data */
+.Ldo_closure:
 	add	x3, sp, #16				/* load context */
 	add	x4, sp, #ffi_closure_SYSV_FS		/* load stack */
 	add	x5, sp, #16+CALL_CONTEXT_SIZE		/* load rvalue */
@@ -336,3 +342,57 @@ CNAME(ffi_closure_SYSV):
 	.hidden	CNAME(ffi_closure_SYSV)
 	.size	CNAME(ffi_closure_SYSV), . - CNAME(ffi_closure_SYSV)
 #endif
+
+#ifdef FFI_GO_CLOSURES
+	.align 4
+CNAME(ffi_go_closure_SYSV_V):
+	cfi_startproc
+	stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
+	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
+	cfi_rel_offset (x29, 0)
+	cfi_rel_offset (x30, 8)
+
+	/* Save the argument passing vector registers.  */
+	stp     q0, q1, [sp, #16 + 0]
+	stp     q2, q3, [sp, #16 + 32]
+	stp     q4, q5, [sp, #16 + 64]
+	stp     q6, q7, [sp, #16 + 96]
+	b	0f
+	cfi_endproc
+
+	.globl	CNAME(ffi_go_closure_SYSV_V)
+#ifdef __ELF__
+	.type	CNAME(ffi_go_closure_SYSV_V), #function
+	.hidden	CNAME(ffi_go_closure_SYSV_V)
+	.size	CNAME(ffi_go_closure_SYSV_V), . - CNAME(ffi_go_closure_SYSV_V)
+#endif
+
+	.align	4
+	cfi_startproc
+CNAME(ffi_go_closure_SYSV):
+	stp     x29, x30, [sp, #-ffi_closure_SYSV_FS]!
+	cfi_adjust_cfa_offset (ffi_closure_SYSV_FS)
+	cfi_rel_offset (x29, 0)
+	cfi_rel_offset (x30, 8)
+0:
+	mov     x29, sp
+
+	/* Save the argument passing core registers.  */
+	stp     x0, x1, [sp, #16 + 16*N_V_ARG_REG + 0]
+	stp     x2, x3, [sp, #16 + 16*N_V_ARG_REG + 16]
+	stp     x4, x5, [sp, #16 + 16*N_V_ARG_REG + 32]
+	stp     x6, x7, [sp, #16 + 16*N_V_ARG_REG + 48]
+
+	/* Load ffi_closure_inner arguments.  */
+	ldp	x0, x1, [x18, #8]			/* load cif, fn */
+	mov	x2, x18					/* load user_data */
+	b	.Ldo_closure
+	cfi_endproc
+
+	.globl	CNAME(ffi_go_closure_SYSV)
+#ifdef __ELF__
+	.type	CNAME(ffi_go_closure_SYSV), #function
+	.hidden	CNAME(ffi_go_closure_SYSV)
+	.size	CNAME(ffi_go_closure_SYSV), . - CNAME(ffi_go_closure_SYSV)
+#endif
+#endif /* FFI_GO_CLOSURES */
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 02/16] aarch64: Improve is_hfa
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (6 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 16/16] aarch64: Add support for Go closures Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 13/16] aarch64: Remove aarch64_flags Richard Henderson
                   ` (8 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

The set of functions get_homogeneous_type, element_count, and is_hfa
are all intertwined and recompute data.  Return a compound quantity
from is_hfa that contains all the data and avoids the recomputation.
---
 src/aarch64/ffi.c | 212 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 131 insertions(+), 81 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index cdb7816..0834614 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -242,88 +242,132 @@ is_floating_type (unsigned short type)
 	  || type == FFI_TYPE_LONGDOUBLE);
 }
 
-/* Test for a homogeneous structure.  */
+/* A subroutine of is_hfa.  Given a structure type, return the type code
+   of the first non-structure element.  Recurse for structure elements.
+   Return -1 if the structure is in fact empty, i.e. no nested elements.  */
 
-static unsigned short
-get_homogeneous_type (ffi_type *ty)
+static int
+is_hfa0 (const ffi_type *ty)
 {
-  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
-    {
-      unsigned i;
-      unsigned short candidate_type
-	= get_homogeneous_type (ty->elements[0]);
-      for (i =1; ty->elements[i]; i++)
-	{
-	  unsigned short iteration_type = 0;
-	  /* If we have a nested struct, we must find its homogeneous type.
-	     If that fits with our candidate type, we are still
-	     homogeneous.  */
-	  if (ty->elements[i]->type == FFI_TYPE_STRUCT
-	      && ty->elements[i]->elements)
-	    {
-	      iteration_type = get_homogeneous_type (ty->elements[i]);
-	    }
-	  else
-	    {
-	      iteration_type = ty->elements[i]->type;
-	    }
+  ffi_type **elements = ty->elements;
+  int i, ret = -1;
 
-	  /* If we are not homogeneous, return FFI_TYPE_STRUCT.  */
-	  if (candidate_type != iteration_type)
-	    return FFI_TYPE_STRUCT;
-	}
-      return candidate_type;
-    }
+  if (elements != NULL)
+    for (i = 0; elements[i]; ++i)
+      {
+        ret = elements[i]->type;
+        if (ret == FFI_TYPE_STRUCT)
+          {
+            ret = is_hfa0 (elements[i]);
+            if (ret < 0)
+              continue;
+          }
+        break;
+      }
 
-  /* Base case, we have no more levels of nesting, so we
-     are a basic type, and so, trivially homogeneous in that type.  */
-  return ty->type;
+  return ret;
 }
 
-/* Determine the number of elements within a STRUCT.
+/* A subroutine of is_hfa.  Given a structure type, return true if all
+   of the non-structure elements are the same as CANDIDATE.  */
 
-   Note, we must handle nested structs.
+static int
+is_hfa1 (const ffi_type *ty, int candidate)
+{
+  ffi_type **elements = ty->elements;
+  int i;
 
-   If ty is not a STRUCT this function will return 0.  */
+  if (elements != NULL)
+    for (i = 0; elements[i]; ++i)
+      {
+        int t = elements[i]->type;
+        if (t == FFI_TYPE_STRUCT)
+          {
+            if (!is_hfa1 (elements[i], candidate))
+              return 0;
+          }
+        else if (t != candidate)
+          return 0;
+      }
 
-static unsigned
-element_count (ffi_type *ty)
-{
-  if (ty->type == FFI_TYPE_STRUCT && ty->elements)
-    {
-      unsigned n;
-      unsigned elems = 0;
-      for (n = 0; ty->elements[n]; n++)
-	{
-	  if (ty->elements[n]->type == FFI_TYPE_STRUCT
-	      && ty->elements[n]->elements)
-	    elems += element_count (ty->elements[n]);
-	  else
-	    elems++;
-	}
-      return elems;
-    }
-  return 0;
+  return 1;
 }
 
-/* Test for a homogeneous floating point aggregate.
+/* Determine if TY is an homogenous floating point aggregate (HFA).
+   That is, a structure consisting of 1 to 4 members of all the same type,
+   where that type is a floating point scalar.
 
-   A homogeneous floating point aggregate is a homogeneous aggregate of
-   a half- single- or double- precision floating point type with one
-   to four elements.  Note that this includes nested structs of the
-   basic type.  */
+   Returns non-zero iff TY is an HFA.  The result is an encoded value where
+   bits 0-7 contain the type code, and bits 8-10 contain the element count.  */
 
 static int
-is_hfa (ffi_type *ty)
+is_hfa(const ffi_type *ty)
 {
-  if (ty->type == FFI_TYPE_STRUCT
-      && ty->elements[0]
-      && is_floating_type (get_homogeneous_type (ty)))
+  ffi_type **elements;
+  int candidate, i;
+  size_t size, ele_count;
+
+  /* Quickest tests first.  */
+  if (ty->type != FFI_TYPE_STRUCT)
+    return 0;
+
+  /* No HFA types are smaller than 4 bytes, or larger than 64 bytes.  */
+  size = ty->size;
+  if (size < 4 || size > 64)
+    return 0;
+
+  /* Find the type of the first non-structure member.  */
+  elements = ty->elements;
+  candidate = elements[0]->type;
+  if (candidate == FFI_TYPE_STRUCT)
     {
-      unsigned n = element_count (ty);
-      return n >= 1 && n <= 4;
+      for (i = 0; ; ++i)
+        {
+          candidate = is_hfa0 (elements[i]);
+          if (candidate >= 0)
+            break;
+        }
     }
-  return 0;
+
+  /* If the first member is not a floating point type, it's not an HFA.
+     Also quickly re-check the size of the structure.  */
+  switch (candidate)
+    {
+    case FFI_TYPE_FLOAT:
+      ele_count = size / sizeof(float);
+      if (size != ele_count * sizeof(float))
+        return 0;
+      break;
+    case FFI_TYPE_DOUBLE:
+      ele_count = size / sizeof(double);
+      if (size != ele_count * sizeof(double))
+        return 0;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      ele_count = size / sizeof(long double);
+      if (size != ele_count * sizeof(long double))
+        return 0;
+      break;
+    default:
+      return 0;
+    }
+  if (ele_count > 4)
+    return 0;
+
+  /* Finally, make sure that all scalar elements are the same type.  */
+  for (i = 0; elements[i]; ++i)
+    {
+      if (elements[i]->type == FFI_TYPE_STRUCT)
+        {
+          if (!is_hfa1 (elements[i], candidate))
+            return 0;
+        }
+      else if (elements[i]->type != candidate)
+        return 0;
+    }
+
+  /* All tests succeeded.  Encode the result.  */
+  return (ele_count << 8) | candidate;
 }
 
 /* Test if an ffi_type is a candidate for passing in a register.
@@ -559,7 +603,10 @@ copy_hfa_to_reg_or_stack (void *memory,
 			  unsigned char *stack,
 			  struct arg_state *state)
 {
-  unsigned elems = element_count (ty);
+  int h = is_hfa (ty);
+  int type = h & 0xff;
+  unsigned elems = h >> 8;
+
   if (available_v (state) < elems)
     {
       /* There are insufficient V registers. Further V register allocations
@@ -573,7 +620,6 @@ copy_hfa_to_reg_or_stack (void *memory,
   else
     {
       int i;
-      unsigned short type = get_homogeneous_type (ty);
       for (i = 0; i < elems; i++)
 	{
 	  void *reg = allocate_to_v (context, state);
@@ -813,6 +859,7 @@ void
 ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 {
   extended_cif ecif;
+  int h;
 
   ecif.cif = cif;
   ecif.avalue = avalue;
@@ -861,11 +908,12 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 		}
 
               case FFI_TYPE_STRUCT:
-                if (is_hfa (cif->rtype))
+		h = is_hfa (cif->rtype);
+                if (h)
 		  {
 		    int j;
-		    unsigned short type = get_homogeneous_type (cif->rtype);
-		    unsigned elems = element_count (cif->rtype);
+		    int type = h & 0xff;
+		    int elems = h >> 8;
 		    for (j = 0; j < elems; j++)
 		      {
 			void *reg = get_basic_type_addr (type, &context, j);
@@ -967,7 +1015,7 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
   ffi_cif *cif = closure->cif;
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
   void *rvalue = NULL;
-  int i;
+  int i, h;
   struct arg_state state;
 
   arg_init (&state, ALIGN(cif->bytes, 16));
@@ -1002,9 +1050,10 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
 #endif
 
 	case FFI_TYPE_STRUCT:
-	  if (is_hfa (ty))
+	  h = is_hfa (ty);
+	  if (h)
 	    {
-	      unsigned n = element_count (ty);
+	      unsigned n = h >> 8;
 	      if (available_v (&state) < n)
 		{
 		  state.nsrn = N_V_ARG_REG;
@@ -1013,7 +1062,7 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
 		}
 	      else
 		{
-		  switch (get_homogeneous_type (ty))
+		  switch (h & 0xff)
 		    {
 		    case FFI_TYPE_FLOAT:
 		      {
@@ -1027,9 +1076,9 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
 			   correctly. The fake can be tossed once the
 			   closure function has returned hence alloca()
 			   is sufficient. */
-			int j;
+			unsigned j;
 			UINT32 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < element_count (ty); j++)
+			for (j = 0; j < n; j++)
 			  memcpy (&p[j],
 				  allocate_to_s (context, &state),
 				  sizeof (*p));
@@ -1048,9 +1097,9 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
 			   correctly. The fake can be tossed once the
 			   closure function has returned hence alloca()
 			   is sufficient. */
-			int j;
+			unsigned j;
 			UINT64 *p = avalue[i] = alloca (ty->size);
-			for (j = 0; j < element_count (ty); j++)
+			for (j = 0; j < n; j++)
 			  memcpy (&p[j],
 				  allocate_to_d (context, &state),
 				  sizeof (*p));
@@ -1143,11 +1192,12 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
             break;
 	  }
         case FFI_TYPE_STRUCT:
-          if (is_hfa (cif->rtype))
+	  h = is_hfa (cif->rtype);
+          if (h)
 	    {
 	      int j;
-	      unsigned short type = get_homogeneous_type (cif->rtype);
-	      unsigned elems = element_count (cif->rtype);
+	      int type = h & 0xff;
+	      int elems = h >> 8;
 	      for (j = 0; j < elems; j++)
 		{
 		  void *reg = get_basic_type_addr (type, context, j);
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 12/16] aarch64: Unify scalar fp and hfa handling
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (4 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 07/16] aarch64: Treat void return as not passed in registers Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 16/16] aarch64: Add support for Go closures Richard Henderson
                   ` (10 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

Since an HFA of a single element is exactly the same as scalar,
this tidies things up a bit.
---
 src/aarch64/ffi.c | 225 ++++++++++++++++++++++--------------------------------
 1 file changed, 91 insertions(+), 134 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index c5a429a..f69c350 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -71,16 +71,7 @@ ffi_clear_cache (void *start, void *end)
 #endif
 }
 
-/* Test for an FFI floating point representation.  */
-
-static unsigned
-is_floating_type (unsigned short type)
-{
-  return (type == FFI_TYPE_FLOAT || type == FFI_TYPE_DOUBLE
-	  || type == FFI_TYPE_LONGDOUBLE);
-}
-
-/* A subroutine of is_hfa.  Given a structure type, return the type code
+/* A subroutine of is_vfp_type.  Given a structure type, return the type code
    of the first non-structure element.  Recurse for structure elements.
    Return -1 if the structure is in fact empty, i.e. no nested elements.  */
 
@@ -106,7 +97,7 @@ is_hfa0 (const ffi_type *ty)
   return ret;
 }
 
-/* A subroutine of is_hfa.  Given a structure type, return true if all
+/* A subroutine of is_vfp_type.  Given a structure type, return true if all
    of the non-structure elements are the same as CANDIDATE.  */
 
 static int
@@ -131,23 +122,35 @@ is_hfa1 (const ffi_type *ty, int candidate)
   return 1;
 }
 
-/* Determine if TY is an homogenous floating point aggregate (HFA).
+/* Determine if TY may be allocated to the FP registers.  This is both an
+   fp scalar type as well as an homogenous floating point aggregate (HFA).
    That is, a structure consisting of 1 to 4 members of all the same type,
-   where that type is a floating point scalar.
+   where that type is an fp scalar.
 
-   Returns non-zero iff TY is an HFA.  The result is an encoded value where
-   bits 0-7 contain the type code, and bits 8-10 contain the element count.  */
+   Returns non-zero iff TY is an HFA.  The result is the AARCH64_RET_*
+   constant for the type.  */
 
 static int
-is_hfa(const ffi_type *ty)
+is_vfp_type (const ffi_type *ty)
 {
   ffi_type **elements;
   int candidate, i;
   size_t size, ele_count;
 
   /* Quickest tests first.  */
-  if (ty->type != FFI_TYPE_STRUCT)
-    return 0;
+  switch (ty->type)
+    {
+    default:
+      return 0;
+    case FFI_TYPE_FLOAT:
+      return AARCH64_RET_S1;
+    case FFI_TYPE_DOUBLE:
+      return AARCH64_RET_D1;
+    case FFI_TYPE_LONGDOUBLE:
+      return AARCH64_RET_Q1;
+    case FFI_TYPE_STRUCT:
+      break;
+    }
 
   /* No HFA types are smaller than 4 bytes, or larger than 64 bytes.  */
   size = ty->size;
@@ -205,17 +208,7 @@ is_hfa(const ffi_type *ty)
     }
 
   /* All tests succeeded.  Encode the result.  */
-  return (ele_count << 8) | candidate;
-}
-
-/* Test if an ffi_type argument or result is a candidate for a vector
-   register.  */
-
-static int
-is_v_register_candidate (ffi_type *ty)
-{
-  return is_floating_type (ty->type)
-	   || (ty->type == FFI_TYPE_STRUCT && is_hfa (ty));
+  return candidate * 4 + (4 - ele_count);
 }
 
 /* Representation of the procedure call argument marshalling
@@ -302,9 +295,7 @@ extend_integer_type (void *source, int type)
 static void
 extend_hfa_type (void *dest, void *src, int h)
 {
-  int n = (h >> 8);
-  int t = h & 0xff;
-  int f = (t - FFI_TYPE_FLOAT) * 4 + 4 - n;
+  int f = h - AARCH64_RET_S4;
   void *x0;
 
   asm volatile (
@@ -358,82 +349,68 @@ extend_hfa_type (void *dest, void *src, int h)
 static void *
 compress_hfa_type (void *dest, void *reg, int h)
 {
-  int n = h >> 8;
-  switch (h & 0xff)
+  switch (h)
     {
-    case FFI_TYPE_FLOAT:
-      switch (n)
+    case AARCH64_RET_S1:
+      if (dest == reg)
 	{
-	default:
-	  if (dest == reg)
-	    {
 #ifdef __AARCH64EB__
-	      dest += 12;
+	  dest += 12;
 #endif
-	    }
-	  else
-	    *(float *)dest = *(float *)reg;
-	  break;
-	case 2:
-	  asm("ldp q16, q17, [%1]\n\t"
-	      "st2 { v16.s, v17.s }[0], [%0]"
-	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
-	  break;
-	case 3:
-	  asm("ldp q16, q17, [%1]\n\t"
-	      "ldr q18, [%1, #32]\n\t"
-	      "st3 { v16.s, v17.s, v18.s }[0], [%0]"
-	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
-	  break;
-	case 4:
-	  asm("ldp q16, q17, [%1]\n\t"
-	      "ldp q18, q19, [%1, #32]\n\t"
-	      "st4 { v16.s, v17.s, v18.s, v19.s }[0], [%0]"
-	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
-	  break;
 	}
+      else
+	*(float *)dest = *(float *)reg;
+      break;
+    case AARCH64_RET_S2:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "st2 { v16.s, v17.s }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+      break;
+    case AARCH64_RET_S3:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "ldr q18, [%1, #32]\n\t"
+	   "st3 { v16.s, v17.s, v18.s }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+      break;
+    case AARCH64_RET_S4:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "ldp q18, q19, [%1, #32]\n\t"
+	   "st4 { v16.s, v17.s, v18.s, v19.s }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
       break;
 
-    case FFI_TYPE_DOUBLE:
-      switch (n)
+    case AARCH64_RET_D1:
+      if (dest == reg)
 	{
-	default:
-	  if (dest == reg)
-	    {
 #ifdef __AARCH64EB__
-	      dest += 8;
+	  dest += 8;
 #endif
-	    }
-	  else
-	    *(double *)dest = *(double *)reg;
-	  break;
-	case 2:
-	  asm("ldp q16, q17, [%1]\n\t"
-	      "st2 { v16.d, v17.d }[0], [%0]"
-	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
-	  break;
-	case 3:
-	  asm("ldp q16, q17, [%1]\n\t"
-	      "ldr q18, [%1, #32]\n\t"
-	      "st3 { v16.d, v17.d, v18.d }[0], [%0]"
-	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
-	  break;
-	case 4:
-	  asm("ldp q16, q17, [%1]\n\t"
-	      "ldp q18, q19, [%1, #32]\n\t"
-	      "st4 { v16.d, v17.d, v18.d, v19.d }[0], [%0]"
-	      : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
-	  break;
 	}
+      else
+	*(double *)dest = *(double *)reg;
       break;
-
-    case FFI_TYPE_LONGDOUBLE:
-      if (dest != reg)
-	return memcpy (dest, reg, 16 * n);
+    case AARCH64_RET_D2:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "st2 { v16.d, v17.d }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17");
+      break;
+    case AARCH64_RET_D3:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "ldr q18, [%1, #32]\n\t"
+	   "st3 { v16.d, v17.d, v18.d }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
+      break;
+    case AARCH64_RET_D4:
+      asm ("ldp q16, q17, [%1]\n\t"
+	   "ldp q18, q19, [%1, #32]\n\t"
+	   "st4 { v16.d, v17.d, v18.d, v19.d }[0], [%0]"
+	   : : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
       break;
 
     default:
-      FFI_ASSERT (0);
+      if (dest != reg)
+	return memcpy (dest, reg, 16 * (4 - (h & 3)));
+      break;
     }
   return dest;
 }
@@ -494,34 +471,25 @@ ffi_prep_cif_machdep (ffi_cif *cif)
       break;
 
     case FFI_TYPE_FLOAT:
-      flags = AARCH64_RET_S1;
-      break;
     case FFI_TYPE_DOUBLE:
-      flags = AARCH64_RET_D1;
-      break;
     case FFI_TYPE_LONGDOUBLE:
-      flags = AARCH64_RET_Q1;
-      break;
-
     case FFI_TYPE_STRUCT:
-      {
-	int h = is_hfa (rtype);
-	size_t s = rtype->size;
-
-	if (h)
-	  flags = (h & 0xff) * 4 + 4 - (h >> 8);
-	else if (s > 16)
-	  {
-	    flags = AARCH64_RET_VOID | AARCH64_RET_IN_MEM;
-	    bytes += 8;
-	  }
-	else if (s == 16)
-	  flags = AARCH64_RET_INT128;
-	else if (s == 8)
-	  flags = AARCH64_RET_INT64;
-	else
-	  flags = AARCH64_RET_INT128 | AARCH64_RET_NEED_COPY;
-      }
+      flags = is_vfp_type (rtype);
+      if (flags == 0)
+	{
+	  size_t s = rtype->size;
+	  if (s > 16)
+	    {
+	      flags = AARCH64_RET_VOID | AARCH64_RET_IN_MEM;
+	      bytes += 8;
+	    }
+	  else if (s == 16)
+	    flags = AARCH64_RET_INT128;
+	  else if (s == 8)
+	    flags = AARCH64_RET_INT64;
+	  else
+	    flags = AARCH64_RET_INT128 | AARCH64_RET_NEED_COPY;
+	}
       break;
 
     default:
@@ -530,7 +498,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 
   aarch64_flags = 0;
   for (i = 0, n = cif->nargs; i < n; i++)
-    if (is_v_register_candidate (cif->arg_types[i]))
+    if (is_vfp_type (cif->arg_types[i]))
       {
 	aarch64_flags = AARCH64_FLAG_ARG_V;
 	flags |= AARCH64_FLAG_ARG_V;
@@ -652,20 +620,14 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
 	case FFI_TYPE_FLOAT:
 	case FFI_TYPE_DOUBLE:
 	case FFI_TYPE_LONGDOUBLE:
-	  /* Scalar float is a degenerate case of HFA.  */
-	  h = t + 0x100;
-	  goto do_hfa;
-
 	case FFI_TYPE_STRUCT:
 	  {
 	    void *dest;
-	    int elems;
 
-	    h = is_hfa (ty);
+	    h = is_vfp_type (ty);
 	    if (h)
 	      {
-	    do_hfa:
-		elems = h >> 8;
+		int elems = 4 - (h & 3);
 	        if (state.nsrn + elems <= N_V_ARG_REG)
 		  {
 		    dest = &context->v[state.nsrn];
@@ -828,16 +790,11 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
 	case FFI_TYPE_FLOAT:
 	case FFI_TYPE_DOUBLE:
 	case FFI_TYPE_LONGDOUBLE:
-	  /* Scalar float is a degenerate case of HFA.  */
-	  h = t + 0x100;
-	  goto do_hfa;
-
 	case FFI_TYPE_STRUCT:
-	  h = is_hfa (ty);
+	  h = is_vfp_type (ty);
 	  if (h)
 	    {
-	    do_hfa:
-	      n = h >> 8;
+	      n = 4 - (h & 3);
 	      if (state.nsrn + n <= N_V_ARG_REG)
 		{
 		  void *reg = &context->v[state.nsrn];
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 15/16] aarch64: Move x8 out of call_context
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (11 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 09/16] aarch64: Merge prep_args with ffi_call Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 08/16] aarch64: Tidy up abi manipulation Richard Henderson
                   ` (3 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

Reduces stack size.  It was only used by the closure, and there
are available argument registers.
---
 src/aarch64/ffi.c      | 5 ++---
 src/aarch64/internal.h | 2 +-
 src/aarch64/sysv.S     | 2 +-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 4f85140..f546ab2 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -52,7 +52,6 @@ struct call_context
 {
   struct _v v[N_V_ARG_REG];
   UINT64 x[N_X_ARG_REG];
-  UINT64 x8;
 };
 
 #if defined (__clang__) && defined (__APPLE__)
@@ -766,7 +765,7 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
 			void (*fun)(ffi_cif*,void*,void**,void*),
 			void *user_data,
 			struct call_context *context,
-			void *stack, void *rvalue)
+			void *stack, void *rvalue, void *struct_rvalue)
 {
   void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
   int i, h, nargs, flags;
@@ -861,7 +860,7 @@ ffi_closure_SYSV_inner (ffi_cif *cif,
 
   flags = cif->flags;
   if (flags & AARCH64_RET_IN_MEM)
-    rvalue = (void *)(uintptr_t)context->x8;
+    rvalue = struct_rvalue;
 
   fun (cif, rvalue, avalue, user_data);
 
diff --git a/src/aarch64/internal.h b/src/aarch64/internal.h
index a3070db..9c3e077 100644
--- a/src/aarch64/internal.h
+++ b/src/aarch64/internal.h
@@ -64,4 +64,4 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 #define N_X_ARG_REG		8
 #define N_V_ARG_REG		8
-#define CALL_CONTEXT_SIZE	(N_V_ARG_REG * 16 + N_X_ARG_REG * 8 + 16)
+#define CALL_CONTEXT_SIZE	(N_V_ARG_REG * 16 + N_X_ARG_REG * 8)
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index abd848d..7f00a3f 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -241,7 +241,6 @@ CNAME(ffi_closure_SYSV):
 	stp     x2, x3, [sp, #16 + 16*N_V_ARG_REG + 16]
 	stp     x4, x5, [sp, #16 + 16*N_V_ARG_REG + 32]
 	stp     x6, x7, [sp, #16 + 16*N_V_ARG_REG + 48]
-	str     x8,     [sp, #16 + 16*N_V_ARG_REG + 64]
 
 	/* Load ffi_closure_inner arguments.  */
 	ldp	x0, x1, [x17, #FFI_TRAMPOLINE_SIZE]	/* load cif, fn */
@@ -249,6 +248,7 @@ CNAME(ffi_closure_SYSV):
 	add	x3, sp, #16				/* load context */
 	add	x4, sp, #ffi_closure_SYSV_FS		/* load stack */
 	add	x5, sp, #16+CALL_CONTEXT_SIZE		/* load rvalue */
+	mov	x6, x8					/* load struct_rval */
 	bl      CNAME(ffi_closure_SYSV_inner)
 
 	/* Load the return value as directed.  */
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 13/16] aarch64: Remove aarch64_flags
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (7 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 02/16] aarch64: Improve is_hfa Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 05/16] aarch64: Reduce the size of register_context Richard Henderson
                   ` (7 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

This field was useless from the start, since the normal flags
field is available for backend use.
---
 src/aarch64/ffi.c       | 5 +----
 src/aarch64/ffitarget.h | 4 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index f69c350..b3e0b16 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -436,7 +436,7 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 {
   ffi_type *rtype = cif->rtype;
   size_t bytes = cif->bytes;
-  int flags, aarch64_flags, i, n;
+  int flags, i, n;
 
   switch (rtype->type)
     {
@@ -496,11 +496,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
       abort();
     }
 
-  aarch64_flags = 0;
   for (i = 0, n = cif->nargs; i < n; i++)
     if (is_vfp_type (cif->arg_types[i]))
       {
-	aarch64_flags = AARCH64_FLAG_ARG_V;
 	flags |= AARCH64_FLAG_ARG_V;
 	break;
       }
@@ -508,7 +506,6 @@ ffi_prep_cif_machdep (ffi_cif *cif)
   /* Round the stack up to a multiple of the stack alignment requirement. */
   cif->bytes = ALIGN(bytes, 16);
   cif->flags = flags;
-  cif->aarch64_flags = aarch64_flags;
 #if defined (__APPLE__)
   cif->aarch64_nfixedargs = 0;
 #endif
diff --git a/src/aarch64/ffitarget.h b/src/aarch64/ffitarget.h
index b488bbe..6d6d3e6 100644
--- a/src/aarch64/ffitarget.h
+++ b/src/aarch64/ffitarget.h
@@ -49,9 +49,7 @@ typedef enum ffi_abi
 
 #if defined (__APPLE__)
 #define FFI_TARGET_SPECIFIC_VARIADIC
-#define FFI_EXTRA_CIF_FIELDS unsigned aarch64_flags; unsigned aarch64_nfixedargs
-#else
-#define FFI_EXTRA_CIF_FIELDS unsigned aarch64_flags
+#define FFI_EXTRA_CIF_FIELDS unsigned aarch64_nfixedargs
 #endif
 
 #endif
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 04/16] aarch64: Simplify AARCH64_STACK_ALIGN
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (9 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 05/16] aarch64: Reduce the size of register_context Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 09/16] aarch64: Merge prep_args with ffi_call Richard Henderson
                   ` (5 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

The iOS abi doesn't require padding between arguments, but
that's not what AARCH64_STACK_ALIGN meant.  The hardware will
in fact trap if the SP register is not 16 byte aligned.
---
 src/aarch64/ffi.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index f065be5..a6fcc11 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -35,13 +35,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 # define FFI_TYPE_LONGDOUBLE 4
 #endif
 
-/* Stack alignment requirement in bytes */
-#if defined (__APPLE__)
-#define AARCH64_STACK_ALIGN 1
-#else
-#define AARCH64_STACK_ALIGN 16
-#endif
-
 #define N_X_ARG_REG 8
 #define N_V_ARG_REG 8
 
@@ -799,8 +792,7 @@ ffi_status
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
   /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes =
-    (cif->bytes + (AARCH64_STACK_ALIGN - 1)) & ~ (AARCH64_STACK_ALIGN - 1);
+  cif->bytes = ALIGN(cif->bytes, 16);
 
   /* Initialize our flags. We are interested if this CIF will touch a
      vector register, if so we will enable context save and load to
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 10/16] aarch64: Move return value handling into ffi_call_SYSV
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (2 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 11/16] aarch64: Move return value handling into ffi_closure_SYSV Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 07/16] aarch64: Treat void return as not passed in registers Richard Henderson
                   ` (12 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

This lets us pass return data directly to the caller of ffi_call
in most cases, rather than storing it into temporary storage first.
---
 src/aarch64/ffi.c      | 202 ++++++++++++++++++++++++++++---------------------
 src/aarch64/internal.h |  43 ++++++++++-
 src/aarch64/sysv.S     | 127 ++++++++++++++++++++++++-------
 3 files changed, 258 insertions(+), 114 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index a067303..ffa1363 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -523,30 +523,90 @@ allocate_int_to_reg_or_stack (struct call_context *context,
 ffi_status
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
-  /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes = ALIGN(cif->bytes, 16);
-
-  /* Initialize our flags. We are interested if this CIF will touch a
-     vector register, if so we will enable context save and load to
-     those registers, otherwise not. This is intended to be friendly
-     to lazy float context switching in the kernel.  */
-  cif->aarch64_flags = 0;
+  ffi_type *rtype = cif->rtype;
+  size_t bytes = cif->bytes;
+  int flags, aarch64_flags, i, n;
 
-  if (is_v_register_candidate (cif->rtype))
+  switch (rtype->type)
     {
-      cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < cif->nargs; i++)
-        if (is_v_register_candidate (cif->arg_types[i]))
-          {
-            cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
-            break;
-          }
+    case FFI_TYPE_VOID:
+      flags = AARCH64_RET_VOID;
+      break;
+    case FFI_TYPE_UINT8:
+      flags = AARCH64_RET_UINT8;
+      break;
+    case FFI_TYPE_UINT16:
+      flags = AARCH64_RET_UINT16;
+      break;
+    case FFI_TYPE_UINT32:
+      flags = AARCH64_RET_UINT32;
+      break;
+    case FFI_TYPE_SINT8:
+      flags = AARCH64_RET_SINT8;
+      break;
+    case FFI_TYPE_SINT16:
+      flags = AARCH64_RET_SINT16;
+      break;
+    case FFI_TYPE_INT:
+    case FFI_TYPE_SINT32:
+      flags = AARCH64_RET_SINT32;
+      break;
+    case FFI_TYPE_SINT64:
+    case FFI_TYPE_UINT64:
+      flags = AARCH64_RET_INT64;
+      break;
+    case FFI_TYPE_POINTER:
+      flags = (sizeof(void *) == 4 ? AARCH64_RET_UINT32 : AARCH64_RET_INT64);
+      break;
+
+    case FFI_TYPE_FLOAT:
+      flags = AARCH64_RET_S1;
+      break;
+    case FFI_TYPE_DOUBLE:
+      flags = AARCH64_RET_D1;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = AARCH64_RET_Q1;
+      break;
+
+    case FFI_TYPE_STRUCT:
+      {
+	int h = is_hfa (rtype);
+	size_t s = rtype->size;
+
+	if (h)
+	  flags = (h & 0xff) * 4 + 4 - (h >> 8);
+	else if (s > 16)
+	  {
+	    flags = AARCH64_RET_VOID | AARCH64_RET_IN_MEM;
+	    bytes += 8;
+	  }
+	else if (s == 16)
+	  flags = AARCH64_RET_INT128;
+	else if (s == 8)
+	  flags = AARCH64_RET_INT64;
+	else
+	  flags = AARCH64_RET_INT128 | AARCH64_RET_NEED_COPY;
+      }
+      break;
+
+    default:
+      abort();
     }
 
+  aarch64_flags = 0;
+  for (i = 0, n = cif->nargs; i < n; i++)
+    if (is_v_register_candidate (cif->arg_types[i]))
+      {
+	aarch64_flags = AARCH64_FLAG_ARG_V;
+	flags |= AARCH64_FLAG_ARG_V;
+	break;
+      }
+
+  /* Round the stack up to a multiple of the stack alignment requirement. */
+  cif->bytes = ALIGN(bytes, 16);
+  cif->flags = flags;
+  cif->aarch64_flags = aarch64_flags;
 #if defined (__APPLE__)
   cif->aarch64_nfixedargs = 0;
 #endif
@@ -555,51 +615,65 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 }
 
 #if defined (__APPLE__)
-
 /* Perform Apple-specific cif processing for variadic calls */
 ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
 				    unsigned int nfixedargs,
 				    unsigned int ntotalargs)
 {
-  ffi_status status;
-
-  status = ffi_prep_cif_machdep (cif);
-
+  ffi_status status = ffi_prep_cif_machdep (cif);
   cif->aarch64_nfixedargs = nfixedargs;
-
   return status;
 }
+#endif /* __APPLE__ */
 
-#endif
-
-extern void ffi_call_SYSV (void *stack, void *frame,
-			   void (*fn)(void), int flags) FFI_HIDDEN;
+extern void ffi_call_SYSV (struct call_context *context, void *frame,
+			   void (*fn)(void), void *rvalue, int flags)
+	FFI_HIDDEN;
 
 /* Call a function with the provided arguments and capture the return
    value.  */
 void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
 {
   struct call_context *context;
-  void *stack, *frame;
+  void *stack, *frame, *rvalue;
   struct arg_state state;
-  size_t stack_bytes;
-  int i, nargs = cif->nargs;
-  int h, t;
+  size_t stack_bytes, rtype_size, rsize;
+  int i, nargs, flags;
   ffi_type *rtype;
 
-  /* Allocate consectutive stack for everything we'll need.  */
+  flags = cif->flags;
+  rtype = cif->rtype;
+  rtype_size = rtype->size;
   stack_bytes = cif->bytes;
-  stack = alloca (stack_bytes + 32 + sizeof(struct call_context));
+
+  /* If the target function returns a structure via hidden pointer,
+     then we cannot allow a null rvalue.  Otherwise, mash a null
+     rvalue to void return type.  */
+  rsize = 0;
+  if (flags & AARCH64_RET_IN_MEM)
+    {
+      if (orig_rvalue == NULL)
+	rsize = rtype_size;
+    }
+  else if (orig_rvalue == NULL)
+    flags &= AARCH64_FLAG_ARG_V;
+  else if (flags & AARCH64_RET_NEED_COPY)
+    rsize = 16;
+
+  /* Allocate consectutive stack for everything we'll need.  */
+  context = alloca (sizeof(struct call_context) + stack_bytes + 32 + rsize);
+  stack = context + 1;
   frame = stack + stack_bytes;
-  context = frame + 32;
+  rvalue = (rsize ? frame + 32 : orig_rvalue);
 
   arg_init (&state);
-  for (i = 0; i < nargs; i++)
+  for (i = 0, nargs = cif->nargs; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
       size_t s = ty->size;
       void *a = avalue[i];
+      int h, t;
 
       t = ty->type;
       switch (t)
@@ -717,54 +791,10 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 #endif
     }
 
-  rtype = cif->rtype;
-  if (is_register_candidate (rtype))
-    {
-      ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags);
+  ffi_call_SYSV (context, frame, fn, rvalue, flags);
 
-      t = rtype->type;
-      switch (t)
-	{
-	case FFI_TYPE_INT:
-	case FFI_TYPE_UINT8:
-	case FFI_TYPE_SINT8:
-	case FFI_TYPE_UINT16:
-	case FFI_TYPE_SINT16:
-	case FFI_TYPE_UINT32:
-	case FFI_TYPE_SINT32:
-	case FFI_TYPE_POINTER:
-	case FFI_TYPE_UINT64:
-	case FFI_TYPE_SINT64:
-	  *(ffi_arg *)rvalue = extend_integer_type (&context->x[0], t);
-	  break;
-
-	case FFI_TYPE_FLOAT:
-	case FFI_TYPE_DOUBLE:
-	case FFI_TYPE_LONGDOUBLE:
-	  compress_hfa_type (rvalue, &context->v[0], 0x100 + t);
-	  break;
-
-	case FFI_TYPE_STRUCT:
-	  h = is_hfa (cif->rtype);
-	  if (h)
-	    compress_hfa_type (rvalue, &context->v[0], h);
-	  else
-	    {
-	      FFI_ASSERT (rtype->size <= 16);
-	      memcpy (rvalue, &context->x[0], rtype->size);
-	    }
-	  break;
-
-	default:
-	  FFI_ASSERT (0);
-	  break;
-	}
-    }
-  else
-    {
-      context->x8 = (uintptr_t)rvalue;
-      ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags);
-    }
+  if (flags & AARCH64_RET_NEED_COPY)
+    memcpy (orig_rvalue, rvalue, rtype_size);
 }
 
 static unsigned char trampoline [] =
diff --git a/src/aarch64/internal.h b/src/aarch64/internal.h
index b6b6104..a3070db 100644
--- a/src/aarch64/internal.h
+++ b/src/aarch64/internal.h
@@ -18,7 +18,48 @@ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
-#define AARCH64_FLAG_ARG_V_BIT	0
+#define AARCH64_RET_VOID	0
+#define AARCH64_RET_INT64	1
+#define AARCH64_RET_INT128	2
+
+#define AARCH64_RET_UNUSED3	3
+#define AARCH64_RET_UNUSED4	4
+#define AARCH64_RET_UNUSED5	5
+#define AARCH64_RET_UNUSED6	6
+#define AARCH64_RET_UNUSED7	7
+
+/* Note that FFI_TYPE_FLOAT == 2, _DOUBLE == 3, _LONGDOUBLE == 4,
+   so _S4 through _Q1 are layed out as (TYPE * 4) + (4 - COUNT).  */
+#define AARCH64_RET_S4		8
+#define AARCH64_RET_S3		9
+#define AARCH64_RET_S2		10
+#define AARCH64_RET_S1		11
+
+#define AARCH64_RET_D4		12
+#define AARCH64_RET_D3		13
+#define AARCH64_RET_D2		14
+#define AARCH64_RET_D1		15
+
+#define AARCH64_RET_Q4		16
+#define AARCH64_RET_Q3		17
+#define AARCH64_RET_Q2		18
+#define AARCH64_RET_Q1		19
+
+/* Note that each of the sub-64-bit integers gets two entries.  */
+#define AARCH64_RET_UINT8	20
+#define AARCH64_RET_UINT16	22
+#define AARCH64_RET_UINT32	24
+
+#define AARCH64_RET_SINT8	26
+#define AARCH64_RET_SINT16	28
+#define AARCH64_RET_SINT32	30
+
+#define AARCH64_RET_MASK	31
+
+#define AARCH64_RET_IN_MEM	(1 << 5)
+#define AARCH64_RET_NEED_COPY	(1 << 6)
+
+#define AARCH64_FLAG_ARG_V_BIT	7
 #define AARCH64_FLAG_ARG_V	(1 << AARCH64_FLAG_ARG_V_BIT)
 
 #define N_X_ARG_REG		8
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index a5f636a..ba15663 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -40,9 +40,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #endif
 
 	.text
-	.align 2
+	.align 4
 
-	.globl CNAME(ffi_call_SYSV)
+	.globl	CNAME(ffi_call_SYSV)
 #ifdef __ELF__
 	.type	CNAME(ffi_call_SYSV), #function
 	.hidden	CNAME(ffi_call_SYSV)
@@ -50,14 +50,15 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 
 /* ffi_call_SYSV
    extern void ffi_call_SYSV (void *stack, void *frame,
-			      void (*fn)(void), int flags);
+			      void (*fn)(void), void *rvalue, int flags);
 
    Therefore on entry we have:
 
    x0 stack
    x1 frame
    x2 fn
-   x3 flags
+   x3 rvalue
+   x4 flags
 */
 
 	cfi_startproc
@@ -71,43 +72,111 @@ CNAME(ffi_call_SYSV):
 	cfi_rel_offset (x29, 0)
 	cfi_rel_offset (x30, 8)
 
-	str	w3, [x29, #16]		/* save flags */
 	mov	x9, x2			/* save fn */
+	mov	x8, x3			/* install structure return */
+	stp	x3, x4, [x29, #16]	/* save rvalue and flags */
 
 	/* Load the vector argument passing registers, if necessary.  */
-	tbz	w3, #AARCH64_FLAG_ARG_V_BIT, 1f
-	ldp     q0, q1, [x29, #32 + 0]
-	ldp     q2, q3, [x29, #32 + 32]
-	ldp     q4, q5, [x29, #32 + 64]
-	ldp     q6, q7, [x29, #32 + 96]
+	tbz	w4, #AARCH64_FLAG_ARG_V_BIT, 1f
+	ldp     q0, q1, [sp, #0]
+	ldp     q2, q3, [sp, #32]
+	ldp     q4, q5, [sp, #64]
+	ldp     q6, q7, [sp, #96]
 1:
 	/* Load the core argument passing registers, including
 	   the structure return pointer.  */
-	ldp     x0, x1, [x29, #32 + 16*N_V_ARG_REG + 0]
-	ldp     x2, x3, [x29, #32 + 16*N_V_ARG_REG + 16]
-	ldp     x4, x5, [x29, #32 + 16*N_V_ARG_REG + 32]
-	ldp     x6, x7, [x29, #32 + 16*N_V_ARG_REG + 48]
-	ldr     x8,     [x29, #32 + 16*N_V_ARG_REG + 64]
+	ldp     x0, x1, [sp, #16*N_V_ARG_REG + 0]
+	ldp     x2, x3, [sp, #16*N_V_ARG_REG + 16]
+	ldp     x4, x5, [sp, #16*N_V_ARG_REG + 32]
+	ldp     x6, x7, [sp, #16*N_V_ARG_REG + 48]
+
+	/* Deallocate the context, leaving the stacked arguments.  */
+	add	sp, sp, #CALL_CONTEXT_SIZE
 
 	blr     x9			/* call fn */
 
-	ldr	w3, [x29, #16]		/* reload flags */
+	ldp	x3, x4, [x29, #16]	/* reload rvalue and flags */
 
 	/* Partially deconstruct the stack frame.  */
 	mov     sp, x29
 	cfi_def_cfa_register (sp)
 	ldp     x29, x30, [x29]
 
-	/* Save the core return registers.  */
-	stp     x0, x1, [sp, #32 + 16*N_V_ARG_REG]
-
-	/* Save the vector return registers, if necessary.  */
-	tbz     w3, #AARCH64_FLAG_ARG_V_BIT, 1f
-	stp     q0, q1, [sp, #32 + 0]
-	stp     q2, q3, [sp, #32 + 32]
-1:
-	/* All done.  */
+	/* Save the return value as directed.  */
+	adr	x5, 0f
+	and	w4, w4, #AARCH64_RET_MASK
+	add	x5, x5, x4, lsl #3
+	br	x5
+
+	/* Note that each table entry is 2 insns, and thus 8 bytes.
+	   For integer data, note that we're storing into ffi_arg
+	   and therefore we want to extend to 64 bits; these types
+	   have two consecutive entries allocated for them.  */
+	.align	4
+0:	ret				/* VOID */
+	nop
+1:	str	x0, [x3]		/* INT64 */
+	ret
+2:	stp	x0, x1, [x3]		/* INT128 */
+	ret
+3:	brk	#1000			/* UNUSED */
+	ret
+4:	brk	#1000			/* UNUSED */
+	ret
+5:	brk	#1000			/* UNUSED */
+	ret
+6:	brk	#1000			/* UNUSED */
+	ret
+7:	brk	#1000			/* UNUSED */
+	ret
+8:	st4	{ v0.s-v3.s }[0], [x3]	/* S4 */
+	ret
+9:	st3	{ v0.s-v2.s }[0], [x3]	/* S3 */
 	ret
+10:	stp	s0, s1, [x3]		/* S2 */
+	ret
+11:	str	s0, [x3]		/* S1 */
+	ret
+12:	st4	{ v0.d-v3.d }[0], [x3]	/* D4 */
+	ret
+13:	st3	{ v0.d-v2.d }[0], [x3]	/* D3 */
+	ret
+14:	stp	d0, d1, [x3]		/* D2 */
+	ret
+15:	str	d0, [x3]		/* D1 */
+	ret
+16:	str	q3, [x3, #48]		/* Q4 */
+	nop
+17:	str	q2, [x3, #32]		/* Q3 */
+	nop
+18:	stp	q0, q1, [x3]		/* Q2 */
+	ret
+19:	str	q0, [x3]		/* Q1 */
+	ret
+20:	uxtb	w0, w0			/* UINT8 */
+	str	x0, [x3]
+21:	ret				/* reserved */
+	nop
+22:	uxth	w0, w0			/* UINT16 */
+	str	x0, [x3]
+23:	ret				/* reserved */
+	nop
+24:	mov	w0, w0			/* UINT32 */
+	str	x0, [x3]
+25:	ret				/* reserved */
+	nop
+26:	sxtb	x0, w0			/* SINT8 */
+	str	x0, [x3]
+27:	ret				/* reserved */
+	nop
+28:	sxth	x0, w0			/* SINT16 */
+	str	x0, [x3]
+29:	ret				/* reserved */
+	nop
+30:	sxtw	x0, w0			/* SINT32 */
+	str	x0, [x3]
+31:	ret				/* reserved */
+	nop
 
 	cfi_endproc
 #ifdef __ELF__
@@ -154,9 +223,13 @@ CNAME(ffi_call_SYSV):
    Voila!  */
 
         .text
-        .align 2
+        .align 4
 
-        .globl CNAME(ffi_closure_SYSV)
+        .globl	CNAME(ffi_closure_SYSV)
+#ifdef __ELF__
+	.type	CNAME(ffi_closure_SYSV), #function
+	.hidden	CNAME(ffi_closure_SYSV)
+#endif
         cfi_startproc
 CNAME(ffi_closure_SYSV):
         stp     x29, x30, [sp, #-16]!
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 09/16] aarch64: Merge prep_args with ffi_call
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (10 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 04/16] aarch64: Simplify AARCH64_STACK_ALIGN Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 15/16] aarch64: Move x8 out of call_context Richard Henderson
                   ` (4 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

Use the trick to allocate the stack frame for ffi_call_SYSV
within ffi_call itself.
---
 src/aarch64/ffi.c  | 193 ++++++++++++++++++++++++-----------------------------
 src/aarch64/sysv.S | 192 ++++++++++++++++------------------------------------
 2 files changed, 144 insertions(+), 241 deletions(-)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index d19384b..a067303 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -72,14 +72,6 @@ ffi_clear_cache (void *start, void *end)
 }
 
 extern void
-ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *,
-			    extended_cif *),
-               struct call_context *context,
-               extended_cif *,
-               size_t,
-               void (*fn)(void));
-
-extern void
 ffi_closure_SYSV (ffi_closure *);
 
 /* Test for an FFI floating point representation.  */
@@ -311,12 +303,11 @@ struct arg_state
 
 /* Initialize a procedure call argument marshalling state.  */
 static void
-arg_init (struct arg_state *state, size_t call_frame_size)
+arg_init (struct arg_state *state)
 {
   state->ngrn = 0;
   state->nsrn = 0;
   state->nsaa = 0;
-
 #if defined (__APPLE__)
   state->allocating_variadic = 0;
 #endif
@@ -529,27 +520,88 @@ allocate_int_to_reg_or_stack (struct call_context *context,
   return allocate_to_stack (state, stack, size, size);
 }
 
-/* Marshall the arguments from FFI representation to procedure call
-   context and stack.  */
+ffi_status
+ffi_prep_cif_machdep (ffi_cif *cif)
+{
+  /* Round the stack up to a multiple of the stack alignment requirement. */
+  cif->bytes = ALIGN(cif->bytes, 16);
 
-static unsigned
-aarch64_prep_args (struct call_context *context, unsigned char *stack,
-		   extended_cif *ecif)
+  /* Initialize our flags. We are interested if this CIF will touch a
+     vector register, if so we will enable context save and load to
+     those registers, otherwise not. This is intended to be friendly
+     to lazy float context switching in the kernel.  */
+  cif->aarch64_flags = 0;
+
+  if (is_v_register_candidate (cif->rtype))
+    {
+      cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
+    }
+  else
+    {
+      int i;
+      for (i = 0; i < cif->nargs; i++)
+        if (is_v_register_candidate (cif->arg_types[i]))
+          {
+            cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
+            break;
+          }
+    }
+
+#if defined (__APPLE__)
+  cif->aarch64_nfixedargs = 0;
+#endif
+
+  return FFI_OK;
+}
+
+#if defined (__APPLE__)
+
+/* Perform Apple-specific cif processing for variadic calls */
+ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
+				    unsigned int nfixedargs,
+				    unsigned int ntotalargs)
 {
-  ffi_cif *cif = ecif->cif;
-  void **avalue = ecif->avalue;
-  int i, nargs = cif->nargs;
+  ffi_status status;
+
+  status = ffi_prep_cif_machdep (cif);
+
+  cif->aarch64_nfixedargs = nfixedargs;
+
+  return status;
+}
+
+#endif
+
+extern void ffi_call_SYSV (void *stack, void *frame,
+			   void (*fn)(void), int flags) FFI_HIDDEN;
+
+/* Call a function with the provided arguments and capture the return
+   value.  */
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  struct call_context *context;
+  void *stack, *frame;
   struct arg_state state;
+  size_t stack_bytes;
+  int i, nargs = cif->nargs;
+  int h, t;
+  ffi_type *rtype;
 
-  arg_init (&state, cif->bytes);
+  /* Allocate consectutive stack for everything we'll need.  */
+  stack_bytes = cif->bytes;
+  stack = alloca (stack_bytes + 32 + sizeof(struct call_context));
+  frame = stack + stack_bytes;
+  context = frame + 32;
 
+  arg_init (&state);
   for (i = 0; i < nargs; i++)
     {
       ffi_type *ty = cif->arg_types[i];
       size_t s = ty->size;
-      int h, t = ty->type;
       void *a = avalue[i];
 
+      t = ty->type;
       switch (t)
 	{
 	case FFI_TYPE_VOID:
@@ -665,83 +717,12 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack,
 #endif
     }
 
-  return cif->aarch64_flags;
-}
-
-ffi_status
-ffi_prep_cif_machdep (ffi_cif *cif)
-{
-  /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes = ALIGN(cif->bytes, 16);
-
-  /* Initialize our flags. We are interested if this CIF will touch a
-     vector register, if so we will enable context save and load to
-     those registers, otherwise not. This is intended to be friendly
-     to lazy float context switching in the kernel.  */
-  cif->aarch64_flags = 0;
-
-  if (is_v_register_candidate (cif->rtype))
-    {
-      cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < cif->nargs; i++)
-        if (is_v_register_candidate (cif->arg_types[i]))
-          {
-            cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
-            break;
-          }
-    }
-
-#if defined (__APPLE__)
-  cif->aarch64_nfixedargs = 0;
-#endif
-
-  return FFI_OK;
-}
-
-#if defined (__APPLE__)
-
-/* Perform Apple-specific cif processing for variadic calls */
-ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
-				    unsigned int nfixedargs,
-				    unsigned int ntotalargs)
-{
-  ffi_status status;
-
-  status = ffi_prep_cif_machdep (cif);
-
-  cif->aarch64_nfixedargs = nfixedargs;
-
-  return status;
-}
-
-#endif
-
-/* Call a function with the provided arguments and capture the return
-   value.  */
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
-{
-  extended_cif ecif;
-  struct call_context context;
-  size_t stack_bytes;
-  int h, t;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  ecif.rvalue = rvalue;
-
-  stack_bytes = cif->bytes;
-
-  memset (&context, 0, sizeof (context));
-  if (is_register_candidate (cif->rtype))
+  rtype = cif->rtype;
+  if (is_register_candidate (rtype))
     {
-      ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
+      ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags);
 
-      t = cif->rtype->type;
+      t = rtype->type;
       switch (t)
 	{
 	case FFI_TYPE_INT:
@@ -754,33 +735,35 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 	case FFI_TYPE_POINTER:
 	case FFI_TYPE_UINT64:
 	case FFI_TYPE_SINT64:
-	  *(ffi_arg *)rvalue = extend_integer_type (&context.x[0], t);
+	  *(ffi_arg *)rvalue = extend_integer_type (&context->x[0], t);
 	  break;
 
 	case FFI_TYPE_FLOAT:
 	case FFI_TYPE_DOUBLE:
 	case FFI_TYPE_LONGDOUBLE:
-	  compress_hfa_type (rvalue, &context.v[0], 0x100 + t);
+	  compress_hfa_type (rvalue, &context->v[0], 0x100 + t);
 	  break;
 
 	case FFI_TYPE_STRUCT:
 	  h = is_hfa (cif->rtype);
 	  if (h)
-	    compress_hfa_type (rvalue, &context.v[0], h);
-	  else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG)
-	    memcpy (rvalue, &context.x[0], cif->rtype->size);
+	    compress_hfa_type (rvalue, &context->v[0], h);
 	  else
-	    abort();
+	    {
+	      FFI_ASSERT (rtype->size <= 16);
+	      memcpy (rvalue, &context->x[0], rtype->size);
+	    }
 	  break;
 
 	default:
-	  abort();
+	  FFI_ASSERT (0);
+	  break;
 	}
     }
   else
     {
-      context.x8 = (uintptr_t)rvalue;
-      ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn);
+      context->x8 = (uintptr_t)rvalue;
+      ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags);
     }
 }
 
@@ -851,7 +834,7 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context,
   struct arg_state state;
   ffi_type *rtype;
 
-  arg_init (&state, ALIGN(cif->bytes, 16));
+  arg_init (&state);
 
   for (i = 0; i < nargs; i++)
     {
diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S
index fa7ff5b..a5f636a 100644
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -22,6 +22,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
+#include <ffi_cfi.h>
 #include "internal.h"
 
 #ifdef HAVE_MACHINE_ASM_H
@@ -38,158 +39,77 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #endif
 #endif
 
-#define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
-#define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
-#define cfi_restore(reg)		.cfi_restore reg
-#define cfi_def_cfa_register(reg)	.cfi_def_cfa_register reg
+	.text
+	.align 2
 
-        .text
-        .align 2
-
-        .globl CNAME(ffi_call_SYSV)
+	.globl CNAME(ffi_call_SYSV)
 #ifdef __ELF__
-        .type CNAME(ffi_call_SYSV), #function
+	.type	CNAME(ffi_call_SYSV), #function
+	.hidden	CNAME(ffi_call_SYSV)
 #endif
 
-/* ffi_call_SYSV()
-
-   Create a stack frame, setup an argument context, call the callee
-   and extract the result.
-
-   The maximum required argument stack size is provided,
-   ffi_call_SYSV() allocates that stack space then calls the
-   prepare_fn to populate register context and stack.  The
-   argument passing registers are loaded from the register
-   context and the callee called, on return the register passing
-   register are saved back to the context.  Our caller will
-   extract the return value from the final state of the saved
-   register context.
-
-   Prototype:
-
-   extern unsigned
-   ffi_call_SYSV (void (*)(struct call_context *context, unsigned char *,
-			   extended_cif *),
-                  struct call_context *context,
-                  extended_cif *,
-                  size_t required_stack_size,
-                  void (*fn)(void));
+/* ffi_call_SYSV
+   extern void ffi_call_SYSV (void *stack, void *frame,
+			      void (*fn)(void), int flags);
 
    Therefore on entry we have:
 
-   x0 prepare_fn
-   x1 &context
-   x2 &ecif
-   x3 bytes
-   x4 fn
-
-   This function uses the following stack frame layout:
+   x0 stack
+   x1 frame
+   x2 fn
+   x3 flags
+*/
 
-   ==
-                saved x30(lr)
-   x29(fp)->    saved x29(fp)
-                saved x24
-                saved x23
-                saved x22
-   sp'    ->    saved x21
-                ...
-   sp     ->    (constructed callee stack arguments)
-   ==
-
-   Voila! */
-
-#define ffi_call_SYSV_FS (8 * 4)
-
-        .cfi_startproc
+	cfi_startproc
 CNAME(ffi_call_SYSV):
-        stp     x29, x30, [sp, #-16]!
-	cfi_adjust_cfa_offset (16)
-        cfi_rel_offset (x29, 0)
-        cfi_rel_offset (x30, 8)
-
-        mov     x29, sp
-	cfi_def_cfa_register (x29)
-        sub     sp, sp, #ffi_call_SYSV_FS
-
-        stp     x21, x22, [sp, #0]
-        cfi_rel_offset (x21, 0 - ffi_call_SYSV_FS)
-        cfi_rel_offset (x22, 8 - ffi_call_SYSV_FS)
-
-        stp     x23, x24, [sp, #16]
-        cfi_rel_offset (x23, 16 - ffi_call_SYSV_FS)
-        cfi_rel_offset (x24, 24 - ffi_call_SYSV_FS)
-
-        mov     x21, x1
-        mov     x22, x2
-        mov     x24, x4
-
-        /* Allocate the stack space for the actual arguments, many
-           arguments will be passed in registers, but we assume
-           worst case and allocate sufficient stack for ALL of
-           the arguments.  */
-        sub     sp, sp, x3
-
-        /* unsigned (*prepare_fn) (struct call_context *context,
-				   unsigned char *stack, extended_cif *ecif);
-	 */
-        mov     x23, x0
-        mov     x0, x1
-        mov     x1, sp
-        /* x2 already in place */
-        blr     x23
-
-        /* Preserve the flags returned.  */
-        mov     x23, x0
-
-        /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FLAG_ARG_V_BIT, 1f
-
-        /* Load the vector argument passing registers.  */
-        ldp     q0, q1, [x21, #0]
-        ldp     q2, q3, [x21, #32]
-        ldp     q4, q5, [x21, #64]
-        ldp     q6, q7, [x21, #96]
+	/* Use a stack frame allocated by our caller.  */
+	cfi_def_cfa(x1, 32);
+	stp	x29, x30, [x1]
+	mov	x29, x1
+	mov	sp, x0
+	cfi_def_cfa_register(x29)
+	cfi_rel_offset (x29, 0)
+	cfi_rel_offset (x30, 8)
+
+	str	w3, [x29, #16]		/* save flags */
+	mov	x9, x2			/* save fn */
+
+	/* Load the vector argument passing registers, if necessary.  */
+	tbz	w3, #AARCH64_FLAG_ARG_V_BIT, 1f
+	ldp     q0, q1, [x29, #32 + 0]
+	ldp     q2, q3, [x29, #32 + 32]
+	ldp     q4, q5, [x29, #32 + 64]
+	ldp     q6, q7, [x29, #32 + 96]
 1:
-        /* Load the core argument passing registers, including
+	/* Load the core argument passing registers, including
 	   the structure return pointer.  */
-        ldp     x0, x1, [x21, #16*N_V_ARG_REG + 0]
-        ldp     x2, x3, [x21, #16*N_V_ARG_REG + 16]
-        ldp     x4, x5, [x21, #16*N_V_ARG_REG + 32]
-        ldp     x6, x7, [x21, #16*N_V_ARG_REG + 48]
-        ldr     x8,     [x21, #16*N_V_ARG_REG + 64]
-
-        blr     x24
+	ldp     x0, x1, [x29, #32 + 16*N_V_ARG_REG + 0]
+	ldp     x2, x3, [x29, #32 + 16*N_V_ARG_REG + 16]
+	ldp     x4, x5, [x29, #32 + 16*N_V_ARG_REG + 32]
+	ldp     x6, x7, [x29, #32 + 16*N_V_ARG_REG + 48]
+	ldr     x8,     [x29, #32 + 16*N_V_ARG_REG + 64]
 
-        /* Save the core return registers.  */
-        stp     x0, x1, [x21, #16*N_V_ARG_REG]
+	blr     x9			/* call fn */
 
-        /* Figure out if we should touch the vector registers.  */
-        tbz     x23, #AARCH64_FLAG_ARG_V_BIT, 1f
+	ldr	w3, [x29, #16]		/* reload flags */
 
-        /* Save the vector return registers.  */
-        stp     q0, q1, [x21, #0]
-        stp     q2, q3, [x21, #32]
-1:
-        /* All done, unwind our stack frame.  */
-        ldp     x21, x22, [x29,  # - ffi_call_SYSV_FS]
-        cfi_restore (x21)
-        cfi_restore (x22)
-
-        ldp     x23, x24, [x29,  # - ffi_call_SYSV_FS + 16]
-        cfi_restore (x23)
-        cfi_restore (x24)
-
-        mov     sp, x29
+	/* Partially deconstruct the stack frame.  */
+	mov     sp, x29
 	cfi_def_cfa_register (sp)
+	ldp     x29, x30, [x29]
 
-        ldp     x29, x30, [sp], #16
-	cfi_adjust_cfa_offset (-16)
-        cfi_restore (x29)
-        cfi_restore (x30)
+	/* Save the core return registers.  */
+	stp     x0, x1, [sp, #32 + 16*N_V_ARG_REG]
 
-        ret
+	/* Save the vector return registers, if necessary.  */
+	tbz     w3, #AARCH64_FLAG_ARG_V_BIT, 1f
+	stp     q0, q1, [sp, #32 + 0]
+	stp     q2, q3, [sp, #32 + 32]
+1:
+	/* All done.  */
+	ret
 
-        .cfi_endproc
+	cfi_endproc
 #ifdef __ELF__
         .size CNAME(ffi_call_SYSV), .-CNAME(ffi_call_SYSV)
 #endif
@@ -237,7 +157,7 @@ CNAME(ffi_call_SYSV):
         .align 2
 
         .globl CNAME(ffi_closure_SYSV)
-        .cfi_startproc
+        cfi_startproc
 CNAME(ffi_closure_SYSV):
         stp     x29, x30, [sp, #-16]!
 	cfi_adjust_cfa_offset (16)
@@ -310,7 +230,7 @@ CNAME(ffi_closure_SYSV):
         cfi_restore (x30)
 
         ret
-        .cfi_endproc
+	cfi_endproc
 #ifdef __ELF__
         .size CNAME(ffi_closure_SYSV), .-CNAME(ffi_closure_SYSV)
 #endif
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 07/16] aarch64: Treat void return as not passed in registers
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (3 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 10/16] aarch64: Move return value handling into ffi_call_SYSV Richard Henderson
@ 2014-10-28 18:54 ` Richard Henderson
  2014-10-28 18:54 ` [PATCH 12/16] aarch64: Unify scalar fp and hfa handling Richard Henderson
                   ` (11 subsequent siblings)
  16 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-10-28 18:54 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Richard Henderson

From: Richard Henderson <rth@redhat.com>

This lets us do less post-processing when there's no return value.
---
 src/aarch64/ffi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c
index 58d088b..6c338e1 100644
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -383,6 +383,7 @@ is_register_candidate (ffi_type *ty)
   switch (ty->type)
     {
     case FFI_TYPE_VOID:
+      return 0;
     case FFI_TYPE_FLOAT:
     case FFI_TYPE_DOUBLE:
     case FFI_TYPE_LONGDOUBLE:
-- 
1.9.3

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 00/16] Go closures for aarch64
  2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
                   ` (15 preceding siblings ...)
  2014-10-28 18:54 ` [PATCH 14/16] aarch64: Add support for complex types Richard Henderson
@ 2014-11-10 10:12 ` James Greenhalgh
  2014-11-10 13:22   ` Richard Henderson
  16 siblings, 1 reply; 20+ messages in thread
From: James Greenhalgh @ 2014-11-10 10:12 UTC (permalink / raw)
  To: Richard Henderson; +Cc: libffi-discuss

On Tue, Oct 28, 2014 at 06:52:57PM +0000, Richard Henderson wrote:
> This patch set fixes a compilation error since the iOS merge,
> tidies up the port significantly, and finally adds support for
> complex and Go closures.

Hi Richard,

Possibly an irrelevant comment for this patch series, but while rewriting
the world, did you consider Jakub's comments in this thread on
gcc-patches regarding .note.GNU-stack notes?
  https://gcc.gnu.org/ml/gcc-patches/2014-09/msg00820.html

"I've noticed that on 4.8 branch libgo recently (in the last few months)
  started being linked with
    GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RWE 0x10
  i.e. requiring executable stack on powerpc-linux (32-bit).

  The problem is that we link into libffi linux64.o and linux64_closure.o
  unconditionally, both for 32-bit and 64-bit compilations, just for 32-bit
  ones all the assembly is ifdefed out, so they have just empty sections.
  The .note.GNU-stack section isn't emitted in that case either, which means
  that the linker conservatively treats those as possibly needing executable
  stack.

  The following patch should fix that, ok for trunk/4.9/4.8?

  BTW, I wonder if e.g. libffi/src/arm/trampoline.S or
  libffi/src/aarch64/sysv.S shouldn't have those notes too (note, both of
  those were added after 2008 when most of the *.S files were marked that
  way)."

If it doesn't belong in this series, I'll propose a patch adding it once
your patches have gone in.

Thanks,
James

> Richard Henderson (16):
>   aarch64: Fix non-apple compilation
>   aarch64: Improve is_hfa
>   aarch64: Always distinguish LONGDOUBLE
>   aarch64: Simplify AARCH64_STACK_ALIGN
>   aarch64: Reduce the size of register_context
>   aarch64: Use correct return registers
>   aarch64: Treat void return as not passed in registers
>   aarch64: Tidy up abi manipulation
>   aarch64: Merge prep_args with ffi_call
>   aarch64: Move return value handling into ffi_call_SYSV
>   aarch64: Move return value handling into ffi_closure_SYSV
>   aarch64: Unify scalar fp and hfa handling
>   aarch64: Remove aarch64_flags
>   aarch64: Add support for complex types
>   aarch64: Move x8 out of call_context
>   aarch64: Add support for Go closures
> 
>  src/aarch64/ffi.c              | 1477 ++++++++++++++++------------------------
>  src/aarch64/ffitarget.h        |   14 +-
>  src/aarch64/internal.h         |   67 ++
>  src/aarch64/sysv.S             |  589 +++++++++-------
>  testsuite/libffi.call/call.exp |   10 +-
>  5 files changed, 1008 insertions(+), 1149 deletions(-)
>  create mode 100644 src/aarch64/internal.h
> 
> -- 
> 1.9.3
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 00/16] Go closures for aarch64
  2014-11-10 10:12 ` [PATCH 00/16] Go closures for aarch64 James Greenhalgh
@ 2014-11-10 13:22   ` Richard Henderson
  0 siblings, 0 replies; 20+ messages in thread
From: Richard Henderson @ 2014-11-10 13:22 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: libffi-discuss

On 11/10/2014 11:12 AM, James Greenhalgh wrote:
> If it doesn't belong in this series, I'll propose a patch adding it once
> your patches have gone in.

I hadn't considered missing gnu stack markers.
Fixing any that are still missing after my patch set would be most appreciated.


r~

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 08/16] aarch64: Tidy up abi manipulation
  2014-10-28 18:54 ` [PATCH 08/16] aarch64: Tidy up abi manipulation Richard Henderson
@ 2017-09-17 14:24   ` Andreas Schwab
  0 siblings, 0 replies; 20+ messages in thread
From: Andreas Schwab @ 2017-09-17 14:24 UTC (permalink / raw)
  To: Richard Henderson; +Cc: libffi-discuss, Richard Henderson

On Okt 28 2014, Richard Henderson <rth@twiddle.net> wrote:

> +	case FFI_TYPE_STRUCT:
> +	  {
> +	    void *dest;
> +	    int elems;
> +
> +	    h = is_hfa (ty);
> +	    if (h)
> +	      {
> +	    do_hfa:
> +		elems = h >> 8;
> +	        if (state.nsrn + elems <= N_V_ARG_REG)
> +		  {
> +		    dest = &context->v[state.nsrn];
> +		    state.nsrn += elems;
> +		    extend_hfa_type (dest, a, h);
> +		    break;
> +		  }
> +		state.nsrn = N_V_ARG_REG;
> +		dest = allocate_to_stack (&state, stack, ty->alignment, s);
> +	      }
> +	    else if (s > 16)
> +	      {
> +		/* If the argument is a composite type that is larger than 16
> +		   bytes, then the argument has been copied to memory, and
> +		   the argument is replaced by a pointer to the copy.  */
> +		a = &avalue[i];
> +		t = FFI_TYPE_POINTER;
> +		goto do_pointer;

I don't see where the argument has been copied to memory.  Doesn't that
need to call allocate_to_stack here?

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2017-09-17 14:24 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-28 18:53 [PATCH 00/16] Go closures for aarch64 Richard Henderson
2014-10-28 18:53 ` [PATCH 03/16] aarch64: Always distinguish LONGDOUBLE Richard Henderson
2014-10-28 18:53 ` [PATCH 01/16] aarch64: Fix non-apple compilation Richard Henderson
2014-10-28 18:54 ` [PATCH 11/16] aarch64: Move return value handling into ffi_closure_SYSV Richard Henderson
2014-10-28 18:54 ` [PATCH 10/16] aarch64: Move return value handling into ffi_call_SYSV Richard Henderson
2014-10-28 18:54 ` [PATCH 07/16] aarch64: Treat void return as not passed in registers Richard Henderson
2014-10-28 18:54 ` [PATCH 12/16] aarch64: Unify scalar fp and hfa handling Richard Henderson
2014-10-28 18:54 ` [PATCH 16/16] aarch64: Add support for Go closures Richard Henderson
2014-10-28 18:54 ` [PATCH 02/16] aarch64: Improve is_hfa Richard Henderson
2014-10-28 18:54 ` [PATCH 13/16] aarch64: Remove aarch64_flags Richard Henderson
2014-10-28 18:54 ` [PATCH 05/16] aarch64: Reduce the size of register_context Richard Henderson
2014-10-28 18:54 ` [PATCH 04/16] aarch64: Simplify AARCH64_STACK_ALIGN Richard Henderson
2014-10-28 18:54 ` [PATCH 09/16] aarch64: Merge prep_args with ffi_call Richard Henderson
2014-10-28 18:54 ` [PATCH 15/16] aarch64: Move x8 out of call_context Richard Henderson
2014-10-28 18:54 ` [PATCH 08/16] aarch64: Tidy up abi manipulation Richard Henderson
2017-09-17 14:24   ` Andreas Schwab
2014-10-28 18:54 ` [PATCH 06/16] aarch64: Use correct return registers Richard Henderson
2014-10-28 18:54 ` [PATCH 14/16] aarch64: Add support for complex types Richard Henderson
2014-11-10 10:12 ` [PATCH 00/16] Go closures for aarch64 James Greenhalgh
2014-11-10 13:22   ` Richard Henderson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).