From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 14937 invoked by alias); 10 Oct 2014 20:43:54 -0000 Mailing-List: contact libffi-discuss-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libffi-discuss-owner@sourceware.org Received: (qmail 14671 invoked by uid 89); 10 Oct 2014 20:43:52 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-2.3 required=5.0 tests=AWL,BAYES_00,FREEMAIL_ENVFROM_END_DIGIT,FREEMAIL_FROM,RCVD_IN_DNSWL_LOW,SPF_PASS autolearn=ham version=3.3.2 X-HELO: mail-qg0-f50.google.com Received: from mail-qg0-f50.google.com (HELO mail-qg0-f50.google.com) (209.85.192.50) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-SHA encrypted) ESMTPS; Fri, 10 Oct 2014 20:43:44 +0000 Received: by mail-qg0-f50.google.com with SMTP id q108so4445722qgd.37 for ; Fri, 10 Oct 2014 13:43:42 -0700 (PDT) X-Received: by 10.224.47.134 with SMTP id n6mr12764563qaf.15.1412973822253; Fri, 10 Oct 2014 13:43:42 -0700 (PDT) Received: from anchor.com (50-194-63-110-static.hfc.comcastbusiness.net. [50.194.63.110]) by mx.google.com with ESMTPSA id s49sm5909008qge.15.2014.10.10.13.43.40 for (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Fri, 10 Oct 2014 13:43:41 -0700 (PDT) From: Richard Henderson To: gcc-patches@gcc.gnu.org Cc: libffi-discuss@sourceware.org, gofrontend-dev@googlegroups.com Subject: [PATCH 10/13] libffi: Rewrite aarch64 Date: Fri, 10 Oct 2014 20:43:00 -0000 Message-Id: <1412973773-3942-11-git-send-email-rth@redhat.com> In-Reply-To: <1412973773-3942-1-git-send-email-rth@redhat.com> References: <1412973773-3942-1-git-send-email-rth@redhat.com> X-IsSubscribed: yes X-SW-Source: 2014/txt/msg00106.txt.bz2 (1) Invent a new "internal.h" rather than polluting the public ffitarget.h with stuff that ought not be exposed. (2) Rewrite is_hfa to not be so horribly computationally expensive. And more to the point require us to _re_ compute the same stuff in order to actually do anything with the type. (3) Don't use the out-dated prep_args callback form for ffi_call. The x86_64 port has for years shown how to do this with a single alloca, but new ports keep copying i386 which still does it the inefficient way. --- libffi/src/aarch64/ffi.c | 1362 +++++++++++++++------------------------- libffi/src/aarch64/ffitarget.h | 17 +- libffi/src/aarch64/internal.h | 43 ++ libffi/src/aarch64/sysv.S | 499 ++++++++------- 4 files changed, 816 insertions(+), 1105 deletions(-) create mode 100644 libffi/src/aarch64/internal.h diff --git a/libffi/src/aarch64/ffi.c b/libffi/src/aarch64/ffi.c index 1405665..c409c0c 100644 --- a/libffi/src/aarch64/ffi.c +++ b/libffi/src/aarch64/ffi.c @@ -20,42 +20,37 @@ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include - +#include +#include #include #include +#include "internal.h" -#include - -/* Stack alignment requirement in bytes */ +/* Stack alignment requirement in bytes. */ #define AARCH64_STACK_ALIGN 16 +/* Number of X and V argument registers. */ #define N_X_ARG_REG 8 #define N_V_ARG_REG 8 -#define AARCH64_FFI_WITH_V (1 << AARCH64_FFI_WITH_V_BIT) - union _d { UINT64 d; UINT32 s[2]; }; -struct call_context +struct _v { - UINT64 x [AARCH64_N_XREG]; - struct - { - union _d d[2]; - } v [AARCH64_N_VREG]; + union _d d[2] __attribute__((aligned(16))); }; -static void * -get_x_addr (struct call_context *context, unsigned n) +struct call_context { - return &context->x[n]; -} + UINT64 x[N_X_ARG_REG]; + struct _v v[N_V_ARG_REG]; +}; -static void * +static inline UINT32 * get_s_addr (struct call_context *context, unsigned n) { #if defined __AARCH64EB__ @@ -65,557 +60,371 @@ get_s_addr (struct call_context *context, unsigned n) #endif } -static void * +static inline UINT64 * get_d_addr (struct call_context *context, unsigned n) { #if defined __AARCH64EB__ - return &context->v[n].d[1]; + return &context->v[n].d[1].d; #else - return &context->v[n].d[0]; + return &context->v[n].d[0].d; #endif } -static void * -get_v_addr (struct call_context *context, unsigned n) -{ - return &context->v[n]; -} - -/* Return the memory location at which a basic type would reside - were it to have been stored in register n. */ - -static void * -get_basic_type_addr (unsigned short type, struct call_context *context, - unsigned n) -{ - switch (type) - { - case FFI_TYPE_FLOAT: - return get_s_addr (context, n); - case FFI_TYPE_DOUBLE: - return get_d_addr (context, n); - case FFI_TYPE_LONGDOUBLE: - return get_v_addr (context, n); - case FFI_TYPE_UINT8: - case FFI_TYPE_SINT8: - case FFI_TYPE_UINT16: - case FFI_TYPE_SINT16: - case FFI_TYPE_UINT32: - case FFI_TYPE_SINT32: - case FFI_TYPE_INT: - case FFI_TYPE_POINTER: - case FFI_TYPE_UINT64: - case FFI_TYPE_SINT64: - return get_x_addr (context, n); - default: - FFI_ASSERT (0); - return NULL; - } -} - -/* Return the alignment width for each of the basic types. */ - -static size_t -get_basic_type_alignment (unsigned short type) -{ - switch (type) - { - case FFI_TYPE_FLOAT: - case FFI_TYPE_DOUBLE: - return sizeof (UINT64); - case FFI_TYPE_LONGDOUBLE: - return sizeof (long double); - case FFI_TYPE_UINT8: - case FFI_TYPE_SINT8: - case FFI_TYPE_UINT16: - case FFI_TYPE_SINT16: - case FFI_TYPE_UINT32: - case FFI_TYPE_INT: - case FFI_TYPE_SINT32: - case FFI_TYPE_POINTER: - case FFI_TYPE_UINT64: - case FFI_TYPE_SINT64: - return sizeof (UINT64); +extern void ffi_call_SYSV (void *frame, void *rvalue, + struct call_context *context, + unsigned flags, void (*fn)(void)) FFI_HIDDEN; - default: - FFI_ASSERT (0); - return 0; - } -} +extern void ffi_closure_SYSV (void) FFI_HIDDEN; +extern void ffi_closure_SYSV_V (void) FFI_HIDDEN; -/* Return the size in bytes for each of the basic types. */ +/* A subroutine of is_hfa. Given a structure type, return the type code + of the first non-structure element. Recurse for structure elements. + Return -1 if the structure is in fact empty, i.e. no nested elements. */ -static size_t -get_basic_type_size (unsigned short type) +static int +is_hfa0 (const ffi_type *ty) { - switch (type) - { - case FFI_TYPE_FLOAT: - return sizeof (UINT32); - case FFI_TYPE_DOUBLE: - return sizeof (UINT64); - case FFI_TYPE_LONGDOUBLE: - return sizeof (long double); - case FFI_TYPE_UINT8: - return sizeof (UINT8); - case FFI_TYPE_SINT8: - return sizeof (SINT8); - case FFI_TYPE_UINT16: - return sizeof (UINT16); - case FFI_TYPE_SINT16: - return sizeof (SINT16); - case FFI_TYPE_UINT32: - return sizeof (UINT32); - case FFI_TYPE_INT: - case FFI_TYPE_SINT32: - return sizeof (SINT32); - case FFI_TYPE_POINTER: - case FFI_TYPE_UINT64: - return sizeof (UINT64); - case FFI_TYPE_SINT64: - return sizeof (SINT64); - - default: - FFI_ASSERT (0); - return 0; - } -} + ffi_type **elements = ty->elements; + int i, ret = -1; -extern void -ffi_call_SYSV (unsigned (*)(struct call_context *context, unsigned char *, - extended_cif *), - struct call_context *context, - extended_cif *, - unsigned, - void (*fn)(void)); - -extern void -ffi_closure_SYSV (ffi_closure *); - -/* Test for an FFI floating point representation. */ + if (elements != NULL) + for (i = 0; elements[i]; ++i) + { + ret = elements[i]->type; + if (ret == FFI_TYPE_STRUCT) + { + ret = is_hfa0 (elements[i]); + if (ret < 0) + continue; + } + break; + } -static unsigned -is_floating_type (unsigned short type) -{ - return (type == FFI_TYPE_FLOAT || type == FFI_TYPE_DOUBLE - || type == FFI_TYPE_LONGDOUBLE); + return ret; } -/* Test for a homogeneous structure. */ +/* A subroutine of is_hfa. Given a structure type, return true if all + of the non-structure elements are the same as CANDIDATE. */ -static unsigned short -get_homogeneous_type (ffi_type *ty) +static int +is_hfa1 (const ffi_type *ty, int candidate) { - if (ty->type == FFI_TYPE_STRUCT && ty->elements) - { - unsigned i; - unsigned short candidate_type - = get_homogeneous_type (ty->elements[0]); - for (i =1; ty->elements[i]; i++) - { - unsigned short iteration_type = 0; - /* If we have a nested struct, we must find its homogeneous type. - If that fits with our candidate type, we are still - homogeneous. */ - if (ty->elements[i]->type == FFI_TYPE_STRUCT - && ty->elements[i]->elements) - { - iteration_type = get_homogeneous_type (ty->elements[i]); - } - else - { - iteration_type = ty->elements[i]->type; - } + ffi_type **elements = ty->elements; + int i; - /* If we are not homogeneous, return FFI_TYPE_STRUCT. */ - if (candidate_type != iteration_type) - return FFI_TYPE_STRUCT; - } - return candidate_type; - } + if (elements != NULL) + for (i = 0; elements[i]; ++i) + { + int t = elements[i]->type; + if (t == FFI_TYPE_STRUCT) + { + if (!is_hfa1 (elements[i], candidate)) + return 0; + } + else if (t != candidate) + return 0; + } - /* Base case, we have no more levels of nesting, so we - are a basic type, and so, trivially homogeneous in that type. */ - return ty->type; + return 1; } -/* Determine the number of elements within a STRUCT. +/* Determine if TY is an homogenous floating point aggregate (HFA). + That is, a structure consisting of 1 to 4 members of all the same type, + where that type is a floating point scalar. - Note, we must handle nested structs. + Returns non-zero iff TY is an HFA. The result is an encoded value where + bits 0-7 contain the type code, and bits 8-10 contain the element count. */ - If ty is not a STRUCT this function will return 0. */ - -static unsigned -element_count (ffi_type *ty) +static int +is_hfa(const ffi_type *ty) { - if (ty->type == FFI_TYPE_STRUCT && ty->elements) - { - unsigned n; - unsigned elems = 0; - for (n = 0; ty->elements[n]; n++) - { - if (ty->elements[n]->type == FFI_TYPE_STRUCT - && ty->elements[n]->elements) - elems += element_count (ty->elements[n]); - else - elems++; - } - return elems; - } - return 0; -} + ffi_type **elements; + int candidate, i; + size_t size, ele_count; -/* Test for a homogeneous floating point aggregate. + /* Quickest tests first. */ + if (ty->type != FFI_TYPE_STRUCT) + return 0; - A homogeneous floating point aggregate is a homogeneous aggregate of - a half- single- or double- precision floating point type with one - to four elements. Note that this includes nested structs of the - basic type. */ + /* No HFA types are smaller than 4 bytes, or larger than 64 bytes. */ + size = ty->size; + if (size < 4 || size > 64) + return 0; -static int -is_hfa (ffi_type *ty) -{ - if (ty->type == FFI_TYPE_STRUCT - && ty->elements[0] - && is_floating_type (get_homogeneous_type (ty))) + /* Find the type of the first non-structure member. */ + elements = ty->elements; + candidate = elements[0]->type; + if (candidate == FFI_TYPE_STRUCT) { - unsigned n = element_count (ty); - return n >= 1 && n <= 4; + for (i = 0; ; ++i) + { + candidate = is_hfa0 (elements[i]); + if (candidate >= 0) + break; + } } - return 0; -} - -/* Test if an ffi_type is a candidate for passing in a register. - - This test does not check that sufficient registers of the - appropriate class are actually available, merely that IFF - sufficient registers are available then the argument will be passed - in register(s). - - Note that an ffi_type that is deemed to be a register candidate - will always be returned in registers. - Returns 1 if a register candidate else 0. */ - -static int -is_register_candidate (ffi_type *ty) -{ - switch (ty->type) + /* If the first member is not a floating point type, it's not an HFA. + Also quickly re-check the size of the structure. */ + switch (candidate) { - case FFI_TYPE_VOID: case FFI_TYPE_FLOAT: + ele_count = size / sizeof(float); + if (size != ele_count * sizeof(float)) + return 0; + break; case FFI_TYPE_DOUBLE: + ele_count = size / sizeof(double); + if (size != ele_count * sizeof(double)) + return 0; + break; case FFI_TYPE_LONGDOUBLE: - case FFI_TYPE_UINT8: - case FFI_TYPE_UINT16: - case FFI_TYPE_UINT32: - case FFI_TYPE_UINT64: - case FFI_TYPE_POINTER: - case FFI_TYPE_SINT8: - case FFI_TYPE_SINT16: - case FFI_TYPE_SINT32: - case FFI_TYPE_INT: - case FFI_TYPE_SINT64: - return 1; - - case FFI_TYPE_STRUCT: - if (is_hfa (ty)) - { - return 1; - } - else if (ty->size > 16) - { - /* Too large. Will be replaced with a pointer to memory. The - pointer MAY be passed in a register, but the value will - not. This test specifically fails since the argument will - never be passed by value in registers. */ - return 0; - } - else - { - /* Might be passed in registers depending on the number of - registers required. */ - return (ty->size + 7) / 8 < N_X_ARG_REG; - } + ele_count = size / sizeof(long double); + if (size != ele_count * sizeof(long double)) + return 0; break; - default: - FFI_ASSERT (0); - break; + return 0; } + if (ele_count > 4) + return 0; - return 0; -} - -/* Test if an ffi_type argument or result is a candidate for a vector - register. */ - -static int -is_v_register_candidate (ffi_type *ty) -{ - return is_floating_type (ty->type) - || (ty->type == FFI_TYPE_STRUCT && is_hfa (ty)); -} - -/* Representation of the procedure call argument marshalling - state. - - The terse state variable names match the names used in the AARCH64 - PCS. */ - -struct arg_state -{ - unsigned ngrn; /* Next general-purpose register number. */ - unsigned nsrn; /* Next vector register number. */ - unsigned nsaa; /* Next stack offset. */ -}; - -/* Initialize a procedure call argument marshalling state. */ -static void -arg_init (struct arg_state *state, unsigned call_frame_size) -{ - state->ngrn = 0; - state->nsrn = 0; - state->nsaa = 0; -} - -/* Return the number of available consecutive core argument - registers. */ - -static unsigned -available_x (struct arg_state *state) -{ - return N_X_ARG_REG - state->ngrn; -} - -/* Return the number of available consecutive vector argument - registers. */ - -static unsigned -available_v (struct arg_state *state) -{ - return N_V_ARG_REG - state->nsrn; -} - -static void * -allocate_to_x (struct call_context *context, struct arg_state *state) -{ - FFI_ASSERT (state->ngrn < N_X_ARG_REG) - return get_x_addr (context, (state->ngrn)++); -} - -static void * -allocate_to_s (struct call_context *context, struct arg_state *state) -{ - FFI_ASSERT (state->nsrn < N_V_ARG_REG) - return get_s_addr (context, (state->nsrn)++); -} - -static void * -allocate_to_d (struct call_context *context, struct arg_state *state) -{ - FFI_ASSERT (state->nsrn < N_V_ARG_REG) - return get_d_addr (context, (state->nsrn)++); -} - -static void * -allocate_to_v (struct call_context *context, struct arg_state *state) -{ - FFI_ASSERT (state->nsrn < N_V_ARG_REG) - return get_v_addr (context, (state->nsrn)++); -} - -/* Allocate an aligned slot on the stack and return a pointer to it. */ -static void * -allocate_to_stack (struct arg_state *state, void *stack, unsigned alignment, - unsigned size) -{ - void *allocation; - - /* Round up the NSAA to the larger of 8 or the natural - alignment of the argument's type. */ - state->nsaa = ALIGN (state->nsaa, alignment); - state->nsaa = ALIGN (state->nsaa, alignment); - state->nsaa = ALIGN (state->nsaa, 8); - - allocation = stack + state->nsaa; + /* Finally, make sure that all scalar elements are the same type. */ + for (i = 0; elements[i]; ++i) + { + if (elements[i]->type == FFI_TYPE_STRUCT) + { + if (!is_hfa1 (elements[i], candidate)) + return 0; + } + else if (elements[i]->type != candidate) + return 0; + } - state->nsaa += size; - return allocation; + /* All tests succeeded. Encode the result. */ + return (ele_count << 8) | candidate; } -static void -copy_basic_type (void *dest, void *source, unsigned short type) +/* Extend a basic type to fill a 64-bit slot. */ +static UINT64 +extend_basic_type (UINT64 ret, unsigned short type) { - /* This is neccessary to ensure that basic types are copied - sign extended to 64-bits as libffi expects. */ switch (type) { case FFI_TYPE_FLOAT: - *(float *) dest = *(float *) source; - break; - case FFI_TYPE_DOUBLE: - *(double *) dest = *(double *) source; - break; - case FFI_TYPE_LONGDOUBLE: - *(long double *) dest = *(long double *) source; + ret = (UINT32)ret; +#if defined __AARCH64EB__ + ret <<= 32; +#endif break; case FFI_TYPE_UINT8: - *(ffi_arg *) dest = *(UINT8 *) source; + ret = (UINT8)ret; break; case FFI_TYPE_SINT8: - *(ffi_sarg *) dest = *(SINT8 *) source; + ret = (SINT8)ret; break; case FFI_TYPE_UINT16: - *(ffi_arg *) dest = *(UINT16 *) source; + ret = (UINT16)ret; break; case FFI_TYPE_SINT16: - *(ffi_sarg *) dest = *(SINT16 *) source; + ret = (SINT16)ret; break; case FFI_TYPE_UINT32: - *(ffi_arg *) dest = *(UINT32 *) source; + ret = (UINT32)ret; break; case FFI_TYPE_INT: case FFI_TYPE_SINT32: - *(ffi_sarg *) dest = *(SINT32 *) source; + ret = (SINT32)ret; break; - case FFI_TYPE_POINTER: + case FFI_TYPE_DOUBLE: case FFI_TYPE_UINT64: - *(ffi_arg *) dest = *(UINT64 *) source; - break; case FFI_TYPE_SINT64: - *(ffi_sarg *) dest = *(SINT64 *) source; break; - + case FFI_TYPE_POINTER: + ret = (uintptr_t)ret; + break; default: - FFI_ASSERT (0); + abort (); } + return ret; } -static void -copy_hfa_to_reg_or_stack (void *memory, - ffi_type *ty, - struct call_context *context, - unsigned char *stack, - struct arg_state *state) +ffi_status FFI_HIDDEN +ffi_prep_cif_machdep (ffi_cif *cif) { - unsigned elems = element_count (ty); - if (available_v (state) < elems) - { - /* There are insufficient V registers. Further V register allocations - are prevented, the NSAA is adjusted (by allocate_to_stack ()) - and the argument is copied to memory at the adjusted NSAA. */ - state->nsrn = N_V_ARG_REG; - memcpy (allocate_to_stack (state, stack, ty->alignment, ty->size), - memory, - ty->size); - } - else - { - int i; - unsigned short type = get_homogeneous_type (ty); - unsigned elems = element_count (ty); - for (i = 0; i < elems; i++) - { - void *reg = allocate_to_v (context, state); - copy_basic_type (reg, memory, type); - memory += get_basic_type_size (type); - } - } -} + int flags, h, i; + ffi_type *rtype; -/* Either allocate an appropriate register for the argument type, or if - none are available, allocate a stack slot and return a pointer - to the allocated space. */ + /* Round the stack up to a multiple of the stack alignment requirement. */ + cif->bytes = ALIGN (cif->bytes, AARCH64_STACK_ALIGN); -static void * -allocate_to_register_or_stack (struct call_context *context, - unsigned char *stack, - struct arg_state *state, - unsigned short type) -{ - size_t alignment = get_basic_type_alignment (type); - size_t size = alignment; - switch (type) + rtype = cif->rtype; + switch (rtype->type) { - case FFI_TYPE_FLOAT: - /* This is the only case for which the allocated stack size - should not match the alignment of the type. */ - size = sizeof (UINT32); - /* Fall through. */ - case FFI_TYPE_DOUBLE: - if (state->nsrn < N_V_ARG_REG) - return allocate_to_d (context, state); - state->nsrn = N_V_ARG_REG; - break; - case FFI_TYPE_LONGDOUBLE: - if (state->nsrn < N_V_ARG_REG) - return allocate_to_v (context, state); - state->nsrn = N_V_ARG_REG; + case FFI_TYPE_VOID: + flags = AARCH64_RET_VOID; break; case FFI_TYPE_UINT8: - case FFI_TYPE_SINT8: case FFI_TYPE_UINT16: - case FFI_TYPE_SINT16: case FFI_TYPE_UINT32: - case FFI_TYPE_SINT32: + flags = AARCH64_RET_UINT32; + break; case FFI_TYPE_INT: - case FFI_TYPE_POINTER: + case FFI_TYPE_SINT8: + case FFI_TYPE_SINT16: + case FFI_TYPE_SINT32: + flags = AARCH64_RET_SINT32; + break; case FFI_TYPE_UINT64: case FFI_TYPE_SINT64: - if (state->ngrn < N_X_ARG_REG) - return allocate_to_x (context, state); - state->ngrn = N_X_ARG_REG; + flags = AARCH64_RET_INT64; + break; + case FFI_TYPE_POINTER: + flags = (sizeof(void *) == 8 ? AARCH64_RET_INT64 : AARCH64_RET_UINT32); + break; + case FFI_TYPE_FLOAT: + flags = AARCH64_RET_FLOAT; + break; + case FFI_TYPE_DOUBLE: + flags = AARCH64_RET_DOUBLE; + break; + case FFI_TYPE_LONGDOUBLE: + flags = AARCH64_RET_LDOUBLE; + break; + case FFI_TYPE_STRUCT: + h = is_hfa (rtype); + switch (h & 0xff) + { + case FFI_TYPE_FLOAT: + flags = AARCH64_RET_HFA_FLOAT; + break; + case FFI_TYPE_DOUBLE: + flags = AARCH64_RET_HFA_DOUBLE; + break; + case FFI_TYPE_LONGDOUBLE: + flags = AARCH64_RET_HFA_LDOUBLE; + break; + default: + flags = (rtype->size > 16 + ? AARCH64_RET_LG_STRUCT + : AARCH64_RET_SM_STRUCT); + break; + } break; default: - FFI_ASSERT (0); + abort (); } - return allocate_to_stack (state, stack, alignment, size); -} - -/* Copy a value to an appropriate register, or if none are - available, to the stack. */ + /* Note if any argument requires fp registers. */ + for (i = 0; i < cif->nargs; i++) + { + ffi_type *ty = cif->arg_types[i]; + int tt = ty->type; + if (tt == FFI_TYPE_FLOAT + || tt == FFI_TYPE_DOUBLE + || tt == FFI_TYPE_LONGDOUBLE + || is_hfa (ty)) + { + flags |= AARCH64_FLAG_ARG_V; + break; + } + } -static void -copy_to_register_or_stack (struct call_context *context, - unsigned char *stack, - struct arg_state *state, - void *value, - unsigned short type) -{ - copy_basic_type ( - allocate_to_register_or_stack (context, stack, state, type), - value, - type); + cif->flags = flags; + return FFI_OK; } -/* Marshall the arguments from FFI representation to procedure call - context and stack. */ - -static unsigned -aarch64_prep_args (struct call_context *context, unsigned char *stack, - extended_cif *ecif) +/* Call a function with the provided arguments and capture the return + value. */ +void +ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue) { - int i; - struct arg_state state; + struct call_context *context; + UINT64 *stack, *slot; + void *frame, *local_rvalue; + ffi_type **arg_types; + int i, h, nargs, ngrn, nsrn, nsaa; + size_t size, stack_space, ret_space; - arg_init (&state, ALIGN(ecif->cif->bytes, 16)); + FFI_ASSERT (cif->abi == FFI_SYSV); - for (i = 0; i < ecif->cif->nargs; i++) + ret_space = 0; + h = cif->flags & AARCH64_FLAG_RET_MASK; + switch (h) { - ffi_type *ty = ecif->cif->arg_types[i]; - switch (ty->type) - { - case FFI_TYPE_VOID: - FFI_ASSERT (0); - break; + case AARCH64_RET_HFA_FLOAT: + /* The assembly always writes 4 elements. */ + ret_space = 4 * sizeof(float); + break; + case AARCH64_RET_HFA_DOUBLE: + ret_space = 4 * sizeof(double); + break; + case AARCH64_RET_HFA_LDOUBLE: + ret_space = 4 * sizeof(long double); + break; + case AARCH64_RET_SM_STRUCT: + ret_space = 16; + break; + case AARCH64_RET_LG_STRUCT: + if (rvalue == NULL) + ret_space = cif->rtype->size; + break; + } + /* Allocate the space for all of the arguments, the context, the local + stack frame for ffi_call_SYSV, and (possibly) the return value. */ + stack_space = ALIGN (cif->bytes, 16); + context = alloca (sizeof(struct call_context) + + stack_space + + 4 * sizeof(UINT64) + + ret_space); + stack = (UINT64 *)(context + 1); + frame = (char *)stack + stack_space; + + local_rvalue = rvalue; + if (ret_space) + local_rvalue = (char *)frame + 4 * sizeof(UINT64); + + ngrn = nsrn = nsaa = 0; + arg_types = cif->arg_types; + nargs = cif->nargs; + + for (i = 0; i < nargs; i++) + { + ffi_type *ty = arg_types[i]; + unsigned short t = ty->type; + + switch (t) + { /* If the argument is a basic type the argument is allocated to an appropriate register, or if none are available, to the stack. */ case FFI_TYPE_FLOAT: + if (nsrn < N_V_ARG_REG) + slot = get_d_addr (context, nsrn++); + else + slot = &stack[nsaa++]; + *slot = extend_basic_type (*(UINT32 *)avalue[i], t); + break; case FFI_TYPE_DOUBLE: + if (nsrn < N_V_ARG_REG) + slot = get_d_addr (context, nsrn++); + else + slot = &stack[nsaa++]; + *slot = extend_basic_type (*(UINT64 *)avalue[i], t); + break; + case FFI_TYPE_LONGDOUBLE: + if (nsrn < N_V_ARG_REG) + slot = &context->v[nsrn++].d[0].d; + else + { + nsaa = ALIGN (nsaa, 2); + slot = &stack[nsaa]; + nsaa += 2; + } + memcpy (slot, avalue[i], sizeof(long double)); + break; + case FFI_TYPE_UINT8: case FFI_TYPE_SINT8: case FFI_TYPE_UINT16: @@ -626,207 +435,111 @@ aarch64_prep_args (struct call_context *context, unsigned char *stack, case FFI_TYPE_POINTER: case FFI_TYPE_UINT64: case FFI_TYPE_SINT64: - copy_to_register_or_stack (context, stack, &state, - ecif->avalue[i], ty->type); + if (ngrn < N_X_ARG_REG) + slot = &context->x[ngrn++]; + else + slot = &stack[nsaa++]; + *slot = extend_basic_type (*(ffi_arg *)avalue[i], t); break; + case FFI_TYPE_VOID: + /* Note that libgo passes void as a parameter for a + struct with no fields. */ case FFI_TYPE_STRUCT: - if (is_hfa (ty)) - { - copy_hfa_to_reg_or_stack (ecif->avalue[i], ty, context, - stack, &state); - } - else if (ty->size > 16) - { - /* If the argument is a composite type that is larger than 16 - bytes, then the argument has been copied to memory, and - the argument is replaced by a pointer to the copy. */ + { + size_t slot_count; - copy_to_register_or_stack (context, stack, &state, - &(ecif->avalue[i]), FFI_TYPE_POINTER); - } - else if (available_x (&state) >= (ty->size + 7) / 8) - { - /* If the argument is a composite type and the size in - double-words is not more than the number of available - X registers, then the argument is copied into consecutive - X registers. */ - int j; - for (j = 0; j < (ty->size + 7) / 8; j++) - { - memcpy (allocate_to_x (context, &state), - &(((UINT64 *) ecif->avalue[i])[j]), - sizeof (UINT64)); - } - } - else - { - /* Otherwise, there are insufficient X registers. Further X - register allocations are prevented, the NSAA is adjusted - (by allocate_to_stack ()) and the argument is copied to - memory at the adjusted NSAA. */ - state.ngrn = N_X_ARG_REG; - - memcpy (allocate_to_stack (&state, stack, ty->alignment, - ty->size), ecif->avalue + i, ty->size); - } + size = ty->size; + slot_count = (size + 7) / 8; + h = is_hfa (ty); + if (h) + { + int j, reg_count = h >> 8, tt = h & 0xff; + + if (nsrn + reg_count <= N_V_ARG_REG) + { + switch (tt) + { + case FFI_TYPE_FLOAT: + { + UINT32 *src = avalue[i]; + for (j = 0; j < reg_count; ++j) + *get_s_addr (context, nsrn + j) = src[j]; + } + break; + + case FFI_TYPE_DOUBLE: + { + UINT64 *src = avalue[i]; + for (j = 0; j < reg_count; ++j) + *get_d_addr (context, nsrn + j) = src[j]; + } + break; + + case FFI_TYPE_LONGDOUBLE: + memcpy(&context->v[nsrn], avalue[i], size); + break; + + default: + abort (); + } + nsrn += reg_count; + break; + } + /* All out of fp registers. Copy to the stack. */ + nsrn = N_V_ARG_REG; + } + else if (size > 16) + { + /* If the argument is a composite type that is larger than + 16 bytes, then the argument has been copied to memory, + and the argument is replaced by a pointer. */ + if (ngrn < N_X_ARG_REG) + slot = &context->x[ngrn++]; + else + slot = &stack[nsaa++]; + *slot = (uintptr_t)avalue[i]; + break; + } + else + { + if (ty->alignment == 16) + ngrn = ALIGN (ngrn, 2); + + if (ngrn + slot_count <= N_X_ARG_REG) + { + slot = &context->x[ngrn]; + ngrn += slot_count; + memcpy (slot, avalue[i], size); + break; + } + /* All out of general registers. Copy to the stack. */ + ngrn = N_X_ARG_REG; + } + if (ty->alignment > 8) + { + int a = ty->alignment / 8; + nsaa = ALIGN (nsaa, a); + } + memcpy (&stack[nsaa], avalue[i], size); + nsaa += slot_count; + } break; default: - FFI_ASSERT (0); + abort (); break; } } - return ecif->cif->aarch64_flags; + size = cif->rtype->size; + ffi_call_SYSV (frame, local_rvalue, context, cif->flags, fn); + if (local_rvalue != rvalue && rvalue != NULL) + memcpy (rvalue, local_rvalue, size); } -ffi_status -ffi_prep_cif_machdep (ffi_cif *cif) -{ - /* Round the stack up to a multiple of the stack alignment requirement. */ - cif->bytes = - (cif->bytes + (AARCH64_STACK_ALIGN - 1)) & ~ (AARCH64_STACK_ALIGN - 1); - - /* Initialize our flags. We are interested if this CIF will touch a - vector register, if so we will enable context save and load to - those registers, otherwise not. This is intended to be friendly - to lazy float context switching in the kernel. */ - cif->aarch64_flags = 0; - - if (is_v_register_candidate (cif->rtype)) - { - cif->aarch64_flags |= AARCH64_FFI_WITH_V; - } - else - { - int i; - for (i = 0; i < cif->nargs; i++) - if (is_v_register_candidate (cif->arg_types[i])) - { - cif->aarch64_flags |= AARCH64_FFI_WITH_V; - break; - } - } - - return FFI_OK; -} - -/* Call a function with the provided arguments and capture the return - value. */ -void -ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue) -{ - extended_cif ecif; - - ecif.cif = cif; - ecif.avalue = avalue; - ecif.rvalue = rvalue; - - switch (cif->abi) - { - case FFI_SYSV: - { - struct call_context context; - unsigned stack_bytes; - - /* Figure out the total amount of stack space we need, the - above call frame space needs to be 16 bytes aligned to - ensure correct alignment of the first object inserted in - that space hence the ALIGN applied to cif->bytes.*/ - stack_bytes = ALIGN(cif->bytes, 16); - - memset (&context, 0, sizeof (context)); - if (is_register_candidate (cif->rtype)) - { - ffi_call_SYSV (aarch64_prep_args, &context, &ecif, stack_bytes, fn); - switch (cif->rtype->type) - { - case FFI_TYPE_VOID: - case FFI_TYPE_FLOAT: - case FFI_TYPE_DOUBLE: - case FFI_TYPE_LONGDOUBLE: - case FFI_TYPE_UINT8: - case FFI_TYPE_SINT8: - case FFI_TYPE_UINT16: - case FFI_TYPE_SINT16: - case FFI_TYPE_UINT32: - case FFI_TYPE_SINT32: - case FFI_TYPE_POINTER: - case FFI_TYPE_UINT64: - case FFI_TYPE_INT: - case FFI_TYPE_SINT64: - { - void *addr = get_basic_type_addr (cif->rtype->type, - &context, 0); - copy_basic_type (rvalue, addr, cif->rtype->type); - break; - } - - case FFI_TYPE_STRUCT: - if (is_hfa (cif->rtype)) - { - int j; - unsigned short type = get_homogeneous_type (cif->rtype); - unsigned elems = element_count (cif->rtype); - for (j = 0; j < elems; j++) - { - void *reg = get_basic_type_addr (type, &context, j); - copy_basic_type (rvalue, reg, type); - rvalue += get_basic_type_size (type); - } - } - else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG) - { - unsigned size = ALIGN (cif->rtype->size, sizeof (UINT64)); - memcpy (rvalue, get_x_addr (&context, 0), size); - } - else - { - FFI_ASSERT (0); - } - break; - - default: - FFI_ASSERT (0); - break; - } - } - else - { - memcpy (get_x_addr (&context, 8), &rvalue, sizeof (UINT64)); - ffi_call_SYSV (aarch64_prep_args, &context, &ecif, - stack_bytes, fn); - } - break; - } - - default: - FFI_ASSERT (0); - break; - } -} - -static unsigned char trampoline [] = -{ 0x70, 0x00, 0x00, 0x58, /* ldr x16, 1f */ - 0x91, 0x00, 0x00, 0x10, /* adr x17, 2f */ - 0x00, 0x02, 0x1f, 0xd6 /* br x16 */ -}; - /* Build a trampoline. */ -#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX,FLAGS) \ - ({unsigned char *__tramp = (unsigned char*)(TRAMP); \ - UINT64 __fun = (UINT64)(FUN); \ - UINT64 __ctx = (UINT64)(CTX); \ - UINT64 __flags = (UINT64)(FLAGS); \ - memcpy (__tramp, trampoline, sizeof (trampoline)); \ - memcpy (__tramp + 12, &__fun, sizeof (__fun)); \ - memcpy (__tramp + 20, &__ctx, sizeof (__ctx)); \ - memcpy (__tramp + 28, &__flags, sizeof (__flags)); \ - __clear_cache(__tramp, __tramp + FFI_TRAMPOLINE_SIZE); \ - }) - ffi_status ffi_prep_closure_loc (ffi_closure* closure, ffi_cif* cif, @@ -834,15 +547,29 @@ ffi_prep_closure_loc (ffi_closure* closure, void *user_data, void *codeloc) { + static const unsigned char trampoline[16] = { + 0x90, 0x00, 0x00, 0x58, /* ldr x16, 16 */ + 0xf1, 0xff, 0xff, 0x10, /* adr x17, 0 */ + 0x00, 0x02, 0x1f, 0xd6, /* br x16 */ + }; + char *tramp = &closure->tramp[0]; + void (*entry)(void); + if (cif->abi != FFI_SYSV) return FFI_BAD_ABI; - FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_SYSV, codeloc, - cif->aarch64_flags); + entry = (cif->flags & AARCH64_FLAG_ARG_V + ? ffi_closure_SYSV_V : ffi_closure_SYSV); + + memcpy (tramp, trampoline, sizeof(trampoline)); + + *(UINT64 *)(tramp + 16) = (uintptr_t)entry; - closure->cif = cif; + closure->cif = cif; + closure->fun = fun; closure->user_data = user_data; - closure->fun = fun; + + __clear_cache (tramp, tramp + sizeof(trampoline)); return FFI_OK; } @@ -863,26 +590,33 @@ ffi_prep_closure_loc (ffi_closure* closure, desriptors, invokes the wrapped function, then marshalls the return value back into the call context. */ -void -ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context, - void *stack) +UINT64 FFI_HIDDEN +ffi_closure_SYSV_inner (ffi_cif *cif, + void (*fun)(ffi_cif*,void*,void**,void*), + void *user_data, + struct call_context *context, + UINT64 *stack, void *rvalue) { - ffi_cif *cif = closure->cif; void **avalue = (void**) alloca (cif->nargs * sizeof (void*)); - void *rvalue = NULL; - int i; - struct arg_state state; - - arg_init (&state, ALIGN(cif->bytes, 16)); - - for (i = 0; i < cif->nargs; i++) + ffi_type **arg_types; + int i, nargs, h, ngrn, nsrn, nsaa; + size_t size; + + ngrn = nsrn = nsaa = 0; + arg_types = cif->arg_types; + nargs = cif->nargs; + + for (i = 0; i < nargs; i++) { - ffi_type *ty = cif->arg_types[i]; + ffi_type *ty = arg_types[i]; + int t = ty->type; + void *slot; - switch (ty->type) + switch (t) { case FFI_TYPE_VOID: - FFI_ASSERT (0); + /* ??? abort */ + slot = NULL; break; case FFI_TYPE_UINT8: @@ -895,182 +629,128 @@ ffi_closure_SYSV_inner (ffi_closure *closure, struct call_context *context, case FFI_TYPE_POINTER: case FFI_TYPE_UINT64: case FFI_TYPE_SINT64: - case FFI_TYPE_FLOAT: - case FFI_TYPE_DOUBLE: - case FFI_TYPE_LONGDOUBLE: - avalue[i] = allocate_to_register_or_stack (context, stack, - &state, ty->type); + if (ngrn < N_X_ARG_REG) + slot = &context->x[ngrn++]; + else + slot = &stack[nsaa++]; + *(ffi_arg *)slot = extend_basic_type (*(UINT64 *)slot, t); break; - case FFI_TYPE_STRUCT: - if (is_hfa (ty)) - { - unsigned n = element_count (ty); - if (available_v (&state) < n) - { - state.nsrn = N_V_ARG_REG; - avalue[i] = allocate_to_stack (&state, stack, ty->alignment, - ty->size); - } - else - { - switch (get_homogeneous_type (ty)) - { - case FFI_TYPE_FLOAT: - { - /* Eeek! We need a pointer to the structure, - however the homogeneous float elements are - being passed in individual S registers, - therefore the structure is not represented as - a contiguous sequence of bytes in our saved - register context. We need to fake up a copy - of the structure layed out in memory - correctly. The fake can be tossed once the - closure function has returned hence alloca() - is sufficient. */ - int j; - UINT32 *p = avalue[i] = alloca (ty->size); - for (j = 0; j < element_count (ty); j++) - memcpy (&p[j], - allocate_to_s (context, &state), - sizeof (*p)); - break; - } - - case FFI_TYPE_DOUBLE: - { - /* Eeek! We need a pointer to the structure, - however the homogeneous float elements are - being passed in individual S registers, - therefore the structure is not represented as - a contiguous sequence of bytes in our saved - register context. We need to fake up a copy - of the structure layed out in memory - correctly. The fake can be tossed once the - closure function has returned hence alloca() - is sufficient. */ - int j; - UINT64 *p = avalue[i] = alloca (ty->size); - for (j = 0; j < element_count (ty); j++) - memcpy (&p[j], - allocate_to_d (context, &state), - sizeof (*p)); - break; - } + case FFI_TYPE_FLOAT: + if (nsrn < N_V_ARG_REG) + slot = get_s_addr (context, nsrn++); + else + slot = &stack[nsaa++]; + break; - case FFI_TYPE_LONGDOUBLE: - memcpy (&avalue[i], - allocate_to_v (context, &state), - sizeof (*avalue)); - break; + case FFI_TYPE_DOUBLE: + if (nsrn < N_V_ARG_REG) + slot = get_d_addr (context, nsrn++); + else + slot = &stack[nsaa++]; + break; - default: - FFI_ASSERT (0); - break; - } - } - } - else if (ty->size > 16) - { - /* Replace Composite type of size greater than 16 with a - pointer. */ - memcpy (&avalue[i], - allocate_to_register_or_stack (context, stack, - &state, FFI_TYPE_POINTER), - sizeof (avalue[i])); - } - else if (available_x (&state) >= (ty->size + 7) / 8) - { - avalue[i] = get_x_addr (context, state.ngrn); - state.ngrn += (ty->size + 7) / 8; - } + case FFI_TYPE_LONGDOUBLE: + if (nsrn < N_V_ARG_REG) + slot = &context->v[nsrn++]; else { - state.ngrn = N_X_ARG_REG; - - avalue[i] = allocate_to_stack (&state, stack, ty->alignment, - ty->size); + nsaa = ALIGN (nsaa, 2); + slot = &stack[nsaa]; + nsaa += 2; } break; - default: - FFI_ASSERT (0); + case FFI_TYPE_STRUCT: + { + size_t slot_count; + + size = ty->size; + slot_count = (size + 7) / 8; + h = is_hfa (ty); + if (h) + { + int reg_count = h >> 8; + int tt = h & 0xff; + int j; + + if (nsrn + reg_count <= N_V_ARG_REG) + { + switch (tt) + { + case FFI_TYPE_FLOAT: + { + UINT32 *dst = alloca (size); + for (j = 0; j < reg_count; ++j) + dst[j] = *get_s_addr(context, nsrn + j); + slot = dst; + } + break; + case FFI_TYPE_DOUBLE: + { + UINT64 *dst = alloca (size); + for (j = 0; j < reg_count; ++j) + dst[j] = *get_d_addr(context, nsrn + j); + slot = dst; + } + break; + case FFI_TYPE_LONGDOUBLE: + slot = &context->v[nsrn]; + break; + default: + abort (); + } + nsrn += reg_count; + break; + } + /* All out of fp registers. It's on the stack. */ + nsrn = N_V_ARG_REG; + } + else if (size > 16) + { + /* The argument is passed by indirection. */ + if (ngrn < N_X_ARG_REG) + slot = (void *)(uintptr_t)context->x[ngrn++]; + else + slot = (void *)(uintptr_t)stack[nsaa++]; + break; + } + else + { + if (ty->alignment == 16) + ngrn = ALIGN (ngrn, 2); + + if (ngrn + slot_count <= N_X_ARG_REG) + { + slot = &context->x[ngrn]; + ngrn += slot_count; + break; + } + /* All out of general registers. Copy to the stack. */ + ngrn = N_X_ARG_REG; + } + if (ty->alignment > 8) + { + int a = ty->alignment / 8; + nsaa = ALIGN (nsaa, a); + } + slot = &stack[nsaa]; + nsaa += slot_count; + } break; + + default: + abort (); } + + avalue[i] = slot; } - /* Figure out where the return value will be passed, either in - registers or in a memory block allocated by the caller and passed - in x8. */ + h = cif->flags & AARCH64_FLAG_RET_MASK; + if (h != AARCH64_RET_LG_STRUCT) + rvalue = context + 1; - if (is_register_candidate (cif->rtype)) - { - /* Register candidates are *always* returned in registers. */ - - /* Allocate a scratchpad for the return value, we will let the - callee scrible the result into the scratch pad then move the - contents into the appropriate return value location for the - call convention. */ - rvalue = alloca (cif->rtype->size); - (closure->fun) (cif, rvalue, avalue, closure->user_data); - - /* Copy the return value into the call context so that it is returned - as expected to our caller. */ - switch (cif->rtype->type) - { - case FFI_TYPE_VOID: - break; + fun (cif, rvalue, avalue, user_data); - case FFI_TYPE_UINT8: - case FFI_TYPE_UINT16: - case FFI_TYPE_UINT32: - case FFI_TYPE_POINTER: - case FFI_TYPE_UINT64: - case FFI_TYPE_SINT8: - case FFI_TYPE_SINT16: - case FFI_TYPE_INT: - case FFI_TYPE_SINT32: - case FFI_TYPE_SINT64: - case FFI_TYPE_FLOAT: - case FFI_TYPE_DOUBLE: - case FFI_TYPE_LONGDOUBLE: - { - void *addr = get_basic_type_addr (cif->rtype->type, context, 0); - copy_basic_type (addr, rvalue, cif->rtype->type); - break; - } - case FFI_TYPE_STRUCT: - if (is_hfa (cif->rtype)) - { - int i; - unsigned short type = get_homogeneous_type (cif->rtype); - unsigned elems = element_count (cif->rtype); - for (i = 0; i < elems; i++) - { - void *reg = get_basic_type_addr (type, context, i); - copy_basic_type (reg, rvalue, type); - rvalue += get_basic_type_size (type); - } - } - else if ((cif->rtype->size + 7) / 8 < N_X_ARG_REG) - { - unsigned size = ALIGN (cif->rtype->size, sizeof (UINT64)) ; - memcpy (get_x_addr (context, 0), rvalue, size); - } - else - { - FFI_ASSERT (0); - } - break; - default: - FFI_ASSERT (0); - break; - } - } - else - { - memcpy (&rvalue, get_x_addr (context, 8), sizeof (UINT64)); - (closure->fun) (cif, rvalue, avalue, closure->user_data); - } + return h; } - diff --git a/libffi/src/aarch64/ffitarget.h b/libffi/src/aarch64/ffitarget.h index 6f1a348..ecfa159 100644 --- a/libffi/src/aarch64/ffitarget.h +++ b/libffi/src/aarch64/ffitarget.h @@ -27,8 +27,8 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #endif #ifndef LIBFFI_ASM -typedef unsigned long ffi_arg; -typedef signed long ffi_sarg; +typedef unsigned long long ffi_arg; +typedef signed long long ffi_sarg; typedef enum ffi_abi { @@ -42,18 +42,7 @@ typedef enum ffi_abi /* ---- Definitions for closures ----------------------------------------- */ #define FFI_CLOSURES 1 -#define FFI_TRAMPOLINE_SIZE 36 +#define FFI_TRAMPOLINE_SIZE 24 #define FFI_NATIVE_RAW_API 0 -/* ---- Internal ---- */ - - -#define FFI_EXTRA_CIF_FIELDS unsigned aarch64_flags - -#define AARCH64_FFI_WITH_V_BIT 0 - -#define AARCH64_N_XREG 32 -#define AARCH64_N_VREG 32 -#define AARCH64_CALL_CONTEXT_SIZE (AARCH64_N_XREG * 8 + AARCH64_N_VREG * 16) - #endif diff --git a/libffi/src/aarch64/internal.h b/libffi/src/aarch64/internal.h new file mode 100644 index 0000000..63cf683 --- /dev/null +++ b/libffi/src/aarch64/internal.h @@ -0,0 +1,43 @@ +/* +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +``Software''), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* ---- Internal ---- */ + +#define AARCH64_RET_UINT32 0 +#define AARCH64_RET_SINT32 1 +#define AARCH64_RET_INT64 2 +#define AARCH64_RET_SM_STRUCT 3 +#define AARCH64_RET_FLOAT 4 +#define AARCH64_RET_DOUBLE 5 +#define AARCH64_RET_LDOUBLE 6 +#define AARCH64_RET_HFA_FLOAT 7 +#define AARCH64_RET_HFA_DOUBLE 8 +#define AARCH64_RET_HFA_LDOUBLE 13 +#define AARCH64_RET_LG_STRUCT 14 +#define AARCH64_RET_VOID 15 +#define AARCH64_FLAG_RET_MASK 15 + +#define AARCH64_FLAG_ARG_V_BIT 4 +#define AARCH64_FLAG_ARG_V (1 << AARCH64_FLAG_ARG_V_BIT) + +#define AARCH64_N_VREG 8 +#define AARCH64_N_XREG 8 +#define AARCH64_CALL_CONTEXT_SIZE (AARCH64_N_VREG * 16 + AARCH64_N_XREG * 8) + diff --git a/libffi/src/aarch64/sysv.S b/libffi/src/aarch64/sysv.S index ffb16f8..126c527 100644 --- a/libffi/src/aarch64/sysv.S +++ b/libffi/src/aarch64/sysv.S @@ -22,286 +22,285 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #define LIBFFI_ASM #include #include +#include "internal.h" -#define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off -#define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off -#define cfi_restore(reg) .cfi_restore reg -#define cfi_def_cfa_register(reg) .cfi_def_cfa_register reg - - .text - .globl ffi_call_SYSV - .type ffi_call_SYSV, #function + .text /* ffi_call_SYSV() - Create a stack frame, setup an argument context, call the callee - and extract the result. - - The maximum required argument stack size is provided, - ffi_call_SYSV() allocates that stack space then calls the - prepare_fn to populate register context and stack. The - argument passing registers are loaded from the register - context and the callee called, on return the register passing - register are saved back to the context. Our caller will - extract the return value from the final state of the saved - register context. + Install an argument context and a stack frame. + Call the callee and extract the result. Prototype: - extern unsigned - ffi_call_SYSV (void (*)(struct call_context *context, unsigned char *, - extended_cif *), - struct call_context *context, - extended_cif *, - unsigned required_stack_size, - void (*fn)(void)); - - Therefore on entry we have: - - x0 prepare_fn - x1 &context - x2 &ecif - x3 bytes - x4 fn - - This function uses the following stack frame layout: - - == - saved x30(lr) - x29(fp)-> saved x29(fp) - saved x24 - saved x23 - saved x22 - sp' -> saved x21 - ... - sp -> (constructed callee stack arguments) - == + extern void + ffi_call_SYSV (void *frame, void *rvalue, struct call_context *context, + unsigned flags, void (*fn)(void)) - Voila! */ + This function uses an unusual stack layout. Our local frame has + been allocated by the caller in FRAME with the outgoing arguments + in CONTEXT, and the outgoing stack arguments above CONTEXT. */ -#define ffi_call_SYSV_FS (8 * 4) + .globl ffi_call_SYSV + .hidden ffi_call_SYSV + .type ffi_call_SYSV, %function + .balign 32 - .cfi_startproc ffi_call_SYSV: - stp x29, x30, [sp, #-16]! - cfi_adjust_cfa_offset (16) - cfi_rel_offset (x29, 0) - cfi_rel_offset (x30, 8) - - mov x29, sp - cfi_def_cfa_register (x29) - sub sp, sp, #ffi_call_SYSV_FS - - stp x21, x22, [sp, 0] - cfi_rel_offset (x21, 0 - ffi_call_SYSV_FS) - cfi_rel_offset (x22, 8 - ffi_call_SYSV_FS) - - stp x23, x24, [sp, 16] - cfi_rel_offset (x23, 16 - ffi_call_SYSV_FS) - cfi_rel_offset (x24, 24 - ffi_call_SYSV_FS) - - mov x21, x1 - mov x22, x2 - mov x24, x4 - - /* Allocate the stack space for the actual arguments, many - arguments will be passed in registers, but we assume - worst case and allocate sufficient stack for ALL of - the arguments. */ - sub sp, sp, x3 - - /* unsigned (*prepare_fn) (struct call_context *context, - unsigned char *stack, extended_cif *ecif); - */ - mov x23, x0 - mov x0, x1 - mov x1, sp - /* x2 already in place */ - blr x23 - - /* Preserve the flags returned. */ - mov x23, x0 - - /* Figure out if we should touch the vector registers. */ - tbz x23, #AARCH64_FFI_WITH_V_BIT, 1f - - /* Load the vector argument passing registers. */ - ldp q0, q1, [x21, #8*32 + 0] - ldp q2, q3, [x21, #8*32 + 32] - ldp q4, q5, [x21, #8*32 + 64] - ldp q6, q7, [x21, #8*32 + 96] -1: - /* Load the core argument passing registers. */ - ldp x0, x1, [x21, #0] - ldp x2, x3, [x21, #16] - ldp x4, x5, [x21, #32] - ldp x6, x7, [x21, #48] - - /* Don't forget x8 which may be holding the address of a return buffer. - */ - ldr x8, [x21, #8*8] - - blr x24 - - /* Save the core argument passing registers. */ - stp x0, x1, [x21, #0] - stp x2, x3, [x21, #16] - stp x4, x5, [x21, #32] - stp x6, x7, [x21, #48] - - /* Note nothing useful ever comes back in x8! */ - - /* Figure out if we should touch the vector registers. */ - tbz x23, #AARCH64_FFI_WITH_V_BIT, 1f - - /* Save the vector argument passing registers. */ - stp q0, q1, [x21, #8*32 + 0] - stp q2, q3, [x21, #8*32 + 32] - stp q4, q5, [x21, #8*32 + 64] - stp q6, q7, [x21, #8*32 + 96] + .cfi_startproc + .cfi_def_cfa x0, 32 + stp x29, x30, [x0] /* Save fp, lr in our frame. */ + mov x29, x0 /* Set up our new frame. */ + .cfi_def_cfa_register x29 + .cfi_rel_offset x29, 0 + .cfi_rel_offset x30, 8 + + /* Move parameters out of the way. */ + stp x3, x1, [x0, #16] /* flags, rvalue */ + mov x8, x1 /* rvalue into place */ + mov x10, x2 /* context */ + mov x11, x4 /* fn */ + + /* Load the vector argument passing registers, if needed. */ + tbz w3, #AARCH64_FLAG_ARG_V_BIT, 1f + ldp q0, q1, [x10, #8*AARCH64_N_XREG + 0] + ldp q2, q3, [x10, #8*AARCH64_N_XREG + 32] + ldp q4, q5, [x10, #8*AARCH64_N_XREG + 64] + ldp q6, q7, [x10, #8*AARCH64_N_XREG + 96] 1: - /* All done, unwind our stack frame. */ - ldp x21, x22, [x29, # - ffi_call_SYSV_FS] - cfi_restore (x21) - cfi_restore (x22) - - ldp x23, x24, [x29, # - ffi_call_SYSV_FS + 16] - cfi_restore (x23) - cfi_restore (x24) - - mov sp, x29 - cfi_def_cfa_register (sp) - - ldp x29, x30, [sp], #16 - cfi_adjust_cfa_offset (-16) - cfi_restore (x29) - cfi_restore (x30) - - ret - - .cfi_endproc - .size ffi_call_SYSV, .-ffi_call_SYSV - -#define ffi_closure_SYSV_FS (8 * 2 + AARCH64_CALL_CONTEXT_SIZE) + /* Load the core argument passing registers. */ + ldp x0, x1, [x10, #16*0] + ldp x2, x3, [x10, #16*1] + ldp x4, x5, [x10, #16*2] + ldp x6, x7, [x10, #16*3] + + /* Setup SP for the stacked arguments. */ + add sp, x10, #AARCH64_CALL_CONTEXT_SIZE + + /* Call fn. */ + blr x11 + + /* Recover the flags value and result address. */ + ldp x3, x8, [x29, #16] + + /* Store the return type. + Each case uses 8 bytes, so compute it directly. */ + adr x2, 3f + and w3, w3, #AARCH64_FLAG_RET_MASK + add x2, x2, x3, lsl #3 + br x2 + + /* Store results into the rvalue. Note that for most integer + cases this is actually ffi_arg, aka a 64-bit result. + For the HFA cases, and the (small) struct case, we've arranged + for temporary storage, so store the largest possible. + For the large struct case, we've remapped to VOID, since + the callee has already done the store via x8. */ + .balign 8 +/* 0: AARCH64_RET_UINT32 */ +3: mov w0, w0 + b 4f +/* 1: AARCH64_RET_SINT32 */ + sxtw x0, w0 + nop +/* 2: AARCH64_RET_INT64 */ +4: str x0, [x8] + b 9f +/* 3: AARCH64_RET_SM_STRUCT */ + stp x0, x1, [x8] + b 9f +/* 4: AARCH64_RET_FLOAT */ + str s0, [x8] + b 9f +/* 5: AARCH64_RET_DOUBLE */ + str d0, [x8] + b 9f +/* 6: AARCH64_RET_LONGDOUBLE */ + str q0, [x8] + b 9f +/* 7: AARCH64_RET_HFA_FLOAT */ + st4 { v0.s, v1.s, v2.s, v3.s }[0], [x8] + b 9f +/* 8: AARCH64_RET_HFA_DOUBLE */ + st4 { v0.d, v1.d, v2.d, v3.d }[0], [x8] + b 9f +/* 9: invalid */ + brk #1000 + nop +/* A: invalid */ + brk #1000 + nop +/* B: invalid */ + brk #1000 + nop +/* C: invalid */ + brk #1000 + nop +/* D: AARCH64_RET_HFA_LDOUBLE */ + stp q0, q1, [x8] + stp q2, q3, [x8, #32] +/* E: AARCH64_RET_LG_STRUCT */ + nop + nop +/* F: AARCH64_RET_VOID */ +9: ldp x29, x30, [x29] + .cfi_def_cfa sp, 0 + .cfi_restore x29 + .cfi_restore x30 + ret + .cfi_endproc + .size ffi_call_SYSV, .-ffi_call_SYSV /* ffi_closure_SYSV Closure invocation glue. This is the low level code invoked directly by the closure trampoline to setup and call a closure. - On entry x17 points to a struct trampoline_data, x16 has been clobbered - all other registers are preserved. + On entry x17 points to a ffi_closure, x16 has been clobbered, + and all other registers are preserved. We allocate a call context and save the argument passing registers, then invoked the generic C ffi_closure_SYSV_inner() function to do all the real work, on return we load the result passing registers back from the call context. - On entry - - extern void - ffi_closure_SYSV (struct trampoline_data *); - - struct trampoline_data - { - UINT64 *ffi_closure; - UINT64 flags; - }; + We use two separate entry points, depending on whether there are + any vector argument registers. This function uses the following stack frame layout: == - saved x30(lr) - x29(fp)-> saved x29(fp) - saved x22 - saved x21 - ... - sp -> call_context + temporary return slot + call_context + saved x30(lr) + sp, x29-> saved x29(fp) == Voila! */ - .text - .globl ffi_closure_SYSV - .cfi_startproc +#define ffi_closure_FS (16 + AARCH64_CALL_CONTEXT_SIZE + 64) + + .globl ffi_closure_SYSV_V + .hidden ffi_closure_SYSV_V + .type ffi_closure_SYSV_V, %function + .balign 32 + +ffi_closure_SYSV_V: + .cfi_startproc + stp x29, x30, [sp, #-ffi_closure_FS]! + .cfi_adjust_cfa_offset ffi_closure_FS + .cfi_rel_offset x29, 0 + .cfi_rel_offset x30, 8 + mov x29, sp + + /* Save the argument passing vector registers. */ + stp q0, q1, [sp, #16 + 8*AARCH64_N_XREG + 0] + stp q2, q3, [sp, #16 + 8*AARCH64_N_XREG + 32] + stp q4, q5, [sp, #16 + 8*AARCH64_N_XREG + 64] + stp q6, q7, [sp, #16 + 8*AARCH64_N_XREG + 96] + b 0f + + .cfi_endproc + .size ffi_closure_SYSV_V, . - ffi_closure_SYSV_V + + .globl ffi_closure_SYSV + .hidden ffi_closure_SYSV + .type ffi_closure_SYSV, %function + .balign 32 + ffi_closure_SYSV: - stp x29, x30, [sp, #-16]! - cfi_adjust_cfa_offset (16) - cfi_rel_offset (x29, 0) - cfi_rel_offset (x30, 8) - - mov x29, sp - cfi_def_cfa_register (x29) - - sub sp, sp, #ffi_closure_SYSV_FS - - stp x21, x22, [x29, #-16] - cfi_rel_offset (x21, -16) - cfi_rel_offset (x22, -8) - - /* Load x21 with &call_context. */ - mov x21, sp - /* Preserve our struct trampoline_data * */ - mov x22, x17 - - /* Save the rest of the argument passing registers. */ - stp x0, x1, [x21, #0] - stp x2, x3, [x21, #16] - stp x4, x5, [x21, #32] - stp x6, x7, [x21, #48] - /* Don't forget we may have been given a result scratch pad address. - */ - str x8, [x21, #64] - - /* Figure out if we should touch the vector registers. */ - ldr x0, [x22, #8] - tbz x0, #AARCH64_FFI_WITH_V_BIT, 1f - - /* Save the argument passing vector registers. */ - stp q0, q1, [x21, #8*32 + 0] - stp q2, q3, [x21, #8*32 + 32] - stp q4, q5, [x21, #8*32 + 64] - stp q6, q7, [x21, #8*32 + 96] -1: - /* Load &ffi_closure.. */ - ldr x0, [x22, #0] - mov x1, x21 - /* Compute the location of the stack at the point that the - trampoline was called. */ - add x2, x29, #16 - - bl ffi_closure_SYSV_inner - - /* Figure out if we should touch the vector registers. */ - ldr x0, [x22, #8] - tbz x0, #AARCH64_FFI_WITH_V_BIT, 1f - - /* Load the result passing vector registers. */ - ldp q0, q1, [x21, #8*32 + 0] - ldp q2, q3, [x21, #8*32 + 32] - ldp q4, q5, [x21, #8*32 + 64] - ldp q6, q7, [x21, #8*32 + 96] -1: - /* Load the result passing core registers. */ - ldp x0, x1, [x21, #0] - ldp x2, x3, [x21, #16] - ldp x4, x5, [x21, #32] - ldp x6, x7, [x21, #48] - /* Note nothing usefull is returned in x8. */ - - /* We are done, unwind our frame. */ - ldp x21, x22, [x29, #-16] - cfi_restore (x21) - cfi_restore (x22) - - mov sp, x29 - cfi_def_cfa_register (sp) - - ldp x29, x30, [sp], #16 - cfi_adjust_cfa_offset (-16) - cfi_restore (x29) - cfi_restore (x30) - - ret - .cfi_endproc - .size ffi_closure_SYSV, .-ffi_closure_SYSV + .cfi_startproc + stp x29, x30, [sp, #-ffi_closure_FS]! + .cfi_adjust_cfa_offset ffi_closure_FS + .cfi_rel_offset x29, 0 + .cfi_rel_offset x30, 8 + mov x29, sp + + /* Save the argument passing core registers. */ +0: stp x0, x1, [sp, #16 + 0] + stp x2, x3, [sp, #16 + 16] + stp x4, x5, [sp, #16 + 32] + stp x6, x7, [sp, #16 + 48] + + ldp x0, x1, [x17, #FFI_TRAMPOLINE_SIZE] /* Load cfi, fun */ + ldr x2, [x17, #FFI_TRAMPOLINE_SIZE + 16] /* Load user_data */ + +.Ldo_closure: + add x3, sp, #16 /* Load &call_context. */ + add x4, sp, #ffi_closure_FS /* Load incoming sp value. */ + mov x5, x8 /* Load struct return. */ + bl ffi_closure_SYSV_inner + + /* Load the return type. Each case uses 8 bytes, so compute it + directly. Load x8 with address of the temporary return slot. */ + adr x1, 3f + and w0, w0, #AARCH64_FLAG_RET_MASK + add x1, x1, x0, lsl #3 + add x8, sp, #16 + AARCH64_CALL_CONTEXT_SIZE + br x1 + + /* Load results from temporary storage. Note that for most integer + cases this is actually ffi_arg, aka a 64-bit result. For the HFA + cases and the (small) struct case, we can load the maximum width. + For the large struct case, we've remapped to VOID. */ +#if defined __AARCH64EB__ +# define INT32OFS 4 +#else +# define INT32OFS 0 +#endif + + .balign 8 +/* 0: AARCH64_RET_UINT32 */ +3: ldr w0, [x8, #INT32OFS] + b 9f +/* 1: AARCH64_RET_SINT32 */ + ldrsw x0, [x8, #INT32OFS] + b 9f +/* 2: AARCH64_RET_INT64 */ + ldr x0, [x8] + b 9f +/* 3: AARCH64_RET_SM_STRUCT */ + ldp x0, x1, [x8] + b 9f +/* 4: AARCH64_RET_FLOAT */ + ldr s0, [x8] + b 9f +/* 5: AARCH64_RET_DOUBLE */ + ldr d0, [x8] + b 9f +/* 6: AARCH64_RET_LONGDOUBLE */ + ldr q0, [x8] + b 9f +/* 7: AARCH64_RET_HFA_FLOAT */ + ld4 { v0.s, v1.s, v2.s, v3.s }[0], [x8] + b 9f +/* 8: AARCH64_RET_HFA_DOUBLE */ + ld1 { v0.1d, v1.1d, v2.1d, v3.1d }, [x8] + b 9f +/* 9: invalid */ + brk #1000 + nop +/* A: invalid */ + brk #1000 + nop +/* B: invalid */ + brk #1000 + nop +/* C: invalid */ + brk #1000 + nop +/* D: AARCH64_RET_HFA_LDOUBLE */ + ldp q0, q1, [x8] + ldp q2, q3, [x8, #32] +/* E: AARCH64_RET_LG_STRUCT */ + nop + nop +/* F: AARCH64_RET_VOID */ +9: /* We are done, unwind our frame. */ + ldp x29, x30, [sp], #ffi_closure_FS + .cfi_adjust_cfa_offset -ffi_closure_FS + .cfi_restore x29 + .cfi_restore x30 + ret + .cfi_endproc + .size ffi_closure_SYSV, .-ffi_closure_SYSV -- 1.9.3