From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 32244 invoked by alias); 28 Oct 2014 18:54:09 -0000 Mailing-List: contact libffi-discuss-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libffi-discuss-owner@sourceware.org Received: (qmail 32185 invoked by uid 89); 28 Oct 2014 18:54:08 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-2.3 required=5.0 tests=AWL,BAYES_00,FREEMAIL_ENVFROM_END_DIGIT,FREEMAIL_FROM,RCVD_IN_DNSWL_LOW,SPF_PASS autolearn=ham version=3.3.2 X-HELO: mail-qa0-f48.google.com Received: from mail-qa0-f48.google.com (HELO mail-qa0-f48.google.com) (209.85.216.48) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-SHA encrypted) ESMTPS; Tue, 28 Oct 2014 18:54:04 +0000 Received: by mail-qa0-f48.google.com with SMTP id x12so953531qac.35 for ; Tue, 28 Oct 2014 11:54:02 -0700 (PDT) X-Received: by 10.224.21.133 with SMTP id j5mr7981504qab.51.1414522442434; Tue, 28 Oct 2014 11:54:02 -0700 (PDT) Received: from anchor.com (50-194-63-110-static.hfc.comcastbusiness.net. [50.194.63.110]) by mx.google.com with ESMTPSA id 69sm1717430qgy.19.2014.10.28.11.54.00 for (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Tue, 28 Oct 2014 11:54:01 -0700 (PDT) From: Richard Henderson To: libffi-discuss@sourceware.org Cc: Richard Henderson Subject: [PATCH 10/16] aarch64: Move return value handling into ffi_call_SYSV Date: Tue, 28 Oct 2014 18:54:00 -0000 Message-Id: <1414522393-19169-11-git-send-email-rth@twiddle.net> In-Reply-To: <1414522393-19169-1-git-send-email-rth@twiddle.net> References: <1414522393-19169-1-git-send-email-rth@twiddle.net> X-SW-Source: 2014/txt/msg00146.txt.bz2 From: Richard Henderson This lets us pass return data directly to the caller of ffi_call in most cases, rather than storing it into temporary storage first. --- src/aarch64/ffi.c | 202 ++++++++++++++++++++++++++++--------------------- src/aarch64/internal.h | 43 ++++++++++- src/aarch64/sysv.S | 127 ++++++++++++++++++++++++------- 3 files changed, 258 insertions(+), 114 deletions(-) diff --git a/src/aarch64/ffi.c b/src/aarch64/ffi.c index a067303..ffa1363 100644 --- a/src/aarch64/ffi.c +++ b/src/aarch64/ffi.c @@ -523,30 +523,90 @@ allocate_int_to_reg_or_stack (struct call_context *context, ffi_status ffi_prep_cif_machdep (ffi_cif *cif) { - /* Round the stack up to a multiple of the stack alignment requirement. */ - cif->bytes = ALIGN(cif->bytes, 16); - - /* Initialize our flags. We are interested if this CIF will touch a - vector register, if so we will enable context save and load to - those registers, otherwise not. This is intended to be friendly - to lazy float context switching in the kernel. */ - cif->aarch64_flags = 0; + ffi_type *rtype = cif->rtype; + size_t bytes = cif->bytes; + int flags, aarch64_flags, i, n; - if (is_v_register_candidate (cif->rtype)) + switch (rtype->type) { - cif->aarch64_flags |= AARCH64_FLAG_ARG_V; - } - else - { - int i; - for (i = 0; i < cif->nargs; i++) - if (is_v_register_candidate (cif->arg_types[i])) - { - cif->aarch64_flags |= AARCH64_FLAG_ARG_V; - break; - } + case FFI_TYPE_VOID: + flags = AARCH64_RET_VOID; + break; + case FFI_TYPE_UINT8: + flags = AARCH64_RET_UINT8; + break; + case FFI_TYPE_UINT16: + flags = AARCH64_RET_UINT16; + break; + case FFI_TYPE_UINT32: + flags = AARCH64_RET_UINT32; + break; + case FFI_TYPE_SINT8: + flags = AARCH64_RET_SINT8; + break; + case FFI_TYPE_SINT16: + flags = AARCH64_RET_SINT16; + break; + case FFI_TYPE_INT: + case FFI_TYPE_SINT32: + flags = AARCH64_RET_SINT32; + break; + case FFI_TYPE_SINT64: + case FFI_TYPE_UINT64: + flags = AARCH64_RET_INT64; + break; + case FFI_TYPE_POINTER: + flags = (sizeof(void *) == 4 ? AARCH64_RET_UINT32 : AARCH64_RET_INT64); + break; + + case FFI_TYPE_FLOAT: + flags = AARCH64_RET_S1; + break; + case FFI_TYPE_DOUBLE: + flags = AARCH64_RET_D1; + break; + case FFI_TYPE_LONGDOUBLE: + flags = AARCH64_RET_Q1; + break; + + case FFI_TYPE_STRUCT: + { + int h = is_hfa (rtype); + size_t s = rtype->size; + + if (h) + flags = (h & 0xff) * 4 + 4 - (h >> 8); + else if (s > 16) + { + flags = AARCH64_RET_VOID | AARCH64_RET_IN_MEM; + bytes += 8; + } + else if (s == 16) + flags = AARCH64_RET_INT128; + else if (s == 8) + flags = AARCH64_RET_INT64; + else + flags = AARCH64_RET_INT128 | AARCH64_RET_NEED_COPY; + } + break; + + default: + abort(); } + aarch64_flags = 0; + for (i = 0, n = cif->nargs; i < n; i++) + if (is_v_register_candidate (cif->arg_types[i])) + { + aarch64_flags = AARCH64_FLAG_ARG_V; + flags |= AARCH64_FLAG_ARG_V; + break; + } + + /* Round the stack up to a multiple of the stack alignment requirement. */ + cif->bytes = ALIGN(bytes, 16); + cif->flags = flags; + cif->aarch64_flags = aarch64_flags; #if defined (__APPLE__) cif->aarch64_nfixedargs = 0; #endif @@ -555,51 +615,65 @@ ffi_prep_cif_machdep (ffi_cif *cif) } #if defined (__APPLE__) - /* Perform Apple-specific cif processing for variadic calls */ ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif, unsigned int nfixedargs, unsigned int ntotalargs) { - ffi_status status; - - status = ffi_prep_cif_machdep (cif); - + ffi_status status = ffi_prep_cif_machdep (cif); cif->aarch64_nfixedargs = nfixedargs; - return status; } +#endif /* __APPLE__ */ -#endif - -extern void ffi_call_SYSV (void *stack, void *frame, - void (*fn)(void), int flags) FFI_HIDDEN; +extern void ffi_call_SYSV (struct call_context *context, void *frame, + void (*fn)(void), void *rvalue, int flags) + FFI_HIDDEN; /* Call a function with the provided arguments and capture the return value. */ void -ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue) +ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue) { struct call_context *context; - void *stack, *frame; + void *stack, *frame, *rvalue; struct arg_state state; - size_t stack_bytes; - int i, nargs = cif->nargs; - int h, t; + size_t stack_bytes, rtype_size, rsize; + int i, nargs, flags; ffi_type *rtype; - /* Allocate consectutive stack for everything we'll need. */ + flags = cif->flags; + rtype = cif->rtype; + rtype_size = rtype->size; stack_bytes = cif->bytes; - stack = alloca (stack_bytes + 32 + sizeof(struct call_context)); + + /* If the target function returns a structure via hidden pointer, + then we cannot allow a null rvalue. Otherwise, mash a null + rvalue to void return type. */ + rsize = 0; + if (flags & AARCH64_RET_IN_MEM) + { + if (orig_rvalue == NULL) + rsize = rtype_size; + } + else if (orig_rvalue == NULL) + flags &= AARCH64_FLAG_ARG_V; + else if (flags & AARCH64_RET_NEED_COPY) + rsize = 16; + + /* Allocate consectutive stack for everything we'll need. */ + context = alloca (sizeof(struct call_context) + stack_bytes + 32 + rsize); + stack = context + 1; frame = stack + stack_bytes; - context = frame + 32; + rvalue = (rsize ? frame + 32 : orig_rvalue); arg_init (&state); - for (i = 0; i < nargs; i++) + for (i = 0, nargs = cif->nargs; i < nargs; i++) { ffi_type *ty = cif->arg_types[i]; size_t s = ty->size; void *a = avalue[i]; + int h, t; t = ty->type; switch (t) @@ -717,54 +791,10 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue) #endif } - rtype = cif->rtype; - if (is_register_candidate (rtype)) - { - ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags); + ffi_call_SYSV (context, frame, fn, rvalue, flags); - t = rtype->type; - switch (t) - { - case FFI_TYPE_INT: - case FFI_TYPE_UINT8: - case FFI_TYPE_SINT8: - case FFI_TYPE_UINT16: - case FFI_TYPE_SINT16: - case FFI_TYPE_UINT32: - case FFI_TYPE_SINT32: - case FFI_TYPE_POINTER: - case FFI_TYPE_UINT64: - case FFI_TYPE_SINT64: - *(ffi_arg *)rvalue = extend_integer_type (&context->x[0], t); - break; - - case FFI_TYPE_FLOAT: - case FFI_TYPE_DOUBLE: - case FFI_TYPE_LONGDOUBLE: - compress_hfa_type (rvalue, &context->v[0], 0x100 + t); - break; - - case FFI_TYPE_STRUCT: - h = is_hfa (cif->rtype); - if (h) - compress_hfa_type (rvalue, &context->v[0], h); - else - { - FFI_ASSERT (rtype->size <= 16); - memcpy (rvalue, &context->x[0], rtype->size); - } - break; - - default: - FFI_ASSERT (0); - break; - } - } - else - { - context->x8 = (uintptr_t)rvalue; - ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags); - } + if (flags & AARCH64_RET_NEED_COPY) + memcpy (orig_rvalue, rvalue, rtype_size); } static unsigned char trampoline [] = diff --git a/src/aarch64/internal.h b/src/aarch64/internal.h index b6b6104..a3070db 100644 --- a/src/aarch64/internal.h +++ b/src/aarch64/internal.h @@ -18,7 +18,48 @@ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#define AARCH64_FLAG_ARG_V_BIT 0 +#define AARCH64_RET_VOID 0 +#define AARCH64_RET_INT64 1 +#define AARCH64_RET_INT128 2 + +#define AARCH64_RET_UNUSED3 3 +#define AARCH64_RET_UNUSED4 4 +#define AARCH64_RET_UNUSED5 5 +#define AARCH64_RET_UNUSED6 6 +#define AARCH64_RET_UNUSED7 7 + +/* Note that FFI_TYPE_FLOAT == 2, _DOUBLE == 3, _LONGDOUBLE == 4, + so _S4 through _Q1 are layed out as (TYPE * 4) + (4 - COUNT). */ +#define AARCH64_RET_S4 8 +#define AARCH64_RET_S3 9 +#define AARCH64_RET_S2 10 +#define AARCH64_RET_S1 11 + +#define AARCH64_RET_D4 12 +#define AARCH64_RET_D3 13 +#define AARCH64_RET_D2 14 +#define AARCH64_RET_D1 15 + +#define AARCH64_RET_Q4 16 +#define AARCH64_RET_Q3 17 +#define AARCH64_RET_Q2 18 +#define AARCH64_RET_Q1 19 + +/* Note that each of the sub-64-bit integers gets two entries. */ +#define AARCH64_RET_UINT8 20 +#define AARCH64_RET_UINT16 22 +#define AARCH64_RET_UINT32 24 + +#define AARCH64_RET_SINT8 26 +#define AARCH64_RET_SINT16 28 +#define AARCH64_RET_SINT32 30 + +#define AARCH64_RET_MASK 31 + +#define AARCH64_RET_IN_MEM (1 << 5) +#define AARCH64_RET_NEED_COPY (1 << 6) + +#define AARCH64_FLAG_ARG_V_BIT 7 #define AARCH64_FLAG_ARG_V (1 << AARCH64_FLAG_ARG_V_BIT) #define N_X_ARG_REG 8 diff --git a/src/aarch64/sysv.S b/src/aarch64/sysv.S index a5f636a..ba15663 100644 --- a/src/aarch64/sysv.S +++ b/src/aarch64/sysv.S @@ -40,9 +40,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #endif .text - .align 2 + .align 4 - .globl CNAME(ffi_call_SYSV) + .globl CNAME(ffi_call_SYSV) #ifdef __ELF__ .type CNAME(ffi_call_SYSV), #function .hidden CNAME(ffi_call_SYSV) @@ -50,14 +50,15 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* ffi_call_SYSV extern void ffi_call_SYSV (void *stack, void *frame, - void (*fn)(void), int flags); + void (*fn)(void), void *rvalue, int flags); Therefore on entry we have: x0 stack x1 frame x2 fn - x3 flags + x3 rvalue + x4 flags */ cfi_startproc @@ -71,43 +72,111 @@ CNAME(ffi_call_SYSV): cfi_rel_offset (x29, 0) cfi_rel_offset (x30, 8) - str w3, [x29, #16] /* save flags */ mov x9, x2 /* save fn */ + mov x8, x3 /* install structure return */ + stp x3, x4, [x29, #16] /* save rvalue and flags */ /* Load the vector argument passing registers, if necessary. */ - tbz w3, #AARCH64_FLAG_ARG_V_BIT, 1f - ldp q0, q1, [x29, #32 + 0] - ldp q2, q3, [x29, #32 + 32] - ldp q4, q5, [x29, #32 + 64] - ldp q6, q7, [x29, #32 + 96] + tbz w4, #AARCH64_FLAG_ARG_V_BIT, 1f + ldp q0, q1, [sp, #0] + ldp q2, q3, [sp, #32] + ldp q4, q5, [sp, #64] + ldp q6, q7, [sp, #96] 1: /* Load the core argument passing registers, including the structure return pointer. */ - ldp x0, x1, [x29, #32 + 16*N_V_ARG_REG + 0] - ldp x2, x3, [x29, #32 + 16*N_V_ARG_REG + 16] - ldp x4, x5, [x29, #32 + 16*N_V_ARG_REG + 32] - ldp x6, x7, [x29, #32 + 16*N_V_ARG_REG + 48] - ldr x8, [x29, #32 + 16*N_V_ARG_REG + 64] + ldp x0, x1, [sp, #16*N_V_ARG_REG + 0] + ldp x2, x3, [sp, #16*N_V_ARG_REG + 16] + ldp x4, x5, [sp, #16*N_V_ARG_REG + 32] + ldp x6, x7, [sp, #16*N_V_ARG_REG + 48] + + /* Deallocate the context, leaving the stacked arguments. */ + add sp, sp, #CALL_CONTEXT_SIZE blr x9 /* call fn */ - ldr w3, [x29, #16] /* reload flags */ + ldp x3, x4, [x29, #16] /* reload rvalue and flags */ /* Partially deconstruct the stack frame. */ mov sp, x29 cfi_def_cfa_register (sp) ldp x29, x30, [x29] - /* Save the core return registers. */ - stp x0, x1, [sp, #32 + 16*N_V_ARG_REG] - - /* Save the vector return registers, if necessary. */ - tbz w3, #AARCH64_FLAG_ARG_V_BIT, 1f - stp q0, q1, [sp, #32 + 0] - stp q2, q3, [sp, #32 + 32] -1: - /* All done. */ + /* Save the return value as directed. */ + adr x5, 0f + and w4, w4, #AARCH64_RET_MASK + add x5, x5, x4, lsl #3 + br x5 + + /* Note that each table entry is 2 insns, and thus 8 bytes. + For integer data, note that we're storing into ffi_arg + and therefore we want to extend to 64 bits; these types + have two consecutive entries allocated for them. */ + .align 4 +0: ret /* VOID */ + nop +1: str x0, [x3] /* INT64 */ + ret +2: stp x0, x1, [x3] /* INT128 */ + ret +3: brk #1000 /* UNUSED */ + ret +4: brk #1000 /* UNUSED */ + ret +5: brk #1000 /* UNUSED */ + ret +6: brk #1000 /* UNUSED */ + ret +7: brk #1000 /* UNUSED */ + ret +8: st4 { v0.s-v3.s }[0], [x3] /* S4 */ + ret +9: st3 { v0.s-v2.s }[0], [x3] /* S3 */ ret +10: stp s0, s1, [x3] /* S2 */ + ret +11: str s0, [x3] /* S1 */ + ret +12: st4 { v0.d-v3.d }[0], [x3] /* D4 */ + ret +13: st3 { v0.d-v2.d }[0], [x3] /* D3 */ + ret +14: stp d0, d1, [x3] /* D2 */ + ret +15: str d0, [x3] /* D1 */ + ret +16: str q3, [x3, #48] /* Q4 */ + nop +17: str q2, [x3, #32] /* Q3 */ + nop +18: stp q0, q1, [x3] /* Q2 */ + ret +19: str q0, [x3] /* Q1 */ + ret +20: uxtb w0, w0 /* UINT8 */ + str x0, [x3] +21: ret /* reserved */ + nop +22: uxth w0, w0 /* UINT16 */ + str x0, [x3] +23: ret /* reserved */ + nop +24: mov w0, w0 /* UINT32 */ + str x0, [x3] +25: ret /* reserved */ + nop +26: sxtb x0, w0 /* SINT8 */ + str x0, [x3] +27: ret /* reserved */ + nop +28: sxth x0, w0 /* SINT16 */ + str x0, [x3] +29: ret /* reserved */ + nop +30: sxtw x0, w0 /* SINT32 */ + str x0, [x3] +31: ret /* reserved */ + nop cfi_endproc #ifdef __ELF__ @@ -154,9 +223,13 @@ CNAME(ffi_call_SYSV): Voila! */ .text - .align 2 + .align 4 - .globl CNAME(ffi_closure_SYSV) + .globl CNAME(ffi_closure_SYSV) +#ifdef __ELF__ + .type CNAME(ffi_closure_SYSV), #function + .hidden CNAME(ffi_closure_SYSV) +#endif cfi_startproc CNAME(ffi_closure_SYSV): stp x29, x30, [sp, #-16]! -- 1.9.3