public inbox for libffi-discuss@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 1/4] s390: Kill trailing whitespace
  2014-12-18 21:33 [PATCH 0/4] s390 improvements Richard Henderson
@ 2014-12-18 21:33 ` Richard Henderson
  2014-12-18 21:33 ` [PATCH 4/4] s390: Use pc-relative insns in 31-bit mode Richard Henderson
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 16+ messages in thread
From: Richard Henderson @ 2014-12-18 21:33 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Ulrich.Weigand, vogt, krebbel, Richard Henderson

From: Richard Henderson <rth@redhat.com>

---
 src/s390/ffi.c  | 130 ++++++++++++++++++++++++++++----------------------------
 src/s390/sysv.S |  30 ++++++-------
 2 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/src/s390/ffi.c b/src/s390/ffi.c
index 477b85e..b0c3092 100644
--- a/src/s390/ffi.c
+++ b/src/s390/ffi.c
@@ -1,9 +1,9 @@
 /* -----------------------------------------------------------------------
    ffi.c - Copyright (c) 2000, 2007 Software AG
            Copyright (c) 2008 Red Hat, Inc
- 
+
    S390 Foreign Function Interface
- 
+
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
    ``Software''), to deal in the Software without restriction, including
@@ -11,10 +11,10 @@
    distribute, sublicense, and/or sell copies of the Software, and to
    permit persons to whom the Software is furnished to do so, subject to
    the following conditions:
- 
+
    The above copyright notice and this permission notice shall be included
    in all copies or substantial portions of the Software.
- 
+
    THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
@@ -27,24 +27,24 @@
 /*                          Includes                                  */
 /*                          --------                                  */
 /*====================================================================*/
- 
+
 #include <ffi.h>
 #include <ffi_common.h>
- 
+
 #include <stdlib.h>
 #include <stdio.h>
- 
+
 /*====================== End of Includes =============================*/
- 
+
 /*====================================================================*/
 /*                           Defines                                  */
 /*                           -------                                  */
 /*====================================================================*/
 
-/* Maximum number of GPRs available for argument passing.  */ 
+/* Maximum number of GPRs available for argument passing.  */
 #define MAX_GPRARGS 5
 
-/* Maximum number of FPRs available for argument passing.  */ 
+/* Maximum number of FPRs available for argument passing.  */
 #ifdef __s390x__
 #define MAX_FPRARGS 4
 #else
@@ -63,12 +63,12 @@
 #define FFI390_RET_INT64	5
 
 /*===================== End of Defines ===============================*/
- 
+
 /*====================================================================*/
 /*                          Externals                                 */
 /*                          ---------                                 */
 /*====================================================================*/
- 
+
 extern void ffi_call_SYSV(unsigned,
 			  extended_cif *,
 			  void (*)(unsigned char *, extended_cif *),
@@ -78,9 +78,9 @@ extern void ffi_call_SYSV(unsigned,
 
 extern void ffi_closure_SYSV(void);
 extern void ffi_go_closure_SYSV(void);
- 
+
 /*====================== End of Externals ============================*/
- 
+
 /*====================================================================*/
 /*                                                                    */
 /* Name     - ffi_check_struct_type.                                  */
@@ -89,7 +89,7 @@ extern void ffi_go_closure_SYSV(void);
 /*            general purpose or floating point register.             */
 /*                                                                    */
 /*====================================================================*/
- 
+
 static int
 ffi_check_struct_type (ffi_type *arg)
 {
@@ -97,7 +97,7 @@ ffi_check_struct_type (ffi_type *arg)
 
   /* If the struct has just one element, look at that element
      to find out whether to consider the struct as floating point.  */
-  while (arg->type == FFI_TYPE_STRUCT 
+  while (arg->type == FFI_TYPE_STRUCT
          && arg->elements[0] && !arg->elements[1])
     arg = arg->elements[0];
 
@@ -130,9 +130,9 @@ ffi_check_struct_type (ffi_type *arg)
   /* Other structs are passed via a pointer to the data.  */
   return FFI_TYPE_POINTER;
 }
- 
+
 /*======================== End of Routine ============================*/
- 
+
 /*====================================================================*/
 /*                                                                    */
 /* Name     - ffi_prep_args.                                          */
@@ -143,7 +143,7 @@ ffi_check_struct_type (ffi_type *arg)
 /* has been allocated for the function's arguments.                   */
 /*                                                                    */
 /*====================================================================*/
- 
+
 static void
 ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 {
@@ -178,7 +178,7 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
   ffi_type **ptr;
   void **p_argv = ecif->avalue;
   int i;
- 
+
   /* If we returning a structure then we set the first parameter register
      to the address of where we are returning this structure.  */
 
@@ -186,7 +186,7 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
     p_gpr[n_gpr++] = (unsigned long) ecif->rvalue;
 
   /* Now for the arguments.  */
- 
+
   for (ptr = ecif->cif->arg_types, i = ecif->cif->nargs;
        i > 0;
        i--, ptr++, p_argv++)
@@ -218,7 +218,7 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 	}
 
       /* Now handle all primitive int/pointer/float data types.  */
-      switch (type) 
+      switch (type)
 	{
 	  case FFI_TYPE_DOUBLE:
 	    if (n_fpr < MAX_FPRARGS)
@@ -231,7 +231,7 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 	      p_ov[n_ov++] = ((unsigned long *) arg)[1];
 #endif
 	    break;
-	
+
 	  case FFI_TYPE_FLOAT:
 	    if (n_fpr < MAX_FPRARGS)
 	      p_fpr[n_fpr++] = (long long) *(unsigned int *) arg << 32;
@@ -245,7 +245,7 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 	    else
 	      p_ov[n_ov++] = (unsigned long)*(unsigned char **) arg;
 	    break;
- 
+
 	  case FFI_TYPE_UINT64:
 	  case FFI_TYPE_SINT64:
 #ifdef __s390x__
@@ -264,14 +264,14 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 	      p_ov[n_ov++] = ((unsigned long *) arg)[1];
 #endif
 	    break;
- 
+
 	  case FFI_TYPE_UINT32:
 	    if (n_gpr < MAX_GPRARGS)
 	      p_gpr[n_gpr++] = *(unsigned int *) arg;
 	    else
 	      p_ov[n_ov++] = *(unsigned int *) arg;
 	    break;
- 
+
 	  case FFI_TYPE_INT:
 	  case FFI_TYPE_SINT32:
 	    if (n_gpr < MAX_GPRARGS)
@@ -279,14 +279,14 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 	    else
 	      p_ov[n_ov++] = *(signed int *) arg;
 	    break;
- 
+
 	  case FFI_TYPE_UINT16:
 	    if (n_gpr < MAX_GPRARGS)
 	      p_gpr[n_gpr++] = *(unsigned short *) arg;
 	    else
 	      p_ov[n_ov++] = *(unsigned short *) arg;
 	    break;
- 
+
 	  case FFI_TYPE_SINT16:
 	    if (n_gpr < MAX_GPRARGS)
 	      p_gpr[n_gpr++] = *(signed short *) arg;
@@ -300,14 +300,14 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 	    else
 	      p_ov[n_ov++] = *(unsigned char *) arg;
 	    break;
- 
+
 	  case FFI_TYPE_SINT8:
 	    if (n_gpr < MAX_GPRARGS)
 	      p_gpr[n_gpr++] = *(signed char *) arg;
 	    else
 	      p_ov[n_ov++] = *(signed char *) arg;
 	    break;
- 
+
 	  default:
 	    FFI_ASSERT (0);
 	    break;
@@ -316,7 +316,7 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 }
 
 /*======================== End of Routine ============================*/
- 
+
 /*====================================================================*/
 /*                                                                    */
 /* Name     - ffi_prep_cif_machdep.                                   */
@@ -324,7 +324,7 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 /* Function - Perform machine dependent CIF processing.               */
 /*                                                                    */
 /*====================================================================*/
- 
+
 ffi_status
 ffi_prep_cif_machdep(ffi_cif *cif)
 {
@@ -336,7 +336,7 @@ ffi_prep_cif_machdep(ffi_cif *cif)
   ffi_type **ptr;
   int i;
 
-  /* Determine return value handling.  */ 
+  /* Determine return value handling.  */
 
   switch (cif->rtype->type)
     {
@@ -350,7 +350,7 @@ ffi_prep_cif_machdep(ffi_cif *cif)
       case FFI_TYPE_COMPLEX:
 	cif->flags = FFI390_RET_STRUCT;
 	n_gpr++;  /* We need one GPR to pass the pointer.  */
-	break; 
+	break;
 
       /* Floating point values are returned in fpr 0.  */
       case FFI_TYPE_FLOAT:
@@ -389,14 +389,14 @@ ffi_prep_cif_machdep(ffi_cif *cif)
 	cif->flags = FFI390_RET_INT32;
 #endif
 	break;
- 
+
       default:
         FFI_ASSERT (0);
         break;
     }
 
   /* Now for the arguments.  */
- 
+
   for (ptr = cif->arg_types, i = cif->nargs;
        i > 0;
        i--, ptr++)
@@ -424,7 +424,7 @@ ffi_prep_cif_machdep(ffi_cif *cif)
 	}
 
       /* Now handle all primitive int/float data types.  */
-      switch (type) 
+      switch (type)
 	{
 	  /* The first MAX_FPRARGS floating point arguments
 	     go in FPRs, the rest overflow to the stack.  */
@@ -435,7 +435,7 @@ ffi_prep_cif_machdep(ffi_cif *cif)
 	    else
 	      n_ov += sizeof (double) / sizeof (long);
 	    break;
-	
+
 	  case FFI_TYPE_FLOAT:
 	    if (n_fpr < MAX_FPRARGS)
 	      n_fpr++;
@@ -445,9 +445,9 @@ ffi_prep_cif_machdep(ffi_cif *cif)
 
 	  /* On 31-bit machines, 64-bit integers are passed in GPR pairs,
 	     if one is still available, or else on the stack.  If only one
-	     register is free, skip the register (it won't be used for any 
+	     register is free, skip the register (it won't be used for any
 	     subsequent argument either).  */
-	      
+
 #ifndef __s390x__
 	  case FFI_TYPE_UINT64:
 	  case FFI_TYPE_SINT64:
@@ -463,7 +463,7 @@ ffi_prep_cif_machdep(ffi_cif *cif)
 	  /* Everything else is passed in GPRs (until MAX_GPRARGS
 	     have been used) or overflows to the stack.  */
 
-	  default: 
+	  default:
 	    if (n_gpr < MAX_GPRARGS)
 	      n_gpr++;
 	    else
@@ -476,12 +476,12 @@ ffi_prep_cif_machdep(ffi_cif *cif)
      and temporary structure copies.  */
 
   cif->bytes = ROUND_SIZE (n_ov * sizeof (long)) + struct_size;
- 
+
   return FFI_OK;
 }
- 
+
 /*======================== End of Routine ============================*/
- 
+
 /*====================================================================*/
 /*                                                                    */
 /* Name     - ffi_call.                                               */
@@ -489,7 +489,7 @@ ffi_prep_cif_machdep(ffi_cif *cif)
 /* Function - Call the FFI routine.                                   */
 /*                                                                    */
 /*====================================================================*/
- 
+
 static void
 ffi_call_int(ffi_cif *cif,
 	     void (*fn)(void),
@@ -499,7 +499,7 @@ ffi_call_int(ffi_cif *cif,
 {
   int ret_type = cif->flags;
   extended_cif ecif;
- 
+
   ecif.cif    = cif;
   ecif.avalue = avalue;
   ecif.rvalue = rvalue;
@@ -511,7 +511,7 @@ ffi_call_int(ffi_cif *cif,
 	ecif.rvalue = alloca (cif->rtype->size);
       else
 	ret_type = FFI390_RET_VOID;
-    } 
+    }
 
   switch (cif->abi)
     {
@@ -519,7 +519,7 @@ ffi_call_int(ffi_cif *cif,
         ffi_call_SYSV (cif->bytes, &ecif, ffi_prep_args,
 		       ret_type, ecif.rvalue, fn, closure);
         break;
- 
+
       default:
         FFI_ASSERT (0);
         break;
@@ -538,7 +538,7 @@ ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
 {
   ffi_call_int(cif, fn, rvalue, avalue, closure);
 }
- 
+
 /*======================== End of Routine ============================*/
 
 /*====================================================================*/
@@ -575,8 +575,8 @@ ffi_closure_helper_SYSV (ffi_cif *cif,
 
   p_arg = avalue = alloca (cif->nargs * sizeof (void *));
 
-  /* If we returning a structure, pass the structure address 
-     directly to the target function.  Otherwise, have the target 
+  /* If we returning a structure, pass the structure address
+     directly to the target function.  Otherwise, have the target
      function store the return value to the GPR save area.  */
 
   if (cif->flags == FFI390_RET_STRUCT)
@@ -603,7 +603,7 @@ ffi_closure_helper_SYSV (ffi_cif *cif,
 	  else
 	    type = ffi_check_struct_type (*ptr);
 
-	  /* If we pass the struct via pointer, remember to 
+	  /* If we pass the struct via pointer, remember to
 	     retrieve the pointer later.  */
 	  if (type == FFI_TYPE_POINTER)
 	    deref_struct_pointer = 1;
@@ -618,23 +618,23 @@ ffi_closure_helper_SYSV (ffi_cif *cif,
 #endif
 
       /* Now handle all primitive int/float data types.  */
-      switch (type) 
+      switch (type)
 	{
 	  case FFI_TYPE_DOUBLE:
 	    if (n_fpr < MAX_FPRARGS)
 	      *p_arg = &p_fpr[n_fpr++];
 	    else
-	      *p_arg = &p_ov[n_ov], 
+	      *p_arg = &p_ov[n_ov],
 	      n_ov += sizeof (double) / sizeof (long);
 	    break;
-	
+
 	  case FFI_TYPE_FLOAT:
 	    if (n_fpr < MAX_FPRARGS)
 	      *p_arg = &p_fpr[n_fpr++];
 	    else
 	      *p_arg = (char *)&p_ov[n_ov++] + sizeof (long) - 4;
 	    break;
- 
+
 	  case FFI_TYPE_UINT64:
 	  case FFI_TYPE_SINT64:
 #ifdef __s390x__
@@ -651,7 +651,7 @@ ffi_closure_helper_SYSV (ffi_cif *cif,
 	      *p_arg = &p_ov[n_ov], n_ov += 2;
 #endif
 	    break;
- 
+
 	  case FFI_TYPE_INT:
 	  case FFI_TYPE_UINT32:
 	  case FFI_TYPE_SINT32:
@@ -660,7 +660,7 @@ ffi_closure_helper_SYSV (ffi_cif *cif,
 	    else
 	      *p_arg = (char *)&p_ov[n_ov++] + sizeof (long) - 4;
 	    break;
- 
+
 	  case FFI_TYPE_UINT16:
 	  case FFI_TYPE_SINT16:
 	    if (n_gpr < MAX_GPRARGS)
@@ -676,7 +676,7 @@ ffi_closure_helper_SYSV (ffi_cif *cif,
 	    else
 	      *p_arg = (char *)&p_ov[n_ov++] + sizeof (long) - 1;
 	    break;
- 
+
 	  default:
 	    FFI_ASSERT (0);
 	    break;
@@ -744,7 +744,7 @@ ffi_closure_helper_SYSV (ffi_cif *cif,
         break;
     }
 }
- 
+
 /*======================== End of Routine ============================*/
 
 /*====================================================================*/
@@ -754,7 +754,7 @@ ffi_closure_helper_SYSV (ffi_cif *cif,
 /* Function - Prepare a FFI closure.                                  */
 /*                                                                    */
 /*====================================================================*/
- 
+
 ffi_status
 ffi_prep_closure_loc (ffi_closure *closure,
 		      ffi_cif *cif,
@@ -780,17 +780,17 @@ ffi_prep_closure_loc (ffi_closure *closure,
   *(short *)&closure->tramp [8] = 0x07f1;   /* br %r1 */
   *(long  *)&closure->tramp[16] = (long)codeloc;
   *(long  *)&closure->tramp[24] = (long)&ffi_closure_SYSV;
-#endif 
- 
+#endif
+
   closure->cif = cif;
   closure->user_data = user_data;
   closure->fun = fun;
- 
+
   return FFI_OK;
 }
 
 /*======================== End of Routine ============================*/
- 
+
 /* Build a Go language closure.  */
 
 ffi_status
diff --git a/src/s390/sysv.S b/src/s390/sysv.S
index 14672ac..b01a7eb 100644
--- a/src/s390/sysv.S
+++ b/src/s390/sysv.S
@@ -1,9 +1,9 @@
 /* -----------------------------------------------------------------------
    sysv.S - Copyright (c) 2000 Software AG
             Copyright (c) 2008 Red Hat, Inc.
- 
+
    S390 Foreign Function Interface
- 
+
    Permission is hereby granted, free of charge, to any person obtaining
    a copy of this software and associated documentation files (the
    ``Software''), to deal in the Software without restriction, including
@@ -11,10 +11,10 @@
    distribute, sublicense, and/or sell copies of the Software, and to
    permit persons to whom the Software is furnished to do so, subject to
    the following conditions:
- 
+
    The above copyright notice and this permission notice shall be included
    in all copies or substantial portions of the Software.
- 
+
    THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
@@ -30,7 +30,7 @@
 #include <ffi.h>
 
 #ifndef __s390x__
- 
+
 .text
 
 	# r2:	cif->bytes
@@ -38,7 +38,7 @@
 	# r4:	ffi_prep_args
 	# r5:	ret_type
 	# r6:	ecif.rvalue
-	# ov:	fn 
+	# ov:	fn
 	# ov+8:	closure
 
 	# This assumes we are using gas.
@@ -125,7 +125,7 @@ ffi_call_SYSV:
 	.cfi_restore_state
 	# See comment on the nopr above.
 	nopr
- 
+
 .LretDouble:
 	l	%r4,48+56(%r11)
 	std	%f0,0(%r8)			# Return double
@@ -167,7 +167,7 @@ ffi_call_SYSV:
 	.cfi_restore_state
 	# See comment on the nopr above.
 	nopr
- 
+
 .LretInt64:
 	l	%r4,48+56(%r11)
 	stm	%r2,%r3,0(%r8)			# Return long long
@@ -186,7 +186,7 @@ ffi_call_SYSV:
 	.cfi_def_cfa r15, 96
 	br	%r4
 	.cfi_endproc
- 
+
 .Ltable:
 	.byte	.LretNone-.Lbase		# FFI390_RET_VOID
 	.byte	.LretNone-.Lbase		# FFI390_RET_STRUCT
@@ -269,15 +269,15 @@ ffi_go_closure_SYSV:
 	.cfi_endproc
 
 #else
- 
+
 .text
- 
+
 	# r2:	cif->bytes
 	# r3:	&ecif
 	# r4:	ffi_prep_args
 	# r5:	ret_type
 	# r6:	ecif.rvalue
-	# ov:	fn 
+	# ov:	fn
 	# ov+8:	closure
 
 	# This assumes we are using gas.
@@ -365,7 +365,7 @@ ffi_call_SYSV:
 	.cfi_restore_state
 	# See comment on the nopr above.
 	nopr
- 
+
 .LretDouble:
 	lg	%r4,80+112(%r11)
 	std	%f0,0(%r8)			# Return double
@@ -404,7 +404,7 @@ ffi_call_SYSV:
 	.cfi_def_cfa r15, 160
 	br	%r4
 	.cfi_endproc
- 
+
 .Ltable:
 	.byte	.LretNone-.Lbase		# FFI390_RET_VOID
 	.byte	.LretNone-.Lbase		# FFI390_RET_STRUCT
@@ -461,7 +461,7 @@ ffi_closure_SYSV:
 .ffi_closure_SYSV_end:
 	.size	 ffi_closure_SYSV,.ffi_closure_SYSV_end-ffi_closure_SYSV
 
-	
+
 	.globl	ffi_go_closure_SYSV
 	FFI_HIDDEN(ffi_go_closure_SYSV)
 	.type	ffi_go_closure_SYSV,%function
-- 
2.1.0

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 3/4] s390: Reorganize assembly
  2014-12-18 21:33 [PATCH 0/4] s390 improvements Richard Henderson
  2014-12-18 21:33 ` [PATCH 1/4] s390: Kill trailing whitespace Richard Henderson
  2014-12-18 21:33 ` [PATCH 4/4] s390: Use pc-relative insns in 31-bit mode Richard Henderson
@ 2014-12-18 21:33 ` Richard Henderson
  2014-12-22 12:12   ` Dominik Vogt
  2014-12-18 21:33 ` [PATCH 2/4] s390: Avoid aliasing warnings Richard Henderson
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 16+ messages in thread
From: Richard Henderson @ 2014-12-18 21:33 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Ulrich.Weigand, vogt, krebbel, Richard Henderson

From: Richard Henderson <rth@redhat.com>

Avoid using ffi_prep_args as a callback; do all the work setting
up the frame within ffi_call_int directly.  Save fewer registers
in ffi_closure_SYSV.
---
 src/s390/ffi.c      | 153 +++++++-------
 src/s390/internal.h |  11 +
 src/s390/sysv.S     | 578 ++++++++++++++++++----------------------------------
 3 files changed, 286 insertions(+), 456 deletions(-)
 create mode 100644 src/s390/internal.h

diff --git a/src/s390/ffi.c b/src/s390/ffi.c
index c06b79e..1189f7b 100644
--- a/src/s390/ffi.c
+++ b/src/s390/ffi.c
@@ -30,9 +30,7 @@
 
 #include <ffi.h>
 #include <ffi_common.h>
-
-#include <stdlib.h>
-#include <stdio.h>
+#include "internal.h"
 
 /*====================== End of Includes =============================*/
 
@@ -54,14 +52,6 @@
 /* Round to multiple of 16.  */
 #define ROUND_SIZE(size) (((size) + 15) & ~15)
 
-/* If these values change, sysv.S must be adapted!  */
-#define FFI390_RET_VOID		0
-#define FFI390_RET_STRUCT	1
-#define FFI390_RET_FLOAT	2
-#define FFI390_RET_DOUBLE	3
-#define FFI390_RET_INT32	4
-#define FFI390_RET_INT64	5
-
 /*===================== End of Defines ===============================*/
 
 /*====================================================================*/
@@ -69,12 +59,17 @@
 /*                          ---------                                 */
 /*====================================================================*/
 
-extern void ffi_call_SYSV(unsigned,
-			  extended_cif *,
-			  void (*)(unsigned char *, extended_cif *),
-			  unsigned,
-			  void *,
-			  void (*fn)(void), void *);
+struct call_frame
+{
+  void *back_chain;
+  void *eos;
+  unsigned long gpr_args[5];
+  unsigned long gpr_save[9];
+  unsigned long long fpr_args[4];
+};
+
+extern void FFI_HIDDEN ffi_call_SYSV(struct call_frame *, unsigned, void *,
+			             void (*fn)(void), void *);
 
 extern void ffi_closure_SYSV(void);
 extern void ffi_go_closure_SYSV(void);
@@ -145,54 +140,29 @@ ffi_check_struct_type (ffi_type *arg)
 /*====================================================================*/
 
 static void
-ffi_prep_args (unsigned char *stack, extended_cif *ecif)
+ffi_prep_args (ffi_cif *cif, void *rvalue, void **p_argv,
+	       unsigned long *p_ov, struct call_frame *p_frame)
 {
-  /* The stack space will be filled with those areas:
-
-	FPR argument register save area     (highest addresses)
-	GPR argument register save area
-	temporary struct copies
-	overflow argument area              (lowest addresses)
-
-     We set up the following pointers:
-
-        p_fpr: bottom of the FPR area (growing upwards)
-	p_gpr: bottom of the GPR area (growing upwards)
-	p_ov: bottom of the overflow area (growing upwards)
-	p_struct: top of the struct copy area (growing downwards)
-
-     All areas are kept aligned to twice the word size.  */
-
-  int gpr_off = ecif->cif->bytes;
-  int fpr_off = gpr_off + ROUND_SIZE (MAX_GPRARGS * sizeof (long));
-
-  unsigned long long *p_fpr = (unsigned long long *)(stack + fpr_off);
-  unsigned long *p_gpr = (unsigned long *)(stack + gpr_off);
-  unsigned char *p_struct = (unsigned char *)p_gpr;
-  unsigned long *p_ov = (unsigned long *)stack;
-
+  unsigned char *p_struct = (unsigned char *)p_frame;
+  unsigned long *p_gpr = p_frame->gpr_args;
+  unsigned long long *p_fpr = p_frame->fpr_args;
   int n_fpr = 0;
   int n_gpr = 0;
   int n_ov = 0;
-
   ffi_type **ptr;
-  void **p_argv = ecif->avalue;
   int i;
 
   /* If we returning a structure then we set the first parameter register
      to the address of where we are returning this structure.  */
-
-  if (ecif->cif->flags == FFI390_RET_STRUCT)
-    p_gpr[n_gpr++] = (unsigned long) ecif->rvalue;
+  if (cif->flags & FFI390_RET_IN_MEM)
+    p_gpr[n_gpr++] = (unsigned long) rvalue;
 
   /* Now for the arguments.  */
-
-  for (ptr = ecif->cif->arg_types, i = ecif->cif->nargs;
-       i > 0;
-       i--, ptr++, p_argv++)
+  for (ptr = cif->arg_types, i = cif->nargs; i > 0; i--, ptr++, p_argv++)
     {
+      ffi_type *ty = *ptr;
       void *arg = *p_argv;
-      int type = (*ptr)->type;
+      int type = ty->type;
 
 #if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
       /* 16-byte long double is passed like a struct.  */
@@ -206,13 +176,13 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 	  if (type == FFI_TYPE_COMPLEX)
 	    type = FFI_TYPE_POINTER;
 	  else
-	    type = ffi_check_struct_type (*ptr);
+	    type = ffi_check_struct_type (ty);
 
 	  /* If we pass the struct via pointer, copy the data.  */
 	  if (type == FFI_TYPE_POINTER)
 	    {
-	      p_struct -= ROUND_SIZE ((*ptr)->size);
-	      memcpy (p_struct, (char *)arg, (*ptr)->size);
+	      p_struct -= ROUND_SIZE (ty->size);
+	      memcpy (p_struct, (char *)arg, ty->size);
 	      arg = &p_struct;
 	    }
 	}
@@ -234,7 +204,7 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 
 	  case FFI_TYPE_FLOAT:
 	    if (n_fpr < MAX_FPRARGS)
-	      p_fpr[n_fpr++] = (long long) *(unsigned int *) arg << 32;
+	      p_fpr[n_fpr++] = (unsigned long long)*(unsigned int *) arg << 32;
 	    else
 	      p_ov[n_ov++] = *(unsigned int *) arg;
 	    break;
@@ -325,7 +295,7 @@ ffi_prep_args (unsigned char *stack, extended_cif *ecif)
 /*                                                                    */
 /*====================================================================*/
 
-ffi_status
+ffi_status FFI_HIDDEN
 ffi_prep_cif_machdep(ffi_cif *cif)
 {
   size_t struct_size = 0;
@@ -498,32 +468,55 @@ ffi_call_int(ffi_cif *cif,
 	     void *closure)
 {
   int ret_type = cif->flags;
-  extended_cif ecif;
+  size_t rsize = 0, bytes = cif->bytes;
+  unsigned char *stack;
+  struct call_frame *frame;
 
-  ecif.cif    = cif;
-  ecif.avalue = avalue;
-  ecif.rvalue = rvalue;
+  FFI_ASSERT (cif->abi == FFI_SYSV);
 
   /* If we don't have a return value, we need to fake one.  */
   if (rvalue == NULL)
     {
-      if (ret_type == FFI390_RET_STRUCT)
-	ecif.rvalue = alloca (cif->rtype->size);
+      if (ret_type & FFI390_RET_IN_MEM)
+	rsize = cif->rtype->size;
       else
 	ret_type = FFI390_RET_VOID;
     }
 
-  switch (cif->abi)
-    {
-      case FFI_SYSV:
-        ffi_call_SYSV (cif->bytes, &ecif, ffi_prep_args,
-		       ret_type, ecif.rvalue, fn, closure);
-        break;
+  /* The stack space will be filled with those areas:
 
-      default:
-        FFI_ASSERT (0);
-        break;
-    }
+	dummy structure return		    (highest addresses)
+	  FPR argument register save area
+	  GPR argument register save area
+	stack frame for ffi_call_SYSV
+	temporary struct copies
+	overflow argument area              (lowest addresses)
+
+     We set up the following pointers:
+
+        p_fpr: bottom of the FPR area (growing upwards)
+	p_gpr: bottom of the GPR area (growing upwards)
+	p_ov: bottom of the overflow area (growing upwards)
+	p_struct: top of the struct copy area (growing downwards)
+
+     All areas are kept aligned to twice the word size.  */
+
+  stack = alloca (bytes + sizeof(struct call_frame) + rsize);
+  frame = (struct call_frame *)(stack + bytes);
+  if (rsize)
+    rvalue = frame + 1;
+
+  /* Assuming that the current function has the standard call frame,
+     we can maintain the linked list like so.  */
+  frame->back_chain = __builtin_dwarf_cfa() - sizeof(struct call_frame);
+
+  /* Pass the outgoing stack frame in the r15 save slot.  */
+  frame->gpr_save[8] = (unsigned long)(stack - sizeof(struct call_frame));
+
+  /* Fill in all of the argument stuff.  */
+  ffi_prep_args (cif, rvalue, avalue, (unsigned long *)stack, frame);
+
+  ffi_call_SYSV (frame, ret_type & FFI360_RET_MASK, rvalue, fn, closure);
 }
 
 void
@@ -549,8 +542,7 @@ ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
 /*                                                                    */
 /*====================================================================*/
 
-FFI_HIDDEN
-void
+void FFI_HIDDEN
 ffi_closure_helper_SYSV (ffi_cif *cif,
 			 void (*fun)(ffi_cif*,void*,void**,void*),
 			 void *user_data,
@@ -572,18 +564,15 @@ ffi_closure_helper_SYSV (ffi_cif *cif,
   int i;
 
   /* Allocate buffer for argument list pointers.  */
-
   p_arg = avalue = alloca (cif->nargs * sizeof (void *));
 
   /* If we returning a structure, pass the structure address
      directly to the target function.  Otherwise, have the target
      function store the return value to the GPR save area.  */
-
-  if (cif->flags == FFI390_RET_STRUCT)
+  if (cif->flags & FFI390_RET_IN_MEM)
     rvalue = (void *) p_gpr[n_gpr++];
 
   /* Now for the arguments.  */
-
   for (ptr = cif->arg_types, i = cif->nargs; i > 0; i--, p_arg++, ptr++)
     {
       int deref_struct_pointer = 0;
@@ -611,11 +600,13 @@ ffi_closure_helper_SYSV (ffi_cif *cif,
 
       /* Pointers are passed like UINTs of the same size.  */
       if (type == FFI_TYPE_POINTER)
+	{
 #ifdef __s390x__
-	type = FFI_TYPE_UINT64;
+	  type = FFI_TYPE_UINT64;
 #else
-	type = FFI_TYPE_UINT32;
+	  type = FFI_TYPE_UINT32;
 #endif
+	}
 
       /* Now handle all primitive int/float data types.  */
       switch (type)
diff --git a/src/s390/internal.h b/src/s390/internal.h
new file mode 100644
index 0000000..b875578
--- /dev/null
+++ b/src/s390/internal.h
@@ -0,0 +1,11 @@
+/* If these values change, sysv.S must be adapted!  */
+#define FFI390_RET_DOUBLE	0
+#define FFI390_RET_FLOAT	1
+#define FFI390_RET_INT64	2
+#define FFI390_RET_INT32	3
+#define FFI390_RET_VOID		4
+
+#define FFI360_RET_MASK		7
+#define FFI390_RET_IN_MEM	8
+
+#define FFI390_RET_STRUCT	(FFI390_RET_VOID | FFI390_RET_IN_MEM)
diff --git a/src/s390/sysv.S b/src/s390/sysv.S
index b01a7eb..df9083e 100644
--- a/src/s390/sysv.S
+++ b/src/s390/sysv.S
@@ -29,221 +29,140 @@
 #include <fficonfig.h>
 #include <ffi.h>
 
-#ifndef __s390x__
+	.text
 
-.text
+#ifndef __s390x__
 
-	# r2:	cif->bytes
-	# r3:	&ecif
-	# r4:	ffi_prep_args
-	# r5:	ret_type
-	# r6:	ecif.rvalue
-	# ov:	fn
-	# ov+8:	closure
+	# r2:	frame
+	# r3:	ret_type
+	# r4:	ret_addr
+	# r5:	fun
+	# r6:	closure
 
 	# This assumes we are using gas.
+	.balign	8
 	.globl	ffi_call_SYSV
 	FFI_HIDDEN(ffi_call_SYSV)
 	.type	ffi_call_SYSV,%function
 ffi_call_SYSV:
 	.cfi_startproc
-	stm	%r6,%r15,24(%r15)		# Save registers
-	.cfi_offset r6, -72
-	.cfi_offset r7, -68
-	.cfi_offset r8, -64
-	.cfi_offset r9, -60
-	.cfi_offset r10, -56
-	.cfi_offset r11, -52
-	.cfi_offset r12, -48
-	.cfi_offset r13, -44
-	.cfi_offset r14, -40
-	.cfi_offset r15, -36
-	basr	%r13,0				# Set up base register
+	st	%r6,44(%r2)			# Save registers
+	stm	%r12,%r14,48(%r2)
+	lr	%r13,%r2			# Install frame pointer
+	.cfi_rel_offset r6, 44
+	.cfi_rel_offset r12, 48
+	.cfi_rel_offset r13, 52
+	.cfi_rel_offset r14, 56
+	.cfi_def_cfa_register r13
+	l	%r15,60(%r2)			# Set up outgoing stack
+	basr	%r14,0				# Set up base register
 .Lbase:
-	lr	%r11,%r15			# Set up frame pointer
-	.cfi_def_cfa_register r11
-	sr	%r15,%r2
-	ahi	%r15,-96-48			# Allocate stack
-	lr	%r8,%r6				# Save ecif.rvalue
-	sr	%r9,%r9
-	ic	%r9,.Ltable-.Lbase(%r13,%r5)	# Load epilog address
-	l	%r7,96(%r11)			# Load function address
-	st	%r11,0(%r15)			# Set up back chain
-	ahi	%r11,-48			# Register save area
-	.cfi_adjust_cfa_offset 48
-
-	la	%r2,96(%r15)			# Save area
-						# r3 already holds &ecif
-	basr	%r14,%r4			# Call ffi_prep_args
-
-	l	%r0,96+48+4(%r11)		# Go closure -> static chain
-	lm	%r2,%r6,0(%r11)			# Load arguments
-	ld	%f0,32(%r11)
-	ld	%f2,40(%r11)
-	la	%r14,0(%r13,%r9)		# Set return address
-	br	%r7				# ... and call function
-
-.LretNone:					# Return void
-	l	%r4,48+56(%r11)
-	lm	%r6,%r15,48+24(%r11)
-	.cfi_remember_state
-	.cfi_restore 15
-	.cfi_restore 14
-	.cfi_restore 13
-	.cfi_restore 12
-	.cfi_restore 11
-	.cfi_restore 10
-	.cfi_restore 9
-	.cfi_restore 8
-	.cfi_restore 7
-	.cfi_restore 6
-	.cfi_def_cfa r15, 96
-	br	%r4
-	.cfi_restore_state
-	# This nopr is necessary so that the .cfi instructions between the br
-	# above and the label below get executed.  See execute_cfa_program() in
-	# the Gcc source code, libgcc/unwind-dw2.c.
-	nopr
-
-.LretFloat:
-	l	%r4,48+56(%r11)
-	ste	%f0,0(%r8)			# Return float
-	lm	%r6,%r15,48+24(%r11)
-	.cfi_remember_state
-	.cfi_restore 15
-	.cfi_restore 14
-	.cfi_restore 13
-	.cfi_restore 12
-	.cfi_restore 11
-	.cfi_restore 10
-	.cfi_restore 9
-	.cfi_restore 8
-	.cfi_restore 7
-	.cfi_restore 6
-	.cfi_def_cfa r15, 96
-	br	%r4
-	.cfi_restore_state
-	# See comment on the nopr above.
-	nopr
-
-.LretDouble:
-	l	%r4,48+56(%r11)
-	std	%f0,0(%r8)			# Return double
-	lm	%r6,%r15,48+24(%r11)
-	.cfi_remember_state
-	.cfi_restore 15
-	.cfi_restore 14
-	.cfi_restore 13
-	.cfi_restore 12
-	.cfi_restore 11
-	.cfi_restore 10
-	.cfi_restore 9
-	.cfi_restore 8
-	.cfi_restore 7
-	.cfi_restore 6
-	.cfi_def_cfa r15, 96
-	br	%r4
-	.cfi_restore_state
-	# See comment on the nopr above.
-	nopr
-
-.LretInt32:
-	l	%r4,48+56(%r11)
-	st	%r2,0(%r8)			# Return int
-	lm	%r6,%r15,48+24(%r11)
-	.cfi_remember_state
-	.cfi_restore 15
-	.cfi_restore 14
-	.cfi_restore 13
-	.cfi_restore 12
-	.cfi_restore 11
-	.cfi_restore 10
-	.cfi_restore 9
-	.cfi_restore 8
-	.cfi_restore 7
-	.cfi_restore 6
-	.cfi_def_cfa r15, 96
-	br	%r4
-	.cfi_restore_state
-	# See comment on the nopr above.
-	nopr
-
-.LretInt64:
-	l	%r4,48+56(%r11)
-	stm	%r2,%r3,0(%r8)			# Return long long
-	lm	%r6,%r15,48+24(%r11)
-	.cfi_remember_state
-	.cfi_restore 15
+	sla	%r3,3				# ret_type *= 8
+	lr	%r12,%r4			# Save ret_addr
+	lr	%r1,%r5				# Save fun
+	lr	%r0,%r6				# Install static chain
+	la	%r14,.Ltable-.Lbase(%r14,%r3)	# Set return address
+	lm	%r2,%r6,8(%r13)			# Load arguments
+	ld	%f0,64(%r13)
+	ld	%f2,72(%r13)
+	st	%r13,0(%r15)			# Set up back chain
+	br	%r1				# ... and call function
+
+	.balign	8
+.Ltable:
+# FFI390_RET_DOUBLE
+	std	%f0,0(%r12)
+	j	.Ldone
+
+	.balign	8
+# FFI390_RET_FLOAT
+	ste	%f0,0(%r12)
+	j	.Ldone
+
+	.balign	8
+# FFI390_RET_INT64
+	st	%r3,4(%r12)
+	nop
+	# fallthru
+
+	.balign	8
+# FFI390_RET_INT32
+	st	%r2,0(%r12)
+	nop
+	# fallthru
+
+	.balign	8
+# FFI390_RET_VOID
+.Ldone:
+	l	%r14,56(%r13)
+	l	%r12,48(%r13)
+	l	%r6,44(%r13)
+	l	%r13,52(%r13)
 	.cfi_restore 14
 	.cfi_restore 13
 	.cfi_restore 12
-	.cfi_restore 11
-	.cfi_restore 10
-	.cfi_restore 9
-	.cfi_restore 8
-	.cfi_restore 7
 	.cfi_restore 6
 	.cfi_def_cfa r15, 96
-	br	%r4
+	br	%r14
 	.cfi_endproc
+	.size	 ffi_call_SYSV,.-ffi_call_SYSV
 
-.Ltable:
-	.byte	.LretNone-.Lbase		# FFI390_RET_VOID
-	.byte	.LretNone-.Lbase		# FFI390_RET_STRUCT
-	.byte	.LretFloat-.Lbase		# FFI390_RET_FLOAT
-	.byte	.LretDouble-.Lbase		# FFI390_RET_DOUBLE
-	.byte	.LretInt32-.Lbase		# FFI390_RET_INT32
-	.byte	.LretInt64-.Lbase		# FFI390_RET_INT64
-
-.ffi_call_SYSV_end:
-	.size	 ffi_call_SYSV,.ffi_call_SYSV_end-ffi_call_SYSV
 
+	.balign	8
+	.globl	ffi_go_closure_SYSV
+	FFI_HIDDEN(ffi_go_closure_SYSV)
+	.type	ffi_go_closure_SYSV,%function
+ffi_go_closure_SYSV:
+	.cfi_startproc
+	stm	%r2,%r6,8(%r15)			# Save arguments
+	lr	%r4,%r0				# Load closure -> user_data
+	l	%r2,4(%r4)			#   ->cif
+	l	%r3,8(%r4)			#   ->fun
+	j	.Ldoclosure
+	.cfi_endproc
 
+	.balign	8
 	.globl	ffi_closure_SYSV
 	FFI_HIDDEN(ffi_closure_SYSV)
 	.type	ffi_closure_SYSV,%function
 ffi_closure_SYSV:
 	.cfi_startproc
 	stm	%r2,%r6,8(%r15)			# Save arguments
-	.cfi_offset r6, -72
 	lr	%r4,%r0				# Closure
 	l	%r2,16(%r4)			#   ->cif
 	l	%r3,20(%r4)			#   ->fun
 	l	%r4,24(%r4)			#   ->user_data
 .Ldoclosure:
 	stm	%r12,%r15,48(%r15)		# Save registers
-	.cfi_offset r12, -48
-	.cfi_offset r13, -44
-	.cfi_offset r14, -40
-	.cfi_offset r15, -36
+	lr	%r12,%r15
+	.cfi_def_cfa_register r12
+	.cfi_rel_offset r6, 24
+	.cfi_rel_offset r12, 48
+	.cfi_rel_offset r13, 52
+	.cfi_rel_offset r14, 56
+	.cfi_rel_offset r15, 60
 	basr	%r13,0				# Set up base register
 .Lcbase:
-	std	%f0,64(%r15)
-	std	%f2,72(%r15)
-	lr	%r1,%r15			# Set up stack frame
-	ahi	%r15,-104
-	.cfi_adjust_cfa_offset 104
-	l	%r12,.Lchelper-.Lcbase(%r13)	# Get helper function
-	la	%r5,96(%r1)
-	st	%r5,96(%r15)			# Overflow
-	la	%r5,8(%r1)			# GPRs
-	la	%r6,64(%r1)			# FPRs
-	st	%r1,0(%r15)			# Set up back chain
-
-	bas	%r14,0(%r12,%r13)		# Call helper
-
-	l	%r4,104+56(%r15)
-	ld	%f0,104+64(%r15)		# Load return registers
-	lm	%r2,%r3,104+8(%r15)
-	l	%r6,104+24(%r15)		# Restore saved registers
-	.cfi_restore r6
-	lm	%r12,%r15,104+48(%r15)
-	.cfi_adjust_cfa_offset -104
-	.cfi_restore r12
-	.cfi_restore r13
-	.cfi_restore r14
-	.cfi_restore r15
-	br	%r4
+	ahi	%r15,-96-8			# Set up stack frame
+	l	%r1,.Lchelper-.Lcbase(%r13)	# Get helper function
+	st	%r12,0(%r15)			# Set up back chain
+
+	std	%f0,64(%r12)			# Save fp arguments
+	std	%f2,72(%r12)
+
+	la	%r5,96(%r12)			# Overflow
+	st	%r5,96(%r15)
+	la	%r6,64(%r12)			# FPRs
+	la	%r5,8(%r12)			# GPRs
+	bas	%r14,0(%r1,%r13)		# Call helper
+
+	lr	%r15,%r12
+	.cfi_def_cfa_register r15
+	lm	%r12,%r14,48(%r12)		# Restore saved registers
+	l	%r6,24(%r15)
+	ld	%f0,64(%r15)			# Load return registers
+	lm	%r2,%r3,8(%r15)
+	br	%r14
 	.cfi_endproc
 
 	.align 4
@@ -251,234 +170,143 @@ ffi_closure_SYSV:
 	.long	ffi_closure_helper_SYSV-.Lcbase
 
 
-.ffi_closure_SYSV_end:
-	.size	 ffi_closure_SYSV,.ffi_closure_SYSV_end-ffi_closure_SYSV
-
-
-	.globl	ffi_go_closure_SYSV
-	FFI_HIDDEN(ffi_go_closure_SYSV)
-	.type	ffi_go_closure_SYSV,%function
-ffi_go_closure_SYSV:
-	.cfi_startproc
-	stm	%r2,%r6,8(%r15)			# Save arguments
-	.cfi_offset r6, -72
-	lr	%r4,%r0				# Load closure -> user_data
-	l	%r2,4(%r4)			#   ->cif
-	l	%r3,8(%r4)			#   ->fun
-	j	.Ldoclosure
-	.cfi_endproc
+	.size	 ffi_closure_SYSV,.-ffi_closure_SYSV
 
 #else
 
-.text
-
-	# r2:	cif->bytes
-	# r3:	&ecif
-	# r4:	ffi_prep_args
-	# r5:	ret_type
-	# r6:	ecif.rvalue
-	# ov:	fn
-	# ov+8:	closure
+	# r2:	frame
+	# r3:	ret_type
+	# r4:	ret_addr
+	# r5:	fun
+	# r6:	closure
 
 	# This assumes we are using gas.
+	.balign	8
 	.globl	ffi_call_SYSV
 	FFI_HIDDEN(ffi_call_SYSV)
 	.type	ffi_call_SYSV,%function
 ffi_call_SYSV:
 	.cfi_startproc
-	stmg	%r6,%r15,48(%r15)		# Save registers
-	.cfi_offset r6, -112
-	.cfi_offset r7, -104
-	.cfi_offset r8, -96
-	.cfi_offset r9, -88
-	.cfi_offset r10, -80
-	.cfi_offset r11, -72
-	.cfi_offset r12, -64
-	.cfi_offset r13, -56
-	.cfi_offset r14, -48
-	.cfi_offset r15, -40
-	larl	%r13,.Lbase			# Set up base register
-	lgr	%r11,%r15			# Set up frame pointer
-	.cfi_def_cfa_register r11
-	sgr	%r15,%r2
-	aghi	%r15,-160-80			# Allocate stack
-	lgr	%r8,%r6				# Save ecif.rvalue
-	llgc	%r9,.Ltable-.Lbase(%r13,%r5)	# Load epilog address
-	lg	%r7,160(%r11)			# Load function address
-	stg	%r11,0(%r15)			# Set up back chain
-	aghi	%r11,-80			# Register save area
-	.cfi_adjust_cfa_offset 80
-
-	la	%r2,160(%r15)			# Save area
-						# r3 already holds &ecif
-	basr	%r14,%r4			# Call ffi_prep_args
-
-	lg	%r0,160+80+8(%r11)		# Go closure -> static chain
-	lmg	%r2,%r6,0(%r11)			# Load arguments
-	ld	%f0,48(%r11)
-	ld	%f2,56(%r11)
-	ld	%f4,64(%r11)
-	ld	%f6,72(%r11)
-	la	%r14,0(%r13,%r9)		# Set return address
-	br	%r7				# ... and call function
-
-.Lbase:
-.LretNone:					# Return void
-	lg	%r4,80+112(%r11)
-	lmg	%r6,%r15,80+48(%r11)
-	.cfi_remember_state
-	.cfi_restore r15
-	.cfi_restore r14
-	.cfi_restore r13
-	.cfi_restore r12
-	.cfi_restore r11
-	.cfi_restore r10
-	.cfi_restore r9
-	.cfi_restore r8
-	.cfi_restore r7
-	.cfi_restore r6
-	.cfi_def_cfa r15, 160
-	br	%r4
-	.cfi_restore_state
-	# This nopr is necessary so that the .cfi instructions between the br
-	# above and the label below get executed.  See execute_cfa_program() in
-	# the Gcc source code, libgcc/unwind-dw2.c.
-	nopr
-
-.LretFloat:
-	lg	%r4,80+112(%r11)
-	ste	%f0,0(%r8)			# Return float
-	lmg	%r6,%r15,80+48(%r11)
-	.cfi_remember_state
-	.cfi_restore r6
-	.cfi_restore r7
-	.cfi_restore r8
-	.cfi_restore r9
-	.cfi_restore r10
-	.cfi_restore r11
-	.cfi_restore r12
-	.cfi_restore r13
-	.cfi_restore r14
-	.cfi_restore r15
-	.cfi_def_cfa r15, 160
-	br	%r4
-	.cfi_restore_state
-	# See comment on the nopr above.
-	nopr
-
-.LretDouble:
-	lg	%r4,80+112(%r11)
-	std	%f0,0(%r8)			# Return double
-	lmg	%r6,%r15,80+48(%r11)
-	.cfi_remember_state
-	.cfi_restore r15
-	.cfi_restore r14
-	.cfi_restore r13
-	.cfi_restore r12
-	.cfi_restore r11
-	.cfi_restore r10
-	.cfi_restore r9
-	.cfi_restore r8
-	.cfi_restore r7
-	.cfi_restore r6
-	.cfi_def_cfa r15, 160
-	br	%r4
-	.cfi_restore_state
-	# See comment on the nopr above.
-	nopr
-
-.LretInt64:
-	lg	%r4,80+112(%r11)
-	stg	%r2,0(%r8)			# Return long
-	lmg	%r6,%r15,80+48(%r11)
-	.cfi_restore r15
+	stg	%r6,88(%r2)			# Save registers
+	stmg	%r12,%r14,96(%r2)
+	lgr	%r13,%r2			# Install frame pointer
+	.cfi_rel_offset r6, 88
+	.cfi_rel_offset r12, 96
+	.cfi_rel_offset r13, 104
+	.cfi_rel_offset r14, 112
+	.cfi_def_cfa_register r13
+	lg	%r15,120(%r2)			# Set up outgoing stack
+	larl	%r14,.Ltable			# Set up return address
+	slag	%r3,%r3,3			# ret_type *= 8
+	lgr	%r12,%r4			# Save ret_addr
+	lgr	%r1,%r5				# Save fun
+	lgr	%r0,%r6				# Install static chain
+	agr	%r14,%r3
+	lmg	%r2,%r6,16(%r13)		# Load arguments
+	ld	%f0,128(%r13)
+	ld	%f2,136(%r13)
+	ld	%f4,144(%r13)
+	ld	%f6,152(%r13)
+	stg	%r13,0(%r15)			# Set up back chain
+	br	%r1				# ... and call function
+
+	.balign	8
+.Ltable:
+# FFI390_RET_DOUBLE
+	std	%f0,0(%r12)
+	j	.Ldone
+
+	.balign	8
+# FFI390_RET_DOUBLE
+	ste	%f0,0(%r12)
+	j	.Ldone
+
+	.balign	8
+# FFI390_RET_INT64
+	stg	%r2,0(%r12)
+
+	.balign	8
+# FFI390_RET_INT32
+	# Never used, as we always store type ffi_arg.
+	# But the stg above is 6 bytes and we cannot
+	# jump around this case, so fall through.
+	nop
+	nop
+
+	.balign	8
+# FFI390_RET_VOID
+.Ldone:
+	lg	%r14,112(%r13)
+	lg	%r12,96(%r13)
+	lg	%r6,88(%r13)
+	lg	%r13,104(%r13)
 	.cfi_restore r14
 	.cfi_restore r13
 	.cfi_restore r12
-	.cfi_restore r11
-	.cfi_restore r10
-	.cfi_restore r9
-	.cfi_restore r8
-	.cfi_restore r7
 	.cfi_restore r6
 	.cfi_def_cfa r15, 160
-	br	%r4
+	br	%r14
 	.cfi_endproc
+	.size	 ffi_call_SYSV,.-ffi_call_SYSV
 
-.Ltable:
-	.byte	.LretNone-.Lbase		# FFI390_RET_VOID
-	.byte	.LretNone-.Lbase		# FFI390_RET_STRUCT
-	.byte	.LretFloat-.Lbase		# FFI390_RET_FLOAT
-	.byte	.LretDouble-.Lbase		# FFI390_RET_DOUBLE
-	.byte	0				# int32 retval not supported
-	.byte	.LretInt64-.Lbase		# FFI390_RET_INT64
 
-.ffi_call_SYSV_end:
-	.size	 ffi_call_SYSV,.ffi_call_SYSV_end-ffi_call_SYSV
+	.balign	8
+	.globl	ffi_go_closure_SYSV
+	FFI_HIDDEN(ffi_go_closure_SYSV)
+	.type	ffi_go_closure_SYSV,%function
+ffi_go_closure_SYSV:
+	.cfi_startproc
+	stmg	%r2,%r6,16(%r15)		# Save arguments
+	lgr	%r4,%r0				# Load closure -> user_data
+	lg	%r2,8(%r4)			#   ->cif
+	lg	%r3,16(%r4)			#   ->fun
+	j	.Ldoclosure
+	.cfi_endproc
+	.size	 ffi_go_closure_SYSV,.-ffi_go_closure_SYSV
 
 
+	.balign	8
 	.globl	ffi_closure_SYSV
 	FFI_HIDDEN(ffi_closure_SYSV)
 	.type	ffi_closure_SYSV,%function
 ffi_closure_SYSV:
 	.cfi_startproc
 	stmg	%r2,%r6,16(%r15)		# Save arguments
-	.cfi_offset r6, -112
 	lgr	%r4,%r0				# Load closure
 	lg	%r2,32(%r4)			#   ->cif
 	lg	%r3,40(%r4)			#   ->fun
 	lg	%r4,48(%r4)			#   ->user_data
 .Ldoclosure:
-	stmg	%r14,%r15,112(%r15)		# Save registers
-	.cfi_offset r14, -48
-	.cfi_offset r15, -40
-	std	%f0,128(%r15)			# Save arguments
-	std	%f2,136(%r15)
-	std	%f4,144(%r15)
-	std	%f6,152(%r15)
-	lgr	%r1,%r15			# Set up stack frame
-	aghi	%r15,-168
-	.cfi_adjust_cfa_offset 168
-	la	%r5,160(%r1)
-	stg	%r5,160(%r15)			# Overflow
-	la	%r5,16(%r1)			# GPRs
-	la	%r6,128(%r1)			# FPRs
-	stg	%r1,0(%r15)			# Set up back chain
-
+	stmg	%r13,%r15,104(%r15)		# Save registers
+	lgr	%r13,%r15
+	.cfi_def_cfa_register r13
+	.cfi_rel_offset r6, 48
+	.cfi_rel_offset r13, 104
+	.cfi_rel_offset r14, 112
+	.cfi_rel_offset r15, 120
+	aghi	%r15,-160-16			# Set up stack frame
+	stg	%r13,0(%r15)			# Set up back chain
+
+	std	%f0,128(%r13)			# Save fp arguments
+	std	%f2,136(%r13)
+	std	%f4,144(%r13)
+	std	%f6,152(%r13)
+	la	%r5,160(%r13)			# Overflow
+	stg	%r5,160(%r15)
+	la	%r6,128(%r13)			# FPRs
+	la	%r5,16(%r13)			# GPRs
 	brasl	%r14,ffi_closure_helper_SYSV	# Call helper
 
-	ld	%f0,168+128(%r15)		# Load return registers
-	lg	%r2,168+16(%r15)
-	lg	%r6,168+48(%r15)		# Restore saved registers
-	.cfi_restore r6
-	lmg	%r14,%r15,168+112(%r15)
-	.cfi_restore r14
-	.cfi_restore r15
-	.cfi_adjust_cfa_offset -168
+	lgr	%r15,%r13
+	.cfi_def_cfa_register r15
+	lmg	%r13,%r14,104(%r13)		# Restore saved registers
+	lg	%r6,48(%r15)
+	ld	%f0,128(%r15)			# Load return registers
+	lg	%r2,16(%r15)
 	br	%r14
 	.cfi_endproc
-
-.ffi_closure_SYSV_end:
-	.size	 ffi_closure_SYSV,.ffi_closure_SYSV_end-ffi_closure_SYSV
-
-
-	.globl	ffi_go_closure_SYSV
-	FFI_HIDDEN(ffi_go_closure_SYSV)
-	.type	ffi_go_closure_SYSV,%function
-ffi_go_closure_SYSV:
-	.cfi_startproc
-	stmg	%r2,%r6,16(%r15)		# Save arguments
-	.cfi_offset r6, -112
-	lgr	%r4,%r0				# Load closure -> user_data
-	lg	%r2,8(%r4)			#   ->cif
-	lg	%r3,16(%r4)			#   ->fun
-	j	.Ldoclosure
-	.cfi_endproc
-
-.ffi_go_closure_SYSV_end:
-	.size	 ffi_go_closure_SYSV,.ffi_go_closure_SYSV_end-ffi_go_closure_SYSV
-
-#endif
+	.size	 ffi_closure_SYSV,.-ffi_closure_SYSV
+#endif /* !s390x */
 
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
-- 
2.1.0

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 0/4] s390 improvements
@ 2014-12-18 21:33 Richard Henderson
  2014-12-18 21:33 ` [PATCH 1/4] s390: Kill trailing whitespace Richard Henderson
                   ` (6 more replies)
  0 siblings, 7 replies; 16+ messages in thread
From: Richard Henderson @ 2014-12-18 21:33 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Ulrich.Weigand, vogt, krebbel

This is relative to Dominik's patch from the 16th.  The complete
tree can be found at

	git://github.com/rth7680/libffi.git s390

Mostly relevant is patch 3, which converts the s390 port to the
more modern arrangement where there's no callback into ffi_prep_args.

If everyone's happy with this, I'll give Anthony the pull request.


r~


Richard Henderson (4):
  s390: Kill trailing whitespace
  s390: Avoid aliasing warnings
  s390: Reorganize assembly
  s390: Use pc-relative insns in 31-bit mode

 configure.ac        |  17 ++
 src/s390/ffi.c      | 310 +++++++++++++--------------
 src/s390/internal.h |  11 +
 src/s390/sysv.S     | 607 +++++++++++++++++++---------------------------------
 4 files changed, 403 insertions(+), 542 deletions(-)
 create mode 100644 src/s390/internal.h

-- 
2.1.0

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/4] s390: Avoid aliasing warnings
  2014-12-18 21:33 [PATCH 0/4] s390 improvements Richard Henderson
                   ` (2 preceding siblings ...)
  2014-12-18 21:33 ` [PATCH 3/4] s390: Reorganize assembly Richard Henderson
@ 2014-12-18 21:33 ` Richard Henderson
  2014-12-19 15:07 ` [PATCH 0/4] s390 improvements Ulrich Weigand
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 16+ messages in thread
From: Richard Henderson @ 2014-12-18 21:33 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Ulrich.Weigand, vogt, krebbel, Richard Henderson

From: Richard Henderson <rth@redhat.com>

---
 src/s390/ffi.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/s390/ffi.c b/src/s390/ffi.c
index b0c3092..c06b79e 100644
--- a/src/s390/ffi.c
+++ b/src/s390/ffi.c
@@ -762,29 +762,28 @@ ffi_prep_closure_loc (ffi_closure *closure,
 		      void *user_data,
 		      void *codeloc)
 {
-  if (cif->abi != FFI_SYSV)
-    return FFI_BAD_ABI;
-
+  static unsigned short const template[] = {
+    0x0d10,			/* basr %r1,0 */
 #ifndef __s390x__
-  *(short *)&closure->tramp [0] = 0x0d10;   /* basr %r1,0 */
-  *(short *)&closure->tramp [2] = 0x9801;   /* lm %r0,%r1,6(%r1) */
-  *(short *)&closure->tramp [4] = 0x1006;
-  *(short *)&closure->tramp [6] = 0x07f1;   /* br %r1 */
-  *(long  *)&closure->tramp [8] = (long)codeloc;
-  *(long  *)&closure->tramp[12] = (long)&ffi_closure_SYSV;
+    0x9801, 0x1006,		/* lm %r0,%r1,6(%r1) */
 #else
-  *(short *)&closure->tramp [0] = 0x0d10;   /* basr %r1,0 */
-  *(short *)&closure->tramp [2] = 0xeb01;   /* lmg %r0,%r1,14(%r1) */
-  *(short *)&closure->tramp [4] = 0x100e;
-  *(short *)&closure->tramp [6] = 0x0004;
-  *(short *)&closure->tramp [8] = 0x07f1;   /* br %r1 */
-  *(long  *)&closure->tramp[16] = (long)codeloc;
-  *(long  *)&closure->tramp[24] = (long)&ffi_closure_SYSV;
+    0xeb01, 0x100e, 0x0004,	/* lmg %r0,%r1,14(%r1) */
 #endif
+    0x07f1			/* br %r1 */
+  };
+
+  unsigned long *tramp = (unsigned long *)&closure->tramp;
+
+  if (cif->abi != FFI_SYSV)
+    return FFI_BAD_ABI;
+
+  memcpy (tramp, template, sizeof(template));
+  tramp[2] = (unsigned long)codeloc;
+  tramp[3] = (unsigned long)&ffi_closure_SYSV;
 
   closure->cif = cif;
-  closure->user_data = user_data;
   closure->fun = fun;
+  closure->user_data = user_data;
 
   return FFI_OK;
 }
-- 
2.1.0

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 4/4] s390: Use pc-relative insns in 31-bit mode
  2014-12-18 21:33 [PATCH 0/4] s390 improvements Richard Henderson
  2014-12-18 21:33 ` [PATCH 1/4] s390: Kill trailing whitespace Richard Henderson
@ 2014-12-18 21:33 ` Richard Henderson
  2014-12-18 21:33 ` [PATCH 3/4] s390: Reorganize assembly Richard Henderson
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 16+ messages in thread
From: Richard Henderson @ 2014-12-18 21:33 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Ulrich.Weigand, vogt, krebbel, Richard Henderson

From: Richard Henderson <rth@redhat.com>

It's silly to stick to esa/390 features when the compiler won't.
Detect when brasl and larl are used by the compiler and then use
them in the assembly.
---
 configure.ac    | 17 +++++++++++++++++
 src/s390/sysv.S | 19 +++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index d5414f9..2941cd7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -183,6 +183,23 @@ if test x$TARGET = xX86 || test x$TARGET = xX86_WIN32 || test x$TARGET = xX86_64
     fi
 fi
 
+if test x$TARGET = xS390; then
+    AC_CACHE_CHECK([compiler uses zarch features],
+	libffi_cv_as_s390_zarch, [
+	libffi_cv_as_s390_zarch=no
+	echo 'void foo(void) { bar(); bar(); }' > conftest.c
+	if $CC $CFLAGS -S conftest.c > /dev/null 2>&1; then
+	    if grep -q brasl conftest.s; then
+	        libffi_cv_as_s390_zarch=yes
+	    fi
+	fi
+	])
+    if test "x$libffi_cv_as_s390_zarch" = xyes; then
+	AC_DEFINE(HAVE_AS_S390_ZARCH, 1,
+		  [Define if the compiler uses zarch features.])
+    fi
+fi
+
 # On PaX enable kernels that have MPROTECT enable we can't use PROT_EXEC.
 AC_ARG_ENABLE(pax_emutramp,
   [  --enable-pax_emutramp       enable pax emulated trampolines, for we can't use PROT_EXEC],
diff --git a/src/s390/sysv.S b/src/s390/sysv.S
index df9083e..1869269 100644
--- a/src/s390/sysv.S
+++ b/src/s390/sysv.S
@@ -55,13 +55,21 @@ ffi_call_SYSV:
 	.cfi_rel_offset r14, 56
 	.cfi_def_cfa_register r13
 	l	%r15,60(%r2)			# Set up outgoing stack
+#ifdef HAVE_AS_S390_ZARCH
+	larl	%r14,.Ltable
+#else
 	basr	%r14,0				# Set up base register
 .Lbase:
+#endif
 	sla	%r3,3				# ret_type *= 8
 	lr	%r12,%r4			# Save ret_addr
 	lr	%r1,%r5				# Save fun
 	lr	%r0,%r6				# Install static chain
+#ifdef HAVE_AS_S390_ZARCH
+	la	%r14,0(%r14,%r3)		# Set return address
+#else
 	la	%r14,.Ltable-.Lbase(%r14,%r3)	# Set return address
+#endif
 	lm	%r2,%r6,8(%r13)			# Load arguments
 	ld	%f0,64(%r13)
 	ld	%f2,72(%r13)
@@ -141,10 +149,12 @@ ffi_closure_SYSV:
 	.cfi_rel_offset r13, 52
 	.cfi_rel_offset r14, 56
 	.cfi_rel_offset r15, 60
+#ifndef HAVE_AS_S390_ZARCH
 	basr	%r13,0				# Set up base register
 .Lcbase:
-	ahi	%r15,-96-8			# Set up stack frame
 	l	%r1,.Lchelper-.Lcbase(%r13)	# Get helper function
+#endif
+	ahi	%r15,-96-8			# Set up stack frame
 	st	%r12,0(%r15)			# Set up back chain
 
 	std	%f0,64(%r12)			# Save fp arguments
@@ -154,7 +164,11 @@ ffi_closure_SYSV:
 	st	%r5,96(%r15)
 	la	%r6,64(%r12)			# FPRs
 	la	%r5,8(%r12)			# GPRs
+#ifdef HAVE_AS_S390_ZARCH
+	brasl	%r14,ffi_closure_helper_SYSV
+#else
 	bas	%r14,0(%r1,%r13)		# Call helper
+#endif
 
 	lr	%r15,%r12
 	.cfi_def_cfa_register r15
@@ -165,10 +179,11 @@ ffi_closure_SYSV:
 	br	%r14
 	.cfi_endproc
 
+#ifndef HAVE_AS_S390_ZARCH
 	.align 4
 .Lchelper:
 	.long	ffi_closure_helper_SYSV-.Lcbase
-
+#endif
 
 	.size	 ffi_closure_SYSV,.-ffi_closure_SYSV
 
-- 
2.1.0

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] s390 improvements
  2014-12-18 21:33 [PATCH 0/4] s390 improvements Richard Henderson
                   ` (3 preceding siblings ...)
  2014-12-18 21:33 ` [PATCH 2/4] s390: Avoid aliasing warnings Richard Henderson
@ 2014-12-19 15:07 ` Ulrich Weigand
  2014-12-19 15:33   ` Richard Henderson
  2014-12-19 16:43 ` [PATCH 5/4] s390: Inline and tidy ffi_prep_args Richard Henderson
  2014-12-22 11:34 ` [PATCH 0/4] s390 improvements Dominik Vogt
  6 siblings, 1 reply; 16+ messages in thread
From: Ulrich Weigand @ 2014-12-19 15:07 UTC (permalink / raw)
  To: Richard Henderson; +Cc: libffi-discuss, Ulrich.Weigand, vogt, krebbel

Richard Henderson wrote:

> This is relative to Dominik's patch from the 16th.  The complete
> tree can be found at
> 
> 	git://github.com/rth7680/libffi.git s390
> 
> Mostly relevant is patch 3, which converts the s390 port to the
> more modern arrangement where there's no callback into ffi_prep_args.

This is a bit confusing to me.  The assembler routine now does:

        lg      %r15,120(%r2)                   # Set up outgoing stack

without ever restoring the initial stack pointer before returning
to its caller.  This probably works right now since the value loaded
here is determined like that:

  /* Pass the outgoing stack frame in the r15 save slot.  */
  frame->gpr_save[8] = (unsigned long)(stack - sizeof(struct call_frame));

and since "stack" was allocated via alloca and ffi_call_int does not
require an argument save area when calling any of its subroutines,
the stack pointer value computed here should always in fact be
identical to the value %r15 already has at the above location.

Using the 160 bytes below "stack" as register save area for use of the
target function called by ffi_call_SYSV is also only safe if those bytes
are in fact the register save area ffi_call_int provides for its caller,
e.g. again if the value is already identical to %r15.  (If this were
any other value, we might clobber parts of ffi_call_int's stack frame
that it conceivably might still access.)

However, if the procedure only works if the "lg" is a nop, why is it
even done?   Also, the whole setup seems a bit fragile since changes
to ffi_call_int might cause it to need an argument save area ...

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] s390 improvements
  2014-12-19 15:07 ` [PATCH 0/4] s390 improvements Ulrich Weigand
@ 2014-12-19 15:33   ` Richard Henderson
  2014-12-19 16:15     ` Ulrich Weigand
  0 siblings, 1 reply; 16+ messages in thread
From: Richard Henderson @ 2014-12-19 15:33 UTC (permalink / raw)
  To: Ulrich Weigand, Richard Henderson
  Cc: libffi-discuss, Ulrich.Weigand, vogt, krebbel

On 12/19/2014 09:06 AM, Ulrich Weigand wrote:
> Richard Henderson wrote:
> 
>> This is relative to Dominik's patch from the 16th.  The complete
>> tree can be found at
>>
>> 	git://github.com/rth7680/libffi.git s390
>>
>> Mostly relevant is patch 3, which converts the s390 port to the
>> more modern arrangement where there's no callback into ffi_prep_args.
> 
> This is a bit confusing to me.  The assembler routine now does:
> 
>         lg      %r15,120(%r2)                   # Set up outgoing stack
> 
> without ever restoring the initial stack pointer before returning
> to its caller.  This probably works right now since the value loaded
> here is determined like that:
> 
>   /* Pass the outgoing stack frame in the r15 save slot.  */
>   frame->gpr_save[8] = (unsigned long)(stack - sizeof(struct call_frame));
> 
> and since "stack" was allocated via alloca and ffi_call_int does not
> require an argument save area when calling any of its subroutines,
> the stack pointer value computed here should always in fact be
> identical to the value %r15 already has at the above location.
> 
> Using the 160 bytes below "stack" as register save area for use of the
> target function called by ffi_call_SYSV is also only safe if those bytes
> are in fact the register save area ffi_call_int provides for its caller,
> e.g. again if the value is already identical to %r15.  (If this were
> any other value, we might clobber parts of ffi_call_int's stack frame
> that it conceivably might still access.)
> 
> However, if the procedure only works if the "lg" is a nop, why is it
> even done?   Also, the whole setup seems a bit fragile since changes
> to ffi_call_int might cause it to need an argument save area ...

The stack frame we install is created with alloca, and so we know for a fact
that ffi_call_int must be using a frame pointer to hold its own frame.  Since
we do not adjust %r15 on the way out of ffi_call_SYSV, we leave the stack frame
chain intact.  If there were another function for ffi_call_int to call after
ffi_call_SYSV (but there's not), the outgoing 160 byte save area would be present.

It's true that the load of %r15 is now a nop.  It hadn't been at one point in
my development; ffi_prep_args had had more than 5 parameters, and so there was
extra stack allocated.  I suppose if ffi_prep_args were inlined, one could be
certain of this (since there will be no function calls) and document it as such.


r~

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] s390 improvements
  2014-12-19 15:33   ` Richard Henderson
@ 2014-12-19 16:15     ` Ulrich Weigand
  2014-12-19 16:37       ` Richard Henderson
  0 siblings, 1 reply; 16+ messages in thread
From: Ulrich Weigand @ 2014-12-19 16:15 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Richard Henderson, libffi-discuss, Ulrich.Weigand, vogt, krebbel

Richard Henderson wrote:

> The stack frame we install is created with alloca, and so we know for a fact
> that ffi_call_int must be using a frame pointer to hold its own frame.  Since
> we do not adjust %r15 on the way out of ffi_call_SYSV, we leave the stack frame
> chain intact.  If there were another function for ffi_call_int to call after
> ffi_call_SYSV (but there's not), the outgoing 160 byte save area would be present.

Well, even in a function using a frame pointer, we still occasionally
rely on %r15.  One obvious case would be calling a function that needs
an argument save area, after ffi_call_SYSV returned.  (Or if we had
another alloca etc.)

Now, we can probably exclude all those cases by simply not using certain
source-level features in ffi_call_int; that at least needs to be clearly
documented in comments there.   There is still the theoretical possibilty
of a compiler introducing dependencies on %r15 that aren't obvious from
the source.  (One example: some ancient gcc 2.95 compilers would actually
use the backchain value to restore %r15 on function return in some cases.
These would now get the wrong value, since ffi_call_SYSV has installed a
different backchain.)

In the end, it's probably OK for low-level code like libffi to make certain
assumptions on the behavior of the toolchain.  I'm not quite sure whether
this actually gets us any significant benefit in this case.  Does it really
matter whether ffi_prep_args is called from ffi_call_int vs. ffi_call_SYSV?

> It's true that the load of %r15 is now a nop.  It hadn't been at one point in
> my development; ffi_prep_args had had more than 5 parameters, and so there was
> extra stack allocated.  I suppose if ffi_prep_args were inlined, one could be
> certain of this (since there will be no function calls) and document it as such.

If we do use such tricks, this version may actually be preferable.

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] s390 improvements
  2014-12-19 16:15     ` Ulrich Weigand
@ 2014-12-19 16:37       ` Richard Henderson
  2014-12-19 17:08         ` Ulrich Weigand
  0 siblings, 1 reply; 16+ messages in thread
From: Richard Henderson @ 2014-12-19 16:37 UTC (permalink / raw)
  To: Ulrich Weigand
  Cc: Richard Henderson, libffi-discuss, Ulrich.Weigand, vogt, krebbel

On 12/19/2014 10:14 AM, Ulrich Weigand wrote:
> In the end, it's probably OK for low-level code like libffi to make certain
> assumptions on the behavior of the toolchain.  I'm not quite sure whether
> this actually gets us any significant benefit in this case.  Does it really
> matter whether ffi_prep_args is called from ffi_call_int vs. ffi_call_SYSV?

I think it's the indirect call back to ffi_prep_args that I find ugliest,
and for most targets, totally unnecessary.

>> It's true that the load of %r15 is now a nop.  It hadn't been at one point in
>> my development; ffi_prep_args had had more than 5 parameters, and so there was
>> extra stack allocated.  I suppose if ffi_prep_args were inlined, one could be
>> certain of this (since there will be no function calls) and document it as such.
> 
> If we do use such tricks, this version may actually be preferable.

I'll post that version shortly.  If you still don't like it, then I'll
try another tack whereby we simply avoid the indirect call.


r~

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 5/4] s390: Inline and tidy ffi_prep_args
  2014-12-18 21:33 [PATCH 0/4] s390 improvements Richard Henderson
                   ` (4 preceding siblings ...)
  2014-12-19 15:07 ` [PATCH 0/4] s390 improvements Ulrich Weigand
@ 2014-12-19 16:43 ` Richard Henderson
  2014-12-22 11:34 ` [PATCH 0/4] s390 improvements Dominik Vogt
  6 siblings, 0 replies; 16+ messages in thread
From: Richard Henderson @ 2014-12-19 16:43 UTC (permalink / raw)
  To: libffi-discuss; +Cc: Ulrich.Weigand, vogt, krebbel, Richard Henderson

From: Richard Henderson <rth@redhat.com>

As per discussion with Ulrich Weigand, document the restrictions
on the code within ffi_call_int as we simultaneously prepare
stack frames for ffi_call_SYSV and the target function.
---
 src/s390/ffi.c  | 294 ++++++++++++++++++++++++--------------------------------
 src/s390/sysv.S |  21 ++--
 2 files changed, 135 insertions(+), 180 deletions(-)

diff --git a/src/s390/ffi.c b/src/s390/ffi.c
index 1189f7b..4035b6e 100644
--- a/src/s390/ffi.c
+++ b/src/s390/ffi.c
@@ -30,6 +30,7 @@
 
 #include <ffi.h>
 #include <ffi_common.h>
+#include <stdint.h>
 #include "internal.h"
 
 /*====================== End of Includes =============================*/
@@ -130,165 +131,6 @@ ffi_check_struct_type (ffi_type *arg)
 
 /*====================================================================*/
 /*                                                                    */
-/* Name     - ffi_prep_args.                                          */
-/*                                                                    */
-/* Function - Prepare parameters for call to function.                */
-/*                                                                    */
-/* ffi_prep_args is called by the assembly routine once stack space   */
-/* has been allocated for the function's arguments.                   */
-/*                                                                    */
-/*====================================================================*/
-
-static void
-ffi_prep_args (ffi_cif *cif, void *rvalue, void **p_argv,
-	       unsigned long *p_ov, struct call_frame *p_frame)
-{
-  unsigned char *p_struct = (unsigned char *)p_frame;
-  unsigned long *p_gpr = p_frame->gpr_args;
-  unsigned long long *p_fpr = p_frame->fpr_args;
-  int n_fpr = 0;
-  int n_gpr = 0;
-  int n_ov = 0;
-  ffi_type **ptr;
-  int i;
-
-  /* If we returning a structure then we set the first parameter register
-     to the address of where we are returning this structure.  */
-  if (cif->flags & FFI390_RET_IN_MEM)
-    p_gpr[n_gpr++] = (unsigned long) rvalue;
-
-  /* Now for the arguments.  */
-  for (ptr = cif->arg_types, i = cif->nargs; i > 0; i--, ptr++, p_argv++)
-    {
-      ffi_type *ty = *ptr;
-      void *arg = *p_argv;
-      int type = ty->type;
-
-#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
-      /* 16-byte long double is passed like a struct.  */
-      if (type == FFI_TYPE_LONGDOUBLE)
-	type = FFI_TYPE_STRUCT;
-#endif
-
-      /* Check how a structure type is passed.  */
-      if (type == FFI_TYPE_STRUCT || type == FFI_TYPE_COMPLEX)
-	{
-	  if (type == FFI_TYPE_COMPLEX)
-	    type = FFI_TYPE_POINTER;
-	  else
-	    type = ffi_check_struct_type (ty);
-
-	  /* If we pass the struct via pointer, copy the data.  */
-	  if (type == FFI_TYPE_POINTER)
-	    {
-	      p_struct -= ROUND_SIZE (ty->size);
-	      memcpy (p_struct, (char *)arg, ty->size);
-	      arg = &p_struct;
-	    }
-	}
-
-      /* Now handle all primitive int/pointer/float data types.  */
-      switch (type)
-	{
-	  case FFI_TYPE_DOUBLE:
-	    if (n_fpr < MAX_FPRARGS)
-	      p_fpr[n_fpr++] = *(unsigned long long *) arg;
-	    else
-#ifdef __s390x__
-	      p_ov[n_ov++] = *(unsigned long *) arg;
-#else
-	      p_ov[n_ov++] = ((unsigned long *) arg)[0],
-	      p_ov[n_ov++] = ((unsigned long *) arg)[1];
-#endif
-	    break;
-
-	  case FFI_TYPE_FLOAT:
-	    if (n_fpr < MAX_FPRARGS)
-	      p_fpr[n_fpr++] = (unsigned long long)*(unsigned int *) arg << 32;
-	    else
-	      p_ov[n_ov++] = *(unsigned int *) arg;
-	    break;
-
-	  case FFI_TYPE_POINTER:
-	    if (n_gpr < MAX_GPRARGS)
-	      p_gpr[n_gpr++] = (unsigned long)*(unsigned char **) arg;
-	    else
-	      p_ov[n_ov++] = (unsigned long)*(unsigned char **) arg;
-	    break;
-
-	  case FFI_TYPE_UINT64:
-	  case FFI_TYPE_SINT64:
-#ifdef __s390x__
-	    if (n_gpr < MAX_GPRARGS)
-	      p_gpr[n_gpr++] = *(unsigned long *) arg;
-	    else
-	      p_ov[n_ov++] = *(unsigned long *) arg;
-#else
-	    if (n_gpr == MAX_GPRARGS-1)
-	      n_gpr = MAX_GPRARGS;
-	    if (n_gpr < MAX_GPRARGS)
-	      p_gpr[n_gpr++] = ((unsigned long *) arg)[0],
-	      p_gpr[n_gpr++] = ((unsigned long *) arg)[1];
-	    else
-	      p_ov[n_ov++] = ((unsigned long *) arg)[0],
-	      p_ov[n_ov++] = ((unsigned long *) arg)[1];
-#endif
-	    break;
-
-	  case FFI_TYPE_UINT32:
-	    if (n_gpr < MAX_GPRARGS)
-	      p_gpr[n_gpr++] = *(unsigned int *) arg;
-	    else
-	      p_ov[n_ov++] = *(unsigned int *) arg;
-	    break;
-
-	  case FFI_TYPE_INT:
-	  case FFI_TYPE_SINT32:
-	    if (n_gpr < MAX_GPRARGS)
-	      p_gpr[n_gpr++] = *(signed int *) arg;
-	    else
-	      p_ov[n_ov++] = *(signed int *) arg;
-	    break;
-
-	  case FFI_TYPE_UINT16:
-	    if (n_gpr < MAX_GPRARGS)
-	      p_gpr[n_gpr++] = *(unsigned short *) arg;
-	    else
-	      p_ov[n_ov++] = *(unsigned short *) arg;
-	    break;
-
-	  case FFI_TYPE_SINT16:
-	    if (n_gpr < MAX_GPRARGS)
-	      p_gpr[n_gpr++] = *(signed short *) arg;
-	    else
-	      p_ov[n_ov++] = *(signed short *) arg;
-	    break;
-
-	  case FFI_TYPE_UINT8:
-	    if (n_gpr < MAX_GPRARGS)
-	      p_gpr[n_gpr++] = *(unsigned char *) arg;
-	    else
-	      p_ov[n_ov++] = *(unsigned char *) arg;
-	    break;
-
-	  case FFI_TYPE_SINT8:
-	    if (n_gpr < MAX_GPRARGS)
-	      p_gpr[n_gpr++] = *(signed char *) arg;
-	    else
-	      p_ov[n_ov++] = *(signed char *) arg;
-	    break;
-
-	  default:
-	    FFI_ASSERT (0);
-	    break;
-        }
-    }
-}
-
-/*======================== End of Routine ============================*/
-
-/*====================================================================*/
-/*                                                                    */
 /* Name     - ffi_prep_cif_machdep.                                   */
 /*                                                                    */
 /* Function - Perform machine dependent CIF processing.               */
@@ -469,8 +311,12 @@ ffi_call_int(ffi_cif *cif,
 {
   int ret_type = cif->flags;
   size_t rsize = 0, bytes = cif->bytes;
-  unsigned char *stack;
+  unsigned char *stack, *p_struct;
   struct call_frame *frame;
+  unsigned long *p_ov, *p_gpr;
+  unsigned long long *p_fpr;
+  int n_fpr, n_gpr, n_ov, i, n;
+  ffi_type **arg_types;
 
   FFI_ASSERT (cif->abi == FFI_SYSV);
 
@@ -499,22 +345,134 @@ ffi_call_int(ffi_cif *cif,
 	p_ov: bottom of the overflow area (growing upwards)
 	p_struct: top of the struct copy area (growing downwards)
 
-     All areas are kept aligned to twice the word size.  */
+     All areas are kept aligned to twice the word size.
+
+     Note that we're going to create the stack frame for both
+     ffi_call_SYSV _and_ the target function right here.  This
+     works because we don't make any function calls with more
+     than 5 arguments (indeed only memcpy and ffi_call_SYSV),
+     and thus we don't have any stacked outgoing parameters.  */
 
   stack = alloca (bytes + sizeof(struct call_frame) + rsize);
   frame = (struct call_frame *)(stack + bytes);
   if (rsize)
     rvalue = frame + 1;
 
-  /* Assuming that the current function has the standard call frame,
-     we can maintain the linked list like so.  */
-  frame->back_chain = __builtin_dwarf_cfa() - sizeof(struct call_frame);
-
-  /* Pass the outgoing stack frame in the r15 save slot.  */
-  frame->gpr_save[8] = (unsigned long)(stack - sizeof(struct call_frame));
+  /* Link the new frame back to the one from this function.  */
+  frame->back_chain = __builtin_frame_address (0);
 
   /* Fill in all of the argument stuff.  */
-  ffi_prep_args (cif, rvalue, avalue, (unsigned long *)stack, frame);
+  p_ov = (unsigned long *)stack;
+  p_struct = (unsigned char *)frame;
+  p_gpr = frame->gpr_args;
+  p_fpr = frame->fpr_args;
+  n_fpr = n_gpr = n_ov = 0;
+
+  /* If we returning a structure then we set the first parameter register
+     to the address of where we are returning this structure.  */
+  if (cif->flags & FFI390_RET_IN_MEM)
+    p_gpr[n_gpr++] = (uintptr_t) rvalue;
+
+  /* Now for the arguments.  */
+  arg_types = cif->arg_types;
+  for (i = 0, n = cif->nargs; i < n; ++i)
+    {
+      ffi_type *ty = arg_types[i];
+      void *arg = avalue[i];
+      int type = ty->type;
+      ffi_arg val;
+
+    restart:
+      switch (type)
+	{
+	case FFI_TYPE_SINT8:
+	  val = *(SINT8 *)arg;
+	  goto do_int;
+	case FFI_TYPE_UINT8:
+	  val = *(UINT8 *)arg;
+	  goto do_int;
+	case FFI_TYPE_SINT16:
+	  val = *(SINT16 *)arg;
+	  goto do_int;
+	case FFI_TYPE_UINT16:
+	  val = *(UINT16 *)arg;
+	  goto do_int;
+	case FFI_TYPE_INT:
+	case FFI_TYPE_SINT32:
+	  val = *(SINT32 *)arg;
+	  goto do_int;
+	case FFI_TYPE_UINT32:
+	  val = *(UINT32 *)arg;
+	  goto do_int;
+	case FFI_TYPE_POINTER:
+	  val = *(uintptr_t *)arg;
+	do_int:
+	  *(n_gpr < MAX_GPRARGS ? p_gpr + n_gpr++ : p_ov + n_ov++) = val;
+	  break;
+
+	case FFI_TYPE_UINT64:
+	case FFI_TYPE_SINT64:
+#ifdef __s390x__
+	  val = *(UINT64 *)arg;
+	  goto do_int;
+#else
+	  if (n_gpr == MAX_GPRARGS-1)
+	    n_gpr = MAX_GPRARGS;
+	  if (n_gpr < MAX_GPRARGS)
+	    p_gpr[n_gpr++] = ((UINT32 *) arg)[0],
+	    p_gpr[n_gpr++] = ((UINT32 *) arg)[1];
+	  else
+	    p_ov[n_ov++] = ((UINT32 *) arg)[0],
+	    p_ov[n_ov++] = ((UINT32 *) arg)[1];
+#endif
+	  break;
+
+	case FFI_TYPE_DOUBLE:
+	  if (n_fpr < MAX_FPRARGS)
+	    p_fpr[n_fpr++] = *(UINT64 *) arg;
+	  else
+	    {
+#ifdef __s390x__
+	      p_ov[n_ov++] = *(UINT64 *) arg;
+#else
+	      p_ov[n_ov++] = ((UINT32 *) arg)[0],
+	      p_ov[n_ov++] = ((UINT32 *) arg)[1];
+#endif
+	    }
+	  break;
+
+	case FFI_TYPE_FLOAT:
+	  val = *(UINT32 *)arg;
+	  if (n_fpr < MAX_FPRARGS)
+	    p_fpr[n_fpr++] = (UINT64)val << 32;
+	  else
+	    p_ov[n_ov++] = val;
+	  break;
+
+	case FFI_TYPE_STRUCT:
+          /* Check how a structure type is passed.  */
+	  type = ffi_check_struct_type (ty);
+	  /* Some structures are passed via a type they contain.  */
+	  if (type != FFI_TYPE_POINTER)
+	    goto restart;
+	  /* ... otherwise, passed by reference.  fallthru.  */
+
+#if FFI_TYPE_LONGDOUBLE != FFI_TYPE_DOUBLE
+	case FFI_TYPE_LONGDOUBLE:
+	  /* 16-byte long double is passed via reference.  */
+#endif
+	case FFI_TYPE_COMPLEX:
+	  /* Complex types are passed via reference.  */
+	  p_struct -= ROUND_SIZE (ty->size);
+	  memcpy (p_struct, arg, ty->size);
+	  val = (uintptr_t)p_struct;
+	  goto do_int;
+
+	default:
+	  FFI_ASSERT (0);
+	  break;
+        }
+    }
 
   ffi_call_SYSV (frame, ret_type & FFI360_RET_MASK, rvalue, fn, closure);
 }
diff --git a/src/s390/sysv.S b/src/s390/sysv.S
index 1869269..c4b5006 100644
--- a/src/s390/sysv.S
+++ b/src/s390/sysv.S
@@ -54,26 +54,24 @@ ffi_call_SYSV:
 	.cfi_rel_offset r13, 52
 	.cfi_rel_offset r14, 56
 	.cfi_def_cfa_register r13
-	l	%r15,60(%r2)			# Set up outgoing stack
-#ifdef HAVE_AS_S390_ZARCH
-	larl	%r14,.Ltable
-#else
-	basr	%r14,0				# Set up base register
-.Lbase:
-#endif
+	st	%r2,0(%r15)			# Set up back chain
 	sla	%r3,3				# ret_type *= 8
 	lr	%r12,%r4			# Save ret_addr
 	lr	%r1,%r5				# Save fun
 	lr	%r0,%r6				# Install static chain
+
+	# Set return address, so that there is only one indirect jump.
 #ifdef HAVE_AS_S390_ZARCH
-	la	%r14,0(%r14,%r3)		# Set return address
+	larl	%r14,.Ltable
+	ar	%r14,%r3
 #else
-	la	%r14,.Ltable-.Lbase(%r14,%r3)	# Set return address
+	basr	%r14,0
+0:	la	%r14,.Ltable-0b(%r14,%r3)
 #endif
+
 	lm	%r2,%r6,8(%r13)			# Load arguments
 	ld	%f0,64(%r13)
 	ld	%f2,72(%r13)
-	st	%r13,0(%r15)			# Set up back chain
 	br	%r1				# ... and call function
 
 	.balign	8
@@ -210,7 +208,7 @@ ffi_call_SYSV:
 	.cfi_rel_offset r13, 104
 	.cfi_rel_offset r14, 112
 	.cfi_def_cfa_register r13
-	lg	%r15,120(%r2)			# Set up outgoing stack
+	stg	%r2,0(%r15)			# Set up back chain
 	larl	%r14,.Ltable			# Set up return address
 	slag	%r3,%r3,3			# ret_type *= 8
 	lgr	%r12,%r4			# Save ret_addr
@@ -222,7 +220,6 @@ ffi_call_SYSV:
 	ld	%f2,136(%r13)
 	ld	%f4,144(%r13)
 	ld	%f6,152(%r13)
-	stg	%r13,0(%r15)			# Set up back chain
 	br	%r1				# ... and call function
 
 	.balign	8
-- 
2.1.0

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] s390 improvements
  2014-12-19 16:37       ` Richard Henderson
@ 2014-12-19 17:08         ` Ulrich Weigand
  0 siblings, 0 replies; 16+ messages in thread
From: Ulrich Weigand @ 2014-12-19 17:08 UTC (permalink / raw)
  To: Richard Henderson
  Cc: Richard Henderson, libffi-discuss, Ulrich.Weigand, vogt, krebbel

Richard Henderson wrote:
> On 12/19/2014 10:14 AM, Ulrich Weigand wrote:
> > In the end, it's probably OK for low-level code like libffi to make certain
> > assumptions on the behavior of the toolchain.  I'm not quite sure whether
> > this actually gets us any significant benefit in this case.  Does it really
> > matter whether ffi_prep_args is called from ffi_call_int vs. ffi_call_SYSV?
> 
> I think it's the indirect call back to ffi_prep_args that I find ugliest,
> and for most targets, totally unnecessary.

Ah, OK.  Yes, there's no need for the call to be indirect.  I guess in the
original implementation on old S/390, there's no real difference between a
direct and an indirect call, but where we have brasl, we should really use
it and do a direct call.

> >> It's true that the load of %r15 is now a nop.  It hadn't been at one point in
> >> my development; ffi_prep_args had had more than 5 parameters, and so there was
> >> extra stack allocated.  I suppose if ffi_prep_args were inlined, one could be
> >> certain of this (since there will be no function calls) and document it as such.
> > 
> > If we do use such tricks, this version may actually be preferable.
> 
> I'll post that version shortly.  If you still don't like it, then I'll
> try another tack whereby we simply avoid the indirect call.

I guess the new version is fine with me.  Thanks for working on this!

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/4] s390 improvements
  2014-12-18 21:33 [PATCH 0/4] s390 improvements Richard Henderson
                   ` (5 preceding siblings ...)
  2014-12-19 16:43 ` [PATCH 5/4] s390: Inline and tidy ffi_prep_args Richard Henderson
@ 2014-12-22 11:34 ` Dominik Vogt
  6 siblings, 0 replies; 16+ messages in thread
From: Dominik Vogt @ 2014-12-22 11:34 UTC (permalink / raw)
  To: libffi-discuss

On Thu, Dec 18, 2014 at 03:33:19PM -0600, Richard Henderson wrote:
> This is relative to Dominik's patch from the 16th.  The complete
> tree can be found at
> 
> 	git://github.com/rth7680/libffi.git s390

The test suite runs fine on this branch on s390 and s390x.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] s390: Reorganize assembly
  2014-12-18 21:33 ` [PATCH 3/4] s390: Reorganize assembly Richard Henderson
@ 2014-12-22 12:12   ` Dominik Vogt
  2014-12-22 12:25     ` Dominik Vogt
  0 siblings, 1 reply; 16+ messages in thread
From: Dominik Vogt @ 2014-12-22 12:12 UTC (permalink / raw)
  To: libffi-discuss

[-- Attachment #1: Type: text/plain, Size: 994 bytes --]

On Thu, Dec 18, 2014 at 03:33:22PM -0600, Richard Henderson wrote:
> -#define FFI390_RET_VOID		0
> -#define FFI390_RET_STRUCT	1
> -#define FFI390_RET_FLOAT	2
> -#define FFI390_RET_DOUBLE	3
> -#define FFI390_RET_INT32	4
> -#define FFI390_RET_INT64	5

> +#define FFI390_RET_DOUBLE	0
> +#define FFI390_RET_FLOAT	1
> +#define FFI390_RET_INT64	2
> +#define FFI390_RET_INT32	3
> +#define FFI390_RET_VOID		4

While we're at renumbering this list of values, if
FFI390_RET_INT32 had the value 0, this bit of assembly code would
not be necessary for s390x:

> +	.balign	8
> +# FFI390_RET_INT32
> +	# Never used, as we always store type ffi_arg.
> +	# But the stg above is 6 bytes and we cannot
> +	# jump around this case, so fall through.
> +	nop
> +	nop

(See attached patch.)

> +  ffi_call_SYSV (frame, ret_type & FFI360_RET_MASK, rvalue, fn, closure);
...
> +#define FFI360_RET_MASK                7
              ^^^

Shouldn't that be "390"?

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany

[-- Attachment #2: 0001-s390-Optimize-return-type-table-in-ffi_call_SYSV.patch --]
[-- Type: text/x-diff, Size: 2214 bytes --]

From 28c7949be357e6912add914c0574b107d3464e78 Mon Sep 17 00:00:00 2001
From: Dominik Vogt <vogt@linux.vnet.ibm.com>
Date: Mon, 22 Dec 2014 12:51:59 +0100
Subject: [PATCH] s390: Optimize return type table in ffi_call_SYSV.

---
 src/s390/internal.h |  8 ++++----
 src/s390/sysv.S     | 22 +++++++---------------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/src/s390/internal.h b/src/s390/internal.h
index b875578..33246e9 100644
--- a/src/s390/internal.h
+++ b/src/s390/internal.h
@@ -1,8 +1,8 @@
 /* If these values change, sysv.S must be adapted!  */
-#define FFI390_RET_DOUBLE	0
-#define FFI390_RET_FLOAT	1
-#define FFI390_RET_INT64	2
-#define FFI390_RET_INT32	3
+#define FFI390_RET_INT32	0
+#define FFI390_RET_DOUBLE	1
+#define FFI390_RET_FLOAT	2
+#define FFI390_RET_INT64	3
 #define FFI390_RET_VOID		4
 
 #define FFI360_RET_MASK		7
diff --git a/src/s390/sysv.S b/src/s390/sysv.S
index c4b5006..6015404 100644
--- a/src/s390/sysv.S
+++ b/src/s390/sysv.S
@@ -76,6 +76,10 @@ ffi_call_SYSV:
 
 	.balign	8
 .Ltable:
+# FFI390_RET_INT32
+	st	%r2,0(%r12)
+	j	.Ldone
+
 # FFI390_RET_DOUBLE
 	std	%f0,0(%r12)
 	j	.Ldone
@@ -87,16 +91,11 @@ ffi_call_SYSV:
 
 	.balign	8
 # FFI390_RET_INT64
-	st	%r3,4(%r12)
+	stm	%r2,%r3,0(%r12)
 	nop
 	# fallthru
 
 	.balign	8
-# FFI390_RET_INT32
-	st	%r2,0(%r12)
-	nop
-	# fallthru
-
 	.balign	8
 # FFI390_RET_VOID
 .Ldone:
@@ -209,7 +208,7 @@ ffi_call_SYSV:
 	.cfi_rel_offset r14, 112
 	.cfi_def_cfa_register r13
 	stg	%r2,0(%r15)			# Set up back chain
-	larl	%r14,.Ltable			# Set up return address
+	larl	%r14,.Ltable-8			# Set up return address
 	slag	%r3,%r3,3			# ret_type *= 8
 	lgr	%r12,%r4			# Save ret_addr
 	lgr	%r1,%r5				# Save fun
@@ -223,6 +222,7 @@ ffi_call_SYSV:
 	br	%r1				# ... and call function
 
 	.balign	8
+# FFI390_RET_INT32 is not used, table starts at entry 1, not entry 0
 .Ltable:
 # FFI390_RET_DOUBLE
 	std	%f0,0(%r12)
@@ -238,14 +238,6 @@ ffi_call_SYSV:
 	stg	%r2,0(%r12)
 
 	.balign	8
-# FFI390_RET_INT32
-	# Never used, as we always store type ffi_arg.
-	# But the stg above is 6 bytes and we cannot
-	# jump around this case, so fall through.
-	nop
-	nop
-
-	.balign	8
 # FFI390_RET_VOID
 .Ldone:
 	lg	%r14,112(%r13)
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] s390: Reorganize assembly
  2014-12-22 12:12   ` Dominik Vogt
@ 2014-12-22 12:25     ` Dominik Vogt
  2014-12-22 16:30       ` Richard Henderson
  0 siblings, 1 reply; 16+ messages in thread
From: Dominik Vogt @ 2014-12-22 12:25 UTC (permalink / raw)
  To: libffi-discuss

[-- Attachment #1: Type: text/plain, Size: 168 bytes --]

Or rather the attached patch stat replaces

  stm %r2,%r3,0(%r12)
  nop

with

  st %r2,0(%r12)
  st %r3,4(%r12)

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany

[-- Attachment #2: 0001-s390-Optimize-return-type-table-in-ffi_call_SYSV.patch --]
[-- Type: text/x-diff, Size: 2209 bytes --]

From 08ef4df7343ae07bfc8c032387ac57a19bfa9820 Mon Sep 17 00:00:00 2001
From: Dominik Vogt <vogt@linux.vnet.ibm.com>
Date: Mon, 22 Dec 2014 12:51:59 +0100
Subject: [PATCH] s390: Optimize return type table in ffi_call_SYSV.

---
 src/s390/internal.h |  8 ++++----
 src/s390/sysv.S     | 22 +++++++---------------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/src/s390/internal.h b/src/s390/internal.h
index b875578..33246e9 100644
--- a/src/s390/internal.h
+++ b/src/s390/internal.h
@@ -1,8 +1,8 @@
 /* If these values change, sysv.S must be adapted!  */
-#define FFI390_RET_DOUBLE	0
-#define FFI390_RET_FLOAT	1
-#define FFI390_RET_INT64	2
-#define FFI390_RET_INT32	3
+#define FFI390_RET_INT32	0
+#define FFI390_RET_DOUBLE	1
+#define FFI390_RET_FLOAT	2
+#define FFI390_RET_INT64	3
 #define FFI390_RET_VOID		4
 
 #define FFI360_RET_MASK		7
diff --git a/src/s390/sysv.S b/src/s390/sysv.S
index c4b5006..9e3c2a7 100644
--- a/src/s390/sysv.S
+++ b/src/s390/sysv.S
@@ -76,6 +76,10 @@ ffi_call_SYSV:
 
 	.balign	8
 .Ltable:
+# FFI390_RET_INT32
+	st	%r2,0(%r12)
+	j	.Ldone
+
 # FFI390_RET_DOUBLE
 	std	%f0,0(%r12)
 	j	.Ldone
@@ -87,16 +91,11 @@ ffi_call_SYSV:
 
 	.balign	8
 # FFI390_RET_INT64
+	st	%r2,0(%r12)
 	st	%r3,4(%r12)
-	nop
 	# fallthru
 
 	.balign	8
-# FFI390_RET_INT32
-	st	%r2,0(%r12)
-	nop
-	# fallthru
-
 	.balign	8
 # FFI390_RET_VOID
 .Ldone:
@@ -209,7 +208,7 @@ ffi_call_SYSV:
 	.cfi_rel_offset r14, 112
 	.cfi_def_cfa_register r13
 	stg	%r2,0(%r15)			# Set up back chain
-	larl	%r14,.Ltable			# Set up return address
+	larl	%r14,.Ltable-8			# Set up return address
 	slag	%r3,%r3,3			# ret_type *= 8
 	lgr	%r12,%r4			# Save ret_addr
 	lgr	%r1,%r5				# Save fun
@@ -223,6 +222,7 @@ ffi_call_SYSV:
 	br	%r1				# ... and call function
 
 	.balign	8
+# FFI390_RET_INT32 is not used, table starts at entry 1, not entry 0
 .Ltable:
 # FFI390_RET_DOUBLE
 	std	%f0,0(%r12)
@@ -238,14 +238,6 @@ ffi_call_SYSV:
 	stg	%r2,0(%r12)
 
 	.balign	8
-# FFI390_RET_INT32
-	# Never used, as we always store type ffi_arg.
-	# But the stg above is 6 bytes and we cannot
-	# jump around this case, so fall through.
-	nop
-	nop
-
-	.balign	8
 # FFI390_RET_VOID
 .Ldone:
 	lg	%r14,112(%r13)
-- 
1.8.4.2


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] s390: Reorganize assembly
  2014-12-22 12:25     ` Dominik Vogt
@ 2014-12-22 16:30       ` Richard Henderson
  2014-12-23  9:54         ` Dominik Vogt
  0 siblings, 1 reply; 16+ messages in thread
From: Richard Henderson @ 2014-12-22 16:30 UTC (permalink / raw)
  To: libffi-discuss

On 12/22/2014 04:25 AM, Dominik Vogt wrote:
> Or rather the attached patch stat replaces
> 
>   stm %r2,%r3,0(%r12)
>   nop
> 
> with
> 
>   st %r2,0(%r12)
>   st %r3,4(%r12)

Is that really an improvement?

(1) You now need a branch for the (presumed) normal "int" case.
(2) Is stm really that much faster than two st?  I would have
    thought the reverse, actually.


r~

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/4] s390: Reorganize assembly
  2014-12-22 16:30       ` Richard Henderson
@ 2014-12-23  9:54         ` Dominik Vogt
  0 siblings, 0 replies; 16+ messages in thread
From: Dominik Vogt @ 2014-12-23  9:54 UTC (permalink / raw)
  To: libffi-discuss

On Mon, Dec 22, 2014 at 08:30:10AM -0800, Richard Henderson wrote:
> On 12/22/2014 04:25 AM, Dominik Vogt wrote:
> > Or rather the attached patch stat replaces
> > 
> >   stm %r2,%r3,0(%r12)
> >   nop
> > 
> > with
> > 
> >   st %r2,0(%r12)
> >   st %r3,4(%r12)
> 
> Is that really an improvement?
> 
> (1) You now need a branch for the (presumed) normal "int" case.

Well, depends on whether we talk about 64 bit or 31 bit:

64 bit
------

 + saves decoding two nop instructions for int64_t
 + saves four bytes

31 bit
------

 + saves decoding two nop instructions for int64_t
 - replaces a nop with a jump instruction for int32_t

Of course one could have a different order of types depending on
the platform to eliminate the disadvantage of the 31 bit case.

> (2) Is stm really that much faster than two st?  I would have
>     thought the reverse, actually.

No, stm+nop is potentially slower than two st.  That's why the
patch uses the two st instructions.

P.S.:  Just noticed there's a duplicate ".balign" in the patched
code.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2014-12-23  9:54 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-12-18 21:33 [PATCH 0/4] s390 improvements Richard Henderson
2014-12-18 21:33 ` [PATCH 1/4] s390: Kill trailing whitespace Richard Henderson
2014-12-18 21:33 ` [PATCH 4/4] s390: Use pc-relative insns in 31-bit mode Richard Henderson
2014-12-18 21:33 ` [PATCH 3/4] s390: Reorganize assembly Richard Henderson
2014-12-22 12:12   ` Dominik Vogt
2014-12-22 12:25     ` Dominik Vogt
2014-12-22 16:30       ` Richard Henderson
2014-12-23  9:54         ` Dominik Vogt
2014-12-18 21:33 ` [PATCH 2/4] s390: Avoid aliasing warnings Richard Henderson
2014-12-19 15:07 ` [PATCH 0/4] s390 improvements Ulrich Weigand
2014-12-19 15:33   ` Richard Henderson
2014-12-19 16:15     ` Ulrich Weigand
2014-12-19 16:37       ` Richard Henderson
2014-12-19 17:08         ` Ulrich Weigand
2014-12-19 16:43 ` [PATCH 5/4] s390: Inline and tidy ffi_prep_args Richard Henderson
2014-12-22 11:34 ` [PATCH 0/4] s390 improvements Dominik Vogt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).