public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
@ 2009-08-06 21:42 H.J. Lu
  2009-08-06 22:26 ` Jakub Jelinek
  2009-10-15 15:58 ` H.J. Lu
  0 siblings, 2 replies; 59+ messages in thread
From: H.J. Lu @ 2009-08-06 21:42 UTC (permalink / raw)
  To: gcc-patches; +Cc: ubizjak

Hi,

In 32bit, the incoming stack may not be 16 byte aligned.  This patch
assumes the incoming stack is 4 byte aligned and realigns stack if any
SSE variable is put on stack. Any comments?

Thanks.


H.J.
---
gcc/

2009-08-06  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* config/i386/i386.c (ix86_update_stack_boundary): Use
	STACK_BOUNDARY if use_stack_boundary_for_incoming_stack_boundary
	is set.
	(VALID_SSE_VECTOR_MODE): New.
	(ix86_minimum_alignment): In 32bit, set
	use_stack_boundary_for_incoming_stack_boundary if any SSE
	variables are put on stack.

	* config/i386/i386.h (machine_function): Add
	use_stack_boundary_for_incoming_stack_boundary.

gcc/testsuite/

2009-08-06  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.

Index: gcc/testsuite/gcc.target/i386/incoming-7.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-7.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-7.c	(revision 0)
@@ -0,0 +1,16 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si, v4si, v4si, v4si, v4si);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  return y(s1, s2, s1, s2, s2);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-9.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-9.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-9.c	(revision 0)
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O3 -mno-sse -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-6.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-6.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-6.c	(revision 0)
@@ -0,0 +1,17 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si *s3);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  v4si s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-8.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-8.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-8.c	(revision 0)
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O3 -msse2 -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/config/i386/i386.h
===================================================================
--- gcc/config/i386/i386.h	(revision 150532)
+++ gcc/config/i386/i386.h	(working copy)
@@ -2389,6 +2389,8 @@ struct GTY(()) machine_function {
   /* This value is used for amd64 targets and specifies the current abi
      to be used. MS_ABI means ms abi. Otherwise SYSV_ABI means sysv abi.  */
   enum calling_abi call_abi;
+  /* Use STACK_BOUNDARY for incoming stack boundary.  */
+  int use_stack_boundary_for_incoming_stack_boundary;
   struct machine_cfa_state cfa;
 };
 #endif
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 150532)
+++ gcc/config/i386/i386.c	(working copy)
@@ -8230,11 +8230,19 @@ find_drap_reg (void)
 static void
 ix86_update_stack_boundary (void)
 {
+  /* Should we use STACK_BOUNDARY for incoming stack boundary?  */
+  unsigned int incoming_stack_boundary;
+
+  if (cfun->machine->use_stack_boundary_for_incoming_stack_boundary)
+    incoming_stack_boundary = STACK_BOUNDARY;
+  else
+    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+
   /* Prefer the one specified at command line. */
   ix86_incoming_stack_boundary 
     = (ix86_user_incoming_stack_boundary
        ? ix86_user_incoming_stack_boundary
-       : ix86_default_incoming_stack_boundary);
+       : incoming_stack_boundary);
 
   /* Incoming stack alignment can be changed on individual functions
      via force_align_arg_pointer attribute.  We use the smallest
@@ -20069,6 +20077,10 @@ ix86_local_alignment (tree exp, enum mac
   return align;
 }
 
+#define VALID_SSE_VECTOR_MODE(MODE) \
+  ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \
+   || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode)
+
 /* Compute the minimum required alignment for dynamic stack realignment
    purposes for a local variable, parameter or a stack slot.  EXP is
    the data type or decl itself, MODE is its mode and ALIGN is the
@@ -20080,7 +20092,7 @@ ix86_minimum_alignment (tree exp, enum m
 {
   tree type, decl;
 
-  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+  if (TARGET_64BIT)
     return align;
 
   if (exp && DECL_P (exp))
@@ -20094,6 +20106,15 @@ ix86_minimum_alignment (tree exp, enum m
       decl = NULL;
     }
 
+  /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any
+     SSE variables are put on stack.  */
+  if (VALID_SSE_VECTOR_MODE (mode)
+      || (type && VALID_SSE_VECTOR_MODE (TYPE_MODE (type))))
+     cfun->machine->use_stack_boundary_for_incoming_stack_boundary = 1;
+
+  if (align != 64 || ix86_preferred_stack_boundary >= 64)
+    return align;
+
   /* Don't do dynamic stack realignment for long long objects with
      -mpreferred-stack-boundary=2.  */
   if ((mode == DImode || (type && TYPE_MODE (type) == DImode))

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
  2009-08-06 21:42 PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned H.J. Lu
@ 2009-08-06 22:26 ` Jakub Jelinek
  2009-08-06 22:52   ` H.J. Lu
  2009-10-15 15:58 ` H.J. Lu
  1 sibling, 1 reply; 59+ messages in thread
From: Jakub Jelinek @ 2009-08-06 22:26 UTC (permalink / raw)
  To: H.J. Lu; +Cc: gcc-patches, ubizjak

On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote:
> In 32bit, the incoming stack may not be 16 byte aligned.  This patch
> assumes the incoming stack is 4 byte aligned and realigns stack if any
> SSE variable is put on stack. Any comments?

IMHO this is wrong, I could live with a non-default option for those who
don't care about performance and think a SCO document from 1996 has any
relevance to Linux these days.  In reality a Linux ABI for years assumes 16
byte stack alignment for 32-bit code.

	Jakub

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-08-06 22:26 ` Jakub Jelinek
@ 2009-08-06 22:52   ` H.J. Lu
  0 siblings, 0 replies; 59+ messages in thread
From: H.J. Lu @ 2009-08-06 22:52 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches, ubizjak

On Thu, Aug 6, 2009 at 3:26 PM, Jakub Jelinek<jakub@redhat.com> wrote:
> On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote:
>> In 32bit, the incoming stack may not be 16 byte aligned.  This patch
>> assumes the incoming stack is 4 byte aligned and realigns stack if any
>> SSE variable is put on stack. Any comments?
>
> IMHO this is wrong, I could live with a non-default option for those who
> don't care about performance and think a SCO document from 1996 has any
> relevance to Linux these days.  In reality a Linux ABI for years assumes 16
> byte stack alignment for 32-bit code.
>

According to the current LSB, the stack is still aligned at 4 byte on ia32.
Even  gcc 4.1.0 aligns stack to 4 byte with -Os.

-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
  2009-08-06 21:42 PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned H.J. Lu
  2009-08-06 22:26 ` Jakub Jelinek
@ 2009-10-15 15:58 ` H.J. Lu
  2009-10-15 18:45   ` Uros Bizjak
  1 sibling, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-15 15:58 UTC (permalink / raw)
  To: H.J. Lu; +Cc: gcc-patches, ubizjak

On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote:
> Hi,
> 
> In 32bit, the incoming stack may not be 16 byte aligned.  This patch
> assumes the incoming stack is 4 byte aligned and realigns stack if any
> SSE variable is put on stack. Any comments?
> 

For i386, gcc, up to gcc 3.4, had been generating 4byte stack alignment.
4byte stack alignment is a problem unless SSE registers are used, which
may be generated by vectorizer or intrinsics. This patch accommodates
the binaries generated by previous versions of gcc if SSE registers are
generated by vectorizer or intrinsics. The performance impact on SPEC
CPU 2006 is minimum:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156#c7

OK for trunk?

Thanks.


H.J.
---
gcc/

2009-09-26  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* function.c (allocate_struct_function): Initialize
	forced_stack_alignment and forced_stack_mode.

	* function.h (function): Add forced_stack_alignment and
	forced_stack_mode.

	* tree-vect-data-refs.c	(vect_verify_datarefs_alignment):
	Update forced_stack_alignment and forced_stack_mode.

	* config/i386/i386.c (VALID_SSE_VECTOR_MODE): New.
	(ix86_update_stack_boundary): In 32bit, use STACK_BOUNDARY if
	any SSE variables are forced on stack.
	(ix86_minimum_alignment): In 32bit, update
	forced_stack_alignment and forced_stack_mode if any SSE
	variables are put on stack.

gcc/testsuite/

2009-09-26  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.
	* gcc.target/i386/incoming-10.c: Likewise.
	* gcc.target/i386/incoming-11.c: Likewise.

Index: gcc/testsuite/gcc.target/i386/incoming-11.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-11.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-11.c	(revision 0)
@@ -0,0 +1,16 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+void g();
+
+int p[100];
+int q[100];
+
+void f()
+{
+	int i;
+	for (i = 0; i < 100; i++) p[i] = 0;
+	g();
+	for (i = 0; i < 100; i++) q[i] = 0;
+}
Index: gcc/testsuite/gcc.target/i386/incoming-7.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-7.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-7.c	(revision 0)
@@ -0,0 +1,16 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si, v4si, v4si, v4si, v4si);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  return y(s1, s2, s1, s2, s2);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-9.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-9.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-9.c	(revision 0)
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O3 -mno-sse -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-10.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-10.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-10.c	(revision 0)
@@ -0,0 +1,19 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+struct s {
+	int x[8];
+};
+
+void g(struct s *);
+
+void f()
+{
+	int i;
+	struct s s;
+	for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0;
+	g(&s);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-6.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-6.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-6.c	(revision 0)
@@ -0,0 +1,17 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si *s3);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  v4si s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/testsuite/gcc.target/i386/incoming-8.c
===================================================================
--- gcc/testsuite/gcc.target/i386/incoming-8.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/incoming-8.c	(revision 0)
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -O3 -msse2 -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
Index: gcc/function.c
===================================================================
--- gcc/function.c	(revision 152202)
+++ gcc/function.c	(working copy)
@@ -4139,6 +4139,9 @@ allocate_struct_function (tree fndecl, b
       /* Assume all registers in stdarg functions need to be saved.  */
       cfun->va_list_gpr_size = VA_LIST_MAX_GPR_SIZE;
       cfun->va_list_fpr_size = VA_LIST_MAX_FPR_SIZE;
+
+      cfun->forced_stack_alignment = 0;
+      cfun->forced_stack_mode = VOIDmode;
     }
 }
 
Index: gcc/function.h
===================================================================
--- gcc/function.h	(revision 152202)
+++ gcc/function.h	(working copy)
@@ -532,6 +532,12 @@ struct GTY(()) function {
      a string describing the reason for failure.  */
   const char * GTY((skip)) cannot_be_copied_reason;
 
+  /* The largest alignment forced on the stack.  */
+  unsigned int forced_stack_alignment;
+
+  /* The mode of the largest alignment forced on the stack.  */
+  enum machine_mode forced_stack_mode;
+
   /* Collected bit flags.  */
 
   /* Number of units of general registers that need saving in stdarg
Index: gcc/tree-vect-data-refs.c
===================================================================
--- gcc/tree-vect-data-refs.c	(revision 152202)
+++ gcc/tree-vect-data-refs.c	(working copy)
@@ -927,6 +927,8 @@ vect_verify_datarefs_alignment (loop_vec
     {
       gimple stmt = DR_STMT (dr);
       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+      enum machine_mode mode;
+      unsigned int align;
 
       /* For interleaving, only the alignment of the first access matters.  */
       if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
@@ -947,6 +949,15 @@ vect_verify_datarefs_alignment (loop_vec
             }
           return false;
         }
+
+      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
+      align = GET_MODE_ALIGNMENT (mode);
+      if (cfun->forced_stack_alignment < align)
+	{
+	  cfun->forced_stack_alignment = align;
+	  cfun->forced_stack_mode = mode;
+	}
+
       if (supportable_dr_alignment != dr_aligned
           && vect_print_dump_info (REPORT_ALIGNMENT))
         fprintf (vect_dump, "Vectorizing an unaligned access.");
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 152202)
+++ gcc/config/i386/i386.c	(working copy)
@@ -8151,16 +8151,31 @@ find_drap_reg (void)
     }
 }
 
+#define VALID_SSE_VECTOR_MODE(MODE) \
+  ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \
+   || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode)
+
 /* Update incoming stack boundary and estimated stack alignment.  */
 
 static void
 ix86_update_stack_boundary (void)
 {
+  /* Should we use STACK_BOUNDARY for incoming stack boundary?  */
+  unsigned int incoming_stack_boundary;
+
+  /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any
+     SSE variables are put on stack. */
+  if (!TARGET_64BIT
+      && VALID_SSE_VECTOR_MODE (cfun->forced_stack_mode))
+    incoming_stack_boundary = STACK_BOUNDARY;
+  else
+    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+
   /* Prefer the one specified at command line. */
   ix86_incoming_stack_boundary 
     = (ix86_user_incoming_stack_boundary
        ? ix86_user_incoming_stack_boundary
-       : ix86_default_incoming_stack_boundary);
+       : incoming_stack_boundary);
 
   /* Incoming stack alignment can be changed on individual functions
      via force_align_arg_pointer attribute.  We use the smallest
@@ -19773,7 +19788,7 @@ ix86_minimum_alignment (tree exp, enum m
 {
   tree type, decl;
 
-  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+  if (TARGET_64BIT)
     return align;
 
   if (exp && DECL_P (exp))
@@ -19787,6 +19802,25 @@ ix86_minimum_alignment (tree exp, enum m
       decl = NULL;
     }
 
+  /* In 32bit, update forced_stack_mode if any SSE variables are put
+     on stack.  */
+  if (cfun->forced_stack_alignment < 128)
+    {
+      if (VALID_SSE_VECTOR_MODE (mode))
+	{
+	  cfun->forced_stack_alignment = 128;
+	  cfun->forced_stack_mode = mode;
+	}
+      else if ((type && VALID_SSE_VECTOR_MODE (TYPE_MODE (type))))
+	{
+	  cfun->forced_stack_alignment = 128;
+	  cfun->forced_stack_mode = TYPE_MODE (type);
+	}
+    }
+
+  if (align != 64 || ix86_preferred_stack_boundary >= 64)
+    return align;
+
   /* Don't do dynamic stack realignment for long long objects with
      -mpreferred-stack-boundary=2.  */
   if ((mode == DImode || (type && TYPE_MODE (type) == DImode))

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
  2009-10-15 15:58 ` H.J. Lu
@ 2009-10-15 18:45   ` Uros Bizjak
  2009-10-15 19:22     ` H.J. Lu
  2009-10-16 20:27     ` H.J. Lu
  0 siblings, 2 replies; 59+ messages in thread
From: Uros Bizjak @ 2009-10-15 18:45 UTC (permalink / raw)
  To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek

On 10/15/2009 05:48 PM, H.J. Lu wrote:
> On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote:
>    
>> Hi,
>>
>> In 32bit, the incoming stack may not be 16 byte aligned.  This patch
>> assumes the incoming stack is 4 byte aligned and realigns stack if any
>> SSE variable is put on stack. Any comments?
>>
>>      
> For i386, gcc, up to gcc 3.4, had been generating 4byte stack alignment.
> 4byte stack alignment is a problem unless SSE registers are used, which
> may be generated by vectorizer or intrinsics. This patch accommodates
> the binaries generated by previous versions of gcc if SSE registers are
> generated by vectorizer or intrinsics. The performance impact on SPEC
> CPU 2006 is minimum:
>
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156#c7
>
> OK for trunk?
>    

Comments bellow.

> ---
> gcc/
>
> 2009-09-26  H.J. Lu<hongjiu.lu@intel.com>
>
> 	PR target/40838
> 	* function.c (allocate_struct_function): Initialize
> 	forced_stack_alignment and forced_stack_mode.
>
> 	* function.h (function): Add forced_stack_alignment and
> 	forced_stack_mode.
>
> 	* tree-vect-data-refs.c	(vect_verify_datarefs_alignment):
> 	Update forced_stack_alignment and forced_stack_mode.
>
> 	* config/i386/i386.c (VALID_SSE_VECTOR_MODE): New.
> 	(ix86_update_stack_boundary): In 32bit, use STACK_BOUNDARY if
> 	any SSE variables are forced on stack.
> 	(ix86_minimum_alignment): In 32bit, update
> 	forced_stack_alignment and forced_stack_mode if any SSE
> 	variables are put on stack.
>
> gcc/testsuite/
>
>
>    
...

>
> Index: gcc/function.h
> ===================================================================
> --- gcc/function.h	(revision 152202)
> +++ gcc/function.h	(working copy)
> @@ -532,6 +532,12 @@ struct GTY(()) function {
>        a string describing the reason for failure.  */
>     const char * GTY((skip)) cannot_be_copied_reason;
>
> +  /* The largest alignment forced on the stack.  */
> +  unsigned int forced_stack_alignment;
> +
> +  /* The mode of the largest alignment forced on the stack.  */
> +  enum machine_mode forced_stack_mode;
> +
>     /* Collected bit flags.  */
>
>     /* Number of units of general registers that need saving in stdarg
> Index: gcc/tree-vect-data-refs.c
> ===================================================================
> --- gcc/tree-vect-data-refs.c	(revision 152202)
> +++ gcc/tree-vect-data-refs.c	(working copy)
> @@ -927,6 +927,8 @@ vect_verify_datarefs_alignment (loop_vec
>       {
>         gimple stmt = DR_STMT (dr);
>         stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> +      enum machine_mode mode;
> +      unsigned int align;
>
>         /* For interleaving, only the alignment of the first access matters.  */
>         if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
> @@ -947,6 +949,15 @@ vect_verify_datarefs_alignment (loop_vec
>               }
>             return false;
>           }
> +
> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
> +      align = GET_MODE_ALIGNMENT (mode);
> +      if (cfun->forced_stack_alignment<  align)
> +	{
> +	  cfun->forced_stack_alignment = align;
> +	  cfun->forced_stack_mode = mode;
> +	}
> +
>         if (supportable_dr_alignment != dr_aligned
>             &&  vect_print_dump_info (REPORT_ALIGNMENT))
>           fprintf (vect_dump, "Vectorizing an unaligned access.");
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c	(revision 152202)
> +++ gcc/config/i386/i386.c	(working copy)
> @@ -8151,16 +8151,31 @@ find_drap_reg (void)
>       }
>   }
>
> +#define VALID_SSE_VECTOR_MODE(MODE) \
> +  ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \
> +   || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode)
> +
>   /* Update incoming stack boundary and estimated stack alignment.  */
>
>   static void
>   ix86_update_stack_boundary (void)
>   {
> +  /* Should we use STACK_BOUNDARY for incoming stack boundary?  */
> +  unsigned int incoming_stack_boundary;
> +
> +  /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any
> +     SSE variables are put on stack. */
> +  if (!TARGET_64BIT
> +&&  VALID_SSE_VECTOR_MODE (cfun->forced_stack_mode))
> +    incoming_stack_boundary = STACK_BOUNDARY;
> +  else
> +    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
> +
>     /* Prefer the one specified at command line. */
>     ix86_incoming_stack_boundary
>       = (ix86_user_incoming_stack_boundary
>          ? ix86_user_incoming_stack_boundary
> -       : ix86_default_incoming_stack_boundary);
> +       : incoming_stack_boundary);
>
>     /* Incoming stack alignment can be changed on individual functions
>        via force_align_arg_pointer attribute.  We use the smallest
> @@ -19773,7 +19788,7 @@ ix86_minimum_alignment (tree exp, enum m
>   {
>     tree type, decl;
>
> -  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary>= 64)
> +  if (TARGET_64BIT)
>       return align;
>
>     if (exp&&  DECL_P (exp))
> @@ -19787,6 +19802,25 @@ ix86_minimum_alignment (tree exp, enum m
>         decl = NULL;
>       }
>
> +  /* In 32bit, update forced_stack_mode if any SSE variables are put
> +     on stack.  */
> +  if (cfun->forced_stack_alignment<  128)
> +    {
> +      if (VALID_SSE_VECTOR_MODE (mode))
> +	{
> +	  cfun->forced_stack_alignment = 128;
> +	  cfun->forced_stack_mode = mode;
> +	}
> +      else if ((type&&  VALID_SSE_VECTOR_MODE (TYPE_MODE (type))))
> +	{
> +	  cfun->forced_stack_alignment = 128;
> +	  cfun->forced_stack_mode = TYPE_MODE (type);
> +	}
> +    }
> +
> +  if (align != 64 || ix86_preferred_stack_boundary>= 64)
> +    return align;
> +
>    

We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128 
here. At least the comment for P_S_B_D says:

/* It should be MIN_STACK_BOUNDARY.  But we set it to 128 bits for
    both 32bit and 64bit, to support codes that need 128 bit stack
    alignment for SSE instructions, but can't realign the stack.  */
#define PREFERRED_STACK_BOUNDARY_DEFAULT 128

OTOH, I think that this whole stuff should depend on -mstackrealing 
somehow, according to the comment:

/* 1 if -mstackrealign should be turned on by default.  It will
    generate an alternate prologue and epilogue that realigns the
    runtime stack if nessary.  This supports mixing codes that keep a
    4-byte aligned stack, as specified by i386 psABI, with codes that
    need a 16-byte aligned stack, as required by SSE instructions.  If
    STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
    128, stacks for all functions may be realigned.  */
#define STACK_REALIGN_DEFAULT 0

As far as I understand from many comments from the PR40838 trail 
(especially comment #51), -mstackrealing is not effective in some cases 
involving automatic SSE variables on the stack. I think that 
-mstackrealign should be fixed in the way your patch outlines, so 
old/broken sources that assume 4byte alignment can be compiled using 
this option without penalizing new/fixed code that assumes 16byte alignment.

Also, Jakub has its opinion here, I have also CC'd him.
Uros.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-15 18:45   ` Uros Bizjak
@ 2009-10-15 19:22     ` H.J. Lu
  2009-10-15 19:32       ` Uros Bizjak
  2009-10-16 20:27     ` H.J. Lu
  1 sibling, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-15 19:22 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches, Jakub Jelinek

On Thu, Oct 15, 2009 at 11:40 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On 10/15/2009 05:48 PM, H.J. Lu wrote:
>>
>> On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote:
>>
>>>
>>> Hi,
>>>
>>> In 32bit, the incoming stack may not be 16 byte aligned.  This patch
>>> assumes the incoming stack is 4 byte aligned and realigns stack if any
>>> SSE variable is put on stack. Any comments?
>>>
>>>
>>
>> For i386, gcc, up to gcc 3.4, had been generating 4byte stack alignment.
>> 4byte stack alignment is a problem unless SSE registers are used, which
>> may be generated by vectorizer or intrinsics. This patch accommodates
>> the binaries generated by previous versions of gcc if SSE registers are
>> generated by vectorizer or intrinsics. The performance impact on SPEC
>> CPU 2006 is minimum:
>>
>> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156#c7
>>
>> OK for trunk?
>>
>
> Comments bellow.
>
>> ---
>> gcc/
>>
>> 2009-09-26  H.J. Lu<hongjiu.lu@intel.com>
>>
>>        PR target/40838
>>        * function.c (allocate_struct_function): Initialize
>>        forced_stack_alignment and forced_stack_mode.
>>
>>        * function.h (function): Add forced_stack_alignment and
>>        forced_stack_mode.
>>
>>        * tree-vect-data-refs.c (vect_verify_datarefs_alignment):
>>        Update forced_stack_alignment and forced_stack_mode.
>>
>>        * config/i386/i386.c (VALID_SSE_VECTOR_MODE): New.
>>        (ix86_update_stack_boundary): In 32bit, use STACK_BOUNDARY if
>>        any SSE variables are forced on stack.
>>        (ix86_minimum_alignment): In 32bit, update
>>        forced_stack_alignment and forced_stack_mode if any SSE
>>        variables are put on stack.
>>
>> gcc/testsuite/
>>
>>
>>
>
> ...
>
>>
>> Index: gcc/function.h
>> ===================================================================
>> --- gcc/function.h      (revision 152202)
>> +++ gcc/function.h      (working copy)
>> @@ -532,6 +532,12 @@ struct GTY(()) function {
>>       a string describing the reason for failure.  */
>>    const char * GTY((skip)) cannot_be_copied_reason;
>>
>> +  /* The largest alignment forced on the stack.  */
>> +  unsigned int forced_stack_alignment;
>> +
>> +  /* The mode of the largest alignment forced on the stack.  */
>> +  enum machine_mode forced_stack_mode;
>> +
>>    /* Collected bit flags.  */
>>
>>    /* Number of units of general registers that need saving in stdarg
>> Index: gcc/tree-vect-data-refs.c
>> ===================================================================
>> --- gcc/tree-vect-data-refs.c   (revision 152202)
>> +++ gcc/tree-vect-data-refs.c   (working copy)
>> @@ -927,6 +927,8 @@ vect_verify_datarefs_alignment (loop_vec
>>      {
>>        gimple stmt = DR_STMT (dr);
>>        stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
>> +      enum machine_mode mode;
>> +      unsigned int align;
>>
>>        /* For interleaving, only the alignment of the first access
>> matters.  */
>>        if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
>> @@ -947,6 +949,15 @@ vect_verify_datarefs_alignment (loop_vec
>>              }
>>            return false;
>>          }
>> +
>> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
>> +      align = GET_MODE_ALIGNMENT (mode);
>> +      if (cfun->forced_stack_alignment<  align)
>> +       {
>> +         cfun->forced_stack_alignment = align;
>> +         cfun->forced_stack_mode = mode;
>> +       }
>> +
>>        if (supportable_dr_alignment != dr_aligned
>>            &&  vect_print_dump_info (REPORT_ALIGNMENT))
>>          fprintf (vect_dump, "Vectorizing an unaligned access.");
>> Index: gcc/config/i386/i386.c
>> ===================================================================
>> --- gcc/config/i386/i386.c      (revision 152202)
>> +++ gcc/config/i386/i386.c      (working copy)
>> @@ -8151,16 +8151,31 @@ find_drap_reg (void)
>>      }
>>  }
>>
>> +#define VALID_SSE_VECTOR_MODE(MODE) \
>> +  ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \
>> +   || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode)
>> +
>>  /* Update incoming stack boundary and estimated stack alignment.  */
>>
>>  static void
>>  ix86_update_stack_boundary (void)
>>  {
>> +  /* Should we use STACK_BOUNDARY for incoming stack boundary?  */
>> +  unsigned int incoming_stack_boundary;
>> +
>> +  /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any
>> +     SSE variables are put on stack. */
>> +  if (!TARGET_64BIT
>> +&&  VALID_SSE_VECTOR_MODE (cfun->forced_stack_mode))
>> +    incoming_stack_boundary = STACK_BOUNDARY;
>> +  else
>> +    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
>> +
>>    /* Prefer the one specified at command line. */
>>    ix86_incoming_stack_boundary
>>      = (ix86_user_incoming_stack_boundary
>>         ? ix86_user_incoming_stack_boundary
>> -       : ix86_default_incoming_stack_boundary);
>> +       : incoming_stack_boundary);
>>
>>    /* Incoming stack alignment can be changed on individual functions
>>       via force_align_arg_pointer attribute.  We use the smallest
>> @@ -19773,7 +19788,7 @@ ix86_minimum_alignment (tree exp, enum m
>>  {
>>    tree type, decl;
>>
>> -  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary>= 64)
>> +  if (TARGET_64BIT)
>>      return align;
>>
>>    if (exp&&  DECL_P (exp))
>> @@ -19787,6 +19802,25 @@ ix86_minimum_alignment (tree exp, enum m
>>        decl = NULL;
>>      }
>>
>> +  /* In 32bit, update forced_stack_mode if any SSE variables are put
>> +     on stack.  */
>> +  if (cfun->forced_stack_alignment<  128)
>> +    {
>> +      if (VALID_SSE_VECTOR_MODE (mode))
>> +       {
>> +         cfun->forced_stack_alignment = 128;
>> +         cfun->forced_stack_mode = mode;
>> +       }
>> +      else if ((type&&  VALID_SSE_VECTOR_MODE (TYPE_MODE (type))))
>> +       {
>> +         cfun->forced_stack_alignment = 128;
>> +         cfun->forced_stack_mode = TYPE_MODE (type);
>> +       }
>> +    }
>> +
>> +  if (align != 64 || ix86_preferred_stack_boundary>= 64)
>> +    return align;
>> +
>>
>
> We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128
> here. At least the comment for P_S_B_D says:
>
> /* It should be MIN_STACK_BOUNDARY.  But we set it to 128 bits for
>   both 32bit and 64bit, to support codes that need 128 bit stack
>   alignment for SSE instructions, but can't realign the stack.  */
> #define PREFERRED_STACK_BOUNDARY_DEFAULT 128

In forced_stack_alignment, we collect the hard alignment requirement
on stack. The decision if stack should be aligned depends on the
incoming stack alignment and the hard stack alignment. It is separate
from PREFERRED_STACK_BOUNDARY_DEFAULT.

> OTOH, I think that this whole stuff should depend on -mstackrealing somehow,
> according to the comment:
>
> /* 1 if -mstackrealign should be turned on by default.  It will
>   generate an alternate prologue and epilogue that realigns the
>   runtime stack if nessary.  This supports mixing codes that keep a
>   4-byte aligned stack, as specified by i386 psABI, with codes that
>   need a 16-byte aligned stack, as required by SSE instructions.  If
>   STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
>   128, stacks for all functions may be realigned.  */
> #define STACK_REALIGN_DEFAULT 0
>
> As far as I understand from many comments from the PR40838 trail (especially
> comment #51), -mstackrealing is not effective in some cases involving
> automatic SSE variables on the stack. I think that -mstackrealign should be
> fixed in the way your patch outlines, so old/broken sources that assume
> 4byte alignment can be compiled using this option without penalizing
> new/fixed code that assumes 16byte alignment.
>

-mstackrealign is very effective. If we turn it on by default, it works fine:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838#c45

It is just that we don't want to make it the default since we will realign
the stack for almost all functions. In most cases, misaligned stack won't
cause any problems.

What my patch does is to assume 4byte incoming stack alignment
for functions containing SSE instructions which require hard 16byte
stack alignment.

After my patch is applied, we can update -mstackrealign to make it an
no-op since all the problems it tries to solve:

http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00854.html
http://gcc.gnu.org/ml/gcc-patches/2006-05/msg00526.html

disappeared with my patch.

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-15 19:22     ` H.J. Lu
@ 2009-10-15 19:32       ` Uros Bizjak
  2009-10-15 19:43         ` H.J. Lu
  0 siblings, 1 reply; 59+ messages in thread
From: Uros Bizjak @ 2009-10-15 19:32 UTC (permalink / raw)
  To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek

On 10/15/2009 09:11 PM, H.J. Lu wrote:
>> We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128
>> here. At least the comment for P_S_B_D says:
>>
>> /* It should be MIN_STACK_BOUNDARY. �But we set it to 128 bits for
>> � both 32bit and 64bit, to support codes that need 128 bit stack
>> � alignment for SSE instructions, but can't realign the stack. �*/
>> #define PREFERRED_STACK_BOUNDARY_DEFAULT 128
>>      
> In forced_stack_alignment, we collect the hard alignment requirement
> on stack. The decision if stack should be aligned depends on the
> incoming stack alignment and the hard stack alignment. It is separate
> from PREFERRED_STACK_BOUNDARY_DEFAULT.
>
>    
>> OTOH, I think that this whole stuff should depend on -mstackrealing somehow,
>> according to the comment:
>>
>> /* 1 if -mstackrealign should be turned on by default. �It will
>> � generate an alternate prologue and epilogue that realigns the
>> � runtime stack if nessary. �This supports mixing codes that keep a
>> � 4-byte aligned stack, as specified by i386 psABI, with codes that
>> � need a 16-byte aligned stack, as required by SSE instructions. �If
>> � STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
>> � 128, stacks for all functions may be realigned. �*/
>> #define STACK_REALIGN_DEFAULT 0
>>
>> As far as I understand from many comments from the PR40838 trail (especially
>> comment #51), -mstackrealing is not effective in some cases involving
>> automatic SSE variables on the stack. I think that -mstackrealign should be
>> fixed in the way your patch outlines, so old/broken sources that assume
>> 4byte alignment can be compiled using this option without penalizing
>> new/fixed code that assumes 16byte alignment.
>>
>>      
> -mstackrealign is very effective. If we turn it on by default, it works fine:
>
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838#c45
>
> It is just that we don't want to make it the default since we will realign
> the stack for almost all functions. In most cases, misaligned stack won't
> cause any problems.
>
> What my patch does is to assume 4byte incoming stack alignment
> for functions containing SSE instructions which require hard 16byte
> stack alignment.
>
> After my patch is applied, we can update -mstackrealign to make it an
> no-op since all the problems it tries to solve:
>
> http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00854.html
> http://gcc.gnu.org/ml/gcc-patches/2006-05/msg00526.html
>
> disappeared with my patch.
>    

Then we should do realignment the other way around: instead of using 
-mstackrealing for all the code (including where it has no effect), 
let's use -mstackrealign to activate realignment functionality that is 
introduced by your patch.

IOW, lightweight -mstackrealign, firing up only when there is the 
possibility of unaligned access in the code it precedes.

Uros.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-15 19:32       ` Uros Bizjak
@ 2009-10-15 19:43         ` H.J. Lu
  2009-10-15 19:48           ` Jakub Jelinek
  2009-10-15 19:53           ` Uros Bizjak
  0 siblings, 2 replies; 59+ messages in thread
From: H.J. Lu @ 2009-10-15 19:43 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches, Jakub Jelinek

On Thu, Oct 15, 2009 at 12:27 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On 10/15/2009 09:11 PM, H.J. Lu wrote:
>>>
>>> We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128
>>> here. At least the comment for P_S_B_D says:
>>>
>>> /* It should be MIN_STACK_BOUNDARY. �But we set it to 128 bits for
>>> � both 32bit and 64bit, to support codes that need 128 bit stack
>>> � alignment for SSE instructions, but can't realign the stack. �*/
>>> #define PREFERRED_STACK_BOUNDARY_DEFAULT 128
>>>
>>
>> In forced_stack_alignment, we collect the hard alignment requirement
>> on stack. The decision if stack should be aligned depends on the
>> incoming stack alignment and the hard stack alignment. It is separate
>> from PREFERRED_STACK_BOUNDARY_DEFAULT.
>>
>>
>>>
>>> OTOH, I think that this whole stuff should depend on -mstackrealing
>>> somehow,
>>> according to the comment:
>>>
>>> /* 1 if -mstackrealign should be turned on by default. �It will
>>> � generate an alternate prologue and epilogue that realigns the
>>> � runtime stack if nessary. �This supports mixing codes that keep a
>>> � 4-byte aligned stack, as specified by i386 psABI, with codes that
>>> � need a 16-byte aligned stack, as required by SSE instructions. �If
>>> � STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
>>> � 128, stacks for all functions may be realigned. �*/
>>> #define STACK_REALIGN_DEFAULT 0
>>>
>>> As far as I understand from many comments from the PR40838 trail
>>> (especially
>>> comment #51), -mstackrealing is not effective in some cases involving
>>> automatic SSE variables on the stack. I think that -mstackrealign should
>>> be
>>> fixed in the way your patch outlines, so old/broken sources that assume
>>> 4byte alignment can be compiled using this option without penalizing
>>> new/fixed code that assumes 16byte alignment.
>>>
>>>
>>
>> -mstackrealign is very effective. If we turn it on by default, it works
>> fine:
>>
>> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838#c45
>>
>> It is just that we don't want to make it the default since we will realign
>> the stack for almost all functions. In most cases, misaligned stack won't
>> cause any problems.
>>
>> What my patch does is to assume 4byte incoming stack alignment
>> for functions containing SSE instructions which require hard 16byte
>> stack alignment.
>>
>> After my patch is applied, we can update -mstackrealign to make it an
>> no-op since all the problems it tries to solve:
>>
>> http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00854.html
>> http://gcc.gnu.org/ml/gcc-patches/2006-05/msg00526.html
>>
>> disappeared with my patch.
>>
>
> Then we should do realignment the other way around: instead of using
> -mstackrealing for all the code (including where it has no effect), let's
> use -mstackrealign to activate realignment functionality that is introduced
> by your patch.

That defeats the whole purpose of my patch, which automatically
realigns the stack when there is a hard alignment requirement. If it
isn't turned on by default, it is not very useful.

> IOW, lightweight -mstackrealign, firing up only when there is the
> possibility of unaligned access in the code it precedes.
>

That is what my patch does, but turned it on by default.

-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
  2009-10-15 19:43         ` H.J. Lu
@ 2009-10-15 19:48           ` Jakub Jelinek
  2009-10-15 20:11             ` H.J. Lu
  2009-10-15 19:53           ` Uros Bizjak
  1 sibling, 1 reply; 59+ messages in thread
From: Jakub Jelinek @ 2009-10-15 19:48 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Uros Bizjak, gcc-patches

On Thu, Oct 15, 2009 at 12:32:12PM -0700, H.J. Lu wrote:
> > Then we should do realignment the other way around: instead of using
> > -mstackrealing for all the code (including where it has no effect), let's
> > use -mstackrealign to activate realignment functionality that is introduced
> > by your patch.
> 
> That defeats the whole purpose of my patch, which automatically
> realigns the stack when there is a hard alignment requirement. If it
> isn't turned on by default, it is not very useful.
> 
> > IOW, lightweight -mstackrealign, firing up only when there is the
> > possibility of unaligned access in the code it precedes.
> >
> 
> That is what my patch does, but turned it on by default.

I agree with Uros, I have nothing against a lightweight -mstackrealign,
but forcing this upon everybody just because a few people insist on compiling
code with -mpreferred-stack-boundary=2 is IMHO a very bad idea and I object
against it.  -mpreferred-stack-boundary=2 is fine for the kernel where you
basically define your own ABI, like many other ABI changing options, and for
people who know what they are doing and bear the consequences (e.g. that
they need to -mstackrealign when calling outside code), but we shouldn't
penalize because of that rare option all the code by default.  At least not
on Linux.

	Jakub

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-15 19:43         ` H.J. Lu
  2009-10-15 19:48           ` Jakub Jelinek
@ 2009-10-15 19:53           ` Uros Bizjak
  2009-10-15 21:01             ` H.J. Lu
  1 sibling, 1 reply; 59+ messages in thread
From: Uros Bizjak @ 2009-10-15 19:53 UTC (permalink / raw)
  To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek

On 10/15/2009 09:32 PM, H.J. Lu wrote:
>>>> We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128
>>>> here. At least the comment for P_S_B_D says:
>>>>
>>>> /* It should be MIN_STACK_BOUNDARY. �But we set it to 128 bits for
>>>> � both 32bit and 64bit, to support codes that need 128 bit stack
>>>> � alignment for SSE instructions, but can't realign the stack. �*/
>>>> #define PREFERRED_STACK_BOUNDARY_DEFAULT 128
>>>>
>>>>          
>>> In forced_stack_alignment, we collect the hard alignment requirement
>>> on stack. The decision if stack should be aligned depends on the
>>> incoming stack alignment and the hard stack alignment. It is separate
>>> from PREFERRED_STACK_BOUNDARY_DEFAULT.
>>>
>>>
>>>        
>>>> OTOH, I think that this whole stuff should depend on -mstackrealing
>>>> somehow,
>>>> according to the comment:
>>>>
>>>> /* 1 if -mstackrealign should be turned on by default. �It will
>>>> � generate an alternate prologue and epilogue that realigns the
>>>> � runtime stack if nessary. �This supports mixing codes that keep a
>>>> � 4-byte aligned stack, as specified by i386 psABI, with codes that
>>>> � need a 16-byte aligned stack, as required by SSE instructions. �If
>>>> � STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
>>>> � 128, stacks for all functions may be realigned. �*/
>>>> #define STACK_REALIGN_DEFAULT 0
>>>>
>>>> As far as I understand from many comments from the PR40838 trail
>>>> (especially
>>>> comment #51), -mstackrealing is not effective in some cases involving
>>>> automatic SSE variables on the stack. I think that -mstackrealign should
>>>> be
>>>> fixed in the way your patch outlines, so old/broken sources that assume
>>>> 4byte alignment can be compiled using this option without penalizing
>>>> new/fixed code that assumes 16byte alignment.
>>>>
>>>>
>>>>          
>>> -mstackrealign is very effective. If we turn it on by default, it works
>>> fine:
>>>
>>> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838#c45
>>>
>>> It is just that we don't want to make it the default since we will realign
>>> the stack for almost all functions. In most cases, misaligned stack won't
>>> cause any problems.
>>>
>>> What my patch does is to assume 4byte incoming stack alignment
>>> for functions containing SSE instructions which require hard 16byte
>>> stack alignment.
>>>
>>> After my patch is applied, we can update -mstackrealign to make it an
>>> no-op since all the problems it tries to solve:
>>>
>>> http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00854.html
>>> http://gcc.gnu.org/ml/gcc-patches/2006-05/msg00526.html
>>>
>>> disappeared with my patch.
>>>
>>>        
>> Then we should do realignment the other way around: instead of using
>> -mstackrealing for all the code (including where it has no effect), let's
>> use -mstackrealign to activate realignment functionality that is introduced
>> by your patch.
>>      
> That defeats the whole purpose of my patch, which automatically
> realigns the stack when there is a hard alignment requirement. If it
> isn't turned on by default, it is not very useful.
>
>    
>> IOW, lightweight -mstackrealign, firing up only when there is the
>> possibility of unaligned access in the code it precedes.
>>
>>      
> That is what my patch does, but turned it on by default.
>    

I think that we have the same situation here as was with the infamous 
-mcld option. A specific functionality is needed for compatibility with 
some [broken] code and this functionality has non-negligible impact on 
the performance. AFAICS, there will always be opinions for this 
functionality as well as opinions against it.

I propose that we implement new option -mhard-stackrealign [we can 
bikeshed about this option name a bit ;) ] with corresponding 
--enable-hard-stackrealign as a configure option. This way, both groups 
can have whatever they prefer - compatibility vs. performance.

Looking at -mcld discussions, is this acceptable solution?

Uros.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-15 19:48           ` Jakub Jelinek
@ 2009-10-15 20:11             ` H.J. Lu
  0 siblings, 0 replies; 59+ messages in thread
From: H.J. Lu @ 2009-10-15 20:11 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, gcc-patches

On Thu, Oct 15, 2009 at 12:43 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Thu, Oct 15, 2009 at 12:32:12PM -0700, H.J. Lu wrote:
>> > Then we should do realignment the other way around: instead of using
>> > -mstackrealing for all the code (including where it has no effect), let's
>> > use -mstackrealign to activate realignment functionality that is introduced
>> > by your patch.
>>
>> That defeats the whole purpose of my patch, which automatically
>> realigns the stack when there is a hard alignment requirement. If it
>> isn't turned on by default, it is not very useful.
>>
>> > IOW, lightweight -mstackrealign, firing up only when there is the
>> > possibility of unaligned access in the code it precedes.
>> >
>>
>> That is what my patch does, but turned it on by default.
>
> I agree with Uros, I have nothing against a lightweight -mstackrealign,
> but forcing this upon everybody just because a few people insist on compiling
> code with -mpreferred-stack-boundary=2 is IMHO a very bad idea and I object
> against it.  -mpreferred-stack-boundary=2 is fine for the kernel where you
> basically define your own ABI, like many other ABI changing options, and for
> people who know what they are doing and bear the consequences (e.g. that
> they need to -mstackrealign when calling outside code), but we shouldn't
> penalize because of that rare option all the code by default.  At least not
> on Linux.

Gcc 3.4 and older may align stack to 4byte. Do we declare that
mixing object files compiled with different versions of gcc is
unsupported on Linux/i386?

As for penalty, it only happens at -O3 and its performance impact is
minimum.  We can provide an option to turn it off. It is a correctness
issue.

-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-15 19:53           ` Uros Bizjak
@ 2009-10-15 21:01             ` H.J. Lu
  2009-10-15 21:41               ` Uros Bizjak
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-15 21:01 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches, Jakub Jelinek

On Thu, Oct 15, 2009 at 12:51 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On 10/15/2009 09:32 PM, H.J. Lu wrote:
>>>>>
>>>>> We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128
>>>>> here. At least the comment for P_S_B_D says:
>>>>>
>>>>> /* It should be MIN_STACK_BOUNDARY. �But we set it to 128 bits for
>>>>> � both 32bit and 64bit, to support codes that need 128 bit stack
>>>>> � alignment for SSE instructions, but can't realign the stack. �*/
>>>>> #define PREFERRED_STACK_BOUNDARY_DEFAULT 128
>>>>>
>>>>>
>>>>
>>>> In forced_stack_alignment, we collect the hard alignment requirement
>>>> on stack. The decision if stack should be aligned depends on the
>>>> incoming stack alignment and the hard stack alignment. It is separate
>>>> from PREFERRED_STACK_BOUNDARY_DEFAULT.
>>>>
>>>>
>>>>
>>>>>
>>>>> OTOH, I think that this whole stuff should depend on -mstackrealing
>>>>> somehow,
>>>>> according to the comment:
>>>>>
>>>>> /* 1 if -mstackrealign should be turned on by default. �It will
>>>>> � generate an alternate prologue and epilogue that realigns the
>>>>> � runtime stack if nessary. �This supports mixing codes that keep a
>>>>> � 4-byte aligned stack, as specified by i386 psABI, with codes that
>>>>> � need a 16-byte aligned stack, as required by SSE instructions. �If
>>>>> � STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
>>>>> � 128, stacks for all functions may be realigned. �*/
>>>>> #define STACK_REALIGN_DEFAULT 0
>>>>>
>>>>> As far as I understand from many comments from the PR40838 trail
>>>>> (especially
>>>>> comment #51), -mstackrealing is not effective in some cases involving
>>>>> automatic SSE variables on the stack. I think that -mstackrealign
>>>>> should
>>>>> be
>>>>> fixed in the way your patch outlines, so old/broken sources that assume
>>>>> 4byte alignment can be compiled using this option without penalizing
>>>>> new/fixed code that assumes 16byte alignment.
>>>>>
>>>>>
>>>>>
>>>>
>>>> -mstackrealign is very effective. If we turn it on by default, it works
>>>> fine:
>>>>
>>>> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838#c45
>>>>
>>>> It is just that we don't want to make it the default since we will
>>>> realign
>>>> the stack for almost all functions. In most cases, misaligned stack
>>>> won't
>>>> cause any problems.
>>>>
>>>> What my patch does is to assume 4byte incoming stack alignment
>>>> for functions containing SSE instructions which require hard 16byte
>>>> stack alignment.
>>>>
>>>> After my patch is applied, we can update -mstackrealign to make it an
>>>> no-op since all the problems it tries to solve:
>>>>
>>>> http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00854.html
>>>> http://gcc.gnu.org/ml/gcc-patches/2006-05/msg00526.html
>>>>
>>>> disappeared with my patch.
>>>>
>>>>
>>>
>>> Then we should do realignment the other way around: instead of using
>>> -mstackrealing for all the code (including where it has no effect), let's
>>> use -mstackrealign to activate realignment functionality that is
>>> introduced
>>> by your patch.
>>>
>>
>> That defeats the whole purpose of my patch, which automatically
>> realigns the stack when there is a hard alignment requirement. If it
>> isn't turned on by default, it is not very useful.
>>
>>
>>>
>>> IOW, lightweight -mstackrealign, firing up only when there is the
>>> possibility of unaligned access in the code it precedes.
>>>
>>>
>>
>> That is what my patch does, but turned it on by default.
>>
>
> I think that we have the same situation here as was with the infamous -mcld
> option. A specific functionality is needed for compatibility with some
> [broken] code and this functionality has non-negligible impact on the
> performance. AFAICS, there will always be opinions for this functionality as

Well, performance impact of my patch on SPEC CPU 2006 is pretty much
in the noise.

> well as opinions against it.
>
> I propose that we implement new option -mhard-stackrealign [we can bikeshed
> about this option name a bit ;) ] with corresponding
> --enable-hard-stackrealign as a configure option. This way, both groups can
> have whatever they prefer - compatibility vs. performance.
>
> Looking at -mcld discussions, is this acceptable solution?
>

Is --enable-hard-stackrealign enabled by default. If not, it doesn't
buy much.


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-15 21:01             ` H.J. Lu
@ 2009-10-15 21:41               ` Uros Bizjak
  0 siblings, 0 replies; 59+ messages in thread
From: Uros Bizjak @ 2009-10-15 21:41 UTC (permalink / raw)
  To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek

On 10/15/2009 10:59 PM, H.J. Lu wrote:
>> I think that we have the same situation here as was with the infamous -mcld
>> option. A specific functionality is needed for compatibility with some
>> [broken] code and this functionality has non-negligible impact on the
>> performance. AFAICS, there will always be opinions for this functionality as
>>      
> Well, performance impact of my patch on SPEC CPU 2006 is pretty much
> in the noise.
>
>    
>> well as opinions against it.
>>
>> I propose that we implement new option -mhard-stackrealign [we can bikeshed
>> about this option name a bit ;) ] with corresponding
>> --enable-hard-stackrealign as a configure option. This way, both groups can
>> have whatever they prefer - compatibility vs. performance.
>>
>> Looking at -mcld discussions, is this acceptable solution?
>>
>>      
> Is --enable-hard-stackrealign enabled by default. If not, it doesn't
> buy much.
>    

No, it won't be.

Uros.


^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-15 18:45   ` Uros Bizjak
  2009-10-15 19:22     ` H.J. Lu
@ 2009-10-16 20:27     ` H.J. Lu
  2009-10-17  1:03       ` Ian Lance Taylor
  2009-10-17  7:09       ` Uros Bizjak
  1 sibling, 2 replies; 59+ messages in thread
From: H.J. Lu @ 2009-10-16 20:27 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches, Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 3467 bytes --]

On Thu, Oct 15, 2009 at 11:40 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On 10/15/2009 05:48 PM, H.J. Lu wrote:
>>
>> On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote:
>>
>>>
>>> Hi,
>>>
>>> In 32bit, the incoming stack may not be 16 byte aligned.  This patch
>>> assumes the incoming stack is 4 byte aligned and realigns stack if any
>>> SSE variable is put on stack. Any comments?
>>>
>>>
>>
>> For i386, gcc, up to gcc 3.4, had been generating 4byte stack alignment.
>> 4byte stack alignment is a problem unless SSE registers are used, which
>> may be generated by vectorizer or intrinsics. This patch accommodates
>> the binaries generated by previous versions of gcc if SSE registers are
>> generated by vectorizer or intrinsics. The performance impact on SPEC
>> CPU 2006 is minimum:
>>
>> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156#c7
>>
>> OK for trunk?
>>
>
> Comments bellow.
>

>
> OTOH, I think that this whole stuff should depend on -mstackrealing somehow,
> according to the comment:
>
> /* 1 if -mstackrealign should be turned on by default.  It will
>   generate an alternate prologue and epilogue that realigns the
>   runtime stack if nessary.  This supports mixing codes that keep a
>   4-byte aligned stack, as specified by i386 psABI, with codes that
>   need a 16-byte aligned stack, as required by SSE instructions.  If
>   STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
>   128, stacks for all functions may be realigned.  */
> #define STACK_REALIGN_DEFAULT 0
>
> As far as I understand from many comments from the PR40838 trail (especially
> comment #51), -mstackrealing is not effective in some cases involving
> automatic SSE variables on the stack. I think that -mstackrealign should be
> fixed in the way your patch outlines, so old/broken sources that assume
> 4byte alignment can be compiled using this option without penalizing
> new/fixed code that assumes 16byte alignment.
>
> Also, Jakub has its opinion here, I have also CC'd him.
> Uros.
>

Here is the patch to make -mstackrealing only align stack when
SSE registers are used. People who care about mixing binaries
compiled by the older gcc can use it without aligning stack for
every function. OK for trunk if regression passes?

Thanks.


-- 
H.J.
---
gcc/

2009-10-16  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* function.c (allocate_struct_function): Initialize
	hard_stack_alignment.

	* function.h (function): Add hard_stack_alignment.

	* tree-vect-data-refs.c (vect_verify_datarefs_alignment): Update
	hard_stack_alignment.

	* config/i386/i386.c (verride_options): Don't check
	ix86_force_align_arg_pointer here.
	(ix86_update_stack_boundary): In 32bit, use MIN_STACK_BOUNDARY
	for incoming stack boundary if -mstackrealign is used and hard
	stack alignment is 128bit.
	(ix86_minimum_alignment): In 32bit, update hard_stack_alignment
	if alignment is 128bit.

gcc/testsuite/

2009-10-16  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.
	* gcc.target/i386/incoming-10.c: Likewise.
	* gcc.target/i386/incoming-11.c: Likewise.
	* gcc.target/i386/incoming-12.c: Likewise.
	* gcc.target/i386/incoming-13.c: Likewise.
	* gcc.target/i386/incoming-14.c: Likewise.

[-- Attachment #2: gcc-pr40838-6.patch --]
[-- Type: text/plain, Size: 11831 bytes --]

gcc/

2009-10-16  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* function.c (allocate_struct_function): Initialize
	hard_stack_alignment.

	* function.h (function): Add hard_stack_alignment.

	* tree-vect-data-refs.c (vect_verify_datarefs_alignment): Update
	hard_stack_alignment.

	* config/i386/i386.c (verride_options): Don't check
	ix86_force_align_arg_pointer here.
	(ix86_update_stack_boundary): In 32bit, use MIN_STACK_BOUNDARY
	for incoming stack boundary if -mstackrealign is used and hard
	stack alignment is 128bit.
	(ix86_minimum_alignment): In 32bit, update hard_stack_alignment
	if alignment is 128bit.

gcc/testsuite/

2009-10-16  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.
	* gcc.target/i386/incoming-10.c: Likewise.
	* gcc.target/i386/incoming-11.c: Likewise.
	* gcc.target/i386/incoming-12.c: Likewise.
	* gcc.target/i386/incoming-13.c: Likewise.
	* gcc.target/i386/incoming-14.c: Likewise.

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 73913b8..ac9d2ad 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -3239,12 +3239,10 @@ override_options (bool main_args_p)
   if (ix86_force_align_arg_pointer == -1)
     ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
 
+  ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
+
   /* Validate -mincoming-stack-boundary= value or default it to
      MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
-  if (ix86_force_align_arg_pointer)
-    ix86_default_incoming_stack_boundary = MIN_STACK_BOUNDARY;
-  else
-    ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
   ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
   if (ix86_incoming_stack_boundary_string)
     {
@@ -8201,11 +8199,23 @@ find_drap_reg (void)
 static void
 ix86_update_stack_boundary (void)
 {
+  /* Should we use MIN_STACK_BOUNDARY for incoming stack boundary?  */
+  unsigned int incoming_stack_boundary;
+
+  /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary if
+     -mstackrealign is used and hard stack alignment is 128bit.  */
+  if (!TARGET_64BIT
+      && ix86_force_align_arg_pointer
+      && cfun->hard_stack_alignment == 128)
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
+  else
+    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
+
   /* Prefer the one specified at command line. */
   ix86_incoming_stack_boundary 
     = (ix86_user_incoming_stack_boundary
        ? ix86_user_incoming_stack_boundary
-       : ix86_default_incoming_stack_boundary);
+       : incoming_stack_boundary);
 
   /* Incoming stack alignment can be changed on individual functions
      via force_align_arg_pointer attribute.  We use the smallest
@@ -19863,7 +19873,14 @@ ix86_minimum_alignment (tree exp, enum machine_mode mode,
 {
   tree type, decl;
 
-  if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
+  if (TARGET_64BIT)
+    return align;
+
+  /* In 32bit, update hard_stack_mode if ALIGN is 128.  */
+  if (align == 128 && cfun->hard_stack_alignment < 128)
+    cfun->hard_stack_alignment = align;
+
+  if (align != 64 || ix86_preferred_stack_boundary >= 64)
     return align;
 
   if (exp && DECL_P (exp))
diff --git a/gcc/function.c b/gcc/function.c
index 35c0cfd..1071183 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -4139,6 +4139,8 @@ allocate_struct_function (tree fndecl, bool abstract_p)
       /* Assume all registers in stdarg functions need to be saved.  */
       cfun->va_list_gpr_size = VA_LIST_MAX_GPR_SIZE;
       cfun->va_list_fpr_size = VA_LIST_MAX_FPR_SIZE;
+
+      cfun->hard_stack_alignment = 0;
     }
 }
 
diff --git a/gcc/function.h b/gcc/function.h
index 4825d16..457d5d1 100644
--- a/gcc/function.h
+++ b/gcc/function.h
@@ -532,6 +532,9 @@ struct GTY(()) function {
      a string describing the reason for failure.  */
   const char * GTY((skip)) cannot_be_copied_reason;
 
+  /* The largest hard alignment on the stack.  */
+  unsigned int hard_stack_alignment;
+
   /* Collected bit flags.  */
 
   /* Number of units of general registers that need saving in stdarg
diff --git a/gcc/testsuite/gcc.target/i386/incoming-10.c b/gcc/testsuite/gcc.target/i386/incoming-10.c
new file mode 100644
index 0000000..31d9e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-10.c
@@ -0,0 +1,19 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+struct s {
+	int x[8];
+};
+
+void g(struct s *);
+
+void f()
+{
+	int i;
+	struct s s;
+	for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0;
+	g(&s);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-11.c b/gcc/testsuite/gcc.target/i386/incoming-11.c
new file mode 100644
index 0000000..e5787af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-11.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+void g();
+
+int p[100];
+int q[100];
+
+void f()
+{
+	int i;
+	for (i = 0; i < 100; i++) p[i] = 0;
+	g();
+	for (i = 0; i < 100; i++) q[i] = 0;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-12.c b/gcc/testsuite/gcc.target/i386/incoming-12.c
new file mode 100644
index 0000000..d7ef103
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-12.c
@@ -0,0 +1,20 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+struct x {
+       v4si v;
+       v4si w;
+};
+
+void y(void *);
+
+v4si x(void)
+{
+       struct x x;
+       y(&x);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-13.c b/gcc/testsuite/gcc.target/i386/incoming-13.c
new file mode 100644
index 0000000..bbc8993
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-13.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern double y(double *s3);
+
+extern double s1, s2;
+
+double x(void)
+{
+  double s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-14.c b/gcc/testsuite/gcc.target/i386/incoming-14.c
new file mode 100644
index 0000000..d27179d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-14.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern int y(int *s3);
+
+extern int s1, s2;
+
+int x(void)
+{
+  int s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-15.c b/gcc/testsuite/gcc.target/i386/incoming-15.c
new file mode 100644
index 0000000..e6a1749
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-15.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern long long y(long long *s3);
+
+extern long long s1, s2;
+
+long long x(void)
+{
+  long long s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-6.c b/gcc/testsuite/gcc.target/i386/incoming-6.c
new file mode 100644
index 0000000..5cc4ab3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-6.c
@@ -0,0 +1,17 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si *s3);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  v4si s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-7.c b/gcc/testsuite/gcc.target/i386/incoming-7.c
new file mode 100644
index 0000000..cdd6037
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-7.c
@@ -0,0 +1,16 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si, v4si, v4si, v4si, v4si);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  return y(s1, s2, s1, s2, s2);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-8.c b/gcc/testsuite/gcc.target/i386/incoming-8.c
new file mode 100644
index 0000000..2dd8800
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-8.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O3 -msse2 -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-9.c b/gcc/testsuite/gcc.target/i386/incoming-9.c
new file mode 100644
index 0000000..e43cbd6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-9.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O3 -mno-sse -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index c3570d3..0daaa70 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -927,6 +927,8 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
     {
       gimple stmt = DR_STMT (dr);
       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+      enum machine_mode mode;
+      unsigned int align;
 
       /* For interleaving, only the alignment of the first access matters.  */
       if (STMT_VINFO_STRIDED_ACCESS (stmt_info)
@@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
             }
           return false;
         }
+
+      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
+      align = GET_MODE_ALIGNMENT (mode);
+      if (cfun->hard_stack_alignment < align)
+	cfun->hard_stack_alignment = align;
+
       if (supportable_dr_alignment != dr_aligned
           && vect_print_dump_info (REPORT_ALIGNMENT))
         fprintf (vect_dump, "Vectorizing an unaligned access.");

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-16 20:27     ` H.J. Lu
@ 2009-10-17  1:03       ` Ian Lance Taylor
  2009-10-17 18:22         ` H.J. Lu
  2009-10-17  7:09       ` Uros Bizjak
  1 sibling, 1 reply; 59+ messages in thread
From: Ian Lance Taylor @ 2009-10-17  1:03 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Uros Bizjak, gcc-patches, Jakub Jelinek

"H.J. Lu" <hjl.tools@gmail.com> writes:

> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
>              }
>            return false;
>          }
> +
> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
> +      align = GET_MODE_ALIGNMENT (mode);
> +      if (cfun->hard_stack_alignment < align)
> +	cfun->hard_stack_alignment = align;
> +
>        if (supportable_dr_alignment != dr_aligned
>            && vect_print_dump_info (REPORT_ALIGNMENT))
>          fprintf (vect_dump, "Vectorizing an unaligned access.");

I do not understand why this is the right place to set your new cfun
field.  Your patch is about a misaligned stack.  The code you are
changing here does not appear to have anything to do with a misaligned
stack.  This code is about the correct alignment of data.  The data
may be on the stack but it need not be.  If the data is on the stack,
then the compiler should be able to determine the alignment available
on the stack.

Ian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-16 20:27     ` H.J. Lu
  2009-10-17  1:03       ` Ian Lance Taylor
@ 2009-10-17  7:09       ` Uros Bizjak
  1 sibling, 0 replies; 59+ messages in thread
From: Uros Bizjak @ 2009-10-17  7:09 UTC (permalink / raw)
  To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek

On 10/16/2009 10:12 PM, H.J. Lu wrote:

>> OTOH, I think that this whole stuff should depend on -mstackrealing somehow,
>> according to the comment:
>>
>> /* 1 if -mstackrealign should be turned on by default. �It will
>> � generate an alternate prologue and epilogue that realigns the
>> � runtime stack if nessary. �This supports mixing codes that keep a
>> � 4-byte aligned stack, as specified by i386 psABI, with codes that
>> � need a 16-byte aligned stack, as required by SSE instructions. �If
>> � STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
>> � 128, stacks for all functions may be realigned. �*/
>> #define STACK_REALIGN_DEFAULT 0
>>
>> As far as I understand from many comments from the PR40838 trail (especially
>> comment #51), -mstackrealing is not effective in some cases involving
>> automatic SSE variables on the stack. I think that -mstackrealign should be
>> fixed in the way your patch outlines, so old/broken sources that assume
>> 4byte alignment can be compiled using this option without penalizing
>> new/fixed code that assumes 16byte alignment.
>>
>> Also, Jakub has its opinion here, I have also CC'd him.
>> Uros.
>>
>>      
> Here is the patch to make -mstackrealing only align stack when
> SSE registers are used. People who care about mixing binaries
> compiled by the older gcc can use it without aligning stack for
> every function. OK for trunk if regression passes?
>    


Please also update the comment to STACK_REALIGN_DEFAULT in i386.h where 
it still says that stacks for _all_ functions may be realigned.

Otherwise, this patch OK as far as x86 is concerned (please note that 
middle-end parts need another approval).

IMO, this patch makes -mstackrealign really usable. Also, it is in line 
with -mstackrealign documentation that says that this option aligns the 
stack only if necessary [...] to support mixing of 4 and 16byte aligned 
code.

Thanks,
Uros.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-17  1:03       ` Ian Lance Taylor
@ 2009-10-17 18:22         ` H.J. Lu
  2009-10-17 19:02           ` Richard Guenther
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-17 18:22 UTC (permalink / raw)
  To: Ian Lance Taylor; +Cc: Uros Bizjak, gcc-patches, Jakub Jelinek

On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote:
> "H.J. Lu" <hjl.tools@gmail.com> writes:
>
>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
>>              }
>>            return false;
>>          }
>> +
>> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
>> +      align = GET_MODE_ALIGNMENT (mode);
>> +      if (cfun->hard_stack_alignment < align)
>> +     cfun->hard_stack_alignment = align;
>> +
>>        if (supportable_dr_alignment != dr_aligned
>>            && vect_print_dump_info (REPORT_ALIGNMENT))
>>          fprintf (vect_dump, "Vectorizing an unaligned access.");
>
> I do not understand why this is the right place to set your new cfun
> field.  Your patch is about a misaligned stack.  The code you are
> changing here does not appear to have anything to do with a misaligned
> stack.  This code is about the correct alignment of data.  The data
> may be on the stack but it need not be.  If the data is on the stack,
> then the compiler should be able to determine the alignment available
> on the stack.

The issue here is to set proper incoming stack alignment. We have
to do it before RTL expansion which uses this information to check
if stack alignment is needed. To do that, we need to know what the
minimum stack alignment required by hardware. We collect hard
stack alignment when we put variables on stack and check the alignment
generated by vectorizer.


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-17 18:22         ` H.J. Lu
@ 2009-10-17 19:02           ` Richard Guenther
  2009-10-17 19:21             ` H.J. Lu
  0 siblings, 1 reply; 59+ messages in thread
From: Richard Guenther @ 2009-10-17 19:02 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote:
>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>
>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
>>>              }
>>>            return false;
>>>          }
>>> +
>>> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
>>> +      align = GET_MODE_ALIGNMENT (mode);
>>> +      if (cfun->hard_stack_alignment < align)
>>> +     cfun->hard_stack_alignment = align;
>>> +
>>>        if (supportable_dr_alignment != dr_aligned
>>>            && vect_print_dump_info (REPORT_ALIGNMENT))
>>>          fprintf (vect_dump, "Vectorizing an unaligned access.");
>>
>> I do not understand why this is the right place to set your new cfun
>> field.  Your patch is about a misaligned stack.  The code you are
>> changing here does not appear to have anything to do with a misaligned
>> stack.  This code is about the correct alignment of data.  The data
>> may be on the stack but it need not be.  If the data is on the stack,
>> then the compiler should be able to determine the alignment available
>> on the stack.
>
> The issue here is to set proper incoming stack alignment. We have
> to do it before RTL expansion which uses this information to check
> if stack alignment is needed. To do that, we need to know what the
> minimum stack alignment required by hardware. We collect hard
> stack alignment when we put variables on stack and check the alignment
> generated by vectorizer.

It's surely not the right function to do this.  Nor is it a proper
interface IMHO.
Why don't you need to check whether the access is to stack memory at all?

I think you want to instead see if any user-alignment on automatic
variables requires stack realignment.  Thus, wherever the vectorizer
sets DECL_USER_ALIGN on automatic storage.

Richard.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-17 19:02           ` Richard Guenther
@ 2009-10-17 19:21             ` H.J. Lu
  2009-10-17 19:29               ` Richard Guenther
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-17 19:21 UTC (permalink / raw)
  To: Richard Guenther
  Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote:
>>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>>
>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
>>>>              }
>>>>            return false;
>>>>          }
>>>> +
>>>> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
>>>> +      align = GET_MODE_ALIGNMENT (mode);
>>>> +      if (cfun->hard_stack_alignment < align)
>>>> +     cfun->hard_stack_alignment = align;
>>>> +
>>>>        if (supportable_dr_alignment != dr_aligned
>>>>            && vect_print_dump_info (REPORT_ALIGNMENT))
>>>>          fprintf (vect_dump, "Vectorizing an unaligned access.");
>>>
>>> I do not understand why this is the right place to set your new cfun
>>> field.  Your patch is about a misaligned stack.  The code you are
>>> changing here does not appear to have anything to do with a misaligned
>>> stack.  This code is about the correct alignment of data.  The data
>>> may be on the stack but it need not be.  If the data is on the stack,
>>> then the compiler should be able to determine the alignment available
>>> on the stack.
>>
>> The issue here is to set proper incoming stack alignment. We have
>> to do it before RTL expansion which uses this information to check
>> if stack alignment is needed. To do that, we need to know what the
>> minimum stack alignment required by hardware. We collect hard
>> stack alignment when we put variables on stack and check the alignment
>> generated by vectorizer.
>
> It's surely not the right function to do this.  Nor is it a proper
> interface IMHO.
> Why don't you need to check whether the access is to stack memory at all?
>
> I think you want to instead see if any user-alignment on automatic
> variables requires stack realignment.  Thus, wherever the vectorizer
> sets DECL_USER_ALIGN on automatic storage.
>

RTL expansion may generate automatic variables from vectorizer
statements while vectorizer doesn't generate any automatic variables
at all. How should I handle this case?


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-17 19:21             ` H.J. Lu
@ 2009-10-17 19:29               ` Richard Guenther
  2009-10-17 19:35                 ` H.J. Lu
  0 siblings, 1 reply; 59+ messages in thread
From: Richard Guenther @ 2009-10-17 19:29 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

On Sat, Oct 17, 2009 at 9:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther
> <richard.guenther@gmail.com> wrote:
>> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote:
>>>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>>>
>>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
>>>>>              }
>>>>>            return false;
>>>>>          }
>>>>> +
>>>>> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
>>>>> +      align = GET_MODE_ALIGNMENT (mode);
>>>>> +      if (cfun->hard_stack_alignment < align)
>>>>> +     cfun->hard_stack_alignment = align;
>>>>> +
>>>>>        if (supportable_dr_alignment != dr_aligned
>>>>>            && vect_print_dump_info (REPORT_ALIGNMENT))
>>>>>          fprintf (vect_dump, "Vectorizing an unaligned access.");
>>>>
>>>> I do not understand why this is the right place to set your new cfun
>>>> field.  Your patch is about a misaligned stack.  The code you are
>>>> changing here does not appear to have anything to do with a misaligned
>>>> stack.  This code is about the correct alignment of data.  The data
>>>> may be on the stack but it need not be.  If the data is on the stack,
>>>> then the compiler should be able to determine the alignment available
>>>> on the stack.
>>>
>>> The issue here is to set proper incoming stack alignment. We have
>>> to do it before RTL expansion which uses this information to check
>>> if stack alignment is needed. To do that, we need to know what the
>>> minimum stack alignment required by hardware. We collect hard
>>> stack alignment when we put variables on stack and check the alignment
>>> generated by vectorizer.
>>
>> It's surely not the right function to do this.  Nor is it a proper
>> interface IMHO.
>> Why don't you need to check whether the access is to stack memory at all?
>>
>> I think you want to instead see if any user-alignment on automatic
>> variables requires stack realignment.  Thus, wherever the vectorizer
>> sets DECL_USER_ALIGN on automatic storage.
>>
>
> RTL expansion may generate automatic variables from vectorizer
> statements while vectorizer doesn't generate any automatic variables
> at all. How should I handle this case?

The same way you would handle spills or you would handle user code
using intrinsics.  I don't see how the vectorizer is special.

Richard.

>
> --
> H.J.
>

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-17 19:29               ` Richard Guenther
@ 2009-10-17 19:35                 ` H.J. Lu
  2009-10-17 19:46                   ` Richard Guenther
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-17 19:35 UTC (permalink / raw)
  To: Richard Guenther
  Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

On Sat, Oct 17, 2009 at 12:20 PM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Sat, Oct 17, 2009 at 9:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther
>> <richard.guenther@gmail.com> wrote:
>>> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote:
>>>>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>>>>
>>>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
>>>>>>              }
>>>>>>            return false;
>>>>>>          }
>>>>>> +
>>>>>> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
>>>>>> +      align = GET_MODE_ALIGNMENT (mode);
>>>>>> +      if (cfun->hard_stack_alignment < align)
>>>>>> +     cfun->hard_stack_alignment = align;
>>>>>> +
>>>>>>        if (supportable_dr_alignment != dr_aligned
>>>>>>            && vect_print_dump_info (REPORT_ALIGNMENT))
>>>>>>          fprintf (vect_dump, "Vectorizing an unaligned access.");
>>>>>
>>>>> I do not understand why this is the right place to set your new cfun
>>>>> field.  Your patch is about a misaligned stack.  The code you are
>>>>> changing here does not appear to have anything to do with a misaligned
>>>>> stack.  This code is about the correct alignment of data.  The data
>>>>> may be on the stack but it need not be.  If the data is on the stack,
>>>>> then the compiler should be able to determine the alignment available
>>>>> on the stack.
>>>>
>>>> The issue here is to set proper incoming stack alignment. We have
>>>> to do it before RTL expansion which uses this information to check
>>>> if stack alignment is needed. To do that, we need to know what the
>>>> minimum stack alignment required by hardware. We collect hard
>>>> stack alignment when we put variables on stack and check the alignment
>>>> generated by vectorizer.
>>>
>>> It's surely not the right function to do this.  Nor is it a proper
>>> interface IMHO.
>>> Why don't you need to check whether the access is to stack memory at all?
>>>
>>> I think you want to instead see if any user-alignment on automatic
>>> variables requires stack realignment.  Thus, wherever the vectorizer
>>> sets DECL_USER_ALIGN on automatic storage.
>>>
>>
>> RTL expansion may generate automatic variables from vectorizer
>> statements while vectorizer doesn't generate any automatic variables
>> at all. How should I handle this case?
>
> The same way you would handle spills or you would handle user code
> using intrinsics.  I don't see how the vectorizer is special.
>

For

---
void g();

int p[100];
int q[100];

void f()
{
        int i;
        for (i = 0; i < 100; i++) p[i] = 0;
        g();
        for (i = 0; i < 100; i++) q[i] = 0;
}
---

Vectorizor sets DECL_USER_ALIGN on p and g. But it may not generate
any automatic variables in function f. How do I know function f
needs stack alignment before RTL expansion?


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-17 19:35                 ` H.J. Lu
@ 2009-10-17 19:46                   ` Richard Guenther
  2009-10-17 20:01                     ` H.J. Lu
  0 siblings, 1 reply; 59+ messages in thread
From: Richard Guenther @ 2009-10-17 19:46 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

On Sat, Oct 17, 2009 at 9:29 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, Oct 17, 2009 at 12:20 PM, Richard Guenther
> <richard.guenther@gmail.com> wrote:
>> On Sat, Oct 17, 2009 at 9:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther
>>> <richard.guenther@gmail.com> wrote:
>>>> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote:
>>>>>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>>>>>
>>>>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
>>>>>>>              }
>>>>>>>            return false;
>>>>>>>          }
>>>>>>> +
>>>>>>> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
>>>>>>> +      align = GET_MODE_ALIGNMENT (mode);
>>>>>>> +      if (cfun->hard_stack_alignment < align)
>>>>>>> +     cfun->hard_stack_alignment = align;
>>>>>>> +
>>>>>>>        if (supportable_dr_alignment != dr_aligned
>>>>>>>            && vect_print_dump_info (REPORT_ALIGNMENT))
>>>>>>>          fprintf (vect_dump, "Vectorizing an unaligned access.");
>>>>>>
>>>>>> I do not understand why this is the right place to set your new cfun
>>>>>> field.  Your patch is about a misaligned stack.  The code you are
>>>>>> changing here does not appear to have anything to do with a misaligned
>>>>>> stack.  This code is about the correct alignment of data.  The data
>>>>>> may be on the stack but it need not be.  If the data is on the stack,
>>>>>> then the compiler should be able to determine the alignment available
>>>>>> on the stack.
>>>>>
>>>>> The issue here is to set proper incoming stack alignment. We have
>>>>> to do it before RTL expansion which uses this information to check
>>>>> if stack alignment is needed. To do that, we need to know what the
>>>>> minimum stack alignment required by hardware. We collect hard
>>>>> stack alignment when we put variables on stack and check the alignment
>>>>> generated by vectorizer.
>>>>
>>>> It's surely not the right function to do this.  Nor is it a proper
>>>> interface IMHO.
>>>> Why don't you need to check whether the access is to stack memory at all?
>>>>
>>>> I think you want to instead see if any user-alignment on automatic
>>>> variables requires stack realignment.  Thus, wherever the vectorizer
>>>> sets DECL_USER_ALIGN on automatic storage.
>>>>
>>>
>>> RTL expansion may generate automatic variables from vectorizer
>>> statements while vectorizer doesn't generate any automatic variables
>>> at all. How should I handle this case?
>>
>> The same way you would handle spills or you would handle user code
>> using intrinsics.  I don't see how the vectorizer is special.
>>
>
> For
>
> ---
> void g();
>
> int p[100];
> int q[100];
>
> void f()
> {
>        int i;
>        for (i = 0; i < 100; i++) p[i] = 0;
>        g();
>        for (i = 0; i < 100; i++) q[i] = 0;
> }
> ---
>
> Vectorizor sets DECL_USER_ALIGN on p and g. But it may not generate
> any automatic variables in function f.

Right.  What matters is if the vectorizer sets that on automatic
variables of course.

> How do I know function f
> needs stack alignment before RTL expansion?

Well, you simply decide.  Then you have to cope with the consequences
of the decision - properly set the alignment of any stack temporaries (or
spill slots) you generate.

Richard.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-17 19:46                   ` Richard Guenther
@ 2009-10-17 20:01                     ` H.J. Lu
  2009-10-17 20:59                       ` Richard Guenther
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-17 20:01 UTC (permalink / raw)
  To: Richard Guenther
  Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

On Sat, Oct 17, 2009 at 12:35 PM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Sat, Oct 17, 2009 at 9:29 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Sat, Oct 17, 2009 at 12:20 PM, Richard Guenther
>> <richard.guenther@gmail.com> wrote:
>>> On Sat, Oct 17, 2009 at 9:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>> On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther
>>>> <richard.guenther@gmail.com> wrote:
>>>>> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote:
>>>>>>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>>>>>>
>>>>>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
>>>>>>>>              }
>>>>>>>>            return false;
>>>>>>>>          }
>>>>>>>> +
>>>>>>>> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
>>>>>>>> +      align = GET_MODE_ALIGNMENT (mode);
>>>>>>>> +      if (cfun->hard_stack_alignment < align)
>>>>>>>> +     cfun->hard_stack_alignment = align;
>>>>>>>> +
>>>>>>>>        if (supportable_dr_alignment != dr_aligned
>>>>>>>>            && vect_print_dump_info (REPORT_ALIGNMENT))
>>>>>>>>          fprintf (vect_dump, "Vectorizing an unaligned access.");
>>>>>>>
>>>>>>> I do not understand why this is the right place to set your new cfun
>>>>>>> field.  Your patch is about a misaligned stack.  The code you are
>>>>>>> changing here does not appear to have anything to do with a misaligned
>>>>>>> stack.  This code is about the correct alignment of data.  The data
>>>>>>> may be on the stack but it need not be.  If the data is on the stack,
>>>>>>> then the compiler should be able to determine the alignment available
>>>>>>> on the stack.
>>>>>>
>>>>>> The issue here is to set proper incoming stack alignment. We have
>>>>>> to do it before RTL expansion which uses this information to check
>>>>>> if stack alignment is needed. To do that, we need to know what the
>>>>>> minimum stack alignment required by hardware. We collect hard
>>>>>> stack alignment when we put variables on stack and check the alignment
>>>>>> generated by vectorizer.
>>>>>
>>>>> It's surely not the right function to do this.  Nor is it a proper
>>>>> interface IMHO.
>>>>> Why don't you need to check whether the access is to stack memory at all?
>>>>>
>>>>> I think you want to instead see if any user-alignment on automatic
>>>>> variables requires stack realignment.  Thus, wherever the vectorizer
>>>>> sets DECL_USER_ALIGN on automatic storage.
>>>>>
>>>>
>>>> RTL expansion may generate automatic variables from vectorizer
>>>> statements while vectorizer doesn't generate any automatic variables
>>>> at all. How should I handle this case?
>>>
>>> The same way you would handle spills or you would handle user code
>>> using intrinsics.  I don't see how the vectorizer is special.
>>>
>>
>> For
>>
>> ---
>> void g();
>>
>> int p[100];
>> int q[100];
>>
>> void f()
>> {
>>        int i;
>>        for (i = 0; i < 100; i++) p[i] = 0;
>>        g();
>>        for (i = 0; i < 100; i++) q[i] = 0;
>> }
>> ---
>>
>> Vectorizor sets DECL_USER_ALIGN on p and g. But it may not generate
>> any automatic variables in function f.
>
> Right.  What matters is if the vectorizer sets that on automatic
> variables of course.

The problem I am trying to deal with is vectorizer does not
generate any automatic variables. Instead, it generates vector
statements on static array directly. RTL expansion will generate
automatic variables from those vector statements, which is too late.
I can't rely on automatic variables for vectorizer when I need
the information before RTL expansion.

-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-17 20:01                     ` H.J. Lu
@ 2009-10-17 20:59                       ` Richard Guenther
  2009-10-18 19:21                         ` Michael Matz
  0 siblings, 1 reply; 59+ messages in thread
From: Richard Guenther @ 2009-10-17 20:59 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

On Sat, Oct 17, 2009 at 9:46 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Sat, Oct 17, 2009 at 12:35 PM, Richard Guenther
> <richard.guenther@gmail.com> wrote:
>> On Sat, Oct 17, 2009 at 9:29 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Sat, Oct 17, 2009 at 12:20 PM, Richard Guenther
>>> <richard.guenther@gmail.com> wrote:
>>>> On Sat, Oct 17, 2009 at 9:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>> On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther
>>>>> <richard.guenther@gmail.com> wrote:
>>>>>> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>>>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote:
>>>>>>>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>>>>>>>
>>>>>>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
>>>>>>>>>              }
>>>>>>>>>            return false;
>>>>>>>>>          }
>>>>>>>>> +
>>>>>>>>> +      mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
>>>>>>>>> +      align = GET_MODE_ALIGNMENT (mode);
>>>>>>>>> +      if (cfun->hard_stack_alignment < align)
>>>>>>>>> +     cfun->hard_stack_alignment = align;
>>>>>>>>> +
>>>>>>>>>        if (supportable_dr_alignment != dr_aligned
>>>>>>>>>            && vect_print_dump_info (REPORT_ALIGNMENT))
>>>>>>>>>          fprintf (vect_dump, "Vectorizing an unaligned access.");
>>>>>>>>
>>>>>>>> I do not understand why this is the right place to set your new cfun
>>>>>>>> field.  Your patch is about a misaligned stack.  The code you are
>>>>>>>> changing here does not appear to have anything to do with a misaligned
>>>>>>>> stack.  This code is about the correct alignment of data.  The data
>>>>>>>> may be on the stack but it need not be.  If the data is on the stack,
>>>>>>>> then the compiler should be able to determine the alignment available
>>>>>>>> on the stack.
>>>>>>>
>>>>>>> The issue here is to set proper incoming stack alignment. We have
>>>>>>> to do it before RTL expansion which uses this information to check
>>>>>>> if stack alignment is needed. To do that, we need to know what the
>>>>>>> minimum stack alignment required by hardware. We collect hard
>>>>>>> stack alignment when we put variables on stack and check the alignment
>>>>>>> generated by vectorizer.
>>>>>>
>>>>>> It's surely not the right function to do this.  Nor is it a proper
>>>>>> interface IMHO.
>>>>>> Why don't you need to check whether the access is to stack memory at all?
>>>>>>
>>>>>> I think you want to instead see if any user-alignment on automatic
>>>>>> variables requires stack realignment.  Thus, wherever the vectorizer
>>>>>> sets DECL_USER_ALIGN on automatic storage.
>>>>>>
>>>>>
>>>>> RTL expansion may generate automatic variables from vectorizer
>>>>> statements while vectorizer doesn't generate any automatic variables
>>>>> at all. How should I handle this case?
>>>>
>>>> The same way you would handle spills or you would handle user code
>>>> using intrinsics.  I don't see how the vectorizer is special.
>>>>
>>>
>>> For
>>>
>>> ---
>>> void g();
>>>
>>> int p[100];
>>> int q[100];
>>>
>>> void f()
>>> {
>>>        int i;
>>>        for (i = 0; i < 100; i++) p[i] = 0;
>>>        g();
>>>        for (i = 0; i < 100; i++) q[i] = 0;
>>> }
>>> ---
>>>
>>> Vectorizor sets DECL_USER_ALIGN on p and g. But it may not generate
>>> any automatic variables in function f.
>>
>> Right.  What matters is if the vectorizer sets that on automatic
>> variables of course.
>
> The problem I am trying to deal with is vectorizer does not
> generate any automatic variables. Instead, it generates vector
> statements on static array directly. RTL expansion will generate
> automatic variables from those vector statements, which is too late.
> I can't rely on automatic variables for vectorizer when I need
> the information before RTL expansion.

I don't see how this is too late (I also don't see where expand creates
automatic variables, but well ... I can imagine reload creating spill
slots).  If non-aligned automatic variables are generated you simply
use unaligned moves - what's the problem?

Richard.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is    aligned
  2009-10-17 20:59                       ` Richard Guenther
@ 2009-10-18 19:21                         ` Michael Matz
  2009-10-18 19:45                           ` Richard Guenther
  2009-10-19 16:38                           ` H.J. Lu
  0 siblings, 2 replies; 59+ messages in thread
From: Michael Matz @ 2009-10-18 19:21 UTC (permalink / raw)
  To: Richard Guenther
  Cc: H.J. Lu, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

Hi,

On Sat, 17 Oct 2009, Richard Guenther wrote:

> > I can't rely on automatic variables for vectorizer when I need
> > the information before RTL expansion.
> 
> I don't see how this is too late (I also don't see where expand creates 
> automatic variables,

assign_temp/assign_stack_temp, all over.

> but well ... I can imagine reload creating spill slots).  If non-aligned 
> automatic variables are generated you simply use unaligned moves - 
> what's the problem?

It's obvious: we don't want to generate unaligned moves.  That's the whole 
point of H.J. patches.  He has a phase ordering problem:
(a) alignment of generated temporaries depends on known stack alignment
(b) known stack alignment depends on what is put on stack (including late 
    generated temporaries)

He tries to solve this by pessimistically assuming that potentially 
everything imaginable could go on stack.  What I don't understand is why 
we don't instead track hard_stack_alignment in assign_*_temp (where we 
then assume that the stack will be aligned perfectly), and expand stack 
realignment code _after_ having expanded everything else (plus examined 
local variables for the possibility of generating spill slots).


Ciao,
Michael.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-18 19:21                         ` Michael Matz
@ 2009-10-18 19:45                           ` Richard Guenther
  2009-10-19 16:36                             ` H.J. Lu
  2009-10-19 16:38                           ` H.J. Lu
  1 sibling, 1 reply; 59+ messages in thread
From: Richard Guenther @ 2009-10-18 19:45 UTC (permalink / raw)
  To: Michael Matz
  Cc: H.J. Lu, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

On Sun, Oct 18, 2009 at 9:19 PM, Michael Matz <matz@suse.de> wrote:
> Hi,
>
> On Sat, 17 Oct 2009, Richard Guenther wrote:
>
>> > I can't rely on automatic variables for vectorizer when I need
>> > the information before RTL expansion.
>>
>> I don't see how this is too late (I also don't see where expand creates
>> automatic variables,
>
> assign_temp/assign_stack_temp, all over.
>
>> but well ... I can imagine reload creating spill slots).  If non-aligned
>> automatic variables are generated you simply use unaligned moves -
>> what's the problem?
>
> It's obvious: we don't want to generate unaligned moves.  That's the whole
> point of H.J. patches.  He has a phase ordering problem:
> (a) alignment of generated temporaries depends on known stack alignment
> (b) known stack alignment depends on what is put on stack (including late
>    generated temporaries)
>
> He tries to solve this by pessimistically assuming that potentially
> everything imaginable could go on stack.  What I don't understand is why
> we don't instead track hard_stack_alignment in assign_*_temp (where we
> then assume that the stack will be aligned perfectly), and expand stack
> realignment code _after_ having expanded everything else (plus examined
> local variables for the possibility of generating spill slots).

I also don't understand how this problem can be solved in the vectorizer
and how this problem cannot occur the same way when using
intrinsics.

Richard.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-18 19:45                           ` Richard Guenther
@ 2009-10-19 16:36                             ` H.J. Lu
  2009-10-20  1:12                               ` Michael Matz
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-19 16:36 UTC (permalink / raw)
  To: Richard Guenther
  Cc: Michael Matz, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

On Sun, Oct 18, 2009 at 12:44 PM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Sun, Oct 18, 2009 at 9:19 PM, Michael Matz <matz@suse.de> wrote:
>> Hi,
>>
>> On Sat, 17 Oct 2009, Richard Guenther wrote:
>>
>>> > I can't rely on automatic variables for vectorizer when I need
>>> > the information before RTL expansion.
>>>
>>> I don't see how this is too late (I also don't see where expand creates
>>> automatic variables,
>>
>> assign_temp/assign_stack_temp, all over.
>>
>>> but well ... I can imagine reload creating spill slots).  If non-aligned
>>> automatic variables are generated you simply use unaligned moves -
>>> what's the problem?
>>
>> It's obvious: we don't want to generate unaligned moves.  That's the whole
>> point of H.J. patches.  He has a phase ordering problem:
>> (a) alignment of generated temporaries depends on known stack alignment
>> (b) known stack alignment depends on what is put on stack (including late
>>    generated temporaries)
>>
>> He tries to solve this by pessimistically assuming that potentially
>> everything imaginable could go on stack.  What I don't understand is why
>> we don't instead track hard_stack_alignment in assign_*_temp (where we
>> then assume that the stack will be aligned perfectly), and expand stack
>> realignment code _after_ having expanded everything else (plus examined
>> local variables for the possibility of generating spill slots).
>
> I also don't understand how this problem can be solved in the vectorizer
> and how this problem cannot occur the same way when using
> intrinsics.
>

The issues are

1. The incoming stack alignment can't be changed after RTL expansion
starts.
2. When -mstackrealign is used, we want to use 4 byte incoming stack
alignment on functions which use SSE vector insns.
3. SSE vector insns may be generated by intrinsics, vectorizer and
vector operations.

Both intrinsics and and vector operations will always use automatic
variables, which are handled by ix86_minimum_alignment.  Since
vectorizer may not always use automatic variables, I modified
vectorizer to update hard_stack_alignment when vector statements
are generated.

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-18 19:21                         ` Michael Matz
  2009-10-18 19:45                           ` Richard Guenther
@ 2009-10-19 16:38                           ` H.J. Lu
  2009-10-19 17:08                             ` Ian Lance Taylor
  2009-10-20  1:53                             ` Michael Matz
  1 sibling, 2 replies; 59+ messages in thread
From: H.J. Lu @ 2009-10-19 16:38 UTC (permalink / raw)
  To: Michael Matz
  Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches,
	Jakub Jelinek

On Sun, Oct 18, 2009 at 12:19 PM, Michael Matz <matz@suse.de> wrote:
> Hi,
>
> On Sat, 17 Oct 2009, Richard Guenther wrote:
>
>> > I can't rely on automatic variables for vectorizer when I need
>> > the information before RTL expansion.
>>
>> I don't see how this is too late (I also don't see where expand creates
>> automatic variables,
>
> assign_temp/assign_stack_temp, all over.
>
>> but well ... I can imagine reload creating spill slots).  If non-aligned
>> automatic variables are generated you simply use unaligned moves -
>> what's the problem?
>
> It's obvious: we don't want to generate unaligned moves.  That's the whole
> point of H.J. patches.  He has a phase ordering problem:
> (a) alignment of generated temporaries depends on known stack alignment
> (b) known stack alignment depends on what is put on stack (including late
>    generated temporaries)
>
> He tries to solve this by pessimistically assuming that potentially
> everything imaginable could go on stack.  What I don't understand is why
> we don't instead track hard_stack_alignment in assign_*_temp (where we
> then assume that the stack will be aligned perfectly), and expand stack
> realignment code _after_ having expanded everything else (plus examined
> local variables for the possibility of generating spill slots).
>

Vectorizer may not call assign_*_temp at all. Instead, x86 backend
may call gen_reg_rtx to generate pseudo registers when expanding
vector statement.

-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
  2009-10-19 16:38                           ` H.J. Lu
@ 2009-10-19 17:08                             ` Ian Lance Taylor
  2009-10-19 17:26                               ` H.J. Lu
  2009-10-20  1:53                             ` Michael Matz
  1 sibling, 1 reply; 59+ messages in thread
From: Ian Lance Taylor @ 2009-10-19 17:08 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek

"H.J. Lu" <hjl.tools@gmail.com> writes:

> Vectorizer may not call assign_*_temp at all. Instead, x86 backend
> may call gen_reg_rtx to generate pseudo registers when expanding
> vector statement.

It simply does not make sense to change the vectorizer, of all things,
to set the stack alignment.  Stack alignment depends upon the
types/modes of automatic variables stored on the stack.  Therefore,
you should set the required stack alignment based on the creation of
automatic variables.  Ideally you would set the required stack
alignment for automatic variables stored on the stack, but apparently
you can't change the stack alignment after expand (though I don't see
why not).  So if you can't set the stack alignment based on automatic
variables stored on the stack, then you should set it based on the
creation of automatic variables.

Ian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-19 17:08                             ` Ian Lance Taylor
@ 2009-10-19 17:26                               ` H.J. Lu
  2009-10-19 17:33                                 ` Ian Lance Taylor
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-19 17:26 UTC (permalink / raw)
  To: Ian Lance Taylor
  Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek

On Mon, Oct 19, 2009 at 10:05 AM, Ian Lance Taylor <iant@google.com> wrote:
> "H.J. Lu" <hjl.tools@gmail.com> writes:
>
>> Vectorizer may not call assign_*_temp at all. Instead, x86 backend
>> may call gen_reg_rtx to generate pseudo registers when expanding
>> vector statement.
>
> It simply does not make sense to change the vectorizer, of all things,
> to set the stack alignment.  Stack alignment depends upon the
> types/modes of automatic variables stored on the stack.  Therefore,
> you should set the required stack alignment based on the creation of
> automatic variables.  Ideally you would set the required stack
> alignment for automatic variables stored on the stack, but apparently
> you can't change the stack alignment after expand (though I don't see
> why not).  So if you can't set the stack alignment based on automatic
> variables stored on the stack, then you should set it based on the
> creation of automatic variables.
>

It is about setting the incoming stack alignment, which has to be
done before RTL expansion. Vectorizer may not use any automatic
variables. But the RTL expander may generate pseudo vector registers
based on vector statement, at which time, it is too late to go back to
change incoming stack alignment.  I only modified vectorizer to
record what it does. I didn't change any statements generated by
vectorizer. The x86 backend uses this information to set the
incoming stack alignment before RTL expansion.


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
  2009-10-19 17:26                               ` H.J. Lu
@ 2009-10-19 17:33                                 ` Ian Lance Taylor
  2009-10-19 17:46                                   ` H.J. Lu
  0 siblings, 1 reply; 59+ messages in thread
From: Ian Lance Taylor @ 2009-10-19 17:33 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek

"H.J. Lu" <hjl.tools@gmail.com> writes:

> On Mon, Oct 19, 2009 at 10:05 AM, Ian Lance Taylor <iant@google.com> wrote:
>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>
>>> Vectorizer may not call assign_*_temp at all. Instead, x86 backend
>>> may call gen_reg_rtx to generate pseudo registers when expanding
>>> vector statement.
>>
>> It simply does not make sense to change the vectorizer, of all things,
>> to set the stack alignment.  Stack alignment depends upon the
>> types/modes of automatic variables stored on the stack.  Therefore,
>> you should set the required stack alignment based on the creation of
>> automatic variables.  Ideally you would set the required stack
>> alignment for automatic variables stored on the stack, but apparently
>> you can't change the stack alignment after expand (though I don't see
>> why not).  So if you can't set the stack alignment based on automatic
>> variables stored on the stack, then you should set it based on the
>> creation of automatic variables.
>>
>
> It is about setting the incoming stack alignment, which has to be
> done before RTL expansion. Vectorizer may not use any automatic
> variables. But the RTL expander may generate pseudo vector registers
> based on vector statement, at which time, it is too late to go back to
> change incoming stack alignment.  I only modified vectorizer to
> record what it does. I didn't change any statements generated by
> vectorizer. The x86 backend uses this information to set the
> incoming stack alignment before RTL expansion.

If somebody asked you "where does gcc set the required stack
alignment?" would you expect the answer to be "in the vectorizer
code?"

Is there any way we can fix incoming stack alignment so that it can be
controlled by automatic stack variables?  I don't understand why this
is not done by the prologue and epilogue code.

Ian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-19 17:33                                 ` Ian Lance Taylor
@ 2009-10-19 17:46                                   ` H.J. Lu
  2009-10-19 17:55                                     ` Ian Lance Taylor
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-19 17:46 UTC (permalink / raw)
  To: Ian Lance Taylor
  Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek

On Mon, Oct 19, 2009 at 10:30 AM, Ian Lance Taylor <iant@google.com> wrote:
> "H.J. Lu" <hjl.tools@gmail.com> writes:
>
>> On Mon, Oct 19, 2009 at 10:05 AM, Ian Lance Taylor <iant@google.com> wrote:
>>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>>
>>>> Vectorizer may not call assign_*_temp at all. Instead, x86 backend
>>>> may call gen_reg_rtx to generate pseudo registers when expanding
>>>> vector statement.
>>>
>>> It simply does not make sense to change the vectorizer, of all things,
>>> to set the stack alignment.  Stack alignment depends upon the
>>> types/modes of automatic variables stored on the stack.  Therefore,
>>> you should set the required stack alignment based on the creation of
>>> automatic variables.  Ideally you would set the required stack
>>> alignment for automatic variables stored on the stack, but apparently
>>> you can't change the stack alignment after expand (though I don't see
>>> why not).  So if you can't set the stack alignment based on automatic
>>> variables stored on the stack, then you should set it based on the
>>> creation of automatic variables.
>>>
>>
>> It is about setting the incoming stack alignment, which has to be
>> done before RTL expansion. Vectorizer may not use any automatic
>> variables. But the RTL expander may generate pseudo vector registers
>> based on vector statement, at which time, it is too late to go back to
>> change incoming stack alignment.  I only modified vectorizer to
>> record what it does. I didn't change any statements generated by
>> vectorizer. The x86 backend uses this information to set the
>> incoming stack alignment before RTL expansion.
>
> If somebody asked you "where does gcc set the required stack
> alignment?" would you expect the answer to be "in the vectorizer
> code?"

Yes, vectorizer may put requirement on stack alignment. But it is up
to backend to decide and it isn't the only place gcc may require
certain stack alignment.

> Is there any way we can fix incoming stack alignment so that it can be
> controlled by automatic stack variables?  I don't understand why this

The incoming stack alignment can be controlled by automatic stack
variables. But it can't be controlled by pseudo registers.

> is not done by the prologue and epilogue code.
>

Because the prologue and epilogue code is generated after
RTL expansion starts?


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
  2009-10-19 17:46                                   ` H.J. Lu
@ 2009-10-19 17:55                                     ` Ian Lance Taylor
  2009-10-19 19:16                                       ` H.J. Lu
  0 siblings, 1 reply; 59+ messages in thread
From: Ian Lance Taylor @ 2009-10-19 17:55 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek

"H.J. Lu" <hjl.tools@gmail.com> writes:

>> If somebody asked you "where does gcc set the required stack
>> alignment?" would you expect the answer to be "in the vectorizer
>> code?"
>
> Yes, vectorizer may put requirement on stack alignment. But it is up
> to backend to decide and it isn't the only place gcc may require
> certain stack alignment.

It's the only place your patch changes.


>> Is there any way we can fix incoming stack alignment so that it can be
>> controlled by automatic stack variables?  I don't understand why this
>
> The incoming stack alignment can be controlled by automatic stack
> variables. But it can't be controlled by pseudo registers.
>
>> is not done by the prologue and epilogue code.
>>
>
> Because the prologue and epilogue code is generated after
> RTL expansion starts?

That just restates my question without answering it.

Ian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-19 17:55                                     ` Ian Lance Taylor
@ 2009-10-19 19:16                                       ` H.J. Lu
  2009-10-19 21:15                                         ` Ian Lance Taylor
  2009-10-20  1:23                                         ` Michael Matz
  0 siblings, 2 replies; 59+ messages in thread
From: H.J. Lu @ 2009-10-19 19:16 UTC (permalink / raw)
  To: Ian Lance Taylor
  Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek

On Mon, Oct 19, 2009 at 10:47 AM, Ian Lance Taylor <iant@google.com> wrote:
> "H.J. Lu" <hjl.tools@gmail.com> writes:
>
>>> If somebody asked you "where does gcc set the required stack
>>> alignment?" would you expect the answer to be "in the vectorizer
>>> code?"
>>
>> Yes, vectorizer may put requirement on stack alignment. But it is up
>> to backend to decide and it isn't the only place gcc may require
>> certain stack alignment.
>
> It's the only place your patch changes.

I also changed ix86_minimum_alignment to handle
automatic stack variables.

>
>>> Is there any way we can fix incoming stack alignment so that it can be
>>> controlled by automatic stack variables?  I don't understand why this
>>
>> The incoming stack alignment can be controlled by automatic stack
>> variables. But it can't be controlled by pseudo registers.
>>
>>> is not done by the prologue and epilogue code.
>>>
>>
>> Because the prologue and epilogue code is generated after
>> RTL expansion starts?
>
> That just restates my question without answering it.
>

We don't "fix" the incoming stack alignment. We just need to
know what value it will be. Normally the incoming stack alignment
is a constant specified by psABI. I don't think we should
change the way we do RTL expansion just to work around a
quirk in gcc implementation of i386 psABI.


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
  2009-10-19 19:16                                       ` H.J. Lu
@ 2009-10-19 21:15                                         ` Ian Lance Taylor
  2009-10-20 19:00                                           ` H.J. Lu
  2009-10-20  1:23                                         ` Michael Matz
  1 sibling, 1 reply; 59+ messages in thread
From: Ian Lance Taylor @ 2009-10-19 21:15 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek

"H.J. Lu" <hjl.tools@gmail.com> writes:

> On Mon, Oct 19, 2009 at 10:47 AM, Ian Lance Taylor <iant@google.com> wrote:
>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>
>>>> If somebody asked you "where does gcc set the required stack
>>>> alignment?" would you expect the answer to be "in the vectorizer
>>>> code?"
>>>
>>> Yes, vectorizer may put requirement on stack alignment. But it is up
>>> to backend to decide and it isn't the only place gcc may require
>>> certain stack alignment.
>>
>> It's the only place your patch changes.
>
> I also changed ix86_minimum_alignment to handle
> automatic stack variables.

OK, the only middle-end place.


>>>> Is there any way we can fix incoming stack alignment so that it can be
>>>> controlled by automatic stack variables?  I don't understand why this
>>>
>>> The incoming stack alignment can be controlled by automatic stack
>>> variables. But it can't be controlled by pseudo registers.
>>>
>>>> is not done by the prologue and epilogue code.
>>>>
>>>
>>> Because the prologue and epilogue code is generated after
>>> RTL expansion starts?
>>
>> That just restates my question without answering it.
>>
>
> We don't "fix" the incoming stack alignment. We just need to
> know what value it will be. Normally the incoming stack alignment
> is a constant specified by psABI. I don't think we should
> change the way we do RTL expansion just to work around a
> quirk in gcc implementation of i386 psABI.

I'm not sure I understand what you mean here.  The incoming stack
alignment is still a constant, isn't it?  Does using a vector somehow
change the incoming stack alignment?

Ian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-19 16:36                             ` H.J. Lu
@ 2009-10-20  1:12                               ` Michael Matz
  2009-10-20 19:10                                 ` H.J. Lu
  0 siblings, 1 reply; 59+ messages in thread
From: Michael Matz @ 2009-10-20  1:12 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches,
	Jakub Jelinek

Hi,

On Mon, 19 Oct 2009, H.J. Lu wrote:

> The issues are
> 
> 1. The incoming stack alignment can't be changed after RTL expansion
> starts.

And that's the thing.  Just move the prologue expansion to after the body 
expansion (see cfgexpand.c) and you should be set.  No doubt you'll hit 
some ugly obstacles, but that should be the way forward.

> 2. When -mstackrealign is used, we want to use 4 byte incoming stack
> alignment on functions which use SSE vector insns.
> 3. SSE vector insns may be generated by intrinsics, vectorizer and
> vector operations.

All of these will be automatically taken care off when (1) is solved.  I 
really think it would be worth it.  Fiddling with stack alignment in the 
vectorizer (though that's the first alignment requirers you'll hit) is not 
going to fly in the long run.


Ciao,
Michael.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-19 19:16                                       ` H.J. Lu
  2009-10-19 21:15                                         ` Ian Lance Taylor
@ 2009-10-20  1:23                                         ` Michael Matz
  2009-10-20 19:12                                           ` H.J. Lu
  1 sibling, 1 reply; 59+ messages in thread
From: Michael Matz @ 2009-10-20  1:23 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Ian Lance Taylor, Richard Guenther, Uros Bizjak, gcc-patches,
	Jakub Jelinek

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2839 bytes --]

Hi,

On Mon, 19 Oct 2009, H.J. Lu wrote:

> On Mon, Oct 19, 2009 at 10:47 AM, Ian Lance Taylor <iant@google.com> wrote:
> > "H.J. Lu" <hjl.tools@gmail.com> writes:
> >
> >>> If somebody asked you "where does gcc set the required stack
> >>> alignment?" would you expect the answer to be "in the vectorizer
> >>> code?"

For sake of completeness: obviously not.  Any implementation where this 
answer would be true is broken (on one or the other level).

> >>> Is there any way we can fix incoming stack alignment so that it can 
> >>> be controlled by automatic stack variables?  I don't understand why 
> >>> this
> >>
> >> The incoming stack alignment can be controlled by automatic stack 
> >> variables. But it can't be controlled by pseudo registers.
> >>
> >>> is not done by the prologue and epilogue code.
> >>>
> >>
> >> Because the prologue and epilogue code is generated after RTL 
> >> expansion starts?
> >
> > That just restates my question without answering it.
> >
> 
> We don't "fix" the incoming stack alignment. We just need to know what 
> value it will be. Normally the incoming stack alignment is a constant 
> specified by psABI. I don't think we should change the way we do RTL 
> expansion just to work around a quirk in gcc implementation of i386 
> psABI.

Actually I think that we should change whatever we need to change for the 
change (ahem) to make sense.  You said:

> >> The incoming stack alignment can be controlled by automatic stack
> >> variables. But it can't be controlled by pseudo registers.

That doesn't make sense verbatim.  Why should the _incoming_ alignment (I 
understand this as the guarantee that the caller makes, "incoming" from 
the perspective of the called function) be influenced by anything the 
given function does or doesn't do to the stack.  I guess there's some 
confusion with the terms, so please clarify.

You also said:

[Ian]
> > > I don't understand why this
> > > is not done by the prologue and epilogue code.

[H.J.]
> > Because the prologue and epilogue code is generated after RTL
> > expansion starts?

Somewhere we're going downhill with understanding.  I assume that stack 
alignment work (if necessary) is done in the prologue.  I further assume 
that required stack alignment depends on emitted calls and stack slots in 
the body.

So if (as you say right above) prologue is emitted after the expansion of 
all real instructions (which then includes arbitrary temporaries, but not 
spill slots), it should be very well able to emit whatever insns necessary 
to guarantee the alignment required depending on the guaranteed incoming 
alignment.

(Ignore for a moment that currently the function start is not emitted 
after everything else, but before.  For discussion purpose assume that the 
function start is emitted after everything else.)


Ciao,
Michael.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-19 16:38                           ` H.J. Lu
  2009-10-19 17:08                             ` Ian Lance Taylor
@ 2009-10-20  1:53                             ` Michael Matz
  2009-10-20 21:15                               ` H.J. Lu
  1 sibling, 1 reply; 59+ messages in thread
From: Michael Matz @ 2009-10-20  1:53 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches,
	Jakub Jelinek

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1745 bytes --]

Hi,

On Mon, 19 Oct 2009, H.J. Lu wrote:

> > He tries to solve this by pessimistically assuming that potentially 
> > everything imaginable could go on stack.  What I don't understand is 
> > why we don't instead track hard_stack_alignment in assign_*_temp 
> > (where we then assume that the stack will be aligned perfectly), and 
> > expand stack realignment code _after_ having expanded everything else 
> > (plus examined local variables for the possibility of generating spill 
> > slots).
> 
> Vectorizer may not call assign_*_temp at all. Instead, x86 backend may 
> call gen_reg_rtx to generate pseudo registers when expanding vector 
> statement.

If that (and not assign_*_temp) is the problem, then it's obvious that 
fiddling with the vectorizer doesn't solve it.  The expanders can create 
new pseudos for whatever they see fit.  For expanding vector statements, 
for expanding block moves, for expanding string compares, for expanding 
additions, for anything.  Mucking around with the vectorizer won't solve 
the problem.

On the outset it's very easy: if anything (call it X) goes to the stack it 
requires some alignment for optimality.  That alignment can only be 
generated in the prologue.  It follows trivially that such prologue can 
only be generated after every X is done and emitted.  You seem to want to 
work around this by guessing what might possibly be required from stack 
alignment.  That's sort of fine, but if you go down that path you can't 
just change the vectorizer, you have to look at all possible sources of 
stack temporaries, and that simply includes the pseudos.

Given all this: remind me why we don't simply adjust the prologue after 
reload, when all stack slots are allocated?


Ciao,
Michael.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-19 21:15                                         ` Ian Lance Taylor
@ 2009-10-20 19:00                                           ` H.J. Lu
  0 siblings, 0 replies; 59+ messages in thread
From: H.J. Lu @ 2009-10-20 19:00 UTC (permalink / raw)
  To: Ian Lance Taylor
  Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek

On Mon, Oct 19, 2009 at 2:09 PM, Ian Lance Taylor <iant@google.com> wrote:
>>
>> We don't "fix" the incoming stack alignment. We just need to
>> know what value it will be. Normally the incoming stack alignment
>> is a constant specified by psABI. I don't think we should
>> change the way we do RTL expansion just to work around a
>> quirk in gcc implementation of i386 psABI.
>
> I'm not sure I understand what you mean here.  The incoming stack
> alignment is still a constant, isn't it?  Does using a vector somehow
> change the incoming stack alignment?
>

Using a vector doesn't change the incoming stack alignment,
which is set by caller. But it may change the minimum value
of the coming stack alignment, which leads to different code
generation if stack alignment is required by hardware. The
incoming stack alignment should be a constant when RTL
expansion starts, especially before calling expand_call. In
cfgexpand.c, there are comments:

      /* Call update_stack_boundary here to update incoming stack
         boundary before TARGET_FUNCTION_OK_FOR_SIBCALL is called.
         TARGET_FUNCTION_OK_FOR_SIBCALL needs to know the accurate
         incoming stack alignment to check if it is OK to perform
         sibcall optimization since sibcall optimization will only
         align the outgoing stack to incoming stack boundary.  */
      if (targetm.calls.update_stack_boundary)
        targetm.calls.update_stack_boundary ();

In i386, there are

  /* If we need to align the outgoing stack, then sibcalling would
     unalign the stack, which may break the called function.  */
  if (ix86_incoming_stack_boundary < PREFERRED_STACK_BOUNDARY)
    return false;

Before calling expand_stack_alignment in gimple_expand_cfg,
we can generate any alignment on stack. expand_stack_alignment
has

 crtl->stack_realign_needed
    = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;

At that point, we make the decision if we need to align the stack.
We can always not align the stack at some later time. But we can't
align the stack if stack_realign_needed is false since it will impact
how stack pointer and frame pointer are used in IRA and reload.



-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-20  1:12                               ` Michael Matz
@ 2009-10-20 19:10                                 ` H.J. Lu
  0 siblings, 0 replies; 59+ messages in thread
From: H.J. Lu @ 2009-10-20 19:10 UTC (permalink / raw)
  To: Michael Matz
  Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches,
	Jakub Jelinek

On Mon, Oct 19, 2009 at 5:48 PM, Michael Matz <matz@suse.de> wrote:
> Hi,
>
> On Mon, 19 Oct 2009, H.J. Lu wrote:
>
>> The issues are
>>
>> 1. The incoming stack alignment can't be changed after RTL expansion
>> starts.
>
> And that's the thing.  Just move the prologue expansion to after the body
> expansion (see cfgexpand.c) and you should be set.  No doubt you'll hit
> some ugly obstacles, but that should be the way forward.

Currently, pass_thread_prologue_and_epilogue is processed well after
the body expansion, IRA and some reload passes. I am not familiar
with middle-end to move it that far.

>> 2. When -mstackrealign is used, we want to use 4 byte incoming stack
>> alignment on functions which use SSE vector insns.
>> 3. SSE vector insns may be generated by intrinsics, vectorizer and
>> vector operations.
>
> All of these will be automatically taken care off when (1) is solved.  I
> really think it would be worth it.  Fiddling with stack alignment in the
> vectorizer (though that's the first alignment requirers you'll hit) is not
> going to fly in the long run.
>

The whole thing is to deal with a quirk in gcc implementation of i386
psABI when -mstackrealign is used. Sure, we may have to deal some
more cases. But I don't expect any long term issue.

-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-20  1:23                                         ` Michael Matz
@ 2009-10-20 19:12                                           ` H.J. Lu
  0 siblings, 0 replies; 59+ messages in thread
From: H.J. Lu @ 2009-10-20 19:12 UTC (permalink / raw)
  To: Michael Matz
  Cc: Ian Lance Taylor, Richard Guenther, Uros Bizjak, gcc-patches,
	Jakub Jelinek

On Mon, Oct 19, 2009 at 6:12 PM, Michael Matz <matz@suse.de> wrote:

>
> Somewhere we're going downhill with understanding.  I assume that stack
> alignment work (if necessary) is done in the prologue.  I further assume
> that required stack alignment depends on emitted calls and stack slots in
> the body.

Stack alignment work is done all over middle-end, RA, reload as well
as backend.  prologue is just one part of it.

> So if (as you say right above) prologue is emitted after the expansion of
> all real instructions (which then includes arbitrary temporaries, but not
> spill slots), it should be very well able to emit whatever insns necessary
> to guarantee the alignment required depending on the guaranteed incoming
> alignment.
>

The problem is we want to know what "the guaranteed incoming alignment"
is. 4byte is the safest. But we have to realign the stack for every non-leaf
function in order to generate 16byte outgoing stack alignment. When is it
safe to assuming 16byte incoming stack alignment when the actual incoming
stack alignment is really 4byte? We want to identify functions which can't
tolerate misaligned stack. So far my list is:

1. Has any SSE automatic variable.
2. Has any vector statement.


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-20  1:53                             ` Michael Matz
@ 2009-10-20 21:15                               ` H.J. Lu
  2009-10-21  1:10                                 ` H.J. Lu
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-20 21:15 UTC (permalink / raw)
  To: Michael Matz
  Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches,
	Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 2706 bytes --]

On Mon, Oct 19, 2009 at 6:25 PM, Michael Matz <matz@suse.de> wrote:
> Hi,
>
> On Mon, 19 Oct 2009, H.J. Lu wrote:
>
>> > He tries to solve this by pessimistically assuming that potentially
>> > everything imaginable could go on stack.  What I don't understand is
>> > why we don't instead track hard_stack_alignment in assign_*_temp
>> > (where we then assume that the stack will be aligned perfectly), and
>> > expand stack realignment code _after_ having expanded everything else
>> > (plus examined local variables for the possibility of generating spill
>> > slots).
>>
>> Vectorizer may not call assign_*_temp at all. Instead, x86 backend may
>> call gen_reg_rtx to generate pseudo registers when expanding vector
>> statement.
>
> If that (and not assign_*_temp) is the problem, then it's obvious that
> fiddling with the vectorizer doesn't solve it.  The expanders can create
> new pseudos for whatever they see fit.  For expanding vector statements,
> for expanding block moves, for expanding string compares, for expanding
> additions, for anything.  Mucking around with the vectorizer won't solve
> the problem.
>

Here is a new patch. I added hard_stack_alignment, moved
update_stack_boundary after RTL expansion and updated
ix86_function_ok_for_sibcall to deal with it.  Any comments?

Thanks.


-- 
H.J.
---
gcc/

2009-10-20  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40836
	* cfgexpand.c (get_decl_align_unit): Update hard_stack_alignment.
	(expand_one_var): Likewise.
	(gimple_expand_cfg): Initialize hard_stack_alignment to 0. Move
	update_stack_boundary call to ...
	(expand_stack_alignment): Here.

	* emit-rtl.c (gen_reg_rtx): Update hard_stack_alignment.
	* function.c (assign_stack_local_1): Likewise.
	(assign_parms): Likewise.
	(locate_and_pad_parm): Likewise.

	* function.h (rtl_data): Add hard_stack_alignment.

	* config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New.
	(verride_options): Don't check ix86_force_align_arg_pointer here.
	(ix86_function_ok_for_sibcall): Use it.
	(ix86_update_stack_boundary): Likewise.

	* config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments.

gcc/testsuite/

2009-10-20  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.
	* gcc.target/i386/incoming-10.c: Likewise.
	* gcc.target/i386/incoming-11.c: Likewise.
	* gcc.target/i386/incoming-12.c: Likewise.
	* gcc.target/i386/incoming-13.c: Likewise.
	* gcc.target/i386/incoming-14.c: Likewise.
	* gcc.target/i386/incoming-15.c: Likewise.

[-- Attachment #2: gcc-pr40838-8.patch --]
[-- Type: text/plain, Size: 18213 bytes --]

gcc/

2009-10-20  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40836
	* cfgexpand.c (get_decl_align_unit): Update hard_stack_alignment.
	(expand_one_var): Likewise.
	(gimple_expand_cfg): Initialize hard_stack_alignment to 0. Move
	update_stack_boundary call to ...
	(expand_stack_alignment): Here.

	* emit-rtl.c (gen_reg_rtx): Update hard_stack_alignment.
	* function.c (assign_stack_local_1): Likewise.
	(assign_parms): Likewise.
	(locate_and_pad_parm): Likewise.

	* function.h (rtl_data): Add hard_stack_alignment.

	* config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New.
	(verride_options): Don't check ix86_force_align_arg_pointer here.
	(ix86_function_ok_for_sibcall): Use it.
	(ix86_update_stack_boundary): Likewise.

	* config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments.

gcc/testsuite/

2009-10-20  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.
	* gcc.target/i386/incoming-10.c: Likewise.
	* gcc.target/i386/incoming-11.c: Likewise.
	* gcc.target/i386/incoming-12.c: Likewise.
	* gcc.target/i386/incoming-13.c: Likewise.
	* gcc.target/i386/incoming-14.c: Likewise.
	* gcc.target/i386/incoming-15.c: Likewise.

diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c
index acd70c1..05df1fd 100644
--- a/gcc/cfgexpand.c
+++ b/gcc/cfgexpand.c
@@ -247,6 +247,9 @@ get_decl_align_unit (tree decl)
 	  gcc_assert(!crtl->stack_realign_processed);
           crtl->stack_alignment_estimated = align;
 	}
+
+      if (crtl->hard_stack_alignment < align)
+	crtl->hard_stack_alignment = align;
     }
 
   /* stack_alignment_needed > PREFERRED_STACK_BOUNDARY is permitted.
@@ -994,6 +997,9 @@ expand_one_var (tree var, bool toplevel, bool really_expand)
           gcc_assert(!crtl->stack_realign_processed);
 	  crtl->stack_alignment_estimated = align;
 	}
+
+      if (crtl->hard_stack_alignment < align)
+	crtl->hard_stack_alignment = align;
     }
 
   if (TREE_CODE (origvar) == SSA_NAME)
@@ -3472,6 +3478,19 @@ expand_stack_alignment (void)
   gcc_assert (crtl->stack_alignment_needed
 	      <= crtl->stack_alignment_estimated);
 
+  /* Call update_stack_boundary here again to update incoming stack
+     boundary.  It may set incoming stack alignment to a different
+     value after RTL expansion.  TARGET_FUNCTION_OK_FOR_SIBCALL may
+     use the minimum incoming stack alignment to check if it is OK
+     to perform sibcall optimization since sibcall optimization will
+     only align the outgoing stack to incoming stack boundary.  */
+  if (targetm.calls.update_stack_boundary)
+    targetm.calls.update_stack_boundary ();
+
+  /* The incoming stack frame has to be aligned at least at
+     parm_stack_boundary.  */
+  gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY);
+
   /* Update crtl->stack_alignment_estimated and use it later to align
      stack.  We check PREFERRED_STACK_BOUNDARY if there may be non-call
      exceptions since callgraph doesn't collect incoming stack alignment
@@ -3564,6 +3583,7 @@ gimple_expand_cfg (void)
   crtl->max_used_stack_slot_alignment = STACK_BOUNDARY;
   crtl->stack_alignment_estimated = STACK_BOUNDARY;
   crtl->preferred_stack_boundary = STACK_BOUNDARY;
+  crtl->hard_stack_alignment = 0;
   cfun->cfg->max_jumptable_ents = 0;
 
 
@@ -3626,23 +3646,6 @@ gimple_expand_cfg (void)
   if (crtl->stack_protect_guard)
     stack_protect_prologue ();
 
-  /* Update stack boundary if needed.  */
-  if (SUPPORTS_STACK_ALIGNMENT)
-    {
-      /* Call update_stack_boundary here to update incoming stack
-	 boundary before TARGET_FUNCTION_OK_FOR_SIBCALL is called.
-	 TARGET_FUNCTION_OK_FOR_SIBCALL needs to know the accurate
-	 incoming stack alignment to check if it is OK to perform
-	 sibcall optimization since sibcall optimization will only
-	 align the outgoing stack to incoming stack boundary.  */
-      if (targetm.calls.update_stack_boundary)
-	targetm.calls.update_stack_boundary ();
-      
-      /* The incoming stack frame has to be aligned at least at
-	 parm_stack_boundary.  */
-      gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY);
-    }
-
   expand_phi_nodes (&SA);
 
   /* Register rtl specific functions for cfg.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 73913b8..3db9485 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1905,6 +1905,7 @@ static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
 static bool ix86_valid_target_attribute_inner_p (tree, char *[]);
 static bool ix86_can_inline_p (tree, tree);
 static void ix86_set_current_function (tree);
+static unsigned int ix86_minimum_incoming_stack_boundary (bool);
 
 static enum calling_abi ix86_function_abi (const_tree);
 
@@ -3239,12 +3240,10 @@ override_options (bool main_args_p)
   if (ix86_force_align_arg_pointer == -1)
     ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
 
+  ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
+
   /* Validate -mincoming-stack-boundary= value or default it to
      MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
-  if (ix86_force_align_arg_pointer)
-    ix86_default_incoming_stack_boundary = MIN_STACK_BOUNDARY;
-  else
-    ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
   ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
   if (ix86_incoming_stack_boundary_string)
     {
@@ -4277,7 +4276,8 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
 
   /* If we need to align the outgoing stack, then sibcalling would
      unalign the stack, which may break the called function.  */
-  if (ix86_incoming_stack_boundary < PREFERRED_STACK_BOUNDARY)
+  if (ix86_minimum_incoming_stack_boundary (true)
+      < PREFERRED_STACK_BOUNDARY)
     return false;
 
   if (decl)
@@ -8196,37 +8196,57 @@ find_drap_reg (void)
     }
 }
 
-/* Update incoming stack boundary and estimated stack alignment.  */
+/* Return minimum incoming stack alignment.  */
 
-static void
-ix86_update_stack_boundary (void)
+static unsigned int
+ix86_minimum_incoming_stack_boundary (bool sibcall)
 {
+  unsigned int incoming_stack_boundary;
+
   /* Prefer the one specified at command line. */
-  ix86_incoming_stack_boundary 
-    = (ix86_user_incoming_stack_boundary
-       ? ix86_user_incoming_stack_boundary
-       : ix86_default_incoming_stack_boundary);
+  if (ix86_user_incoming_stack_boundary)
+    incoming_stack_boundary = ix86_user_incoming_stack_boundary;
+  /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary if
+     -mstackrealign is used and it is called to check if sibcall is
+     OK or hard stack alignment is 128bit.  */
+  else if (!TARGET_64BIT
+	   && ix86_force_align_arg_pointer
+	   && (sibcall || crtl->hard_stack_alignment == 128))
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
+  else
+    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
 
   /* Incoming stack alignment can be changed on individual functions
      via force_align_arg_pointer attribute.  We use the smallest
      incoming stack boundary.  */
-  if (ix86_incoming_stack_boundary > MIN_STACK_BOUNDARY
+  if (incoming_stack_boundary > MIN_STACK_BOUNDARY
       && lookup_attribute (ix86_force_align_arg_pointer_string,
 			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
-    ix86_incoming_stack_boundary = MIN_STACK_BOUNDARY;
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
 
   /* The incoming stack frame has to be aligned at least at
      parm_stack_boundary.  */
-  if (ix86_incoming_stack_boundary < crtl->parm_stack_boundary)
-    ix86_incoming_stack_boundary = crtl->parm_stack_boundary;
+  if (incoming_stack_boundary < crtl->parm_stack_boundary)
+    incoming_stack_boundary = crtl->parm_stack_boundary;
 
   /* Stack at entrance of main is aligned by runtime.  We use the
      smallest incoming stack boundary. */
-  if (ix86_incoming_stack_boundary > MAIN_STACK_BOUNDARY
+  if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
       && DECL_NAME (current_function_decl)
       && MAIN_NAME_P (DECL_NAME (current_function_decl))
       && DECL_FILE_SCOPE_P (current_function_decl))
-    ix86_incoming_stack_boundary = MAIN_STACK_BOUNDARY;
+    incoming_stack_boundary = MAIN_STACK_BOUNDARY;
+
+  return incoming_stack_boundary;
+}
+
+/* Update incoming stack boundary and estimated stack alignment.  */
+
+static void
+ix86_update_stack_boundary (void)
+{
+  ix86_incoming_stack_boundary
+    = ix86_minimum_incoming_stack_boundary (false);
 
   /* x86_64 vararg needs 16byte stack alignment for register save
      area.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 33a5077..22187a9 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -706,9 +706,7 @@ enum target_cpu_default
    generate an alternate prologue and epilogue that realigns the
    runtime stack if nessary.  This supports mixing codes that keep a
    4-byte aligned stack, as specified by i386 psABI, with codes that
-   need a 16-byte aligned stack, as required by SSE instructions.  If
-   STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
-   128, stacks for all functions may be realigned.  */
+   need a 16-byte aligned stack, as required by SSE instructions.  */
 #define STACK_REALIGN_DEFAULT 0
 
 /* Boundary (in *bits*) on which the incoming stack is aligned.  */
diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
index b868298..c1ca40e 100644
--- a/gcc/emit-rtl.c
+++ b/gcc/emit-rtl.c
@@ -872,6 +872,9 @@ gen_reg_rtx (enum machine_mode mode)
       unsigned int min_align = MINIMUM_ALIGNMENT (NULL, mode, align);
       if (crtl->stack_alignment_estimated < min_align)
 	crtl->stack_alignment_estimated = min_align;
+
+      if (crtl->hard_stack_alignment < min_align)
+	crtl->hard_stack_alignment = min_align;
     }
 
   if (generating_concat_p
diff --git a/gcc/function.c b/gcc/function.c
index 35c0cfd..eb8ecfe 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -356,6 +356,10 @@ assign_stack_local_1 (enum machine_mode mode, HOST_WIDE_INT size,
 		}
 	    }
 	}
+
+      if (!crtl->stack_realign_processed
+	  && crtl->hard_stack_alignment < alignment_in_bits)
+	crtl->hard_stack_alignment = alignment_in_bits;
     }
 
   if (crtl->stack_alignment_needed < alignment_in_bits)
@@ -3166,6 +3170,9 @@ assign_parms (tree fndecl)
 	      gcc_assert (!crtl->stack_realign_processed);
 	      crtl->stack_alignment_estimated = align;
 	    }
+
+	  if (crtl->hard_stack_alignment < align)
+	    crtl->hard_stack_alignment = align;
 	}
 	
       if (cfun->stdarg && !TREE_CHAIN (parm))
@@ -3223,6 +3230,9 @@ assign_parms (tree fndecl)
 		  gcc_assert (!crtl->stack_realign_processed);
 		  crtl->stack_alignment_estimated = align;
 		}
+
+	      if (crtl->hard_stack_alignment < align)
+		crtl->hard_stack_alignment = align;
 	    }
 	} 
     }
@@ -3538,6 +3548,10 @@ locate_and_pad_parm (enum machine_mode passed_mode, tree type, int in_regs,
 			  && crtl->stack_realign_needed);
 	    }
 	}
+
+      if (!crtl->stack_realign_processed
+	  && crtl->hard_stack_alignment < boundary)
+	crtl->hard_stack_alignment = boundary;
     }
 
   /* Remember if the outgoing parameter requires extra alignment on the
diff --git a/gcc/function.h b/gcc/function.h
index 4825d16..8b57c9a 100644
--- a/gcc/function.h
+++ b/gcc/function.h
@@ -341,6 +341,9 @@ struct GTY(()) rtl_data {
         local stack.  */
   unsigned int stack_alignment_estimated;
 
+  /* The largest hard alignment on the stack.  */
+  unsigned int hard_stack_alignment;
+
   /* For reorg.  */
 
   /* If some insns can be deferred to the delay slots of the epilogue, the
diff --git a/gcc/testsuite/gcc.target/i386/incoming-10.c b/gcc/testsuite/gcc.target/i386/incoming-10.c
new file mode 100644
index 0000000..31d9e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-10.c
@@ -0,0 +1,19 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+struct s {
+	int x[8];
+};
+
+void g(struct s *);
+
+void f()
+{
+	int i;
+	struct s s;
+	for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0;
+	g(&s);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-11.c b/gcc/testsuite/gcc.target/i386/incoming-11.c
new file mode 100644
index 0000000..e5787af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-11.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+void g();
+
+int p[100];
+int q[100];
+
+void f()
+{
+	int i;
+	for (i = 0; i < 100; i++) p[i] = 0;
+	g();
+	for (i = 0; i < 100; i++) q[i] = 0;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-12.c b/gcc/testsuite/gcc.target/i386/incoming-12.c
new file mode 100644
index 0000000..d7ef103
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-12.c
@@ -0,0 +1,20 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+struct x {
+       v4si v;
+       v4si w;
+};
+
+void y(void *);
+
+v4si x(void)
+{
+       struct x x;
+       y(&x);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-13.c b/gcc/testsuite/gcc.target/i386/incoming-13.c
new file mode 100644
index 0000000..bbc8993
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-13.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern double y(double *s3);
+
+extern double s1, s2;
+
+double x(void)
+{
+  double s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-14.c b/gcc/testsuite/gcc.target/i386/incoming-14.c
new file mode 100644
index 0000000..d27179d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-14.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern int y(int *s3);
+
+extern int s1, s2;
+
+int x(void)
+{
+  int s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-15.c b/gcc/testsuite/gcc.target/i386/incoming-15.c
new file mode 100644
index 0000000..e6a1749
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-15.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern long long y(long long *s3);
+
+extern long long s1, s2;
+
+long long x(void)
+{
+  long long s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-6.c b/gcc/testsuite/gcc.target/i386/incoming-6.c
new file mode 100644
index 0000000..5cc4ab3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-6.c
@@ -0,0 +1,17 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si *s3);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  v4si s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-7.c b/gcc/testsuite/gcc.target/i386/incoming-7.c
new file mode 100644
index 0000000..cdd6037
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-7.c
@@ -0,0 +1,16 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si, v4si, v4si, v4si, v4si);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  return y(s1, s2, s1, s2, s2);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-8.c b/gcc/testsuite/gcc.target/i386/incoming-8.c
new file mode 100644
index 0000000..2dd8800
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-8.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O3 -msse2 -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-9.c b/gcc/testsuite/gcc.target/i386/incoming-9.c
new file mode 100644
index 0000000..e43cbd6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-9.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O3 -mno-sse -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-20 21:15                               ` H.J. Lu
@ 2009-10-21  1:10                                 ` H.J. Lu
  2009-10-21  9:54                                   ` Michael Matz
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-21  1:10 UTC (permalink / raw)
  To: Michael Matz
  Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches,
	Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 3054 bytes --]

On Tue, Oct 20, 2009 at 2:07 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Oct 19, 2009 at 6:25 PM, Michael Matz <matz@suse.de> wrote:
>> Hi,
>>
>> On Mon, 19 Oct 2009, H.J. Lu wrote:
>>
>>> > He tries to solve this by pessimistically assuming that potentially
>>> > everything imaginable could go on stack.  What I don't understand is
>>> > why we don't instead track hard_stack_alignment in assign_*_temp
>>> > (where we then assume that the stack will be aligned perfectly), and
>>> > expand stack realignment code _after_ having expanded everything else
>>> > (plus examined local variables for the possibility of generating spill
>>> > slots).
>>>
>>> Vectorizer may not call assign_*_temp at all. Instead, x86 backend may
>>> call gen_reg_rtx to generate pseudo registers when expanding vector
>>> statement.
>>
>> If that (and not assign_*_temp) is the problem, then it's obvious that
>> fiddling with the vectorizer doesn't solve it.  The expanders can create
>> new pseudos for whatever they see fit.  For expanding vector statements,
>> for expanding block moves, for expanding string compares, for expanding
>> additions, for anything.  Mucking around with the vectorizer won't solve
>> the problem.
>>
>
> Here is a new patch. I added hard_stack_alignment, moved
> update_stack_boundary after RTL expansion and updated
> ix86_function_ok_for_sibcall to deal with it.  Any comments?
>
Here is the updated patch.  For -mstackrealign, we don't
need to check MIN_STACK_BOUNDARY for sibcall.
Callee of the sibcall will get the same incoming stack
alignment.  I added a new testcase to check it.


-- 
H.J.
---
gcc/

2009-10-20  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40836
	* cfgexpand.c (get_decl_align_unit): Update hard_stack_alignment.
	(expand_one_var): Likewise.
	(gimple_expand_cfg): Initialize hard_stack_alignment to 0. Move
	update_stack_boundary call to ...
	(expand_stack_alignment): Here.

	* emit-rtl.c (gen_reg_rtx): Update hard_stack_alignment.
	* function.c (assign_stack_local_1): Likewise.
	(assign_parms): Likewise.
	(locate_and_pad_parm): Likewise.

	* function.h (rtl_data): Add hard_stack_alignment.

	* config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New.
	(verride_options): Don't check ix86_force_align_arg_pointer here.
	(ix86_function_ok_for_sibcall): Use it.
	(ix86_update_stack_boundary): Likewise.

	* config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments.

gcc/testsuite/

2009-10-20  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.
	* gcc.target/i386/incoming-10.c: Likewise.
	* gcc.target/i386/incoming-11.c: Likewise.
	* gcc.target/i386/incoming-12.c: Likewise.
	* gcc.target/i386/incoming-13.c: Likewise.
	* gcc.target/i386/incoming-14.c: Likewise.
	* gcc.target/i386/incoming-15.c: Likewise.
	* gcc.target/i386/pr37843-4.c: Likewise.

[-- Attachment #2: gcc-pr40838-9.patch --]
[-- Type: text/plain, Size: 18940 bytes --]

gcc/

2009-10-20  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40836
	* cfgexpand.c (get_decl_align_unit): Update hard_stack_alignment.
	(expand_one_var): Likewise.
	(gimple_expand_cfg): Initialize hard_stack_alignment to 0. Move
	update_stack_boundary call to ...
	(expand_stack_alignment): Here.

	* emit-rtl.c (gen_reg_rtx): Update hard_stack_alignment.
	* function.c (assign_stack_local_1): Likewise.
	(assign_parms): Likewise.
	(locate_and_pad_parm): Likewise.

	* function.h (rtl_data): Add hard_stack_alignment.

	* config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New.
	(verride_options): Don't check ix86_force_align_arg_pointer here.
	(ix86_function_ok_for_sibcall): Use it.
	(ix86_update_stack_boundary): Likewise.

	* config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments.

gcc/testsuite/

2009-10-20  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.
	* gcc.target/i386/incoming-10.c: Likewise.
	* gcc.target/i386/incoming-11.c: Likewise.
	* gcc.target/i386/incoming-12.c: Likewise.
	* gcc.target/i386/incoming-13.c: Likewise.
	* gcc.target/i386/incoming-14.c: Likewise.
	* gcc.target/i386/incoming-15.c: Likewise.
	* gcc.target/i386/pr37843-4.c: Likewise.

diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c
index 2678d7e..27475d7 100644
--- a/gcc/cfgexpand.c
+++ b/gcc/cfgexpand.c
@@ -247,6 +247,9 @@ get_decl_align_unit (tree decl)
 	  gcc_assert(!crtl->stack_realign_processed);
           crtl->stack_alignment_estimated = align;
 	}
+
+      if (crtl->hard_stack_alignment < align)
+	crtl->hard_stack_alignment = align;
     }
 
   /* stack_alignment_needed > PREFERRED_STACK_BOUNDARY is permitted.
@@ -994,6 +997,9 @@ expand_one_var (tree var, bool toplevel, bool really_expand)
           gcc_assert(!crtl->stack_realign_processed);
 	  crtl->stack_alignment_estimated = align;
 	}
+
+      if (crtl->hard_stack_alignment < align)
+	crtl->hard_stack_alignment = align;
     }
 
   if (TREE_CODE (origvar) == SSA_NAME)
@@ -3433,6 +3439,19 @@ expand_stack_alignment (void)
   gcc_assert (crtl->stack_alignment_needed
 	      <= crtl->stack_alignment_estimated);
 
+  /* Call update_stack_boundary here again to update incoming stack
+     boundary.  It may set incoming stack alignment to a different
+     value after RTL expansion.  TARGET_FUNCTION_OK_FOR_SIBCALL may
+     use the minimum incoming stack alignment to check if it is OK
+     to perform sibcall optimization since sibcall optimization will
+     only align the outgoing stack to incoming stack boundary.  */
+  if (targetm.calls.update_stack_boundary)
+    targetm.calls.update_stack_boundary ();
+
+  /* The incoming stack frame has to be aligned at least at
+     parm_stack_boundary.  */
+  gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY);
+
   /* Update crtl->stack_alignment_estimated and use it later to align
      stack.  We check PREFERRED_STACK_BOUNDARY if there may be non-call
      exceptions since callgraph doesn't collect incoming stack alignment
@@ -3525,6 +3544,7 @@ gimple_expand_cfg (void)
   crtl->max_used_stack_slot_alignment = STACK_BOUNDARY;
   crtl->stack_alignment_estimated = STACK_BOUNDARY;
   crtl->preferred_stack_boundary = STACK_BOUNDARY;
+  crtl->hard_stack_alignment = 0;
   cfun->cfg->max_jumptable_ents = 0;
 
 
@@ -3587,23 +3607,6 @@ gimple_expand_cfg (void)
   if (crtl->stack_protect_guard)
     stack_protect_prologue ();
 
-  /* Update stack boundary if needed.  */
-  if (SUPPORTS_STACK_ALIGNMENT)
-    {
-      /* Call update_stack_boundary here to update incoming stack
-	 boundary before TARGET_FUNCTION_OK_FOR_SIBCALL is called.
-	 TARGET_FUNCTION_OK_FOR_SIBCALL needs to know the accurate
-	 incoming stack alignment to check if it is OK to perform
-	 sibcall optimization since sibcall optimization will only
-	 align the outgoing stack to incoming stack boundary.  */
-      if (targetm.calls.update_stack_boundary)
-	targetm.calls.update_stack_boundary ();
-      
-      /* The incoming stack frame has to be aligned at least at
-	 parm_stack_boundary.  */
-      gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY);
-    }
-
   expand_phi_nodes (&SA);
 
   /* Register rtl specific functions for cfg.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 6065f49..6603250 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1905,6 +1905,7 @@ static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
 static bool ix86_valid_target_attribute_inner_p (tree, char *[]);
 static bool ix86_can_inline_p (tree, tree);
 static void ix86_set_current_function (tree);
+static unsigned int ix86_minimum_incoming_stack_boundary (bool);
 
 static enum calling_abi ix86_function_abi (const_tree);
 
@@ -3239,12 +3240,10 @@ override_options (bool main_args_p)
   if (ix86_force_align_arg_pointer == -1)
     ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
 
+  ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
+
   /* Validate -mincoming-stack-boundary= value or default it to
      MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
-  if (ix86_force_align_arg_pointer)
-    ix86_default_incoming_stack_boundary = MIN_STACK_BOUNDARY;
-  else
-    ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
   ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
   if (ix86_incoming_stack_boundary_string)
     {
@@ -4277,7 +4276,8 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
 
   /* If we need to align the outgoing stack, then sibcalling would
      unalign the stack, which may break the called function.  */
-  if (ix86_incoming_stack_boundary < PREFERRED_STACK_BOUNDARY)
+  if (ix86_minimum_incoming_stack_boundary (true)
+      < PREFERRED_STACK_BOUNDARY)
     return false;
 
   if (decl)
@@ -8196,37 +8196,58 @@ find_drap_reg (void)
     }
 }
 
-/* Update incoming stack boundary and estimated stack alignment.  */
+/* Return minimum incoming stack alignment.  */
 
-static void
-ix86_update_stack_boundary (void)
+static unsigned int
+ix86_minimum_incoming_stack_boundary (bool sibcall)
 {
+  unsigned int incoming_stack_boundary;
+
   /* Prefer the one specified at command line. */
-  ix86_incoming_stack_boundary 
-    = (ix86_user_incoming_stack_boundary
-       ? ix86_user_incoming_stack_boundary
-       : ix86_default_incoming_stack_boundary);
+  if (ix86_user_incoming_stack_boundary)
+    incoming_stack_boundary = ix86_user_incoming_stack_boundary;
+  /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary if
+     -mstackrealign is used, it isn't used for sibcall check and hard
+     stack alignment is 128bit.  */
+  else if (!sibcall
+	   && !TARGET_64BIT
+	   && ix86_force_align_arg_pointer
+	   && crtl->hard_stack_alignment == 128)
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
+  else
+    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
 
   /* Incoming stack alignment can be changed on individual functions
      via force_align_arg_pointer attribute.  We use the smallest
      incoming stack boundary.  */
-  if (ix86_incoming_stack_boundary > MIN_STACK_BOUNDARY
+  if (incoming_stack_boundary > MIN_STACK_BOUNDARY
       && lookup_attribute (ix86_force_align_arg_pointer_string,
 			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
-    ix86_incoming_stack_boundary = MIN_STACK_BOUNDARY;
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
 
   /* The incoming stack frame has to be aligned at least at
      parm_stack_boundary.  */
-  if (ix86_incoming_stack_boundary < crtl->parm_stack_boundary)
-    ix86_incoming_stack_boundary = crtl->parm_stack_boundary;
+  if (incoming_stack_boundary < crtl->parm_stack_boundary)
+    incoming_stack_boundary = crtl->parm_stack_boundary;
 
   /* Stack at entrance of main is aligned by runtime.  We use the
      smallest incoming stack boundary. */
-  if (ix86_incoming_stack_boundary > MAIN_STACK_BOUNDARY
+  if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
       && DECL_NAME (current_function_decl)
       && MAIN_NAME_P (DECL_NAME (current_function_decl))
       && DECL_FILE_SCOPE_P (current_function_decl))
-    ix86_incoming_stack_boundary = MAIN_STACK_BOUNDARY;
+    incoming_stack_boundary = MAIN_STACK_BOUNDARY;
+
+  return incoming_stack_boundary;
+}
+
+/* Update incoming stack boundary and estimated stack alignment.  */
+
+static void
+ix86_update_stack_boundary (void)
+{
+  ix86_incoming_stack_boundary
+    = ix86_minimum_incoming_stack_boundary (false);
 
   /* x86_64 vararg needs 16byte stack alignment for register save
      area.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 33a5077..22187a9 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -706,9 +706,7 @@ enum target_cpu_default
    generate an alternate prologue and epilogue that realigns the
    runtime stack if nessary.  This supports mixing codes that keep a
    4-byte aligned stack, as specified by i386 psABI, with codes that
-   need a 16-byte aligned stack, as required by SSE instructions.  If
-   STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
-   128, stacks for all functions may be realigned.  */
+   need a 16-byte aligned stack, as required by SSE instructions.  */
 #define STACK_REALIGN_DEFAULT 0
 
 /* Boundary (in *bits*) on which the incoming stack is aligned.  */
diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c
index b868298..c1ca40e 100644
--- a/gcc/emit-rtl.c
+++ b/gcc/emit-rtl.c
@@ -872,6 +872,9 @@ gen_reg_rtx (enum machine_mode mode)
       unsigned int min_align = MINIMUM_ALIGNMENT (NULL, mode, align);
       if (crtl->stack_alignment_estimated < min_align)
 	crtl->stack_alignment_estimated = min_align;
+
+      if (crtl->hard_stack_alignment < min_align)
+	crtl->hard_stack_alignment = min_align;
     }
 
   if (generating_concat_p
diff --git a/gcc/function.c b/gcc/function.c
index 35c0cfd..eb8ecfe 100644
--- a/gcc/function.c
+++ b/gcc/function.c
@@ -356,6 +356,10 @@ assign_stack_local_1 (enum machine_mode mode, HOST_WIDE_INT size,
 		}
 	    }
 	}
+
+      if (!crtl->stack_realign_processed
+	  && crtl->hard_stack_alignment < alignment_in_bits)
+	crtl->hard_stack_alignment = alignment_in_bits;
     }
 
   if (crtl->stack_alignment_needed < alignment_in_bits)
@@ -3166,6 +3170,9 @@ assign_parms (tree fndecl)
 	      gcc_assert (!crtl->stack_realign_processed);
 	      crtl->stack_alignment_estimated = align;
 	    }
+
+	  if (crtl->hard_stack_alignment < align)
+	    crtl->hard_stack_alignment = align;
 	}
 	
       if (cfun->stdarg && !TREE_CHAIN (parm))
@@ -3223,6 +3230,9 @@ assign_parms (tree fndecl)
 		  gcc_assert (!crtl->stack_realign_processed);
 		  crtl->stack_alignment_estimated = align;
 		}
+
+	      if (crtl->hard_stack_alignment < align)
+		crtl->hard_stack_alignment = align;
 	    }
 	} 
     }
@@ -3538,6 +3548,10 @@ locate_and_pad_parm (enum machine_mode passed_mode, tree type, int in_regs,
 			  && crtl->stack_realign_needed);
 	    }
 	}
+
+      if (!crtl->stack_realign_processed
+	  && crtl->hard_stack_alignment < boundary)
+	crtl->hard_stack_alignment = boundary;
     }
 
   /* Remember if the outgoing parameter requires extra alignment on the
diff --git a/gcc/function.h b/gcc/function.h
index 4825d16..8b57c9a 100644
--- a/gcc/function.h
+++ b/gcc/function.h
@@ -341,6 +341,9 @@ struct GTY(()) rtl_data {
         local stack.  */
   unsigned int stack_alignment_estimated;
 
+  /* The largest hard alignment on the stack.  */
+  unsigned int hard_stack_alignment;
+
   /* For reorg.  */
 
   /* If some insns can be deferred to the delay slots of the epilogue, the
diff --git a/gcc/testsuite/gcc.target/i386/incoming-10.c b/gcc/testsuite/gcc.target/i386/incoming-10.c
new file mode 100644
index 0000000..31d9e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-10.c
@@ -0,0 +1,19 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+struct s {
+	int x[8];
+};
+
+void g(struct s *);
+
+void f()
+{
+	int i;
+	struct s s;
+	for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0;
+	g(&s);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-11.c b/gcc/testsuite/gcc.target/i386/incoming-11.c
new file mode 100644
index 0000000..e5787af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-11.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+void g();
+
+int p[100];
+int q[100];
+
+void f()
+{
+	int i;
+	for (i = 0; i < 100; i++) p[i] = 0;
+	g();
+	for (i = 0; i < 100; i++) q[i] = 0;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-12.c b/gcc/testsuite/gcc.target/i386/incoming-12.c
new file mode 100644
index 0000000..d7ef103
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-12.c
@@ -0,0 +1,20 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+struct x {
+       v4si v;
+       v4si w;
+};
+
+void y(void *);
+
+v4si x(void)
+{
+       struct x x;
+       y(&x);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-13.c b/gcc/testsuite/gcc.target/i386/incoming-13.c
new file mode 100644
index 0000000..bbc8993
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-13.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern double y(double *s3);
+
+extern double s1, s2;
+
+double x(void)
+{
+  double s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-14.c b/gcc/testsuite/gcc.target/i386/incoming-14.c
new file mode 100644
index 0000000..d27179d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-14.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern int y(int *s3);
+
+extern int s1, s2;
+
+int x(void)
+{
+  int s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-15.c b/gcc/testsuite/gcc.target/i386/incoming-15.c
new file mode 100644
index 0000000..e6a1749
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-15.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern long long y(long long *s3);
+
+extern long long s1, s2;
+
+long long x(void)
+{
+  long long s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-6.c b/gcc/testsuite/gcc.target/i386/incoming-6.c
new file mode 100644
index 0000000..5cc4ab3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-6.c
@@ -0,0 +1,17 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si *s3);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  v4si s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-7.c b/gcc/testsuite/gcc.target/i386/incoming-7.c
new file mode 100644
index 0000000..cdd6037
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-7.c
@@ -0,0 +1,16 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si, v4si, v4si, v4si, v4si);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  return y(s1, s2, s1, s2, s2);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-8.c b/gcc/testsuite/gcc.target/i386/incoming-8.c
new file mode 100644
index 0000000..2dd8800
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-8.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O3 -msse2 -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-9.c b/gcc/testsuite/gcc.target/i386/incoming-9.c
new file mode 100644
index 0000000..e43cbd6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-9.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O3 -mno-sse -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr37843-4.c b/gcc/testsuite/gcc.target/i386/pr37843-4.c
new file mode 100644
index 0000000..8e5f51f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr37843-4.c
@@ -0,0 +1,13 @@
+/* Test for stack alignment with sibcall optimization.  */
+/* { dg-do compile { target { ilp32 && nonpic } } } */
+/* { dg-options "-O2 -msse2 -mpreferred-stack-boundary=4 -mstackrealign" } */
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%\[re\]?sp" } } */
+/* { dg-final { scan-assembler-not "call\[\\t \]*foo" } } */
+/* { dg-final { scan-assembler "jmp\[\\t \]*foo" } } */
+
+extern int foo (void);
+
+int bar (void)
+{
+    return foo();
+}

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-21  1:10                                 ` H.J. Lu
@ 2009-10-21  9:54                                   ` Michael Matz
  2009-10-21 16:56                                     ` H.J. Lu
  0 siblings, 1 reply; 59+ messages in thread
From: Michael Matz @ 2009-10-21  9:54 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches,
	Jakub Jelinek

[-- Attachment #1: Type: TEXT/PLAIN, Size: 358 bytes --]

Hi,

On Tue, 20 Oct 2009, H.J. Lu wrote:

> > Here is a new patch. I added hard_stack_alignment, moved 
> > update_stack_boundary after RTL expansion and updated 
> > ix86_function_ok_for_sibcall to deal with it.  Any comments?

Well, I think this approach of tracking the required alignment (in 
gen_reg_rtx especially) is indeed better.


Ciao,
Michael.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-21  9:54                                   ` Michael Matz
@ 2009-10-21 16:56                                     ` H.J. Lu
  2009-10-30 10:08                                       ` Richard Guenther
  0 siblings, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-10-21 16:56 UTC (permalink / raw)
  To: Michael Matz
  Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches,
	Jakub Jelinek

[-- Attachment #1: Type: text/plain, Size: 1859 bytes --]

On Wed, Oct 21, 2009 at 2:05 AM, Michael Matz <matz@suse.de> wrote:
> Hi,
>
> On Tue, 20 Oct 2009, H.J. Lu wrote:
>
>> > Here is a new patch. I added hard_stack_alignment, moved
>> > update_stack_boundary after RTL expansion and updated
>> > ix86_function_ok_for_sibcall to deal with it.  Any comments?
>
> Well, I think this approach of tracking the required alignment (in
> gen_reg_rtx especially) is indeed better.
>

Here is the updated patch. We don't need to add
hard_stack_alignment.  We can initialize
stack_alignment_estimated to 0 and check it instead.
Tested it on Linux/x86-64 with -m32. OK for trunk?

Thanks.


-- 
H.J.
---
gcc/

2009-10-21  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40836
	* cfgexpand.c (expand_stack_alignment): Call update_stack_boundary
	first.  Move assert on stack_alignment_estimated just before
	setting stack_realign_needed.
	(gimple_expand_cfg): Initialize stack_alignment_estimated to 0.
	Don't call update_stack_boundary.

	* config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New.
	(verride_options): Don't check ix86_force_align_arg_pointer here.
	(ix86_function_ok_for_sibcall): Use it.
	(ix86_update_stack_boundary): Likewise.

	* config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments.

gcc/testsuite/

2009-10-21  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.
	* gcc.target/i386/incoming-10.c: Likewise.
	* gcc.target/i386/incoming-11.c: Likewise.
	* gcc.target/i386/incoming-12.c: Likewise.
	* gcc.target/i386/incoming-13.c: Likewise.
	* gcc.target/i386/incoming-14.c: Likewise.
	* gcc.target/i386/incoming-15.c: Likewise.
	* gcc.target/i386/pr37843-4.c: Likewise.

[-- Attachment #2: gcc-pr40838-10.patch --]
[-- Type: text/plain, Size: 16361 bytes --]

gcc/

2009-10-21  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40836
	* cfgexpand.c (expand_stack_alignment): Call update_stack_boundary
	first.  Move assert on stack_alignment_estimated just before
	setting stack_realign_needed.
	(gimple_expand_cfg): Initialize stack_alignment_estimated to 0.
	Don't call update_stack_boundary.

	* config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New.
	(verride_options): Don't check ix86_force_align_arg_pointer here.
	(ix86_function_ok_for_sibcall): Use it.
	(ix86_update_stack_boundary): Likewise.

	* config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments.

gcc/testsuite/

2009-10-21  H.J. Lu  <hongjiu.lu@intel.com>

	PR target/40838
	* gcc.target/i386/incoming-6.c: New.
	* gcc.target/i386/incoming-7.c: Likewise.
	* gcc.target/i386/incoming-8.c: Likewise.
	* gcc.target/i386/incoming-9.c: Likewise.
	* gcc.target/i386/incoming-10.c: Likewise.
	* gcc.target/i386/incoming-11.c: Likewise.
	* gcc.target/i386/incoming-12.c: Likewise.
	* gcc.target/i386/incoming-13.c: Likewise.
	* gcc.target/i386/incoming-14.c: Likewise.
	* gcc.target/i386/incoming-15.c: Likewise.
	* gcc.target/i386/pr37843-4.c: Likewise.

diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c
index 2678d7e..a4b8e4f 100644
--- a/gcc/cfgexpand.c
+++ b/gcc/cfgexpand.c
@@ -3430,8 +3430,18 @@ expand_stack_alignment (void)
       || crtl->has_nonlocal_goto)
     crtl->need_drap = true;
 
-  gcc_assert (crtl->stack_alignment_needed
-	      <= crtl->stack_alignment_estimated);
+  /* Call update_stack_boundary here again to update incoming stack
+     boundary.  It may set incoming stack alignment to a different
+     value after RTL expansion.  TARGET_FUNCTION_OK_FOR_SIBCALL may
+     use the minimum incoming stack alignment to check if it is OK
+     to perform sibcall optimization since sibcall optimization will
+     only align the outgoing stack to incoming stack boundary.  */
+  if (targetm.calls.update_stack_boundary)
+    targetm.calls.update_stack_boundary ();
+
+  /* The incoming stack frame has to be aligned at least at
+     parm_stack_boundary.  */
+  gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY);
 
   /* Update crtl->stack_alignment_estimated and use it later to align
      stack.  We check PREFERRED_STACK_BOUNDARY if there may be non-call
@@ -3447,6 +3457,9 @@ expand_stack_alignment (void)
   if (preferred_stack_boundary > crtl->stack_alignment_needed)
     crtl->stack_alignment_needed = preferred_stack_boundary;
 
+  gcc_assert (crtl->stack_alignment_needed
+	      <= crtl->stack_alignment_estimated);
+
   crtl->stack_realign_needed
     = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
   crtl->stack_realign_tried = crtl->stack_realign_needed;
@@ -3523,7 +3536,7 @@ gimple_expand_cfg (void)
   targetm.expand_to_rtl_hook ();
   crtl->stack_alignment_needed = STACK_BOUNDARY;
   crtl->max_used_stack_slot_alignment = STACK_BOUNDARY;
-  crtl->stack_alignment_estimated = STACK_BOUNDARY;
+  crtl->stack_alignment_estimated = 0;
   crtl->preferred_stack_boundary = STACK_BOUNDARY;
   cfun->cfg->max_jumptable_ents = 0;
 
@@ -3587,23 +3600,6 @@ gimple_expand_cfg (void)
   if (crtl->stack_protect_guard)
     stack_protect_prologue ();
 
-  /* Update stack boundary if needed.  */
-  if (SUPPORTS_STACK_ALIGNMENT)
-    {
-      /* Call update_stack_boundary here to update incoming stack
-	 boundary before TARGET_FUNCTION_OK_FOR_SIBCALL is called.
-	 TARGET_FUNCTION_OK_FOR_SIBCALL needs to know the accurate
-	 incoming stack alignment to check if it is OK to perform
-	 sibcall optimization since sibcall optimization will only
-	 align the outgoing stack to incoming stack boundary.  */
-      if (targetm.calls.update_stack_boundary)
-	targetm.calls.update_stack_boundary ();
-      
-      /* The incoming stack frame has to be aligned at least at
-	 parm_stack_boundary.  */
-      gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY);
-    }
-
   expand_phi_nodes (&SA);
 
   /* Register rtl specific functions for cfg.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 6065f49..e222f72 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1905,6 +1905,7 @@ static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
 static bool ix86_valid_target_attribute_inner_p (tree, char *[]);
 static bool ix86_can_inline_p (tree, tree);
 static void ix86_set_current_function (tree);
+static unsigned int ix86_minimum_incoming_stack_boundary (bool);
 
 static enum calling_abi ix86_function_abi (const_tree);
 
@@ -3239,12 +3240,10 @@ override_options (bool main_args_p)
   if (ix86_force_align_arg_pointer == -1)
     ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
 
+  ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
+
   /* Validate -mincoming-stack-boundary= value or default it to
      MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY.  */
-  if (ix86_force_align_arg_pointer)
-    ix86_default_incoming_stack_boundary = MIN_STACK_BOUNDARY;
-  else
-    ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
   ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
   if (ix86_incoming_stack_boundary_string)
     {
@@ -4277,7 +4276,8 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
 
   /* If we need to align the outgoing stack, then sibcalling would
      unalign the stack, which may break the called function.  */
-  if (ix86_incoming_stack_boundary < PREFERRED_STACK_BOUNDARY)
+  if (ix86_minimum_incoming_stack_boundary (true)
+      < PREFERRED_STACK_BOUNDARY)
     return false;
 
   if (decl)
@@ -8196,37 +8196,58 @@ find_drap_reg (void)
     }
 }
 
-/* Update incoming stack boundary and estimated stack alignment.  */
+/* Return minimum incoming stack alignment.  */
 
-static void
-ix86_update_stack_boundary (void)
+static unsigned int
+ix86_minimum_incoming_stack_boundary (bool sibcall)
 {
+  unsigned int incoming_stack_boundary;
+
   /* Prefer the one specified at command line. */
-  ix86_incoming_stack_boundary 
-    = (ix86_user_incoming_stack_boundary
-       ? ix86_user_incoming_stack_boundary
-       : ix86_default_incoming_stack_boundary);
+  if (ix86_user_incoming_stack_boundary)
+    incoming_stack_boundary = ix86_user_incoming_stack_boundary;
+  /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
+     if -mstackrealign is used, it isn't used for sibcall check and 
+     estimated stack alignment is 128bit.  */
+  else if (!sibcall
+	   && !TARGET_64BIT
+	   && ix86_force_align_arg_pointer
+	   && crtl->stack_alignment_estimated == 128)
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
+  else
+    incoming_stack_boundary = ix86_default_incoming_stack_boundary;
 
   /* Incoming stack alignment can be changed on individual functions
      via force_align_arg_pointer attribute.  We use the smallest
      incoming stack boundary.  */
-  if (ix86_incoming_stack_boundary > MIN_STACK_BOUNDARY
+  if (incoming_stack_boundary > MIN_STACK_BOUNDARY
       && lookup_attribute (ix86_force_align_arg_pointer_string,
 			   TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
-    ix86_incoming_stack_boundary = MIN_STACK_BOUNDARY;
+    incoming_stack_boundary = MIN_STACK_BOUNDARY;
 
   /* The incoming stack frame has to be aligned at least at
      parm_stack_boundary.  */
-  if (ix86_incoming_stack_boundary < crtl->parm_stack_boundary)
-    ix86_incoming_stack_boundary = crtl->parm_stack_boundary;
+  if (incoming_stack_boundary < crtl->parm_stack_boundary)
+    incoming_stack_boundary = crtl->parm_stack_boundary;
 
   /* Stack at entrance of main is aligned by runtime.  We use the
      smallest incoming stack boundary. */
-  if (ix86_incoming_stack_boundary > MAIN_STACK_BOUNDARY
+  if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
       && DECL_NAME (current_function_decl)
       && MAIN_NAME_P (DECL_NAME (current_function_decl))
       && DECL_FILE_SCOPE_P (current_function_decl))
-    ix86_incoming_stack_boundary = MAIN_STACK_BOUNDARY;
+    incoming_stack_boundary = MAIN_STACK_BOUNDARY;
+
+  return incoming_stack_boundary;
+}
+
+/* Update incoming stack boundary and estimated stack alignment.  */
+
+static void
+ix86_update_stack_boundary (void)
+{
+  ix86_incoming_stack_boundary
+    = ix86_minimum_incoming_stack_boundary (false);
 
   /* x86_64 vararg needs 16byte stack alignment for register save
      area.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 33a5077..22187a9 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -706,9 +706,7 @@ enum target_cpu_default
    generate an alternate prologue and epilogue that realigns the
    runtime stack if nessary.  This supports mixing codes that keep a
    4-byte aligned stack, as specified by i386 psABI, with codes that
-   need a 16-byte aligned stack, as required by SSE instructions.  If
-   STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is
-   128, stacks for all functions may be realigned.  */
+   need a 16-byte aligned stack, as required by SSE instructions.  */
 #define STACK_REALIGN_DEFAULT 0
 
 /* Boundary (in *bits*) on which the incoming stack is aligned.  */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-10.c b/gcc/testsuite/gcc.target/i386/incoming-10.c
new file mode 100644
index 0000000..31d9e61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-10.c
@@ -0,0 +1,19 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+struct s {
+	int x[8];
+};
+
+void g(struct s *);
+
+void f()
+{
+	int i;
+	struct s s;
+	for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0;
+	g(&s);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-11.c b/gcc/testsuite/gcc.target/i386/incoming-11.c
new file mode 100644
index 0000000..e5787af
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-11.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */
+
+void g();
+
+int p[100];
+int q[100];
+
+void f()
+{
+	int i;
+	for (i = 0; i < 100; i++) p[i] = 0;
+	g();
+	for (i = 0; i < 100; i++) q[i] = 0;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-12.c b/gcc/testsuite/gcc.target/i386/incoming-12.c
new file mode 100644
index 0000000..d7ef103
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-12.c
@@ -0,0 +1,20 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+struct x {
+       v4si v;
+       v4si w;
+};
+
+void y(void *);
+
+v4si x(void)
+{
+       struct x x;
+       y(&x);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-13.c b/gcc/testsuite/gcc.target/i386/incoming-13.c
new file mode 100644
index 0000000..bbc8993
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-13.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern double y(double *s3);
+
+extern double s1, s2;
+
+double x(void)
+{
+  double s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-14.c b/gcc/testsuite/gcc.target/i386/incoming-14.c
new file mode 100644
index 0000000..d27179d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-14.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern int y(int *s3);
+
+extern int s1, s2;
+
+int x(void)
+{
+  int s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-15.c b/gcc/testsuite/gcc.target/i386/incoming-15.c
new file mode 100644
index 0000000..e6a1749
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-15.c
@@ -0,0 +1,15 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */
+
+extern long long y(long long *s3);
+
+extern long long s1, s2;
+
+long long x(void)
+{
+  long long s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-6.c b/gcc/testsuite/gcc.target/i386/incoming-6.c
new file mode 100644
index 0000000..5cc4ab3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-6.c
@@ -0,0 +1,17 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si *s3);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  v4si s3 = s1 + s2;
+  return y(&s3);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-7.c b/gcc/testsuite/gcc.target/i386/incoming-7.c
new file mode 100644
index 0000000..cdd6037
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-7.c
@@ -0,0 +1,16 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+extern v4si y(v4si, v4si, v4si, v4si, v4si);
+
+extern v4si s1, s2;
+
+v4si x(void)
+{
+  return y(s1, s2, s1, s2, s2);
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-8.c b/gcc/testsuite/gcc.target/i386/incoming-8.c
new file mode 100644
index 0000000..2dd8800
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-8.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O3 -msse2 -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/incoming-9.c b/gcc/testsuite/gcc.target/i386/incoming-9.c
new file mode 100644
index 0000000..e43cbd6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/incoming-9.c
@@ -0,0 +1,18 @@
+/* PR target/40838 */
+/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */
+/* { dg-options "-w -mstackrealign -O3 -mno-sse -mpreferred-stack-boundary=4" } */
+
+float
+foo (float f)
+{
+  float array[128];
+  float x;
+  int i;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    array[i] = f;
+  for (i = 0; i < sizeof(array) / sizeof(*array); i++)
+    x += array[i];
+  return x;
+}
+
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr37843-4.c b/gcc/testsuite/gcc.target/i386/pr37843-4.c
new file mode 100644
index 0000000..8e5f51f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr37843-4.c
@@ -0,0 +1,13 @@
+/* Test for stack alignment with sibcall optimization.  */
+/* { dg-do compile { target { ilp32 && nonpic } } } */
+/* { dg-options "-O2 -msse2 -mpreferred-stack-boundary=4 -mstackrealign" } */
+/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%\[re\]?sp" } } */
+/* { dg-final { scan-assembler-not "call\[\\t \]*foo" } } */
+/* { dg-final { scan-assembler "jmp\[\\t \]*foo" } } */
+
+extern int foo (void);
+
+int bar (void)
+{
+    return foo();
+}

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-10-21 16:56                                     ` H.J. Lu
@ 2009-10-30 10:08                                       ` Richard Guenther
  0 siblings, 0 replies; 59+ messages in thread
From: Richard Guenther @ 2009-10-30 10:08 UTC (permalink / raw)
  To: H.J. Lu
  Cc: Michael Matz, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek

On Wed, Oct 21, 2009 at 5:53 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Wed, Oct 21, 2009 at 2:05 AM, Michael Matz <matz@suse.de> wrote:
>> Hi,
>>
>> On Tue, 20 Oct 2009, H.J. Lu wrote:
>>
>>> > Here is a new patch. I added hard_stack_alignment, moved
>>> > update_stack_boundary after RTL expansion and updated
>>> > ix86_function_ok_for_sibcall to deal with it.  Any comments?
>>
>> Well, I think this approach of tracking the required alignment (in
>> gen_reg_rtx especially) is indeed better.
>>
>
> Here is the updated patch. We don't need to add
> hard_stack_alignment.  We can initialize
> stack_alignment_estimated to 0 and check it instead.
> Tested it on Linux/x86-64 with -m32. OK for trunk?

Ok.

Thanks,
Richard.

> Thanks.
>
>
> --
> H.J.
> ---
> gcc/
>
> 2009-10-21  H.J. Lu  <hongjiu.lu@intel.com>
>
>        PR target/40836
>        * cfgexpand.c (expand_stack_alignment): Call update_stack_boundary
>        first.  Move assert on stack_alignment_estimated just before
>        setting stack_realign_needed.
>        (gimple_expand_cfg): Initialize stack_alignment_estimated to 0.
>        Don't call update_stack_boundary.
>
>        * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New.
>        (verride_options): Don't check ix86_force_align_arg_pointer here.
>        (ix86_function_ok_for_sibcall): Use it.
>        (ix86_update_stack_boundary): Likewise.
>
>        * config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments.
>
> gcc/testsuite/
>
> 2009-10-21  H.J. Lu  <hongjiu.lu@intel.com>
>
>        PR target/40838
>        * gcc.target/i386/incoming-6.c: New.
>        * gcc.target/i386/incoming-7.c: Likewise.
>        * gcc.target/i386/incoming-8.c: Likewise.
>        * gcc.target/i386/incoming-9.c: Likewise.
>        * gcc.target/i386/incoming-10.c: Likewise.
>        * gcc.target/i386/incoming-11.c: Likewise.
>        * gcc.target/i386/incoming-12.c: Likewise.
>        * gcc.target/i386/incoming-13.c: Likewise.
>        * gcc.target/i386/incoming-14.c: Likewise.
>        * gcc.target/i386/incoming-15.c: Likewise.
>        * gcc.target/i386/pr37843-4.c: Likewise.
>

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-09-13  1:55           ` H.J. Lu
@ 2009-09-13 14:10             ` Mikulas Patocka
  0 siblings, 0 replies; 59+ messages in thread
From: Mikulas Patocka @ 2009-09-13 14:10 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches, ubizjak

On Sat, 12 Sep 2009, H.J. Lu wrote:

> 2009/9/12 Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>:
> >
> > I tried this patch for 4.4.1 (that one posted in bugzilla 40838) with
> > seamonkey 1.1.17 and it relly misses alignment in some parts of it. I used
> > CFLAGS and CXXFLAGS "-O3 -fomit-frame-pointer -frename-registers
> > -march=barcelona"
> >
> > I found the functions that miscompile, the trivial way to find them is to
> > compile seamonkey, do find . -name "*.o"|xargs objdump -d|less, search for
> > "movdqa.*esp" or "movaps.*esp" (or even "esp.*xmm" and "xmm.*esp", but it
> > gives more false positives) and check the function that it has aligment
> > code. There are a few functions that don't. Please look at these
> > functions, the idea of your patch is good, it just needs to be fixed.
> >
> > BTW. gentoo documentation lists -O3 as problematic flag that leads to
> > instability with gcc 4 (they don't way why, but the most likely reason is
> > this alignment issue), it would be good if we could fix it and allow
> > people to use SSE.
> >
> 
> Please find a small testcase and upload it to PR 40838.
> 
> Thank.s
> 
> Thanks.

OK. The misgenerated examples were similar to one of the two issues, so I 
posted two examples for them to bugzilla. So you can find the bug in your 
patch.

There is another bug: The compiler shouldn't assume that arrays in the 
common section are aligned. They may be initialized in a different module 
compiled with different compiler that doesn't do alignment.

Mikulas

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-09-12 23:32         ` Mikulas Patocka
  2009-09-12 23:42           ` Mikulas Patocka
@ 2009-09-13  1:55           ` H.J. Lu
  2009-09-13 14:10             ` Mikulas Patocka
  1 sibling, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-09-13  1:55 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Jakub Jelinek, gcc-patches, ubizjak

2009/9/12 Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>:
>
> I tried this patch for 4.4.1 (that one posted in bugzilla 40838) with
> seamonkey 1.1.17 and it relly misses alignment in some parts of it. I used
> CFLAGS and CXXFLAGS "-O3 -fomit-frame-pointer -frename-registers
> -march=barcelona"
>
> I found the functions that miscompile, the trivial way to find them is to
> compile seamonkey, do find . -name "*.o"|xargs objdump -d|less, search for
> "movdqa.*esp" or "movaps.*esp" (or even "esp.*xmm" and "xmm.*esp", but it
> gives more false positives) and check the function that it has aligment
> code. There are a few functions that don't. Please look at these
> functions, the idea of your patch is good, it just needs to be fixed.
>
> BTW. gentoo documentation lists -O3 as problematic flag that leads to
> instability with gcc 4 (they don't way why, but the most likely reason is
> this alignment issue), it would be good if we could fix it and allow
> people to use SSE.
>

Please find a small testcase and upload it to PR 40838.

Thank.s

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-09-12 23:32         ` Mikulas Patocka
@ 2009-09-12 23:42           ` Mikulas Patocka
  2009-09-13  1:55           ` H.J. Lu
  1 sibling, 0 replies; 59+ messages in thread
From: Mikulas Patocka @ 2009-09-12 23:42 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches, ubizjak

> _ZL18wait_for_retrievalP13_GtkClipboardP17retrieval_context:
> 560:       55                      push   %ebp
> 561:       57                      push   %edi
> 562:       56                      push   %esi
> 563:       89 d6                   mov    %edx,%esi
> 565:       53                      push   %ebx
> 566:       81 ec bc 01 00 00       sub    $0x1bc,%esp
> ...
> 5b0:       0f 29 44 24 20          movaps %xmm0,0x20(%esp)

^^^ BTW. this is the place where it really crashed when I tried to run it.

Mikulas

> 5b5:       0f 29 44 24 30          movaps %xmm0,0x30(%esp)
> 5ba:       0f 29 44 24 40          movaps %xmm0,0x40(%esp)
> 5bf:       0f 29 44 24 50          movaps %xmm0,0x50(%esp)
> 5c4:       0f 29 44 24 60          movaps %xmm0,0x60(%esp)
> 5c9:       0f 29 44 24 70          movaps %xmm0,0x70(%esp)
> 5ce:       0f 29 84 24 80 00 00    movaps %xmm0,0x80(%esp)
> 5d5:       00
> 5d6:       0f 29 84 24 90 00 00    movaps %xmm0,0x90(%esp)
> 5dd:       00

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-08-24 17:39       ` H.J. Lu
@ 2009-09-12 23:32         ` Mikulas Patocka
  2009-09-12 23:42           ` Mikulas Patocka
  2009-09-13  1:55           ` H.J. Lu
  0 siblings, 2 replies; 59+ messages in thread
From: Mikulas Patocka @ 2009-09-12 23:32 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches, ubizjak

[-- Attachment #1: Type: TEXT/PLAIN, Size: 8848 bytes --]



On Mon, 24 Aug 2009, H.J. Lu wrote:

> On Fri, Aug 7, 2009 at 3:30 PM, H.J. Lu<hjl.tools@gmail.com> wrote:
> > On Fri, Aug 7, 2009 at 5:53 AM, H.J. Lu<hjl.tools@gmail.com> wrote:
> >> On Fri, Aug 7, 2009 at 12:13 AM, Jakub Jelinek<jakub@redhat.com> wrote:
> >>> On Fri, Aug 07, 2009 at 02:54:46AM +0200, Mikulas Patocka wrote:
> >>>> > > In 32bit, the incoming stack may not be 16 byte aligned.  This patch
> >>>> > > assumes the incoming stack is 4 byte aligned and realigns stack if any
> >>>> > > SSE variable is put on stack. Any comments?
> >>>> >
> >>>> > IMHO this is wrong, I could live with a non-default option for those who
> >>>> > don't care about performance and think a SCO document from 1996 has any
> >>>> > relevance to Linux these days.  In reality a Linux ABI for years assumes
> >>>> > 16 byte stack alignment for 32-bit code.
> >>>>
> >>>> Tell me which Linux distribution did you run with 16-byte stack alignment
> >>>> checking (as proposed in bug 40838) and what was the result?
> >>>>
> >>>> For me, the result was that 75% of binaries in /bin in Debian Lenny do not
> >>>> align the stack on 16-byte boundary.
> >>>
> >>> Besides the obstack glibc bug which has been fixed since then you haven't
> >>> reported anything particular.  It is true that parts of i?86 glibc is
> >>> compiled with -mpreferered-stack-boundary=2, but only parts that don't call
> >>> callbacks.  Async signals AFAIK will align the stack properly.
> >>>
> >>> I simply don't trust your 75% claim, lots of stuff would break if things
> >>> weren't aligned properly.
> >>>
> >>
> >> From gcc 3.4:
> >>
> >>  /* Validate -mpreferred-stack-boundary= value, or provide default.
> >>     The default of 128 bits is for Pentium III's SSE __m128, but we
> >>     don't want additional code to keep the stack aligned when
> >>     optimizing for code size.  */
> >>  ix86_preferred_stack_boundary = (optimize_size
> >>                                   ? TARGET_64BIT ? 128 : 32
> >>                                   : 128);
> >>
> >> If you compile code with -Os, you will get 4 byte stack alignment.
> >> Just step back, we changed stack alignment from 4 byte to 16byte
> >> for SSE since we couldn't realign stack at the time. Now we can
> >> realign the stack very efficiently. I think we should do it for SSE
> >> to support the existing Linux binaries which have 4 byte stack
> >> alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse
> >> results with SPEC CPU 2006, before and after my patch.
> >>
> >
> > Here are the differences of -m32 -O3 -msse2 -mfpmath=sse -ffast-math
> > -funroll-loops
> > before and after my patch:
> >
> > 400.perlbench                    -0.384615%
> > 401.bzip2                        0%
> > 403.gcc                          -0.362319%
> > 429.mcf                          -0.813008%
> > 445.gobmk                        0.921659%
> > 456.hmmer                        0.549451%
> > 458.sjeng                        -0.438596%
> > 462.libquantum                   0%
> > 464.h264ref                      0%
> > 471.omnetpp                      -0.478469%
> > 473.astar                        -0.645161%
> > 483.xalancbmk                    -0.727273%
> > SPECint(R)_base2006                      -0.411523%
> > 410.bwaves                       -0.406504%
> > 416.gamess                       0%
> > 433.milc                         -1.36986%
> > 434.zeusmp                       -0.44843%
> > 435.gromacs                      0%
> > 436.cactusADM                    0%
> > 437.leslie3d                     -0.888889%
> > 444.namd                         1.20482%
> > 447.dealII                       -0.350877%
> > 450.soplex                       -0.31746%
> > 453.povray                       0.458716%
> > 454.calculix                     0%
> > 459.GemsFDTD                     0%
> > 465.tonto                        0%
> > 470.lbm                          0%
> > 481.wrf                          0.480769%
> > 482.sphinx3                      0.940439%
> > SPECfp(R)_base2006                       0%
> >
> > I think we should align stack if SSE variables are put on stack.
> >
> 
> Darwin ia32 psABI specifies 16byte stack alignment and enforces it
> with
> 
> #define PREFERRED_STACK_BOUNDARY                        \
>   MAX (STACK_BOUNDARY, ix86_preferred_stack_boundary)
> 
> On other ia32 targets, 4byte outgoing stack alignment is
> correct and allowed. My patch assumes 4 byte incoming
> stack alignment only when SSE variables are put on stack.
> Automatic stack alignment implementation is quite efficient.
> Its performance impact is very limited as show in SPEC CPU
> 2006 results. It also fixed a regression:
> 
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156
> 
> OK for trunk?
> 
> Thanks.
> 
> -- 
> H.J.

I tried this patch for 4.4.1 (that one posted in bugzilla 40838) with 
seamonkey 1.1.17 and it relly misses alignment in some parts of it. I used 
CFLAGS and CXXFLAGS "-O3 -fomit-frame-pointer -frename-registers 
-march=barcelona"

I found the functions that miscompile, the trivial way to find them is to 
compile seamonkey, do find . -name "*.o"|xargs objdump -d|less, search for 
"movdqa.*esp" or "movaps.*esp" (or even "esp.*xmm" and "xmm.*esp", but it 
gives more false positives) and check the function that it has aligment 
code. There are a few functions that don't. Please look at these 
functions, the idea of your patch is good, it just needs to be fixed.

BTW. gentoo documentation lists -O3 as problematic flag that leads to 
instability with gcc 4 (they don't way why, but the most likely reason is 
this alignment issue), it would be good if we could fix it and allow 
people to use SSE.

Mikulas


Some of the miscompiled functions are:

pow5mult:
a00:       55                      push   %ebp
a01:       57                      push   %edi
a02:       56                      push   %esi
a03:       53                      push   %ebx
a04:       89 d3                   mov    %edx,%ebx
a06:       83 ec 6c                sub    $0x6c,%esp
...
a7d:       0f 29 44 24 10          movaps %xmm0,0x10(%esp)
a82:       e8 79 f8 ff ff          call   300 <Balloc>
a87:       85 c0                   test   %eax,%eax
a89:       89 44 24 4c             mov    %eax,0x4c(%esp)
a8d:       66 0f 6f 44 24 10       movdqa 0x10(%esp),%xmm0

PR_strtod:
10a0:       55                      push   %ebp
10a1:       57                      push   %edi
10a2:       56                      push   %esi
10a3:       53                      push   %ebx
10a4:       81 ec fc 00 00 00       sub    $0xfc,%esp
...
172c:       66 0f 6f 44 24 20       movdqa 0x20(%esp),%xmm0

PR_select:
3da0:       55                      push   %ebp
3da1:       57                      push   %edi
3da2:       56                      push   %esi
3da3:       89 ce                   mov    %ecx,%esi
3da5:       53                      push   %ebx
3da6:       89 d3                   mov    %edx,%ebx
3da8:       81 ec ec 01 00 00       sub    $0x1ec,%esp
...
3dcb:       0f 29 84 24 50 01 00    movaps %xmm0,0x150(%esp)
3dd2:       00
3dd3:       0f 29 84 24 60 01 00    movaps %xmm0,0x160(%esp)
3dda:       00
3ddb:       0f 29 84 24 70 01 00    movaps %xmm0,0x170(%esp)
3de2:       00
3de3:       0f 29 84 24 80 01 00    movaps %xmm0,0x180(%esp)
3dea:       00
3deb:       0f 29 84 24 90 01 00    movaps %xmm0,0x190(%esp)
3df2:       00

_ZL18wait_for_retrievalP13_GtkClipboardP17retrieval_context:
560:       55                      push   %ebp
561:       57                      push   %edi
562:       56                      push   %esi
563:       89 d6                   mov    %edx,%esi
565:       53                      push   %ebx
566:       81 ec bc 01 00 00       sub    $0x1bc,%esp
...
5b0:       0f 29 44 24 20          movaps %xmm0,0x20(%esp)
5b5:       0f 29 44 24 30          movaps %xmm0,0x30(%esp)
5ba:       0f 29 44 24 40          movaps %xmm0,0x40(%esp)
5bf:       0f 29 44 24 50          movaps %xmm0,0x50(%esp)
5c4:       0f 29 44 24 60          movaps %xmm0,0x60(%esp)
5c9:       0f 29 44 24 70          movaps %xmm0,0x70(%esp)
5ce:       0f 29 84 24 80 00 00    movaps %xmm0,0x80(%esp)
5d5:       00
5d6:       0f 29 84 24 90 00 00    movaps %xmm0,0x90(%esp)
5dd:       00

_ZN13XRemoteClient7GetLockEmPi:
680:       55                      push   %ebp
681:       57                      push   %edi
682:       56                      push   %esi
683:       53                      push   %ebx
684:       89 c3                   mov    %eax,%ebx
686:       81 ec 0c 02 00 00       sub    $0x20c,%esp
...
6ee:       0f 29 44 24 30          movaps %xmm0,0x30(%esp)

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-08-07 22:30     ` H.J. Lu
  2009-08-08 17:35       ` Mikulas Patocka
@ 2009-08-24 17:39       ` H.J. Lu
  2009-09-12 23:32         ` Mikulas Patocka
  1 sibling, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-08-24 17:39 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Mikulas Patocka, gcc-patches, ubizjak

On Fri, Aug 7, 2009 at 3:30 PM, H.J. Lu<hjl.tools@gmail.com> wrote:
> On Fri, Aug 7, 2009 at 5:53 AM, H.J. Lu<hjl.tools@gmail.com> wrote:
>> On Fri, Aug 7, 2009 at 12:13 AM, Jakub Jelinek<jakub@redhat.com> wrote:
>>> On Fri, Aug 07, 2009 at 02:54:46AM +0200, Mikulas Patocka wrote:
>>>> > > In 32bit, the incoming stack may not be 16 byte aligned.  This patch
>>>> > > assumes the incoming stack is 4 byte aligned and realigns stack if any
>>>> > > SSE variable is put on stack. Any comments?
>>>> >
>>>> > IMHO this is wrong, I could live with a non-default option for those who
>>>> > don't care about performance and think a SCO document from 1996 has any
>>>> > relevance to Linux these days.  In reality a Linux ABI for years assumes
>>>> > 16 byte stack alignment for 32-bit code.
>>>>
>>>> Tell me which Linux distribution did you run with 16-byte stack alignment
>>>> checking (as proposed in bug 40838) and what was the result?
>>>>
>>>> For me, the result was that 75% of binaries in /bin in Debian Lenny do not
>>>> align the stack on 16-byte boundary.
>>>
>>> Besides the obstack glibc bug which has been fixed since then you haven't
>>> reported anything particular.  It is true that parts of i?86 glibc is
>>> compiled with -mpreferered-stack-boundary=2, but only parts that don't call
>>> callbacks.  Async signals AFAIK will align the stack properly.
>>>
>>> I simply don't trust your 75% claim, lots of stuff would break if things
>>> weren't aligned properly.
>>>
>>
>> From gcc 3.4:
>>
>>  /* Validate -mpreferred-stack-boundary= value, or provide default.
>>     The default of 128 bits is for Pentium III's SSE __m128, but we
>>     don't want additional code to keep the stack aligned when
>>     optimizing for code size.  */
>>  ix86_preferred_stack_boundary = (optimize_size
>>                                   ? TARGET_64BIT ? 128 : 32
>>                                   : 128);
>>
>> If you compile code with -Os, you will get 4 byte stack alignment.
>> Just step back, we changed stack alignment from 4 byte to 16byte
>> for SSE since we couldn't realign stack at the time. Now we can
>> realign the stack very efficiently. I think we should do it for SSE
>> to support the existing Linux binaries which have 4 byte stack
>> alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse
>> results with SPEC CPU 2006, before and after my patch.
>>
>
> Here are the differences of -m32 -O3 -msse2 -mfpmath=sse -ffast-math
> -funroll-loops
> before and after my patch:
>
> 400.perlbench                    -0.384615%
> 401.bzip2                        0%
> 403.gcc                          -0.362319%
> 429.mcf                          -0.813008%
> 445.gobmk                        0.921659%
> 456.hmmer                        0.549451%
> 458.sjeng                        -0.438596%
> 462.libquantum                   0%
> 464.h264ref                      0%
> 471.omnetpp                      -0.478469%
> 473.astar                        -0.645161%
> 483.xalancbmk                    -0.727273%
> SPECint(R)_base2006                      -0.411523%
> 410.bwaves                       -0.406504%
> 416.gamess                       0%
> 433.milc                         -1.36986%
> 434.zeusmp                       -0.44843%
> 435.gromacs                      0%
> 436.cactusADM                    0%
> 437.leslie3d                     -0.888889%
> 444.namd                         1.20482%
> 447.dealII                       -0.350877%
> 450.soplex                       -0.31746%
> 453.povray                       0.458716%
> 454.calculix                     0%
> 459.GemsFDTD                     0%
> 465.tonto                        0%
> 470.lbm                          0%
> 481.wrf                          0.480769%
> 482.sphinx3                      0.940439%
> SPECfp(R)_base2006                       0%
>
> I think we should align stack if SSE variables are put on stack.
>

Darwin ia32 psABI specifies 16byte stack alignment and enforces it
with

#define PREFERRED_STACK_BOUNDARY                        \
  MAX (STACK_BOUNDARY, ix86_preferred_stack_boundary)

On other ia32 targets, 4byte outgoing stack alignment is
correct and allowed. My patch assumes 4 byte incoming
stack alignment only when SSE variables are put on stack.
Automatic stack alignment implementation is quite efficient.
Its performance impact is very limited as show in SPEC CPU
2006 results. It also fixed a regression:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156

OK for trunk?

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-08-08 17:35       ` Mikulas Patocka
@ 2009-08-16 21:25         ` H.J. Lu
  0 siblings, 0 replies; 59+ messages in thread
From: H.J. Lu @ 2009-08-16 21:25 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Jakub Jelinek, gcc-patches, ubizjak

2009/8/8 Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>:
>> > From gcc 3.4:
>> >
>> >  /* Validate -mpreferred-stack-boundary= value, or provide default.
>> >     The default of 128 bits is for Pentium III's SSE __m128, but we
>> >     don't want additional code to keep the stack aligned when
>> >     optimizing for code size.  */
>> >  ix86_preferred_stack_boundary = (optimize_size
>> >                                   ? TARGET_64BIT ? 128 : 32
>> >                                   : 128);
>> >
>> > If you compile code with -Os, you will get 4 byte stack alignment.
>> > Just step back, we changed stack alignment from 4 byte to 16byte
>> > for SSE since we couldn't realign stack at the time. Now we can
>> > realign the stack very efficiently. I think we should do it for SSE
>> > to support the existing Linux binaries which have 4 byte stack
>> > alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse
>> > results with SPEC CPU 2006, before and after my patch.
>> >
>>
>> Here are the differences of -m32 -O3 -msse2 -mfpmath=sse -ffast-math
>> -funroll-loops
>> before and after my patch:
>>
>> 400.perlbench                    -0.384615%
>> 401.bzip2                        0%
>> 403.gcc                          -0.362319%
>> 429.mcf                          -0.813008%
>> 445.gobmk                        0.921659%
>> 456.hmmer                        0.549451%
>> 458.sjeng                        -0.438596%
>> 462.libquantum                   0%
>> 464.h264ref                      0%
>> 471.omnetpp                      -0.478469%
>> 473.astar                        -0.645161%
>> 483.xalancbmk                    -0.727273%
>> SPECint(R)_base2006                      -0.411523%
>> 410.bwaves                       -0.406504%
>> 416.gamess                       0%
>> 433.milc                         -1.36986%
>> 434.zeusmp                       -0.44843%
>> 435.gromacs                      0%
>> 436.cactusADM                    0%
>> 437.leslie3d                     -0.888889%
>> 444.namd                         1.20482%
>> 447.dealII                       -0.350877%
>> 450.soplex                       -0.31746%
>> 453.povray                       0.458716%
>> 454.calculix                     0%
>> 459.GemsFDTD                     0%
>> 465.tonto                        0%
>> 470.lbm                          0%
>> 481.wrf                          0.480769%
>> 482.sphinx3                      0.940439%
>> SPECfp(R)_base2006                       0%
>>
>> I think we should align stack if SSE variables are put on stack.
>>
>> --
>> H.J.
>
> Your patch is buggy, it aligns the stack in functions with vector types
> but doesn't align if the autovectorizer creates 16-bit SSE instructions
> (that is even more dangerous than vector types because it may make crashes
> at random places). You must put that test somewhere else.
>

Please show the assembly output with my patch applied.

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-08-07 22:30     ` H.J. Lu
@ 2009-08-08 17:35       ` Mikulas Patocka
  2009-08-16 21:25         ` H.J. Lu
  2009-08-24 17:39       ` H.J. Lu
  1 sibling, 1 reply; 59+ messages in thread
From: Mikulas Patocka @ 2009-08-08 17:35 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches, ubizjak

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2825 bytes --]

> > From gcc 3.4:
> >
> >  /* Validate -mpreferred-stack-boundary= value, or provide default.
> >     The default of 128 bits is for Pentium III's SSE __m128, but we
> >     don't want additional code to keep the stack aligned when
> >     optimizing for code size.  */
> >  ix86_preferred_stack_boundary = (optimize_size
> >                                   ? TARGET_64BIT ? 128 : 32
> >                                   : 128);
> >
> > If you compile code with -Os, you will get 4 byte stack alignment.
> > Just step back, we changed stack alignment from 4 byte to 16byte
> > for SSE since we couldn't realign stack at the time. Now we can
> > realign the stack very efficiently. I think we should do it for SSE
> > to support the existing Linux binaries which have 4 byte stack
> > alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse
> > results with SPEC CPU 2006, before and after my patch.
> >
> 
> Here are the differences of -m32 -O3 -msse2 -mfpmath=sse -ffast-math
> -funroll-loops
> before and after my patch:
> 
> 400.perlbench                    -0.384615%
> 401.bzip2                        0%
> 403.gcc                          -0.362319%
> 429.mcf                          -0.813008%
> 445.gobmk                        0.921659%
> 456.hmmer                        0.549451%
> 458.sjeng                        -0.438596%
> 462.libquantum                   0%
> 464.h264ref                      0%
> 471.omnetpp                      -0.478469%
> 473.astar                        -0.645161%
> 483.xalancbmk                    -0.727273%
> SPECint(R)_base2006                      -0.411523%
> 410.bwaves                       -0.406504%
> 416.gamess                       0%
> 433.milc                         -1.36986%
> 434.zeusmp                       -0.44843%
> 435.gromacs                      0%
> 436.cactusADM                    0%
> 437.leslie3d                     -0.888889%
> 444.namd                         1.20482%
> 447.dealII                       -0.350877%
> 450.soplex                       -0.31746%
> 453.povray                       0.458716%
> 454.calculix                     0%
> 459.GemsFDTD                     0%
> 465.tonto                        0%
> 470.lbm                          0%
> 481.wrf                          0.480769%
> 482.sphinx3                      0.940439%
> SPECfp(R)_base2006                       0%
> 
> I think we should align stack if SSE variables are put on stack.
> 
> -- 
> H.J.

Your patch is buggy, it aligns the stack in functions with vector types 
but doesn't align if the autovectorizer creates 16-bit SSE instructions 
(that is even more dangerous than vector types because it may make crashes 
at random places). You must put that test somewhere else.

Mikulas

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-08-07 12:53   ` H.J. Lu
@ 2009-08-07 22:30     ` H.J. Lu
  2009-08-08 17:35       ` Mikulas Patocka
  2009-08-24 17:39       ` H.J. Lu
  0 siblings, 2 replies; 59+ messages in thread
From: H.J. Lu @ 2009-08-07 22:30 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Mikulas Patocka, gcc-patches, ubizjak

On Fri, Aug 7, 2009 at 5:53 AM, H.J. Lu<hjl.tools@gmail.com> wrote:
> On Fri, Aug 7, 2009 at 12:13 AM, Jakub Jelinek<jakub@redhat.com> wrote:
>> On Fri, Aug 07, 2009 at 02:54:46AM +0200, Mikulas Patocka wrote:
>>> > > In 32bit, the incoming stack may not be 16 byte aligned.  This patch
>>> > > assumes the incoming stack is 4 byte aligned and realigns stack if any
>>> > > SSE variable is put on stack. Any comments?
>>> >
>>> > IMHO this is wrong, I could live with a non-default option for those who
>>> > don't care about performance and think a SCO document from 1996 has any
>>> > relevance to Linux these days.  In reality a Linux ABI for years assumes
>>> > 16 byte stack alignment for 32-bit code.
>>>
>>> Tell me which Linux distribution did you run with 16-byte stack alignment
>>> checking (as proposed in bug 40838) and what was the result?
>>>
>>> For me, the result was that 75% of binaries in /bin in Debian Lenny do not
>>> align the stack on 16-byte boundary.
>>
>> Besides the obstack glibc bug which has been fixed since then you haven't
>> reported anything particular.  It is true that parts of i?86 glibc is
>> compiled with -mpreferered-stack-boundary=2, but only parts that don't call
>> callbacks.  Async signals AFAIK will align the stack properly.
>>
>> I simply don't trust your 75% claim, lots of stuff would break if things
>> weren't aligned properly.
>>
>
> From gcc 3.4:
>
>  /* Validate -mpreferred-stack-boundary= value, or provide default.
>     The default of 128 bits is for Pentium III's SSE __m128, but we
>     don't want additional code to keep the stack aligned when
>     optimizing for code size.  */
>  ix86_preferred_stack_boundary = (optimize_size
>                                   ? TARGET_64BIT ? 128 : 32
>                                   : 128);
>
> If you compile code with -Os, you will get 4 byte stack alignment.
> Just step back, we changed stack alignment from 4 byte to 16byte
> for SSE since we couldn't realign stack at the time. Now we can
> realign the stack very efficiently. I think we should do it for SSE
> to support the existing Linux binaries which have 4 byte stack
> alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse
> results with SPEC CPU 2006, before and after my patch.
>

Here are the differences of -m32 -O3 -msse2 -mfpmath=sse -ffast-math
-funroll-loops
before and after my patch:

400.perlbench                    -0.384615%
401.bzip2                        0%
403.gcc                          -0.362319%
429.mcf                          -0.813008%
445.gobmk                        0.921659%
456.hmmer                        0.549451%
458.sjeng                        -0.438596%
462.libquantum                   0%
464.h264ref                      0%
471.omnetpp                      -0.478469%
473.astar                        -0.645161%
483.xalancbmk                    -0.727273%
SPECint(R)_base2006                      -0.411523%
410.bwaves                       -0.406504%
416.gamess                       0%
433.milc                         -1.36986%
434.zeusmp                       -0.44843%
435.gromacs                      0%
436.cactusADM                    0%
437.leslie3d                     -0.888889%
444.namd                         1.20482%
447.dealII                       -0.350877%
450.soplex                       -0.31746%
453.povray                       0.458716%
454.calculix                     0%
459.GemsFDTD                     0%
465.tonto                        0%
470.lbm                          0%
481.wrf                          0.480769%
482.sphinx3                      0.940439%
SPECfp(R)_base2006                       0%

I think we should align stack if SSE variables are put on stack.

-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-08-07 21:08   ` Mikulas Patocka
@ 2009-08-07 21:25     ` Richard Guenther
  0 siblings, 0 replies; 59+ messages in thread
From: Richard Guenther @ 2009-08-07 21:25 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: Jakub Jelinek, gcc-patches, ubizjak

On Fri, Aug 7, 2009 at 11:08 PM, Mikulas
Patocka<mikulas@artax.karlin.mff.cuni.cz> wrote:
>> > For me, the result was that 75% of binaries in /bin in Debian Lenny do not
>> > align the stack on 16-byte boundary.
>>
>> Besides the obstack glibc bug which has been fixed since then you haven't
>> reported anything particular.  It is true that parts of i?86 glibc is
>> compiled with -mpreferered-stack-boundary=2, but only parts that don't call
>> callbacks.  Async signals AFAIK will align the stack properly.
>>
>> I simply don't trust your 75% claim, lots of stuff would break if things
>> weren't aligned properly.
>>
>>       Jakub
>
> The reason for that '75%' is that integer-only binaries are usually
> compiled with -mpreferred-stack-boundary=2. People used the switch
> "-mpreferred-stack-boundary=2" to tell the compiler "I don't care about
> performance of floating point".
>
> If gcc developers made that "-mpreferred-stack-boundary" switch, people
> used it. It is not surprising that distributors built packages that don't
> do much floating point calculations with it.
>
> Now it's hard to say that anyone who used the switch was wrong. It very
> much ressembles the situatino when mad man cries that he is the only one
> normal and all the other people are crazy because they can't tolerate his
> behavior.
>
>
> If you don't believe what I'm saying, just run the test on your own.
>
>
> Other things that misalign the stack (found during source review):
>
> - Mozilla (has assembler stuff for XPCOM calls that aligns only to
> 8-bytes, based on the reasoning that it uses only doubles)
> - Python (has a FFI library that does callbacks to C, it aligns only for
> Darwin ... could be trivially fixed by using the Darwin code for Linux)
> - Dosbox (generates machine code that calls back to C, doesn't align)
> - Mplayer+Wine (link Windows libraries that call back to C)
> - Intel C (aligns only parts that need it, doesn't align in integer
> functions)
> - Zip (uses some assembler routine that calls to C and doesn't align)

The only openSUSE packages that specify -mprefered-stack-boundary are

fftw3:4
fftw:4
glibc.i686:2
glibc.i686:4
glibc:2
glibc:4
quickcam:2
valgrind:2
virtualbox-ose:2

where the value after the colon is the specified stack boundary.

Richard.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
  2009-08-07  7:13 ` Jakub Jelinek
  2009-08-07 12:53   ` H.J. Lu
@ 2009-08-07 21:08   ` Mikulas Patocka
  2009-08-07 21:25     ` Richard Guenther
  1 sibling, 1 reply; 59+ messages in thread
From: Mikulas Patocka @ 2009-08-07 21:08 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches, ubizjak

> > For me, the result was that 75% of binaries in /bin in Debian Lenny do not 
> > align the stack on 16-byte boundary.
> 
> Besides the obstack glibc bug which has been fixed since then you haven't
> reported anything particular.  It is true that parts of i?86 glibc is
> compiled with -mpreferered-stack-boundary=2, but only parts that don't call
> callbacks.  Async signals AFAIK will align the stack properly.
> 
> I simply don't trust your 75% claim, lots of stuff would break if things
> weren't aligned properly.
> 
> 	Jakub

The reason for that '75%' is that integer-only binaries are usually 
compiled with -mpreferred-stack-boundary=2. People used the switch 
"-mpreferred-stack-boundary=2" to tell the compiler "I don't care about 
performance of floating point".

If gcc developers made that "-mpreferred-stack-boundary" switch, people 
used it. It is not surprising that distributors built packages that don't 
do much floating point calculations with it.

Now it's hard to say that anyone who used the switch was wrong. It very 
much ressembles the situatino when mad man cries that he is the only one 
normal and all the other people are crazy because they can't tolerate his 
behavior.


If you don't believe what I'm saying, just run the test on your own.


Other things that misalign the stack (found during source review):

- Mozilla (has assembler stuff for XPCOM calls that aligns only to 
8-bytes, based on the reasoning that it uses only doubles)
- Python (has a FFI library that does callbacks to C, it aligns only for 
Darwin ... could be trivially fixed by using the Darwin code for Linux)
- Dosbox (generates machine code that calls back to C, doesn't align)
- Mplayer+Wine (link Windows libraries that call back to C)
- Intel C (aligns only parts that need it, doesn't align in integer 
functions)
- Zip (uses some assembler routine that calls to C and doesn't align)

Mikulas

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is   aligned
  2009-08-07  7:13 ` Jakub Jelinek
@ 2009-08-07 12:53   ` H.J. Lu
  2009-08-07 22:30     ` H.J. Lu
  2009-08-07 21:08   ` Mikulas Patocka
  1 sibling, 1 reply; 59+ messages in thread
From: H.J. Lu @ 2009-08-07 12:53 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Mikulas Patocka, gcc-patches, ubizjak

On Fri, Aug 7, 2009 at 12:13 AM, Jakub Jelinek<jakub@redhat.com> wrote:
> On Fri, Aug 07, 2009 at 02:54:46AM +0200, Mikulas Patocka wrote:
>> > > In 32bit, the incoming stack may not be 16 byte aligned.  This patch
>> > > assumes the incoming stack is 4 byte aligned and realigns stack if any
>> > > SSE variable is put on stack. Any comments?
>> >
>> > IMHO this is wrong, I could live with a non-default option for those who
>> > don't care about performance and think a SCO document from 1996 has any
>> > relevance to Linux these days.  In reality a Linux ABI for years assumes
>> > 16 byte stack alignment for 32-bit code.
>>
>> Tell me which Linux distribution did you run with 16-byte stack alignment
>> checking (as proposed in bug 40838) and what was the result?
>>
>> For me, the result was that 75% of binaries in /bin in Debian Lenny do not
>> align the stack on 16-byte boundary.
>
> Besides the obstack glibc bug which has been fixed since then you haven't
> reported anything particular.  It is true that parts of i?86 glibc is
> compiled with -mpreferered-stack-boundary=2, but only parts that don't call
> callbacks.  Async signals AFAIK will align the stack properly.
>
> I simply don't trust your 75% claim, lots of stuff would break if things
> weren't aligned properly.
>

From gcc 3.4:

  /* Validate -mpreferred-stack-boundary= value, or provide default.
     The default of 128 bits is for Pentium III's SSE __m128, but we
     don't want additional code to keep the stack aligned when
     optimizing for code size.  */
  ix86_preferred_stack_boundary = (optimize_size
                                   ? TARGET_64BIT ? 128 : 32
                                   : 128);

If you compile code with -Os, you will get 4 byte stack alignment.
Just step back, we changed stack alignment from 4 byte to 16byte
for SSE since we couldn't realign stack at the time. Now we can
realign the stack very efficiently. I think we should do it for SSE
to support the existing Linux binaries which have 4 byte stack
alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse
results with SPEC CPU 2006, before and after my patch.

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
  2009-08-07  0:54 Mikulas Patocka
@ 2009-08-07  7:13 ` Jakub Jelinek
  2009-08-07 12:53   ` H.J. Lu
  2009-08-07 21:08   ` Mikulas Patocka
  0 siblings, 2 replies; 59+ messages in thread
From: Jakub Jelinek @ 2009-08-07  7:13 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: gcc-patches, ubizjak

On Fri, Aug 07, 2009 at 02:54:46AM +0200, Mikulas Patocka wrote:
> > > In 32bit, the incoming stack may not be 16 byte aligned.  This patch
> > > assumes the incoming stack is 4 byte aligned and realigns stack if any
> > > SSE variable is put on stack. Any comments?
> >
> > IMHO this is wrong, I could live with a non-default option for those who 
> > don't care about performance and think a SCO document from 1996 has any 
> > relevance to Linux these days.  In reality a Linux ABI for years assumes 
> > 16 byte stack alignment for 32-bit code.
> 
> Tell me which Linux distribution did you run with 16-byte stack alignment 
> checking (as proposed in bug 40838) and what was the result?
> 
> For me, the result was that 75% of binaries in /bin in Debian Lenny do not 
> align the stack on 16-byte boundary.

Besides the obstack glibc bug which has been fixed since then you haven't
reported anything particular.  It is true that parts of i?86 glibc is
compiled with -mpreferered-stack-boundary=2, but only parts that don't call
callbacks.  Async signals AFAIK will align the stack properly.

I simply don't trust your 75% claim, lots of stuff would break if things
weren't aligned properly.

	Jakub

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is  aligned
@ 2009-08-07  0:54 Mikulas Patocka
  2009-08-07  7:13 ` Jakub Jelinek
  0 siblings, 1 reply; 59+ messages in thread
From: Mikulas Patocka @ 2009-08-07  0:54 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches, ubizjak

> > In 32bit, the incoming stack may not be 16 byte aligned.  This patch
> > assumes the incoming stack is 4 byte aligned and realigns stack if any
> > SSE variable is put on stack. Any comments?
>
> IMHO this is wrong, I could live with a non-default option for those who 
> don't care about performance and think a SCO document from 1996 has any 
> relevance to Linux these days.  In reality a Linux ABI for years assumes 
> 16 byte stack alignment for 32-bit code.
> 
>        Jakub

Tell me which Linux distribution did you run with 16-byte stack alignment 
checking (as proposed in bug 40838) and what was the result?

For me, the result was that 75% of binaries in /bin in Debian Lenny do not 
align the stack on 16-byte boundary.

Mikulas

^ permalink raw reply	[flat|nested] 59+ messages in thread

end of thread, other threads:[~2009-10-30  9:51 UTC | newest]

Thread overview: 59+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-08-06 21:42 PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned H.J. Lu
2009-08-06 22:26 ` Jakub Jelinek
2009-08-06 22:52   ` H.J. Lu
2009-10-15 15:58 ` H.J. Lu
2009-10-15 18:45   ` Uros Bizjak
2009-10-15 19:22     ` H.J. Lu
2009-10-15 19:32       ` Uros Bizjak
2009-10-15 19:43         ` H.J. Lu
2009-10-15 19:48           ` Jakub Jelinek
2009-10-15 20:11             ` H.J. Lu
2009-10-15 19:53           ` Uros Bizjak
2009-10-15 21:01             ` H.J. Lu
2009-10-15 21:41               ` Uros Bizjak
2009-10-16 20:27     ` H.J. Lu
2009-10-17  1:03       ` Ian Lance Taylor
2009-10-17 18:22         ` H.J. Lu
2009-10-17 19:02           ` Richard Guenther
2009-10-17 19:21             ` H.J. Lu
2009-10-17 19:29               ` Richard Guenther
2009-10-17 19:35                 ` H.J. Lu
2009-10-17 19:46                   ` Richard Guenther
2009-10-17 20:01                     ` H.J. Lu
2009-10-17 20:59                       ` Richard Guenther
2009-10-18 19:21                         ` Michael Matz
2009-10-18 19:45                           ` Richard Guenther
2009-10-19 16:36                             ` H.J. Lu
2009-10-20  1:12                               ` Michael Matz
2009-10-20 19:10                                 ` H.J. Lu
2009-10-19 16:38                           ` H.J. Lu
2009-10-19 17:08                             ` Ian Lance Taylor
2009-10-19 17:26                               ` H.J. Lu
2009-10-19 17:33                                 ` Ian Lance Taylor
2009-10-19 17:46                                   ` H.J. Lu
2009-10-19 17:55                                     ` Ian Lance Taylor
2009-10-19 19:16                                       ` H.J. Lu
2009-10-19 21:15                                         ` Ian Lance Taylor
2009-10-20 19:00                                           ` H.J. Lu
2009-10-20  1:23                                         ` Michael Matz
2009-10-20 19:12                                           ` H.J. Lu
2009-10-20  1:53                             ` Michael Matz
2009-10-20 21:15                               ` H.J. Lu
2009-10-21  1:10                                 ` H.J. Lu
2009-10-21  9:54                                   ` Michael Matz
2009-10-21 16:56                                     ` H.J. Lu
2009-10-30 10:08                                       ` Richard Guenther
2009-10-17  7:09       ` Uros Bizjak
2009-08-07  0:54 Mikulas Patocka
2009-08-07  7:13 ` Jakub Jelinek
2009-08-07 12:53   ` H.J. Lu
2009-08-07 22:30     ` H.J. Lu
2009-08-08 17:35       ` Mikulas Patocka
2009-08-16 21:25         ` H.J. Lu
2009-08-24 17:39       ` H.J. Lu
2009-09-12 23:32         ` Mikulas Patocka
2009-09-12 23:42           ` Mikulas Patocka
2009-09-13  1:55           ` H.J. Lu
2009-09-13 14:10             ` Mikulas Patocka
2009-08-07 21:08   ` Mikulas Patocka
2009-08-07 21:25     ` Richard Guenther

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).