* PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned @ 2009-08-06 21:42 H.J. Lu 2009-08-06 22:26 ` Jakub Jelinek 2009-10-15 15:58 ` H.J. Lu 0 siblings, 2 replies; 59+ messages in thread From: H.J. Lu @ 2009-08-06 21:42 UTC (permalink / raw) To: gcc-patches; +Cc: ubizjak Hi, In 32bit, the incoming stack may not be 16 byte aligned. This patch assumes the incoming stack is 4 byte aligned and realigns stack if any SSE variable is put on stack. Any comments? Thanks. H.J. --- gcc/ 2009-08-06 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * config/i386/i386.c (ix86_update_stack_boundary): Use STACK_BOUNDARY if use_stack_boundary_for_incoming_stack_boundary is set. (VALID_SSE_VECTOR_MODE): New. (ix86_minimum_alignment): In 32bit, set use_stack_boundary_for_incoming_stack_boundary if any SSE variables are put on stack. * config/i386/i386.h (machine_function): Add use_stack_boundary_for_incoming_stack_boundary. gcc/testsuite/ 2009-08-06 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * gcc.target/i386/incoming-6.c: New. * gcc.target/i386/incoming-7.c: Likewise. * gcc.target/i386/incoming-8.c: Likewise. * gcc.target/i386/incoming-9.c: Likewise. Index: gcc/testsuite/gcc.target/i386/incoming-7.c =================================================================== --- gcc/testsuite/gcc.target/i386/incoming-7.c (revision 0) +++ gcc/testsuite/gcc.target/i386/incoming-7.c (revision 0) @@ -0,0 +1,16 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si, v4si, v4si, v4si, v4si); + +extern v4si s1, s2; + +v4si x(void) +{ + return y(s1, s2, s1, s2, s2); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ Index: gcc/testsuite/gcc.target/i386/incoming-9.c =================================================================== --- gcc/testsuite/gcc.target/i386/incoming-9.c (revision 0) +++ gcc/testsuite/gcc.target/i386/incoming-9.c (revision 0) @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O3 -mno-sse -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ Index: gcc/testsuite/gcc.target/i386/incoming-6.c =================================================================== --- gcc/testsuite/gcc.target/i386/incoming-6.c (revision 0) +++ gcc/testsuite/gcc.target/i386/incoming-6.c (revision 0) @@ -0,0 +1,17 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si *s3); + +extern v4si s1, s2; + +v4si x(void) +{ + v4si s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ Index: gcc/testsuite/gcc.target/i386/incoming-8.c =================================================================== --- gcc/testsuite/gcc.target/i386/incoming-8.c (revision 0) +++ gcc/testsuite/gcc.target/i386/incoming-8.c (revision 0) @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O3 -msse2 -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ Index: gcc/config/i386/i386.h =================================================================== --- gcc/config/i386/i386.h (revision 150532) +++ gcc/config/i386/i386.h (working copy) @@ -2389,6 +2389,8 @@ struct GTY(()) machine_function { /* This value is used for amd64 targets and specifies the current abi to be used. MS_ABI means ms abi. Otherwise SYSV_ABI means sysv abi. */ enum calling_abi call_abi; + /* Use STACK_BOUNDARY for incoming stack boundary. */ + int use_stack_boundary_for_incoming_stack_boundary; struct machine_cfa_state cfa; }; #endif Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 150532) +++ gcc/config/i386/i386.c (working copy) @@ -8230,11 +8230,19 @@ find_drap_reg (void) static void ix86_update_stack_boundary (void) { + /* Should we use STACK_BOUNDARY for incoming stack boundary? */ + unsigned int incoming_stack_boundary; + + if (cfun->machine->use_stack_boundary_for_incoming_stack_boundary) + incoming_stack_boundary = STACK_BOUNDARY; + else + incoming_stack_boundary = ix86_default_incoming_stack_boundary; + /* Prefer the one specified at command line. */ ix86_incoming_stack_boundary = (ix86_user_incoming_stack_boundary ? ix86_user_incoming_stack_boundary - : ix86_default_incoming_stack_boundary); + : incoming_stack_boundary); /* Incoming stack alignment can be changed on individual functions via force_align_arg_pointer attribute. We use the smallest @@ -20069,6 +20077,10 @@ ix86_local_alignment (tree exp, enum mac return align; } +#define VALID_SSE_VECTOR_MODE(MODE) \ + ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \ + || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode) + /* Compute the minimum required alignment for dynamic stack realignment purposes for a local variable, parameter or a stack slot. EXP is the data type or decl itself, MODE is its mode and ALIGN is the @@ -20080,7 +20092,7 @@ ix86_minimum_alignment (tree exp, enum m { tree type, decl; - if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64) + if (TARGET_64BIT) return align; if (exp && DECL_P (exp)) @@ -20094,6 +20106,15 @@ ix86_minimum_alignment (tree exp, enum m decl = NULL; } + /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any + SSE variables are put on stack. */ + if (VALID_SSE_VECTOR_MODE (mode) + || (type && VALID_SSE_VECTOR_MODE (TYPE_MODE (type)))) + cfun->machine->use_stack_boundary_for_incoming_stack_boundary = 1; + + if (align != 64 || ix86_preferred_stack_boundary >= 64) + return align; + /* Don't do dynamic stack realignment for long long objects with -mpreferred-stack-boundary=2. */ if ((mode == DImode || (type && TYPE_MODE (type) == DImode)) ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-06 21:42 PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned H.J. Lu @ 2009-08-06 22:26 ` Jakub Jelinek 2009-08-06 22:52 ` H.J. Lu 2009-10-15 15:58 ` H.J. Lu 1 sibling, 1 reply; 59+ messages in thread From: Jakub Jelinek @ 2009-08-06 22:26 UTC (permalink / raw) To: H.J. Lu; +Cc: gcc-patches, ubizjak On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote: > In 32bit, the incoming stack may not be 16 byte aligned. This patch > assumes the incoming stack is 4 byte aligned and realigns stack if any > SSE variable is put on stack. Any comments? IMHO this is wrong, I could live with a non-default option for those who don't care about performance and think a SCO document from 1996 has any relevance to Linux these days. In reality a Linux ABI for years assumes 16 byte stack alignment for 32-bit code. Jakub ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-06 22:26 ` Jakub Jelinek @ 2009-08-06 22:52 ` H.J. Lu 0 siblings, 0 replies; 59+ messages in thread From: H.J. Lu @ 2009-08-06 22:52 UTC (permalink / raw) To: Jakub Jelinek; +Cc: gcc-patches, ubizjak On Thu, Aug 6, 2009 at 3:26 PM, Jakub Jelinek<jakub@redhat.com> wrote: > On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote: >> In 32bit, the incoming stack may not be 16 byte aligned. This patch >> assumes the incoming stack is 4 byte aligned and realigns stack if any >> SSE variable is put on stack. Any comments? > > IMHO this is wrong, I could live with a non-default option for those who > don't care about performance and think a SCO document from 1996 has any > relevance to Linux these days. In reality a Linux ABI for years assumes 16 > byte stack alignment for 32-bit code. > According to the current LSB, the stack is still aligned at 4 byte on ia32. Even gcc 4.1.0 aligns stack to 4 byte with -Os. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-06 21:42 PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned H.J. Lu 2009-08-06 22:26 ` Jakub Jelinek @ 2009-10-15 15:58 ` H.J. Lu 2009-10-15 18:45 ` Uros Bizjak 1 sibling, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-15 15:58 UTC (permalink / raw) To: H.J. Lu; +Cc: gcc-patches, ubizjak On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote: > Hi, > > In 32bit, the incoming stack may not be 16 byte aligned. This patch > assumes the incoming stack is 4 byte aligned and realigns stack if any > SSE variable is put on stack. Any comments? > For i386, gcc, up to gcc 3.4, had been generating 4byte stack alignment. 4byte stack alignment is a problem unless SSE registers are used, which may be generated by vectorizer or intrinsics. This patch accommodates the binaries generated by previous versions of gcc if SSE registers are generated by vectorizer or intrinsics. The performance impact on SPEC CPU 2006 is minimum: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156#c7 OK for trunk? Thanks. H.J. --- gcc/ 2009-09-26 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * function.c (allocate_struct_function): Initialize forced_stack_alignment and forced_stack_mode. * function.h (function): Add forced_stack_alignment and forced_stack_mode. * tree-vect-data-refs.c (vect_verify_datarefs_alignment): Update forced_stack_alignment and forced_stack_mode. * config/i386/i386.c (VALID_SSE_VECTOR_MODE): New. (ix86_update_stack_boundary): In 32bit, use STACK_BOUNDARY if any SSE variables are forced on stack. (ix86_minimum_alignment): In 32bit, update forced_stack_alignment and forced_stack_mode if any SSE variables are put on stack. gcc/testsuite/ 2009-09-26 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * gcc.target/i386/incoming-6.c: New. * gcc.target/i386/incoming-7.c: Likewise. * gcc.target/i386/incoming-8.c: Likewise. * gcc.target/i386/incoming-9.c: Likewise. * gcc.target/i386/incoming-10.c: Likewise. * gcc.target/i386/incoming-11.c: Likewise. Index: gcc/testsuite/gcc.target/i386/incoming-11.c =================================================================== --- gcc/testsuite/gcc.target/i386/incoming-11.c (revision 0) +++ gcc/testsuite/gcc.target/i386/incoming-11.c (revision 0) @@ -0,0 +1,16 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */ + +void g(); + +int p[100]; +int q[100]; + +void f() +{ + int i; + for (i = 0; i < 100; i++) p[i] = 0; + g(); + for (i = 0; i < 100; i++) q[i] = 0; +} Index: gcc/testsuite/gcc.target/i386/incoming-7.c =================================================================== --- gcc/testsuite/gcc.target/i386/incoming-7.c (revision 0) +++ gcc/testsuite/gcc.target/i386/incoming-7.c (revision 0) @@ -0,0 +1,16 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si, v4si, v4si, v4si, v4si); + +extern v4si s1, s2; + +v4si x(void) +{ + return y(s1, s2, s1, s2, s2); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ Index: gcc/testsuite/gcc.target/i386/incoming-9.c =================================================================== --- gcc/testsuite/gcc.target/i386/incoming-9.c (revision 0) +++ gcc/testsuite/gcc.target/i386/incoming-9.c (revision 0) @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O3 -mno-sse -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ Index: gcc/testsuite/gcc.target/i386/incoming-10.c =================================================================== --- gcc/testsuite/gcc.target/i386/incoming-10.c (revision 0) +++ gcc/testsuite/gcc.target/i386/incoming-10.c (revision 0) @@ -0,0 +1,19 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */ + +struct s { + int x[8]; +}; + +void g(struct s *); + +void f() +{ + int i; + struct s s; + for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0; + g(&s); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ Index: gcc/testsuite/gcc.target/i386/incoming-6.c =================================================================== --- gcc/testsuite/gcc.target/i386/incoming-6.c (revision 0) +++ gcc/testsuite/gcc.target/i386/incoming-6.c (revision 0) @@ -0,0 +1,17 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si *s3); + +extern v4si s1, s2; + +v4si x(void) +{ + v4si s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ Index: gcc/testsuite/gcc.target/i386/incoming-8.c =================================================================== --- gcc/testsuite/gcc.target/i386/incoming-8.c (revision 0) +++ gcc/testsuite/gcc.target/i386/incoming-8.c (revision 0) @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -O3 -msse2 -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ Index: gcc/function.c =================================================================== --- gcc/function.c (revision 152202) +++ gcc/function.c (working copy) @@ -4139,6 +4139,9 @@ allocate_struct_function (tree fndecl, b /* Assume all registers in stdarg functions need to be saved. */ cfun->va_list_gpr_size = VA_LIST_MAX_GPR_SIZE; cfun->va_list_fpr_size = VA_LIST_MAX_FPR_SIZE; + + cfun->forced_stack_alignment = 0; + cfun->forced_stack_mode = VOIDmode; } } Index: gcc/function.h =================================================================== --- gcc/function.h (revision 152202) +++ gcc/function.h (working copy) @@ -532,6 +532,12 @@ struct GTY(()) function { a string describing the reason for failure. */ const char * GTY((skip)) cannot_be_copied_reason; + /* The largest alignment forced on the stack. */ + unsigned int forced_stack_alignment; + + /* The mode of the largest alignment forced on the stack. */ + enum machine_mode forced_stack_mode; + /* Collected bit flags. */ /* Number of units of general registers that need saving in stdarg Index: gcc/tree-vect-data-refs.c =================================================================== --- gcc/tree-vect-data-refs.c (revision 152202) +++ gcc/tree-vect-data-refs.c (working copy) @@ -927,6 +927,8 @@ vect_verify_datarefs_alignment (loop_vec { gimple stmt = DR_STMT (dr); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + enum machine_mode mode; + unsigned int align; /* For interleaving, only the alignment of the first access matters. */ if (STMT_VINFO_STRIDED_ACCESS (stmt_info) @@ -947,6 +949,15 @@ vect_verify_datarefs_alignment (loop_vec } return false; } + + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); + align = GET_MODE_ALIGNMENT (mode); + if (cfun->forced_stack_alignment < align) + { + cfun->forced_stack_alignment = align; + cfun->forced_stack_mode = mode; + } + if (supportable_dr_alignment != dr_aligned && vect_print_dump_info (REPORT_ALIGNMENT)) fprintf (vect_dump, "Vectorizing an unaligned access."); Index: gcc/config/i386/i386.c =================================================================== --- gcc/config/i386/i386.c (revision 152202) +++ gcc/config/i386/i386.c (working copy) @@ -8151,16 +8151,31 @@ find_drap_reg (void) } } +#define VALID_SSE_VECTOR_MODE(MODE) \ + ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \ + || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode) + /* Update incoming stack boundary and estimated stack alignment. */ static void ix86_update_stack_boundary (void) { + /* Should we use STACK_BOUNDARY for incoming stack boundary? */ + unsigned int incoming_stack_boundary; + + /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any + SSE variables are put on stack. */ + if (!TARGET_64BIT + && VALID_SSE_VECTOR_MODE (cfun->forced_stack_mode)) + incoming_stack_boundary = STACK_BOUNDARY; + else + incoming_stack_boundary = ix86_default_incoming_stack_boundary; + /* Prefer the one specified at command line. */ ix86_incoming_stack_boundary = (ix86_user_incoming_stack_boundary ? ix86_user_incoming_stack_boundary - : ix86_default_incoming_stack_boundary); + : incoming_stack_boundary); /* Incoming stack alignment can be changed on individual functions via force_align_arg_pointer attribute. We use the smallest @@ -19773,7 +19788,7 @@ ix86_minimum_alignment (tree exp, enum m { tree type, decl; - if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64) + if (TARGET_64BIT) return align; if (exp && DECL_P (exp)) @@ -19787,6 +19802,25 @@ ix86_minimum_alignment (tree exp, enum m decl = NULL; } + /* In 32bit, update forced_stack_mode if any SSE variables are put + on stack. */ + if (cfun->forced_stack_alignment < 128) + { + if (VALID_SSE_VECTOR_MODE (mode)) + { + cfun->forced_stack_alignment = 128; + cfun->forced_stack_mode = mode; + } + else if ((type && VALID_SSE_VECTOR_MODE (TYPE_MODE (type)))) + { + cfun->forced_stack_alignment = 128; + cfun->forced_stack_mode = TYPE_MODE (type); + } + } + + if (align != 64 || ix86_preferred_stack_boundary >= 64) + return align; + /* Don't do dynamic stack realignment for long long objects with -mpreferred-stack-boundary=2. */ if ((mode == DImode || (type && TYPE_MODE (type) == DImode)) ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-15 15:58 ` H.J. Lu @ 2009-10-15 18:45 ` Uros Bizjak 2009-10-15 19:22 ` H.J. Lu 2009-10-16 20:27 ` H.J. Lu 0 siblings, 2 replies; 59+ messages in thread From: Uros Bizjak @ 2009-10-15 18:45 UTC (permalink / raw) To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek On 10/15/2009 05:48 PM, H.J. Lu wrote: > On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote: > >> Hi, >> >> In 32bit, the incoming stack may not be 16 byte aligned. This patch >> assumes the incoming stack is 4 byte aligned and realigns stack if any >> SSE variable is put on stack. Any comments? >> >> > For i386, gcc, up to gcc 3.4, had been generating 4byte stack alignment. > 4byte stack alignment is a problem unless SSE registers are used, which > may be generated by vectorizer or intrinsics. This patch accommodates > the binaries generated by previous versions of gcc if SSE registers are > generated by vectorizer or intrinsics. The performance impact on SPEC > CPU 2006 is minimum: > > http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156#c7 > > OK for trunk? > Comments bellow. > --- > gcc/ > > 2009-09-26 H.J. Lu<hongjiu.lu@intel.com> > > PR target/40838 > * function.c (allocate_struct_function): Initialize > forced_stack_alignment and forced_stack_mode. > > * function.h (function): Add forced_stack_alignment and > forced_stack_mode. > > * tree-vect-data-refs.c (vect_verify_datarefs_alignment): > Update forced_stack_alignment and forced_stack_mode. > > * config/i386/i386.c (VALID_SSE_VECTOR_MODE): New. > (ix86_update_stack_boundary): In 32bit, use STACK_BOUNDARY if > any SSE variables are forced on stack. > (ix86_minimum_alignment): In 32bit, update > forced_stack_alignment and forced_stack_mode if any SSE > variables are put on stack. > > gcc/testsuite/ > > > ... > > Index: gcc/function.h > =================================================================== > --- gcc/function.h (revision 152202) > +++ gcc/function.h (working copy) > @@ -532,6 +532,12 @@ struct GTY(()) function { > a string describing the reason for failure. */ > const char * GTY((skip)) cannot_be_copied_reason; > > + /* The largest alignment forced on the stack. */ > + unsigned int forced_stack_alignment; > + > + /* The mode of the largest alignment forced on the stack. */ > + enum machine_mode forced_stack_mode; > + > /* Collected bit flags. */ > > /* Number of units of general registers that need saving in stdarg > Index: gcc/tree-vect-data-refs.c > =================================================================== > --- gcc/tree-vect-data-refs.c (revision 152202) > +++ gcc/tree-vect-data-refs.c (working copy) > @@ -927,6 +927,8 @@ vect_verify_datarefs_alignment (loop_vec > { > gimple stmt = DR_STMT (dr); > stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > + enum machine_mode mode; > + unsigned int align; > > /* For interleaving, only the alignment of the first access matters. */ > if (STMT_VINFO_STRIDED_ACCESS (stmt_info) > @@ -947,6 +949,15 @@ vect_verify_datarefs_alignment (loop_vec > } > return false; > } > + > + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); > + align = GET_MODE_ALIGNMENT (mode); > + if (cfun->forced_stack_alignment< align) > + { > + cfun->forced_stack_alignment = align; > + cfun->forced_stack_mode = mode; > + } > + > if (supportable_dr_alignment != dr_aligned > && vect_print_dump_info (REPORT_ALIGNMENT)) > fprintf (vect_dump, "Vectorizing an unaligned access."); > Index: gcc/config/i386/i386.c > =================================================================== > --- gcc/config/i386/i386.c (revision 152202) > +++ gcc/config/i386/i386.c (working copy) > @@ -8151,16 +8151,31 @@ find_drap_reg (void) > } > } > > +#define VALID_SSE_VECTOR_MODE(MODE) \ > + ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \ > + || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode) > + > /* Update incoming stack boundary and estimated stack alignment. */ > > static void > ix86_update_stack_boundary (void) > { > + /* Should we use STACK_BOUNDARY for incoming stack boundary? */ > + unsigned int incoming_stack_boundary; > + > + /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any > + SSE variables are put on stack. */ > + if (!TARGET_64BIT > +&& VALID_SSE_VECTOR_MODE (cfun->forced_stack_mode)) > + incoming_stack_boundary = STACK_BOUNDARY; > + else > + incoming_stack_boundary = ix86_default_incoming_stack_boundary; > + > /* Prefer the one specified at command line. */ > ix86_incoming_stack_boundary > = (ix86_user_incoming_stack_boundary > ? ix86_user_incoming_stack_boundary > - : ix86_default_incoming_stack_boundary); > + : incoming_stack_boundary); > > /* Incoming stack alignment can be changed on individual functions > via force_align_arg_pointer attribute. We use the smallest > @@ -19773,7 +19788,7 @@ ix86_minimum_alignment (tree exp, enum m > { > tree type, decl; > > - if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary>= 64) > + if (TARGET_64BIT) > return align; > > if (exp&& DECL_P (exp)) > @@ -19787,6 +19802,25 @@ ix86_minimum_alignment (tree exp, enum m > decl = NULL; > } > > + /* In 32bit, update forced_stack_mode if any SSE variables are put > + on stack. */ > + if (cfun->forced_stack_alignment< 128) > + { > + if (VALID_SSE_VECTOR_MODE (mode)) > + { > + cfun->forced_stack_alignment = 128; > + cfun->forced_stack_mode = mode; > + } > + else if ((type&& VALID_SSE_VECTOR_MODE (TYPE_MODE (type)))) > + { > + cfun->forced_stack_alignment = 128; > + cfun->forced_stack_mode = TYPE_MODE (type); > + } > + } > + > + if (align != 64 || ix86_preferred_stack_boundary>= 64) > + return align; > + > We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128 here. At least the comment for P_S_B_D says: /* It should be MIN_STACK_BOUNDARY. But we set it to 128 bits for both 32bit and 64bit, to support codes that need 128 bit stack alignment for SSE instructions, but can't realign the stack. */ #define PREFERRED_STACK_BOUNDARY_DEFAULT 128 OTOH, I think that this whole stuff should depend on -mstackrealing somehow, according to the comment: /* 1 if -mstackrealign should be turned on by default. It will generate an alternate prologue and epilogue that realigns the runtime stack if nessary. This supports mixing codes that keep a 4-byte aligned stack, as specified by i386 psABI, with codes that need a 16-byte aligned stack, as required by SSE instructions. If STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is 128, stacks for all functions may be realigned. */ #define STACK_REALIGN_DEFAULT 0 As far as I understand from many comments from the PR40838 trail (especially comment #51), -mstackrealing is not effective in some cases involving automatic SSE variables on the stack. I think that -mstackrealign should be fixed in the way your patch outlines, so old/broken sources that assume 4byte alignment can be compiled using this option without penalizing new/fixed code that assumes 16byte alignment. Also, Jakub has its opinion here, I have also CC'd him. Uros. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-15 18:45 ` Uros Bizjak @ 2009-10-15 19:22 ` H.J. Lu 2009-10-15 19:32 ` Uros Bizjak 2009-10-16 20:27 ` H.J. Lu 1 sibling, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-15 19:22 UTC (permalink / raw) To: Uros Bizjak; +Cc: gcc-patches, Jakub Jelinek On Thu, Oct 15, 2009 at 11:40 AM, Uros Bizjak <ubizjak@gmail.com> wrote: > On 10/15/2009 05:48 PM, H.J. Lu wrote: >> >> On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote: >> >>> >>> Hi, >>> >>> In 32bit, the incoming stack may not be 16 byte aligned. This patch >>> assumes the incoming stack is 4 byte aligned and realigns stack if any >>> SSE variable is put on stack. Any comments? >>> >>> >> >> For i386, gcc, up to gcc 3.4, had been generating 4byte stack alignment. >> 4byte stack alignment is a problem unless SSE registers are used, which >> may be generated by vectorizer or intrinsics. This patch accommodates >> the binaries generated by previous versions of gcc if SSE registers are >> generated by vectorizer or intrinsics. The performance impact on SPEC >> CPU 2006 is minimum: >> >> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156#c7 >> >> OK for trunk? >> > > Comments bellow. > >> --- >> gcc/ >> >> 2009-09-26 H.J. Lu<hongjiu.lu@intel.com> >> >> PR target/40838 >> * function.c (allocate_struct_function): Initialize >> forced_stack_alignment and forced_stack_mode. >> >> * function.h (function): Add forced_stack_alignment and >> forced_stack_mode. >> >> * tree-vect-data-refs.c (vect_verify_datarefs_alignment): >> Update forced_stack_alignment and forced_stack_mode. >> >> * config/i386/i386.c (VALID_SSE_VECTOR_MODE): New. >> (ix86_update_stack_boundary): In 32bit, use STACK_BOUNDARY if >> any SSE variables are forced on stack. >> (ix86_minimum_alignment): In 32bit, update >> forced_stack_alignment and forced_stack_mode if any SSE >> variables are put on stack. >> >> gcc/testsuite/ >> >> >> > > ... > >> >> Index: gcc/function.h >> =================================================================== >> --- gcc/function.h (revision 152202) >> +++ gcc/function.h (working copy) >> @@ -532,6 +532,12 @@ struct GTY(()) function { >> a string describing the reason for failure. */ >> const char * GTY((skip)) cannot_be_copied_reason; >> >> + /* The largest alignment forced on the stack. */ >> + unsigned int forced_stack_alignment; >> + >> + /* The mode of the largest alignment forced on the stack. */ >> + enum machine_mode forced_stack_mode; >> + >> /* Collected bit flags. */ >> >> /* Number of units of general registers that need saving in stdarg >> Index: gcc/tree-vect-data-refs.c >> =================================================================== >> --- gcc/tree-vect-data-refs.c (revision 152202) >> +++ gcc/tree-vect-data-refs.c (working copy) >> @@ -927,6 +927,8 @@ vect_verify_datarefs_alignment (loop_vec >> { >> gimple stmt = DR_STMT (dr); >> stmt_vec_info stmt_info = vinfo_for_stmt (stmt); >> + enum machine_mode mode; >> + unsigned int align; >> >> /* For interleaving, only the alignment of the first access >> matters. */ >> if (STMT_VINFO_STRIDED_ACCESS (stmt_info) >> @@ -947,6 +949,15 @@ vect_verify_datarefs_alignment (loop_vec >> } >> return false; >> } >> + >> + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); >> + align = GET_MODE_ALIGNMENT (mode); >> + if (cfun->forced_stack_alignment< align) >> + { >> + cfun->forced_stack_alignment = align; >> + cfun->forced_stack_mode = mode; >> + } >> + >> if (supportable_dr_alignment != dr_aligned >> && vect_print_dump_info (REPORT_ALIGNMENT)) >> fprintf (vect_dump, "Vectorizing an unaligned access."); >> Index: gcc/config/i386/i386.c >> =================================================================== >> --- gcc/config/i386/i386.c (revision 152202) >> +++ gcc/config/i386/i386.c (working copy) >> @@ -8151,16 +8151,31 @@ find_drap_reg (void) >> } >> } >> >> +#define VALID_SSE_VECTOR_MODE(MODE) \ >> + ((MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V2DFmode \ >> + || (MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DImode) >> + >> /* Update incoming stack boundary and estimated stack alignment. */ >> >> static void >> ix86_update_stack_boundary (void) >> { >> + /* Should we use STACK_BOUNDARY for incoming stack boundary? */ >> + unsigned int incoming_stack_boundary; >> + >> + /* In 32bit, use STACK_BOUNDARY for incoming stack boundary if any >> + SSE variables are put on stack. */ >> + if (!TARGET_64BIT >> +&& VALID_SSE_VECTOR_MODE (cfun->forced_stack_mode)) >> + incoming_stack_boundary = STACK_BOUNDARY; >> + else >> + incoming_stack_boundary = ix86_default_incoming_stack_boundary; >> + >> /* Prefer the one specified at command line. */ >> ix86_incoming_stack_boundary >> = (ix86_user_incoming_stack_boundary >> ? ix86_user_incoming_stack_boundary >> - : ix86_default_incoming_stack_boundary); >> + : incoming_stack_boundary); >> >> /* Incoming stack alignment can be changed on individual functions >> via force_align_arg_pointer attribute. We use the smallest >> @@ -19773,7 +19788,7 @@ ix86_minimum_alignment (tree exp, enum m >> { >> tree type, decl; >> >> - if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary>= 64) >> + if (TARGET_64BIT) >> return align; >> >> if (exp&& DECL_P (exp)) >> @@ -19787,6 +19802,25 @@ ix86_minimum_alignment (tree exp, enum m >> decl = NULL; >> } >> >> + /* In 32bit, update forced_stack_mode if any SSE variables are put >> + on stack. */ >> + if (cfun->forced_stack_alignment< 128) >> + { >> + if (VALID_SSE_VECTOR_MODE (mode)) >> + { >> + cfun->forced_stack_alignment = 128; >> + cfun->forced_stack_mode = mode; >> + } >> + else if ((type&& VALID_SSE_VECTOR_MODE (TYPE_MODE (type)))) >> + { >> + cfun->forced_stack_alignment = 128; >> + cfun->forced_stack_mode = TYPE_MODE (type); >> + } >> + } >> + >> + if (align != 64 || ix86_preferred_stack_boundary>= 64) >> + return align; >> + >> > > We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128 > here. At least the comment for P_S_B_D says: > > /* It should be MIN_STACK_BOUNDARY. But we set it to 128 bits for > both 32bit and 64bit, to support codes that need 128 bit stack > alignment for SSE instructions, but can't realign the stack. */ > #define PREFERRED_STACK_BOUNDARY_DEFAULT 128 In forced_stack_alignment, we collect the hard alignment requirement on stack. The decision if stack should be aligned depends on the incoming stack alignment and the hard stack alignment. It is separate from PREFERRED_STACK_BOUNDARY_DEFAULT. > OTOH, I think that this whole stuff should depend on -mstackrealing somehow, > according to the comment: > > /* 1 if -mstackrealign should be turned on by default. It will > generate an alternate prologue and epilogue that realigns the > runtime stack if nessary. This supports mixing codes that keep a > 4-byte aligned stack, as specified by i386 psABI, with codes that > need a 16-byte aligned stack, as required by SSE instructions. If > STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is > 128, stacks for all functions may be realigned. */ > #define STACK_REALIGN_DEFAULT 0 > > As far as I understand from many comments from the PR40838 trail (especially > comment #51), -mstackrealing is not effective in some cases involving > automatic SSE variables on the stack. I think that -mstackrealign should be > fixed in the way your patch outlines, so old/broken sources that assume > 4byte alignment can be compiled using this option without penalizing > new/fixed code that assumes 16byte alignment. > -mstackrealign is very effective. If we turn it on by default, it works fine: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838#c45 It is just that we don't want to make it the default since we will realign the stack for almost all functions. In most cases, misaligned stack won't cause any problems. What my patch does is to assume 4byte incoming stack alignment for functions containing SSE instructions which require hard 16byte stack alignment. After my patch is applied, we can update -mstackrealign to make it an no-op since all the problems it tries to solve: http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00854.html http://gcc.gnu.org/ml/gcc-patches/2006-05/msg00526.html disappeared with my patch. Thanks. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-15 19:22 ` H.J. Lu @ 2009-10-15 19:32 ` Uros Bizjak 2009-10-15 19:43 ` H.J. Lu 0 siblings, 1 reply; 59+ messages in thread From: Uros Bizjak @ 2009-10-15 19:32 UTC (permalink / raw) To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek On 10/15/2009 09:11 PM, H.J. Lu wrote: >> We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128 >> here. At least the comment for P_S_B_D says: >> >> /* It should be MIN_STACK_BOUNDARY. �But we set it to 128 bits for >> � both 32bit and 64bit, to support codes that need 128 bit stack >> � alignment for SSE instructions, but can't realign the stack. �*/ >> #define PREFERRED_STACK_BOUNDARY_DEFAULT 128 >> > In forced_stack_alignment, we collect the hard alignment requirement > on stack. The decision if stack should be aligned depends on the > incoming stack alignment and the hard stack alignment. It is separate > from PREFERRED_STACK_BOUNDARY_DEFAULT. > > >> OTOH, I think that this whole stuff should depend on -mstackrealing somehow, >> according to the comment: >> >> /* 1 if -mstackrealign should be turned on by default. �It will >> � generate an alternate prologue and epilogue that realigns the >> � runtime stack if nessary. �This supports mixing codes that keep a >> � 4-byte aligned stack, as specified by i386 psABI, with codes that >> � need a 16-byte aligned stack, as required by SSE instructions. �If >> � STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is >> � 128, stacks for all functions may be realigned. �*/ >> #define STACK_REALIGN_DEFAULT 0 >> >> As far as I understand from many comments from the PR40838 trail (especially >> comment #51), -mstackrealing is not effective in some cases involving >> automatic SSE variables on the stack. I think that -mstackrealign should be >> fixed in the way your patch outlines, so old/broken sources that assume >> 4byte alignment can be compiled using this option without penalizing >> new/fixed code that assumes 16byte alignment. >> >> > -mstackrealign is very effective. If we turn it on by default, it works fine: > > http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838#c45 > > It is just that we don't want to make it the default since we will realign > the stack for almost all functions. In most cases, misaligned stack won't > cause any problems. > > What my patch does is to assume 4byte incoming stack alignment > for functions containing SSE instructions which require hard 16byte > stack alignment. > > After my patch is applied, we can update -mstackrealign to make it an > no-op since all the problems it tries to solve: > > http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00854.html > http://gcc.gnu.org/ml/gcc-patches/2006-05/msg00526.html > > disappeared with my patch. > Then we should do realignment the other way around: instead of using -mstackrealing for all the code (including where it has no effect), let's use -mstackrealign to activate realignment functionality that is introduced by your patch. IOW, lightweight -mstackrealign, firing up only when there is the possibility of unaligned access in the code it precedes. Uros. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-15 19:32 ` Uros Bizjak @ 2009-10-15 19:43 ` H.J. Lu 2009-10-15 19:48 ` Jakub Jelinek 2009-10-15 19:53 ` Uros Bizjak 0 siblings, 2 replies; 59+ messages in thread From: H.J. Lu @ 2009-10-15 19:43 UTC (permalink / raw) To: Uros Bizjak; +Cc: gcc-patches, Jakub Jelinek On Thu, Oct 15, 2009 at 12:27 PM, Uros Bizjak <ubizjak@gmail.com> wrote: > On 10/15/2009 09:11 PM, H.J. Lu wrote: >>> >>> We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128 >>> here. At least the comment for P_S_B_D says: >>> >>> /* It should be MIN_STACK_BOUNDARY. �But we set it to 128 bits for >>> � both 32bit and 64bit, to support codes that need 128 bit stack >>> � alignment for SSE instructions, but can't realign the stack. �*/ >>> #define PREFERRED_STACK_BOUNDARY_DEFAULT 128 >>> >> >> In forced_stack_alignment, we collect the hard alignment requirement >> on stack. The decision if stack should be aligned depends on the >> incoming stack alignment and the hard stack alignment. It is separate >> from PREFERRED_STACK_BOUNDARY_DEFAULT. >> >> >>> >>> OTOH, I think that this whole stuff should depend on -mstackrealing >>> somehow, >>> according to the comment: >>> >>> /* 1 if -mstackrealign should be turned on by default. �It will >>> � generate an alternate prologue and epilogue that realigns the >>> � runtime stack if nessary. �This supports mixing codes that keep a >>> � 4-byte aligned stack, as specified by i386 psABI, with codes that >>> � need a 16-byte aligned stack, as required by SSE instructions. �If >>> � STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is >>> � 128, stacks for all functions may be realigned. �*/ >>> #define STACK_REALIGN_DEFAULT 0 >>> >>> As far as I understand from many comments from the PR40838 trail >>> (especially >>> comment #51), -mstackrealing is not effective in some cases involving >>> automatic SSE variables on the stack. I think that -mstackrealign should >>> be >>> fixed in the way your patch outlines, so old/broken sources that assume >>> 4byte alignment can be compiled using this option without penalizing >>> new/fixed code that assumes 16byte alignment. >>> >>> >> >> -mstackrealign is very effective. If we turn it on by default, it works >> fine: >> >> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838#c45 >> >> It is just that we don't want to make it the default since we will realign >> the stack for almost all functions. In most cases, misaligned stack won't >> cause any problems. >> >> What my patch does is to assume 4byte incoming stack alignment >> for functions containing SSE instructions which require hard 16byte >> stack alignment. >> >> After my patch is applied, we can update -mstackrealign to make it an >> no-op since all the problems it tries to solve: >> >> http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00854.html >> http://gcc.gnu.org/ml/gcc-patches/2006-05/msg00526.html >> >> disappeared with my patch. >> > > Then we should do realignment the other way around: instead of using > -mstackrealing for all the code (including where it has no effect), let's > use -mstackrealign to activate realignment functionality that is introduced > by your patch. That defeats the whole purpose of my patch, which automatically realigns the stack when there is a hard alignment requirement. If it isn't turned on by default, it is not very useful. > IOW, lightweight -mstackrealign, firing up only when there is the > possibility of unaligned access in the code it precedes. > That is what my patch does, but turned it on by default. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-15 19:43 ` H.J. Lu @ 2009-10-15 19:48 ` Jakub Jelinek 2009-10-15 20:11 ` H.J. Lu 2009-10-15 19:53 ` Uros Bizjak 1 sibling, 1 reply; 59+ messages in thread From: Jakub Jelinek @ 2009-10-15 19:48 UTC (permalink / raw) To: H.J. Lu; +Cc: Uros Bizjak, gcc-patches On Thu, Oct 15, 2009 at 12:32:12PM -0700, H.J. Lu wrote: > > Then we should do realignment the other way around: instead of using > > -mstackrealing for all the code (including where it has no effect), let's > > use -mstackrealign to activate realignment functionality that is introduced > > by your patch. > > That defeats the whole purpose of my patch, which automatically > realigns the stack when there is a hard alignment requirement. If it > isn't turned on by default, it is not very useful. > > > IOW, lightweight -mstackrealign, firing up only when there is the > > possibility of unaligned access in the code it precedes. > > > > That is what my patch does, but turned it on by default. I agree with Uros, I have nothing against a lightweight -mstackrealign, but forcing this upon everybody just because a few people insist on compiling code with -mpreferred-stack-boundary=2 is IMHO a very bad idea and I object against it. -mpreferred-stack-boundary=2 is fine for the kernel where you basically define your own ABI, like many other ABI changing options, and for people who know what they are doing and bear the consequences (e.g. that they need to -mstackrealign when calling outside code), but we shouldn't penalize because of that rare option all the code by default. At least not on Linux. Jakub ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-15 19:48 ` Jakub Jelinek @ 2009-10-15 20:11 ` H.J. Lu 0 siblings, 0 replies; 59+ messages in thread From: H.J. Lu @ 2009-10-15 20:11 UTC (permalink / raw) To: Jakub Jelinek; +Cc: Uros Bizjak, gcc-patches On Thu, Oct 15, 2009 at 12:43 PM, Jakub Jelinek <jakub@redhat.com> wrote: > On Thu, Oct 15, 2009 at 12:32:12PM -0700, H.J. Lu wrote: >> > Then we should do realignment the other way around: instead of using >> > -mstackrealing for all the code (including where it has no effect), let's >> > use -mstackrealign to activate realignment functionality that is introduced >> > by your patch. >> >> That defeats the whole purpose of my patch, which automatically >> realigns the stack when there is a hard alignment requirement. If it >> isn't turned on by default, it is not very useful. >> >> > IOW, lightweight -mstackrealign, firing up only when there is the >> > possibility of unaligned access in the code it precedes. >> > >> >> That is what my patch does, but turned it on by default. > > I agree with Uros, I have nothing against a lightweight -mstackrealign, > but forcing this upon everybody just because a few people insist on compiling > code with -mpreferred-stack-boundary=2 is IMHO a very bad idea and I object > against it. -mpreferred-stack-boundary=2 is fine for the kernel where you > basically define your own ABI, like many other ABI changing options, and for > people who know what they are doing and bear the consequences (e.g. that > they need to -mstackrealign when calling outside code), but we shouldn't > penalize because of that rare option all the code by default. At least not > on Linux. Gcc 3.4 and older may align stack to 4byte. Do we declare that mixing object files compiled with different versions of gcc is unsupported on Linux/i386? As for penalty, it only happens at -O3 and its performance impact is minimum. We can provide an option to turn it off. It is a correctness issue. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-15 19:43 ` H.J. Lu 2009-10-15 19:48 ` Jakub Jelinek @ 2009-10-15 19:53 ` Uros Bizjak 2009-10-15 21:01 ` H.J. Lu 1 sibling, 1 reply; 59+ messages in thread From: Uros Bizjak @ 2009-10-15 19:53 UTC (permalink / raw) To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek On 10/15/2009 09:32 PM, H.J. Lu wrote: >>>> We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128 >>>> here. At least the comment for P_S_B_D says: >>>> >>>> /* It should be MIN_STACK_BOUNDARY. �But we set it to 128 bits for >>>> � both 32bit and 64bit, to support codes that need 128 bit stack >>>> � alignment for SSE instructions, but can't realign the stack. �*/ >>>> #define PREFERRED_STACK_BOUNDARY_DEFAULT 128 >>>> >>>> >>> In forced_stack_alignment, we collect the hard alignment requirement >>> on stack. The decision if stack should be aligned depends on the >>> incoming stack alignment and the hard stack alignment. It is separate >>> from PREFERRED_STACK_BOUNDARY_DEFAULT. >>> >>> >>> >>>> OTOH, I think that this whole stuff should depend on -mstackrealing >>>> somehow, >>>> according to the comment: >>>> >>>> /* 1 if -mstackrealign should be turned on by default. �It will >>>> � generate an alternate prologue and epilogue that realigns the >>>> � runtime stack if nessary. �This supports mixing codes that keep a >>>> � 4-byte aligned stack, as specified by i386 psABI, with codes that >>>> � need a 16-byte aligned stack, as required by SSE instructions. �If >>>> � STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is >>>> � 128, stacks for all functions may be realigned. �*/ >>>> #define STACK_REALIGN_DEFAULT 0 >>>> >>>> As far as I understand from many comments from the PR40838 trail >>>> (especially >>>> comment #51), -mstackrealing is not effective in some cases involving >>>> automatic SSE variables on the stack. I think that -mstackrealign should >>>> be >>>> fixed in the way your patch outlines, so old/broken sources that assume >>>> 4byte alignment can be compiled using this option without penalizing >>>> new/fixed code that assumes 16byte alignment. >>>> >>>> >>>> >>> -mstackrealign is very effective. If we turn it on by default, it works >>> fine: >>> >>> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838#c45 >>> >>> It is just that we don't want to make it the default since we will realign >>> the stack for almost all functions. In most cases, misaligned stack won't >>> cause any problems. >>> >>> What my patch does is to assume 4byte incoming stack alignment >>> for functions containing SSE instructions which require hard 16byte >>> stack alignment. >>> >>> After my patch is applied, we can update -mstackrealign to make it an >>> no-op since all the problems it tries to solve: >>> >>> http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00854.html >>> http://gcc.gnu.org/ml/gcc-patches/2006-05/msg00526.html >>> >>> disappeared with my patch. >>> >>> >> Then we should do realignment the other way around: instead of using >> -mstackrealing for all the code (including where it has no effect), let's >> use -mstackrealign to activate realignment functionality that is introduced >> by your patch. >> > That defeats the whole purpose of my patch, which automatically > realigns the stack when there is a hard alignment requirement. If it > isn't turned on by default, it is not very useful. > > >> IOW, lightweight -mstackrealign, firing up only when there is the >> possibility of unaligned access in the code it precedes. >> >> > That is what my patch does, but turned it on by default. > I think that we have the same situation here as was with the infamous -mcld option. A specific functionality is needed for compatibility with some [broken] code and this functionality has non-negligible impact on the performance. AFAICS, there will always be opinions for this functionality as well as opinions against it. I propose that we implement new option -mhard-stackrealign [we can bikeshed about this option name a bit ;) ] with corresponding --enable-hard-stackrealign as a configure option. This way, both groups can have whatever they prefer - compatibility vs. performance. Looking at -mcld discussions, is this acceptable solution? Uros. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-15 19:53 ` Uros Bizjak @ 2009-10-15 21:01 ` H.J. Lu 2009-10-15 21:41 ` Uros Bizjak 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-15 21:01 UTC (permalink / raw) To: Uros Bizjak; +Cc: gcc-patches, Jakub Jelinek On Thu, Oct 15, 2009 at 12:51 PM, Uros Bizjak <ubizjak@gmail.com> wrote: > On 10/15/2009 09:32 PM, H.J. Lu wrote: >>>>> >>>>> We should use PREFERRED_STACK_BOUNDARY_DEFAULT instead of hardcoded 128 >>>>> here. At least the comment for P_S_B_D says: >>>>> >>>>> /* It should be MIN_STACK_BOUNDARY. �But we set it to 128 bits for >>>>> � both 32bit and 64bit, to support codes that need 128 bit stack >>>>> � alignment for SSE instructions, but can't realign the stack. �*/ >>>>> #define PREFERRED_STACK_BOUNDARY_DEFAULT 128 >>>>> >>>>> >>>> >>>> In forced_stack_alignment, we collect the hard alignment requirement >>>> on stack. The decision if stack should be aligned depends on the >>>> incoming stack alignment and the hard stack alignment. It is separate >>>> from PREFERRED_STACK_BOUNDARY_DEFAULT. >>>> >>>> >>>> >>>>> >>>>> OTOH, I think that this whole stuff should depend on -mstackrealing >>>>> somehow, >>>>> according to the comment: >>>>> >>>>> /* 1 if -mstackrealign should be turned on by default. �It will >>>>> � generate an alternate prologue and epilogue that realigns the >>>>> � runtime stack if nessary. �This supports mixing codes that keep a >>>>> � 4-byte aligned stack, as specified by i386 psABI, with codes that >>>>> � need a 16-byte aligned stack, as required by SSE instructions. �If >>>>> � STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is >>>>> � 128, stacks for all functions may be realigned. �*/ >>>>> #define STACK_REALIGN_DEFAULT 0 >>>>> >>>>> As far as I understand from many comments from the PR40838 trail >>>>> (especially >>>>> comment #51), -mstackrealing is not effective in some cases involving >>>>> automatic SSE variables on the stack. I think that -mstackrealign >>>>> should >>>>> be >>>>> fixed in the way your patch outlines, so old/broken sources that assume >>>>> 4byte alignment can be compiled using this option without penalizing >>>>> new/fixed code that assumes 16byte alignment. >>>>> >>>>> >>>>> >>>> >>>> -mstackrealign is very effective. If we turn it on by default, it works >>>> fine: >>>> >>>> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838#c45 >>>> >>>> It is just that we don't want to make it the default since we will >>>> realign >>>> the stack for almost all functions. In most cases, misaligned stack >>>> won't >>>> cause any problems. >>>> >>>> What my patch does is to assume 4byte incoming stack alignment >>>> for functions containing SSE instructions which require hard 16byte >>>> stack alignment. >>>> >>>> After my patch is applied, we can update -mstackrealign to make it an >>>> no-op since all the problems it tries to solve: >>>> >>>> http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00854.html >>>> http://gcc.gnu.org/ml/gcc-patches/2006-05/msg00526.html >>>> >>>> disappeared with my patch. >>>> >>>> >>> >>> Then we should do realignment the other way around: instead of using >>> -mstackrealing for all the code (including where it has no effect), let's >>> use -mstackrealign to activate realignment functionality that is >>> introduced >>> by your patch. >>> >> >> That defeats the whole purpose of my patch, which automatically >> realigns the stack when there is a hard alignment requirement. If it >> isn't turned on by default, it is not very useful. >> >> >>> >>> IOW, lightweight -mstackrealign, firing up only when there is the >>> possibility of unaligned access in the code it precedes. >>> >>> >> >> That is what my patch does, but turned it on by default. >> > > I think that we have the same situation here as was with the infamous -mcld > option. A specific functionality is needed for compatibility with some > [broken] code and this functionality has non-negligible impact on the > performance. AFAICS, there will always be opinions for this functionality as Well, performance impact of my patch on SPEC CPU 2006 is pretty much in the noise. > well as opinions against it. > > I propose that we implement new option -mhard-stackrealign [we can bikeshed > about this option name a bit ;) ] with corresponding > --enable-hard-stackrealign as a configure option. This way, both groups can > have whatever they prefer - compatibility vs. performance. > > Looking at -mcld discussions, is this acceptable solution? > Is --enable-hard-stackrealign enabled by default. If not, it doesn't buy much. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-15 21:01 ` H.J. Lu @ 2009-10-15 21:41 ` Uros Bizjak 0 siblings, 0 replies; 59+ messages in thread From: Uros Bizjak @ 2009-10-15 21:41 UTC (permalink / raw) To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek On 10/15/2009 10:59 PM, H.J. Lu wrote: >> I think that we have the same situation here as was with the infamous -mcld >> option. A specific functionality is needed for compatibility with some >> [broken] code and this functionality has non-negligible impact on the >> performance. AFAICS, there will always be opinions for this functionality as >> > Well, performance impact of my patch on SPEC CPU 2006 is pretty much > in the noise. > > >> well as opinions against it. >> >> I propose that we implement new option -mhard-stackrealign [we can bikeshed >> about this option name a bit ;) ] with corresponding >> --enable-hard-stackrealign as a configure option. This way, both groups can >> have whatever they prefer - compatibility vs. performance. >> >> Looking at -mcld discussions, is this acceptable solution? >> >> > Is --enable-hard-stackrealign enabled by default. If not, it doesn't > buy much. > No, it won't be. Uros. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-15 18:45 ` Uros Bizjak 2009-10-15 19:22 ` H.J. Lu @ 2009-10-16 20:27 ` H.J. Lu 2009-10-17 1:03 ` Ian Lance Taylor 2009-10-17 7:09 ` Uros Bizjak 1 sibling, 2 replies; 59+ messages in thread From: H.J. Lu @ 2009-10-16 20:27 UTC (permalink / raw) To: Uros Bizjak; +Cc: gcc-patches, Jakub Jelinek [-- Attachment #1: Type: text/plain, Size: 3467 bytes --] On Thu, Oct 15, 2009 at 11:40 AM, Uros Bizjak <ubizjak@gmail.com> wrote: > On 10/15/2009 05:48 PM, H.J. Lu wrote: >> >> On Thu, Aug 06, 2009 at 02:42:16PM -0700, H.J. Lu wrote: >> >>> >>> Hi, >>> >>> In 32bit, the incoming stack may not be 16 byte aligned. This patch >>> assumes the incoming stack is 4 byte aligned and realigns stack if any >>> SSE variable is put on stack. Any comments? >>> >>> >> >> For i386, gcc, up to gcc 3.4, had been generating 4byte stack alignment. >> 4byte stack alignment is a problem unless SSE registers are used, which >> may be generated by vectorizer or intrinsics. This patch accommodates >> the binaries generated by previous versions of gcc if SSE registers are >> generated by vectorizer or intrinsics. The performance impact on SPEC >> CPU 2006 is minimum: >> >> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156#c7 >> >> OK for trunk? >> > > Comments bellow. > > > OTOH, I think that this whole stuff should depend on -mstackrealing somehow, > according to the comment: > > /* 1 if -mstackrealign should be turned on by default. It will > generate an alternate prologue and epilogue that realigns the > runtime stack if nessary. This supports mixing codes that keep a > 4-byte aligned stack, as specified by i386 psABI, with codes that > need a 16-byte aligned stack, as required by SSE instructions. If > STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is > 128, stacks for all functions may be realigned. */ > #define STACK_REALIGN_DEFAULT 0 > > As far as I understand from many comments from the PR40838 trail (especially > comment #51), -mstackrealing is not effective in some cases involving > automatic SSE variables on the stack. I think that -mstackrealign should be > fixed in the way your patch outlines, so old/broken sources that assume > 4byte alignment can be compiled using this option without penalizing > new/fixed code that assumes 16byte alignment. > > Also, Jakub has its opinion here, I have also CC'd him. > Uros. > Here is the patch to make -mstackrealing only align stack when SSE registers are used. People who care about mixing binaries compiled by the older gcc can use it without aligning stack for every function. OK for trunk if regression passes? Thanks. -- H.J. --- gcc/ 2009-10-16 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * function.c (allocate_struct_function): Initialize hard_stack_alignment. * function.h (function): Add hard_stack_alignment. * tree-vect-data-refs.c (vect_verify_datarefs_alignment): Update hard_stack_alignment. * config/i386/i386.c (verride_options): Don't check ix86_force_align_arg_pointer here. (ix86_update_stack_boundary): In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary if -mstackrealign is used and hard stack alignment is 128bit. (ix86_minimum_alignment): In 32bit, update hard_stack_alignment if alignment is 128bit. gcc/testsuite/ 2009-10-16 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * gcc.target/i386/incoming-6.c: New. * gcc.target/i386/incoming-7.c: Likewise. * gcc.target/i386/incoming-8.c: Likewise. * gcc.target/i386/incoming-9.c: Likewise. * gcc.target/i386/incoming-10.c: Likewise. * gcc.target/i386/incoming-11.c: Likewise. * gcc.target/i386/incoming-12.c: Likewise. * gcc.target/i386/incoming-13.c: Likewise. * gcc.target/i386/incoming-14.c: Likewise. [-- Attachment #2: gcc-pr40838-6.patch --] [-- Type: text/plain, Size: 11831 bytes --] gcc/ 2009-10-16 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * function.c (allocate_struct_function): Initialize hard_stack_alignment. * function.h (function): Add hard_stack_alignment. * tree-vect-data-refs.c (vect_verify_datarefs_alignment): Update hard_stack_alignment. * config/i386/i386.c (verride_options): Don't check ix86_force_align_arg_pointer here. (ix86_update_stack_boundary): In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary if -mstackrealign is used and hard stack alignment is 128bit. (ix86_minimum_alignment): In 32bit, update hard_stack_alignment if alignment is 128bit. gcc/testsuite/ 2009-10-16 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * gcc.target/i386/incoming-6.c: New. * gcc.target/i386/incoming-7.c: Likewise. * gcc.target/i386/incoming-8.c: Likewise. * gcc.target/i386/incoming-9.c: Likewise. * gcc.target/i386/incoming-10.c: Likewise. * gcc.target/i386/incoming-11.c: Likewise. * gcc.target/i386/incoming-12.c: Likewise. * gcc.target/i386/incoming-13.c: Likewise. * gcc.target/i386/incoming-14.c: Likewise. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 73913b8..ac9d2ad 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -3239,12 +3239,10 @@ override_options (bool main_args_p) if (ix86_force_align_arg_pointer == -1) ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT; + ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; + /* Validate -mincoming-stack-boundary= value or default it to MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */ - if (ix86_force_align_arg_pointer) - ix86_default_incoming_stack_boundary = MIN_STACK_BOUNDARY; - else - ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary; if (ix86_incoming_stack_boundary_string) { @@ -8201,11 +8199,23 @@ find_drap_reg (void) static void ix86_update_stack_boundary (void) { + /* Should we use MIN_STACK_BOUNDARY for incoming stack boundary? */ + unsigned int incoming_stack_boundary; + + /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary if + -mstackrealign is used and hard stack alignment is 128bit. */ + if (!TARGET_64BIT + && ix86_force_align_arg_pointer + && cfun->hard_stack_alignment == 128) + incoming_stack_boundary = MIN_STACK_BOUNDARY; + else + incoming_stack_boundary = ix86_default_incoming_stack_boundary; + /* Prefer the one specified at command line. */ ix86_incoming_stack_boundary = (ix86_user_incoming_stack_boundary ? ix86_user_incoming_stack_boundary - : ix86_default_incoming_stack_boundary); + : incoming_stack_boundary); /* Incoming stack alignment can be changed on individual functions via force_align_arg_pointer attribute. We use the smallest @@ -19863,7 +19873,14 @@ ix86_minimum_alignment (tree exp, enum machine_mode mode, { tree type, decl; - if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64) + if (TARGET_64BIT) + return align; + + /* In 32bit, update hard_stack_mode if ALIGN is 128. */ + if (align == 128 && cfun->hard_stack_alignment < 128) + cfun->hard_stack_alignment = align; + + if (align != 64 || ix86_preferred_stack_boundary >= 64) return align; if (exp && DECL_P (exp)) diff --git a/gcc/function.c b/gcc/function.c index 35c0cfd..1071183 100644 --- a/gcc/function.c +++ b/gcc/function.c @@ -4139,6 +4139,8 @@ allocate_struct_function (tree fndecl, bool abstract_p) /* Assume all registers in stdarg functions need to be saved. */ cfun->va_list_gpr_size = VA_LIST_MAX_GPR_SIZE; cfun->va_list_fpr_size = VA_LIST_MAX_FPR_SIZE; + + cfun->hard_stack_alignment = 0; } } diff --git a/gcc/function.h b/gcc/function.h index 4825d16..457d5d1 100644 --- a/gcc/function.h +++ b/gcc/function.h @@ -532,6 +532,9 @@ struct GTY(()) function { a string describing the reason for failure. */ const char * GTY((skip)) cannot_be_copied_reason; + /* The largest hard alignment on the stack. */ + unsigned int hard_stack_alignment; + /* Collected bit flags. */ /* Number of units of general registers that need saving in stdarg diff --git a/gcc/testsuite/gcc.target/i386/incoming-10.c b/gcc/testsuite/gcc.target/i386/incoming-10.c new file mode 100644 index 0000000..31d9e61 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-10.c @@ -0,0 +1,19 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */ + +struct s { + int x[8]; +}; + +void g(struct s *); + +void f() +{ + int i; + struct s s; + for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0; + g(&s); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-11.c b/gcc/testsuite/gcc.target/i386/incoming-11.c new file mode 100644 index 0000000..e5787af --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-11.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */ + +void g(); + +int p[100]; +int q[100]; + +void f() +{ + int i; + for (i = 0; i < 100; i++) p[i] = 0; + g(); + for (i = 0; i < 100; i++) q[i] = 0; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-12.c b/gcc/testsuite/gcc.target/i386/incoming-12.c new file mode 100644 index 0000000..d7ef103 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-12.c @@ -0,0 +1,20 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +struct x { + v4si v; + v4si w; +}; + +void y(void *); + +v4si x(void) +{ + struct x x; + y(&x); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-13.c b/gcc/testsuite/gcc.target/i386/incoming-13.c new file mode 100644 index 0000000..bbc8993 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-13.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern double y(double *s3); + +extern double s1, s2; + +double x(void) +{ + double s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-14.c b/gcc/testsuite/gcc.target/i386/incoming-14.c new file mode 100644 index 0000000..d27179d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-14.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern int y(int *s3); + +extern int s1, s2; + +int x(void) +{ + int s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-15.c b/gcc/testsuite/gcc.target/i386/incoming-15.c new file mode 100644 index 0000000..e6a1749 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-15.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern long long y(long long *s3); + +extern long long s1, s2; + +long long x(void) +{ + long long s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-6.c b/gcc/testsuite/gcc.target/i386/incoming-6.c new file mode 100644 index 0000000..5cc4ab3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-6.c @@ -0,0 +1,17 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si *s3); + +extern v4si s1, s2; + +v4si x(void) +{ + v4si s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-7.c b/gcc/testsuite/gcc.target/i386/incoming-7.c new file mode 100644 index 0000000..cdd6037 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-7.c @@ -0,0 +1,16 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si, v4si, v4si, v4si, v4si); + +extern v4si s1, s2; + +v4si x(void) +{ + return y(s1, s2, s1, s2, s2); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-8.c b/gcc/testsuite/gcc.target/i386/incoming-8.c new file mode 100644 index 0000000..2dd8800 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-8.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O3 -msse2 -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-9.c b/gcc/testsuite/gcc.target/i386/incoming-9.c new file mode 100644 index 0000000..e43cbd6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-9.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O3 -mno-sse -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index c3570d3..0daaa70 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -927,6 +927,8 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) { gimple stmt = DR_STMT (dr); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + enum machine_mode mode; + unsigned int align; /* For interleaving, only the alignment of the first access matters. */ if (STMT_VINFO_STRIDED_ACCESS (stmt_info) @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) } return false; } + + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); + align = GET_MODE_ALIGNMENT (mode); + if (cfun->hard_stack_alignment < align) + cfun->hard_stack_alignment = align; + if (supportable_dr_alignment != dr_aligned && vect_print_dump_info (REPORT_ALIGNMENT)) fprintf (vect_dump, "Vectorizing an unaligned access."); ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-16 20:27 ` H.J. Lu @ 2009-10-17 1:03 ` Ian Lance Taylor 2009-10-17 18:22 ` H.J. Lu 2009-10-17 7:09 ` Uros Bizjak 1 sibling, 1 reply; 59+ messages in thread From: Ian Lance Taylor @ 2009-10-17 1:03 UTC (permalink / raw) To: H.J. Lu; +Cc: Uros Bizjak, gcc-patches, Jakub Jelinek "H.J. Lu" <hjl.tools@gmail.com> writes: > @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) > } > return false; > } > + > + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); > + align = GET_MODE_ALIGNMENT (mode); > + if (cfun->hard_stack_alignment < align) > + cfun->hard_stack_alignment = align; > + > if (supportable_dr_alignment != dr_aligned > && vect_print_dump_info (REPORT_ALIGNMENT)) > fprintf (vect_dump, "Vectorizing an unaligned access."); I do not understand why this is the right place to set your new cfun field. Your patch is about a misaligned stack. The code you are changing here does not appear to have anything to do with a misaligned stack. This code is about the correct alignment of data. The data may be on the stack but it need not be. If the data is on the stack, then the compiler should be able to determine the alignment available on the stack. Ian ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-17 1:03 ` Ian Lance Taylor @ 2009-10-17 18:22 ` H.J. Lu 2009-10-17 19:02 ` Richard Guenther 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-17 18:22 UTC (permalink / raw) To: Ian Lance Taylor; +Cc: Uros Bizjak, gcc-patches, Jakub Jelinek On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote: > "H.J. Lu" <hjl.tools@gmail.com> writes: > >> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) >> } >> return false; >> } >> + >> + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); >> + align = GET_MODE_ALIGNMENT (mode); >> + if (cfun->hard_stack_alignment < align) >> + cfun->hard_stack_alignment = align; >> + >> if (supportable_dr_alignment != dr_aligned >> && vect_print_dump_info (REPORT_ALIGNMENT)) >> fprintf (vect_dump, "Vectorizing an unaligned access."); > > I do not understand why this is the right place to set your new cfun > field. Your patch is about a misaligned stack. The code you are > changing here does not appear to have anything to do with a misaligned > stack. This code is about the correct alignment of data. The data > may be on the stack but it need not be. If the data is on the stack, > then the compiler should be able to determine the alignment available > on the stack. The issue here is to set proper incoming stack alignment. We have to do it before RTL expansion which uses this information to check if stack alignment is needed. To do that, we need to know what the minimum stack alignment required by hardware. We collect hard stack alignment when we put variables on stack and check the alignment generated by vectorizer. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-17 18:22 ` H.J. Lu @ 2009-10-17 19:02 ` Richard Guenther 2009-10-17 19:21 ` H.J. Lu 0 siblings, 1 reply; 59+ messages in thread From: Richard Guenther @ 2009-10-17 19:02 UTC (permalink / raw) To: H.J. Lu; +Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote: >> "H.J. Lu" <hjl.tools@gmail.com> writes: >> >>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) >>> } >>> return false; >>> } >>> + >>> + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); >>> + align = GET_MODE_ALIGNMENT (mode); >>> + if (cfun->hard_stack_alignment < align) >>> + cfun->hard_stack_alignment = align; >>> + >>> if (supportable_dr_alignment != dr_aligned >>> && vect_print_dump_info (REPORT_ALIGNMENT)) >>> fprintf (vect_dump, "Vectorizing an unaligned access."); >> >> I do not understand why this is the right place to set your new cfun >> field. Your patch is about a misaligned stack. The code you are >> changing here does not appear to have anything to do with a misaligned >> stack. This code is about the correct alignment of data. The data >> may be on the stack but it need not be. If the data is on the stack, >> then the compiler should be able to determine the alignment available >> on the stack. > > The issue here is to set proper incoming stack alignment. We have > to do it before RTL expansion which uses this information to check > if stack alignment is needed. To do that, we need to know what the > minimum stack alignment required by hardware. We collect hard > stack alignment when we put variables on stack and check the alignment > generated by vectorizer. It's surely not the right function to do this. Nor is it a proper interface IMHO. Why don't you need to check whether the access is to stack memory at all? I think you want to instead see if any user-alignment on automatic variables requires stack realignment. Thus, wherever the vectorizer sets DECL_USER_ALIGN on automatic storage. Richard. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-17 19:02 ` Richard Guenther @ 2009-10-17 19:21 ` H.J. Lu 2009-10-17 19:29 ` Richard Guenther 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-17 19:21 UTC (permalink / raw) To: Richard Guenther Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther <richard.guenther@gmail.com> wrote: > On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote: >>> "H.J. Lu" <hjl.tools@gmail.com> writes: >>> >>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) >>>> } >>>> return false; >>>> } >>>> + >>>> + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); >>>> + align = GET_MODE_ALIGNMENT (mode); >>>> + if (cfun->hard_stack_alignment < align) >>>> + cfun->hard_stack_alignment = align; >>>> + >>>> if (supportable_dr_alignment != dr_aligned >>>> && vect_print_dump_info (REPORT_ALIGNMENT)) >>>> fprintf (vect_dump, "Vectorizing an unaligned access."); >>> >>> I do not understand why this is the right place to set your new cfun >>> field. Your patch is about a misaligned stack. The code you are >>> changing here does not appear to have anything to do with a misaligned >>> stack. This code is about the correct alignment of data. The data >>> may be on the stack but it need not be. If the data is on the stack, >>> then the compiler should be able to determine the alignment available >>> on the stack. >> >> The issue here is to set proper incoming stack alignment. We have >> to do it before RTL expansion which uses this information to check >> if stack alignment is needed. To do that, we need to know what the >> minimum stack alignment required by hardware. We collect hard >> stack alignment when we put variables on stack and check the alignment >> generated by vectorizer. > > It's surely not the right function to do this. Nor is it a proper > interface IMHO. > Why don't you need to check whether the access is to stack memory at all? > > I think you want to instead see if any user-alignment on automatic > variables requires stack realignment. Thus, wherever the vectorizer > sets DECL_USER_ALIGN on automatic storage. > RTL expansion may generate automatic variables from vectorizer statements while vectorizer doesn't generate any automatic variables at all. How should I handle this case? -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-17 19:21 ` H.J. Lu @ 2009-10-17 19:29 ` Richard Guenther 2009-10-17 19:35 ` H.J. Lu 0 siblings, 1 reply; 59+ messages in thread From: Richard Guenther @ 2009-10-17 19:29 UTC (permalink / raw) To: H.J. Lu; +Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Sat, Oct 17, 2009 at 9:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther > <richard.guenther@gmail.com> wrote: >> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote: >>>> "H.J. Lu" <hjl.tools@gmail.com> writes: >>>> >>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) >>>>> } >>>>> return false; >>>>> } >>>>> + >>>>> + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); >>>>> + align = GET_MODE_ALIGNMENT (mode); >>>>> + if (cfun->hard_stack_alignment < align) >>>>> + cfun->hard_stack_alignment = align; >>>>> + >>>>> if (supportable_dr_alignment != dr_aligned >>>>> && vect_print_dump_info (REPORT_ALIGNMENT)) >>>>> fprintf (vect_dump, "Vectorizing an unaligned access."); >>>> >>>> I do not understand why this is the right place to set your new cfun >>>> field. Your patch is about a misaligned stack. The code you are >>>> changing here does not appear to have anything to do with a misaligned >>>> stack. This code is about the correct alignment of data. The data >>>> may be on the stack but it need not be. If the data is on the stack, >>>> then the compiler should be able to determine the alignment available >>>> on the stack. >>> >>> The issue here is to set proper incoming stack alignment. We have >>> to do it before RTL expansion which uses this information to check >>> if stack alignment is needed. To do that, we need to know what the >>> minimum stack alignment required by hardware. We collect hard >>> stack alignment when we put variables on stack and check the alignment >>> generated by vectorizer. >> >> It's surely not the right function to do this. Nor is it a proper >> interface IMHO. >> Why don't you need to check whether the access is to stack memory at all? >> >> I think you want to instead see if any user-alignment on automatic >> variables requires stack realignment. Thus, wherever the vectorizer >> sets DECL_USER_ALIGN on automatic storage. >> > > RTL expansion may generate automatic variables from vectorizer > statements while vectorizer doesn't generate any automatic variables > at all. How should I handle this case? The same way you would handle spills or you would handle user code using intrinsics. I don't see how the vectorizer is special. Richard. > > -- > H.J. > ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-17 19:29 ` Richard Guenther @ 2009-10-17 19:35 ` H.J. Lu 2009-10-17 19:46 ` Richard Guenther 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-17 19:35 UTC (permalink / raw) To: Richard Guenther Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Sat, Oct 17, 2009 at 12:20 PM, Richard Guenther <richard.guenther@gmail.com> wrote: > On Sat, Oct 17, 2009 at 9:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >> On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther >> <richard.guenther@gmail.com> wrote: >>> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >>>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote: >>>>> "H.J. Lu" <hjl.tools@gmail.com> writes: >>>>> >>>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) >>>>>> } >>>>>> return false; >>>>>> } >>>>>> + >>>>>> + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); >>>>>> + align = GET_MODE_ALIGNMENT (mode); >>>>>> + if (cfun->hard_stack_alignment < align) >>>>>> + cfun->hard_stack_alignment = align; >>>>>> + >>>>>> if (supportable_dr_alignment != dr_aligned >>>>>> && vect_print_dump_info (REPORT_ALIGNMENT)) >>>>>> fprintf (vect_dump, "Vectorizing an unaligned access."); >>>>> >>>>> I do not understand why this is the right place to set your new cfun >>>>> field. Your patch is about a misaligned stack. The code you are >>>>> changing here does not appear to have anything to do with a misaligned >>>>> stack. This code is about the correct alignment of data. The data >>>>> may be on the stack but it need not be. If the data is on the stack, >>>>> then the compiler should be able to determine the alignment available >>>>> on the stack. >>>> >>>> The issue here is to set proper incoming stack alignment. We have >>>> to do it before RTL expansion which uses this information to check >>>> if stack alignment is needed. To do that, we need to know what the >>>> minimum stack alignment required by hardware. We collect hard >>>> stack alignment when we put variables on stack and check the alignment >>>> generated by vectorizer. >>> >>> It's surely not the right function to do this. Nor is it a proper >>> interface IMHO. >>> Why don't you need to check whether the access is to stack memory at all? >>> >>> I think you want to instead see if any user-alignment on automatic >>> variables requires stack realignment. Thus, wherever the vectorizer >>> sets DECL_USER_ALIGN on automatic storage. >>> >> >> RTL expansion may generate automatic variables from vectorizer >> statements while vectorizer doesn't generate any automatic variables >> at all. How should I handle this case? > > The same way you would handle spills or you would handle user code > using intrinsics. I don't see how the vectorizer is special. > For --- void g(); int p[100]; int q[100]; void f() { int i; for (i = 0; i < 100; i++) p[i] = 0; g(); for (i = 0; i < 100; i++) q[i] = 0; } --- Vectorizor sets DECL_USER_ALIGN on p and g. But it may not generate any automatic variables in function f. How do I know function f needs stack alignment before RTL expansion? -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-17 19:35 ` H.J. Lu @ 2009-10-17 19:46 ` Richard Guenther 2009-10-17 20:01 ` H.J. Lu 0 siblings, 1 reply; 59+ messages in thread From: Richard Guenther @ 2009-10-17 19:46 UTC (permalink / raw) To: H.J. Lu; +Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Sat, Oct 17, 2009 at 9:29 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Sat, Oct 17, 2009 at 12:20 PM, Richard Guenther > <richard.guenther@gmail.com> wrote: >> On Sat, Oct 17, 2009 at 9:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >>> On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther >>> <richard.guenther@gmail.com> wrote: >>>> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >>>>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote: >>>>>> "H.J. Lu" <hjl.tools@gmail.com> writes: >>>>>> >>>>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) >>>>>>> } >>>>>>> return false; >>>>>>> } >>>>>>> + >>>>>>> + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); >>>>>>> + align = GET_MODE_ALIGNMENT (mode); >>>>>>> + if (cfun->hard_stack_alignment < align) >>>>>>> + cfun->hard_stack_alignment = align; >>>>>>> + >>>>>>> if (supportable_dr_alignment != dr_aligned >>>>>>> && vect_print_dump_info (REPORT_ALIGNMENT)) >>>>>>> fprintf (vect_dump, "Vectorizing an unaligned access."); >>>>>> >>>>>> I do not understand why this is the right place to set your new cfun >>>>>> field. Your patch is about a misaligned stack. The code you are >>>>>> changing here does not appear to have anything to do with a misaligned >>>>>> stack. This code is about the correct alignment of data. The data >>>>>> may be on the stack but it need not be. If the data is on the stack, >>>>>> then the compiler should be able to determine the alignment available >>>>>> on the stack. >>>>> >>>>> The issue here is to set proper incoming stack alignment. We have >>>>> to do it before RTL expansion which uses this information to check >>>>> if stack alignment is needed. To do that, we need to know what the >>>>> minimum stack alignment required by hardware. We collect hard >>>>> stack alignment when we put variables on stack and check the alignment >>>>> generated by vectorizer. >>>> >>>> It's surely not the right function to do this. Nor is it a proper >>>> interface IMHO. >>>> Why don't you need to check whether the access is to stack memory at all? >>>> >>>> I think you want to instead see if any user-alignment on automatic >>>> variables requires stack realignment. Thus, wherever the vectorizer >>>> sets DECL_USER_ALIGN on automatic storage. >>>> >>> >>> RTL expansion may generate automatic variables from vectorizer >>> statements while vectorizer doesn't generate any automatic variables >>> at all. How should I handle this case? >> >> The same way you would handle spills or you would handle user code >> using intrinsics. I don't see how the vectorizer is special. >> > > For > > --- > void g(); > > int p[100]; > int q[100]; > > void f() > { > int i; > for (i = 0; i < 100; i++) p[i] = 0; > g(); > for (i = 0; i < 100; i++) q[i] = 0; > } > --- > > Vectorizor sets DECL_USER_ALIGN on p and g. But it may not generate > any automatic variables in function f. Right. What matters is if the vectorizer sets that on automatic variables of course. > How do I know function f > needs stack alignment before RTL expansion? Well, you simply decide. Then you have to cope with the consequences of the decision - properly set the alignment of any stack temporaries (or spill slots) you generate. Richard. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-17 19:46 ` Richard Guenther @ 2009-10-17 20:01 ` H.J. Lu 2009-10-17 20:59 ` Richard Guenther 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-17 20:01 UTC (permalink / raw) To: Richard Guenther Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Sat, Oct 17, 2009 at 12:35 PM, Richard Guenther <richard.guenther@gmail.com> wrote: > On Sat, Oct 17, 2009 at 9:29 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >> On Sat, Oct 17, 2009 at 12:20 PM, Richard Guenther >> <richard.guenther@gmail.com> wrote: >>> On Sat, Oct 17, 2009 at 9:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >>>> On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther >>>> <richard.guenther@gmail.com> wrote: >>>>> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >>>>>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote: >>>>>>> "H.J. Lu" <hjl.tools@gmail.com> writes: >>>>>>> >>>>>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) >>>>>>>> } >>>>>>>> return false; >>>>>>>> } >>>>>>>> + >>>>>>>> + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); >>>>>>>> + align = GET_MODE_ALIGNMENT (mode); >>>>>>>> + if (cfun->hard_stack_alignment < align) >>>>>>>> + cfun->hard_stack_alignment = align; >>>>>>>> + >>>>>>>> if (supportable_dr_alignment != dr_aligned >>>>>>>> && vect_print_dump_info (REPORT_ALIGNMENT)) >>>>>>>> fprintf (vect_dump, "Vectorizing an unaligned access."); >>>>>>> >>>>>>> I do not understand why this is the right place to set your new cfun >>>>>>> field. Your patch is about a misaligned stack. The code you are >>>>>>> changing here does not appear to have anything to do with a misaligned >>>>>>> stack. This code is about the correct alignment of data. The data >>>>>>> may be on the stack but it need not be. If the data is on the stack, >>>>>>> then the compiler should be able to determine the alignment available >>>>>>> on the stack. >>>>>> >>>>>> The issue here is to set proper incoming stack alignment. We have >>>>>> to do it before RTL expansion which uses this information to check >>>>>> if stack alignment is needed. To do that, we need to know what the >>>>>> minimum stack alignment required by hardware. We collect hard >>>>>> stack alignment when we put variables on stack and check the alignment >>>>>> generated by vectorizer. >>>>> >>>>> It's surely not the right function to do this. Nor is it a proper >>>>> interface IMHO. >>>>> Why don't you need to check whether the access is to stack memory at all? >>>>> >>>>> I think you want to instead see if any user-alignment on automatic >>>>> variables requires stack realignment. Thus, wherever the vectorizer >>>>> sets DECL_USER_ALIGN on automatic storage. >>>>> >>>> >>>> RTL expansion may generate automatic variables from vectorizer >>>> statements while vectorizer doesn't generate any automatic variables >>>> at all. How should I handle this case? >>> >>> The same way you would handle spills or you would handle user code >>> using intrinsics. I don't see how the vectorizer is special. >>> >> >> For >> >> --- >> void g(); >> >> int p[100]; >> int q[100]; >> >> void f() >> { >> int i; >> for (i = 0; i < 100; i++) p[i] = 0; >> g(); >> for (i = 0; i < 100; i++) q[i] = 0; >> } >> --- >> >> Vectorizor sets DECL_USER_ALIGN on p and g. But it may not generate >> any automatic variables in function f. > > Right. What matters is if the vectorizer sets that on automatic > variables of course. The problem I am trying to deal with is vectorizer does not generate any automatic variables. Instead, it generates vector statements on static array directly. RTL expansion will generate automatic variables from those vector statements, which is too late. I can't rely on automatic variables for vectorizer when I need the information before RTL expansion. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-17 20:01 ` H.J. Lu @ 2009-10-17 20:59 ` Richard Guenther 2009-10-18 19:21 ` Michael Matz 0 siblings, 1 reply; 59+ messages in thread From: Richard Guenther @ 2009-10-17 20:59 UTC (permalink / raw) To: H.J. Lu; +Cc: Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Sat, Oct 17, 2009 at 9:46 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Sat, Oct 17, 2009 at 12:35 PM, Richard Guenther > <richard.guenther@gmail.com> wrote: >> On Sat, Oct 17, 2009 at 9:29 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >>> On Sat, Oct 17, 2009 at 12:20 PM, Richard Guenther >>> <richard.guenther@gmail.com> wrote: >>>> On Sat, Oct 17, 2009 at 9:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >>>>> On Sat, Oct 17, 2009 at 11:51 AM, Richard Guenther >>>>> <richard.guenther@gmail.com> wrote: >>>>>> On Sat, Oct 17, 2009 at 8:14 PM, H.J. Lu <hjl.tools@gmail.com> wrote: >>>>>>> On Fri, Oct 16, 2009 at 5:59 PM, Ian Lance Taylor <iant@google.com> wrote: >>>>>>>> "H.J. Lu" <hjl.tools@gmail.com> writes: >>>>>>>> >>>>>>>>> @@ -947,6 +949,12 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo) >>>>>>>>> } >>>>>>>>> return false; >>>>>>>>> } >>>>>>>>> + >>>>>>>>> + mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); >>>>>>>>> + align = GET_MODE_ALIGNMENT (mode); >>>>>>>>> + if (cfun->hard_stack_alignment < align) >>>>>>>>> + cfun->hard_stack_alignment = align; >>>>>>>>> + >>>>>>>>> if (supportable_dr_alignment != dr_aligned >>>>>>>>> && vect_print_dump_info (REPORT_ALIGNMENT)) >>>>>>>>> fprintf (vect_dump, "Vectorizing an unaligned access."); >>>>>>>> >>>>>>>> I do not understand why this is the right place to set your new cfun >>>>>>>> field. Your patch is about a misaligned stack. The code you are >>>>>>>> changing here does not appear to have anything to do with a misaligned >>>>>>>> stack. This code is about the correct alignment of data. The data >>>>>>>> may be on the stack but it need not be. If the data is on the stack, >>>>>>>> then the compiler should be able to determine the alignment available >>>>>>>> on the stack. >>>>>>> >>>>>>> The issue here is to set proper incoming stack alignment. We have >>>>>>> to do it before RTL expansion which uses this information to check >>>>>>> if stack alignment is needed. To do that, we need to know what the >>>>>>> minimum stack alignment required by hardware. We collect hard >>>>>>> stack alignment when we put variables on stack and check the alignment >>>>>>> generated by vectorizer. >>>>>> >>>>>> It's surely not the right function to do this. Nor is it a proper >>>>>> interface IMHO. >>>>>> Why don't you need to check whether the access is to stack memory at all? >>>>>> >>>>>> I think you want to instead see if any user-alignment on automatic >>>>>> variables requires stack realignment. Thus, wherever the vectorizer >>>>>> sets DECL_USER_ALIGN on automatic storage. >>>>>> >>>>> >>>>> RTL expansion may generate automatic variables from vectorizer >>>>> statements while vectorizer doesn't generate any automatic variables >>>>> at all. How should I handle this case? >>>> >>>> The same way you would handle spills or you would handle user code >>>> using intrinsics. I don't see how the vectorizer is special. >>>> >>> >>> For >>> >>> --- >>> void g(); >>> >>> int p[100]; >>> int q[100]; >>> >>> void f() >>> { >>> int i; >>> for (i = 0; i < 100; i++) p[i] = 0; >>> g(); >>> for (i = 0; i < 100; i++) q[i] = 0; >>> } >>> --- >>> >>> Vectorizor sets DECL_USER_ALIGN on p and g. But it may not generate >>> any automatic variables in function f. >> >> Right. What matters is if the vectorizer sets that on automatic >> variables of course. > > The problem I am trying to deal with is vectorizer does not > generate any automatic variables. Instead, it generates vector > statements on static array directly. RTL expansion will generate > automatic variables from those vector statements, which is too late. > I can't rely on automatic variables for vectorizer when I need > the information before RTL expansion. I don't see how this is too late (I also don't see where expand creates automatic variables, but well ... I can imagine reload creating spill slots). If non-aligned automatic variables are generated you simply use unaligned moves - what's the problem? Richard. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-17 20:59 ` Richard Guenther @ 2009-10-18 19:21 ` Michael Matz 2009-10-18 19:45 ` Richard Guenther 2009-10-19 16:38 ` H.J. Lu 0 siblings, 2 replies; 59+ messages in thread From: Michael Matz @ 2009-10-18 19:21 UTC (permalink / raw) To: Richard Guenther Cc: H.J. Lu, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek Hi, On Sat, 17 Oct 2009, Richard Guenther wrote: > > I can't rely on automatic variables for vectorizer when I need > > the information before RTL expansion. > > I don't see how this is too late (I also don't see where expand creates > automatic variables, assign_temp/assign_stack_temp, all over. > but well ... I can imagine reload creating spill slots). If non-aligned > automatic variables are generated you simply use unaligned moves - > what's the problem? It's obvious: we don't want to generate unaligned moves. That's the whole point of H.J. patches. He has a phase ordering problem: (a) alignment of generated temporaries depends on known stack alignment (b) known stack alignment depends on what is put on stack (including late generated temporaries) He tries to solve this by pessimistically assuming that potentially everything imaginable could go on stack. What I don't understand is why we don't instead track hard_stack_alignment in assign_*_temp (where we then assume that the stack will be aligned perfectly), and expand stack realignment code _after_ having expanded everything else (plus examined local variables for the possibility of generating spill slots). Ciao, Michael. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-18 19:21 ` Michael Matz @ 2009-10-18 19:45 ` Richard Guenther 2009-10-19 16:36 ` H.J. Lu 2009-10-19 16:38 ` H.J. Lu 1 sibling, 1 reply; 59+ messages in thread From: Richard Guenther @ 2009-10-18 19:45 UTC (permalink / raw) To: Michael Matz Cc: H.J. Lu, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Sun, Oct 18, 2009 at 9:19 PM, Michael Matz <matz@suse.de> wrote: > Hi, > > On Sat, 17 Oct 2009, Richard Guenther wrote: > >> > I can't rely on automatic variables for vectorizer when I need >> > the information before RTL expansion. >> >> I don't see how this is too late (I also don't see where expand creates >> automatic variables, > > assign_temp/assign_stack_temp, all over. > >> but well ... I can imagine reload creating spill slots). If non-aligned >> automatic variables are generated you simply use unaligned moves - >> what's the problem? > > It's obvious: we don't want to generate unaligned moves. That's the whole > point of H.J. patches. He has a phase ordering problem: > (a) alignment of generated temporaries depends on known stack alignment > (b) known stack alignment depends on what is put on stack (including late > generated temporaries) > > He tries to solve this by pessimistically assuming that potentially > everything imaginable could go on stack. What I don't understand is why > we don't instead track hard_stack_alignment in assign_*_temp (where we > then assume that the stack will be aligned perfectly), and expand stack > realignment code _after_ having expanded everything else (plus examined > local variables for the possibility of generating spill slots). I also don't understand how this problem can be solved in the vectorizer and how this problem cannot occur the same way when using intrinsics. Richard. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-18 19:45 ` Richard Guenther @ 2009-10-19 16:36 ` H.J. Lu 2009-10-20 1:12 ` Michael Matz 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-19 16:36 UTC (permalink / raw) To: Richard Guenther Cc: Michael Matz, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Sun, Oct 18, 2009 at 12:44 PM, Richard Guenther <richard.guenther@gmail.com> wrote: > On Sun, Oct 18, 2009 at 9:19 PM, Michael Matz <matz@suse.de> wrote: >> Hi, >> >> On Sat, 17 Oct 2009, Richard Guenther wrote: >> >>> > I can't rely on automatic variables for vectorizer when I need >>> > the information before RTL expansion. >>> >>> I don't see how this is too late (I also don't see where expand creates >>> automatic variables, >> >> assign_temp/assign_stack_temp, all over. >> >>> but well ... I can imagine reload creating spill slots). If non-aligned >>> automatic variables are generated you simply use unaligned moves - >>> what's the problem? >> >> It's obvious: we don't want to generate unaligned moves. That's the whole >> point of H.J. patches. He has a phase ordering problem: >> (a) alignment of generated temporaries depends on known stack alignment >> (b) known stack alignment depends on what is put on stack (including late >> generated temporaries) >> >> He tries to solve this by pessimistically assuming that potentially >> everything imaginable could go on stack. What I don't understand is why >> we don't instead track hard_stack_alignment in assign_*_temp (where we >> then assume that the stack will be aligned perfectly), and expand stack >> realignment code _after_ having expanded everything else (plus examined >> local variables for the possibility of generating spill slots). > > I also don't understand how this problem can be solved in the vectorizer > and how this problem cannot occur the same way when using > intrinsics. > The issues are 1. The incoming stack alignment can't be changed after RTL expansion starts. 2. When -mstackrealign is used, we want to use 4 byte incoming stack alignment on functions which use SSE vector insns. 3. SSE vector insns may be generated by intrinsics, vectorizer and vector operations. Both intrinsics and and vector operations will always use automatic variables, which are handled by ix86_minimum_alignment. Since vectorizer may not always use automatic variables, I modified vectorizer to update hard_stack_alignment when vector statements are generated. Thanks. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 16:36 ` H.J. Lu @ 2009-10-20 1:12 ` Michael Matz 2009-10-20 19:10 ` H.J. Lu 0 siblings, 1 reply; 59+ messages in thread From: Michael Matz @ 2009-10-20 1:12 UTC (permalink / raw) To: H.J. Lu Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek Hi, On Mon, 19 Oct 2009, H.J. Lu wrote: > The issues are > > 1. The incoming stack alignment can't be changed after RTL expansion > starts. And that's the thing. Just move the prologue expansion to after the body expansion (see cfgexpand.c) and you should be set. No doubt you'll hit some ugly obstacles, but that should be the way forward. > 2. When -mstackrealign is used, we want to use 4 byte incoming stack > alignment on functions which use SSE vector insns. > 3. SSE vector insns may be generated by intrinsics, vectorizer and > vector operations. All of these will be automatically taken care off when (1) is solved. I really think it would be worth it. Fiddling with stack alignment in the vectorizer (though that's the first alignment requirers you'll hit) is not going to fly in the long run. Ciao, Michael. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-20 1:12 ` Michael Matz @ 2009-10-20 19:10 ` H.J. Lu 0 siblings, 0 replies; 59+ messages in thread From: H.J. Lu @ 2009-10-20 19:10 UTC (permalink / raw) To: Michael Matz Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Mon, Oct 19, 2009 at 5:48 PM, Michael Matz <matz@suse.de> wrote: > Hi, > > On Mon, 19 Oct 2009, H.J. Lu wrote: > >> The issues are >> >> 1. The incoming stack alignment can't be changed after RTL expansion >> starts. > > And that's the thing. Just move the prologue expansion to after the body > expansion (see cfgexpand.c) and you should be set. No doubt you'll hit > some ugly obstacles, but that should be the way forward. Currently, pass_thread_prologue_and_epilogue is processed well after the body expansion, IRA and some reload passes. I am not familiar with middle-end to move it that far. >> 2. When -mstackrealign is used, we want to use 4 byte incoming stack >> alignment on functions which use SSE vector insns. >> 3. SSE vector insns may be generated by intrinsics, vectorizer and >> vector operations. > > All of these will be automatically taken care off when (1) is solved. I > really think it would be worth it. Fiddling with stack alignment in the > vectorizer (though that's the first alignment requirers you'll hit) is not > going to fly in the long run. > The whole thing is to deal with a quirk in gcc implementation of i386 psABI when -mstackrealign is used. Sure, we may have to deal some more cases. But I don't expect any long term issue. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-18 19:21 ` Michael Matz 2009-10-18 19:45 ` Richard Guenther @ 2009-10-19 16:38 ` H.J. Lu 2009-10-19 17:08 ` Ian Lance Taylor 2009-10-20 1:53 ` Michael Matz 1 sibling, 2 replies; 59+ messages in thread From: H.J. Lu @ 2009-10-19 16:38 UTC (permalink / raw) To: Michael Matz Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Sun, Oct 18, 2009 at 12:19 PM, Michael Matz <matz@suse.de> wrote: > Hi, > > On Sat, 17 Oct 2009, Richard Guenther wrote: > >> > I can't rely on automatic variables for vectorizer when I need >> > the information before RTL expansion. >> >> I don't see how this is too late (I also don't see where expand creates >> automatic variables, > > assign_temp/assign_stack_temp, all over. > >> but well ... I can imagine reload creating spill slots). If non-aligned >> automatic variables are generated you simply use unaligned moves - >> what's the problem? > > It's obvious: we don't want to generate unaligned moves. That's the whole > point of H.J. patches. He has a phase ordering problem: > (a) alignment of generated temporaries depends on known stack alignment > (b) known stack alignment depends on what is put on stack (including late > generated temporaries) > > He tries to solve this by pessimistically assuming that potentially > everything imaginable could go on stack. What I don't understand is why > we don't instead track hard_stack_alignment in assign_*_temp (where we > then assume that the stack will be aligned perfectly), and expand stack > realignment code _after_ having expanded everything else (plus examined > local variables for the possibility of generating spill slots). > Vectorizer may not call assign_*_temp at all. Instead, x86 backend may call gen_reg_rtx to generate pseudo registers when expanding vector statement. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 16:38 ` H.J. Lu @ 2009-10-19 17:08 ` Ian Lance Taylor 2009-10-19 17:26 ` H.J. Lu 2009-10-20 1:53 ` Michael Matz 1 sibling, 1 reply; 59+ messages in thread From: Ian Lance Taylor @ 2009-10-19 17:08 UTC (permalink / raw) To: H.J. Lu Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek "H.J. Lu" <hjl.tools@gmail.com> writes: > Vectorizer may not call assign_*_temp at all. Instead, x86 backend > may call gen_reg_rtx to generate pseudo registers when expanding > vector statement. It simply does not make sense to change the vectorizer, of all things, to set the stack alignment. Stack alignment depends upon the types/modes of automatic variables stored on the stack. Therefore, you should set the required stack alignment based on the creation of automatic variables. Ideally you would set the required stack alignment for automatic variables stored on the stack, but apparently you can't change the stack alignment after expand (though I don't see why not). So if you can't set the stack alignment based on automatic variables stored on the stack, then you should set it based on the creation of automatic variables. Ian ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 17:08 ` Ian Lance Taylor @ 2009-10-19 17:26 ` H.J. Lu 2009-10-19 17:33 ` Ian Lance Taylor 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-19 17:26 UTC (permalink / raw) To: Ian Lance Taylor Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek On Mon, Oct 19, 2009 at 10:05 AM, Ian Lance Taylor <iant@google.com> wrote: > "H.J. Lu" <hjl.tools@gmail.com> writes: > >> Vectorizer may not call assign_*_temp at all. Instead, x86 backend >> may call gen_reg_rtx to generate pseudo registers when expanding >> vector statement. > > It simply does not make sense to change the vectorizer, of all things, > to set the stack alignment. Stack alignment depends upon the > types/modes of automatic variables stored on the stack. Therefore, > you should set the required stack alignment based on the creation of > automatic variables. Ideally you would set the required stack > alignment for automatic variables stored on the stack, but apparently > you can't change the stack alignment after expand (though I don't see > why not). So if you can't set the stack alignment based on automatic > variables stored on the stack, then you should set it based on the > creation of automatic variables. > It is about setting the incoming stack alignment, which has to be done before RTL expansion. Vectorizer may not use any automatic variables. But the RTL expander may generate pseudo vector registers based on vector statement, at which time, it is too late to go back to change incoming stack alignment. I only modified vectorizer to record what it does. I didn't change any statements generated by vectorizer. The x86 backend uses this information to set the incoming stack alignment before RTL expansion. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 17:26 ` H.J. Lu @ 2009-10-19 17:33 ` Ian Lance Taylor 2009-10-19 17:46 ` H.J. Lu 0 siblings, 1 reply; 59+ messages in thread From: Ian Lance Taylor @ 2009-10-19 17:33 UTC (permalink / raw) To: H.J. Lu Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek "H.J. Lu" <hjl.tools@gmail.com> writes: > On Mon, Oct 19, 2009 at 10:05 AM, Ian Lance Taylor <iant@google.com> wrote: >> "H.J. Lu" <hjl.tools@gmail.com> writes: >> >>> Vectorizer may not call assign_*_temp at all. Instead, x86 backend >>> may call gen_reg_rtx to generate pseudo registers when expanding >>> vector statement. >> >> It simply does not make sense to change the vectorizer, of all things, >> to set the stack alignment. Stack alignment depends upon the >> types/modes of automatic variables stored on the stack. Therefore, >> you should set the required stack alignment based on the creation of >> automatic variables. Ideally you would set the required stack >> alignment for automatic variables stored on the stack, but apparently >> you can't change the stack alignment after expand (though I don't see >> why not). So if you can't set the stack alignment based on automatic >> variables stored on the stack, then you should set it based on the >> creation of automatic variables. >> > > It is about setting the incoming stack alignment, which has to be > done before RTL expansion. Vectorizer may not use any automatic > variables. But the RTL expander may generate pseudo vector registers > based on vector statement, at which time, it is too late to go back to > change incoming stack alignment. I only modified vectorizer to > record what it does. I didn't change any statements generated by > vectorizer. The x86 backend uses this information to set the > incoming stack alignment before RTL expansion. If somebody asked you "where does gcc set the required stack alignment?" would you expect the answer to be "in the vectorizer code?" Is there any way we can fix incoming stack alignment so that it can be controlled by automatic stack variables? I don't understand why this is not done by the prologue and epilogue code. Ian ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 17:33 ` Ian Lance Taylor @ 2009-10-19 17:46 ` H.J. Lu 2009-10-19 17:55 ` Ian Lance Taylor 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-19 17:46 UTC (permalink / raw) To: Ian Lance Taylor Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek On Mon, Oct 19, 2009 at 10:30 AM, Ian Lance Taylor <iant@google.com> wrote: > "H.J. Lu" <hjl.tools@gmail.com> writes: > >> On Mon, Oct 19, 2009 at 10:05 AM, Ian Lance Taylor <iant@google.com> wrote: >>> "H.J. Lu" <hjl.tools@gmail.com> writes: >>> >>>> Vectorizer may not call assign_*_temp at all. Instead, x86 backend >>>> may call gen_reg_rtx to generate pseudo registers when expanding >>>> vector statement. >>> >>> It simply does not make sense to change the vectorizer, of all things, >>> to set the stack alignment. Stack alignment depends upon the >>> types/modes of automatic variables stored on the stack. Therefore, >>> you should set the required stack alignment based on the creation of >>> automatic variables. Ideally you would set the required stack >>> alignment for automatic variables stored on the stack, but apparently >>> you can't change the stack alignment after expand (though I don't see >>> why not). So if you can't set the stack alignment based on automatic >>> variables stored on the stack, then you should set it based on the >>> creation of automatic variables. >>> >> >> It is about setting the incoming stack alignment, which has to be >> done before RTL expansion. Vectorizer may not use any automatic >> variables. But the RTL expander may generate pseudo vector registers >> based on vector statement, at which time, it is too late to go back to >> change incoming stack alignment. I only modified vectorizer to >> record what it does. I didn't change any statements generated by >> vectorizer. The x86 backend uses this information to set the >> incoming stack alignment before RTL expansion. > > If somebody asked you "where does gcc set the required stack > alignment?" would you expect the answer to be "in the vectorizer > code?" Yes, vectorizer may put requirement on stack alignment. But it is up to backend to decide and it isn't the only place gcc may require certain stack alignment. > Is there any way we can fix incoming stack alignment so that it can be > controlled by automatic stack variables? I don't understand why this The incoming stack alignment can be controlled by automatic stack variables. But it can't be controlled by pseudo registers. > is not done by the prologue and epilogue code. > Because the prologue and epilogue code is generated after RTL expansion starts? -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 17:46 ` H.J. Lu @ 2009-10-19 17:55 ` Ian Lance Taylor 2009-10-19 19:16 ` H.J. Lu 0 siblings, 1 reply; 59+ messages in thread From: Ian Lance Taylor @ 2009-10-19 17:55 UTC (permalink / raw) To: H.J. Lu Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek "H.J. Lu" <hjl.tools@gmail.com> writes: >> If somebody asked you "where does gcc set the required stack >> alignment?" would you expect the answer to be "in the vectorizer >> code?" > > Yes, vectorizer may put requirement on stack alignment. But it is up > to backend to decide and it isn't the only place gcc may require > certain stack alignment. It's the only place your patch changes. >> Is there any way we can fix incoming stack alignment so that it can be >> controlled by automatic stack variables? I don't understand why this > > The incoming stack alignment can be controlled by automatic stack > variables. But it can't be controlled by pseudo registers. > >> is not done by the prologue and epilogue code. >> > > Because the prologue and epilogue code is generated after > RTL expansion starts? That just restates my question without answering it. Ian ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 17:55 ` Ian Lance Taylor @ 2009-10-19 19:16 ` H.J. Lu 2009-10-19 21:15 ` Ian Lance Taylor 2009-10-20 1:23 ` Michael Matz 0 siblings, 2 replies; 59+ messages in thread From: H.J. Lu @ 2009-10-19 19:16 UTC (permalink / raw) To: Ian Lance Taylor Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek On Mon, Oct 19, 2009 at 10:47 AM, Ian Lance Taylor <iant@google.com> wrote: > "H.J. Lu" <hjl.tools@gmail.com> writes: > >>> If somebody asked you "where does gcc set the required stack >>> alignment?" would you expect the answer to be "in the vectorizer >>> code?" >> >> Yes, vectorizer may put requirement on stack alignment. But it is up >> to backend to decide and it isn't the only place gcc may require >> certain stack alignment. > > It's the only place your patch changes. I also changed ix86_minimum_alignment to handle automatic stack variables. > >>> Is there any way we can fix incoming stack alignment so that it can be >>> controlled by automatic stack variables? I don't understand why this >> >> The incoming stack alignment can be controlled by automatic stack >> variables. But it can't be controlled by pseudo registers. >> >>> is not done by the prologue and epilogue code. >>> >> >> Because the prologue and epilogue code is generated after >> RTL expansion starts? > > That just restates my question without answering it. > We don't "fix" the incoming stack alignment. We just need to know what value it will be. Normally the incoming stack alignment is a constant specified by psABI. I don't think we should change the way we do RTL expansion just to work around a quirk in gcc implementation of i386 psABI. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 19:16 ` H.J. Lu @ 2009-10-19 21:15 ` Ian Lance Taylor 2009-10-20 19:00 ` H.J. Lu 2009-10-20 1:23 ` Michael Matz 1 sibling, 1 reply; 59+ messages in thread From: Ian Lance Taylor @ 2009-10-19 21:15 UTC (permalink / raw) To: H.J. Lu Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek "H.J. Lu" <hjl.tools@gmail.com> writes: > On Mon, Oct 19, 2009 at 10:47 AM, Ian Lance Taylor <iant@google.com> wrote: >> "H.J. Lu" <hjl.tools@gmail.com> writes: >> >>>> If somebody asked you "where does gcc set the required stack >>>> alignment?" would you expect the answer to be "in the vectorizer >>>> code?" >>> >>> Yes, vectorizer may put requirement on stack alignment. But it is up >>> to backend to decide and it isn't the only place gcc may require >>> certain stack alignment. >> >> It's the only place your patch changes. > > I also changed ix86_minimum_alignment to handle > automatic stack variables. OK, the only middle-end place. >>>> Is there any way we can fix incoming stack alignment so that it can be >>>> controlled by automatic stack variables? I don't understand why this >>> >>> The incoming stack alignment can be controlled by automatic stack >>> variables. But it can't be controlled by pseudo registers. >>> >>>> is not done by the prologue and epilogue code. >>>> >>> >>> Because the prologue and epilogue code is generated after >>> RTL expansion starts? >> >> That just restates my question without answering it. >> > > We don't "fix" the incoming stack alignment. We just need to > know what value it will be. Normally the incoming stack alignment > is a constant specified by psABI. I don't think we should > change the way we do RTL expansion just to work around a > quirk in gcc implementation of i386 psABI. I'm not sure I understand what you mean here. The incoming stack alignment is still a constant, isn't it? Does using a vector somehow change the incoming stack alignment? Ian ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 21:15 ` Ian Lance Taylor @ 2009-10-20 19:00 ` H.J. Lu 0 siblings, 0 replies; 59+ messages in thread From: H.J. Lu @ 2009-10-20 19:00 UTC (permalink / raw) To: Ian Lance Taylor Cc: Michael Matz, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek On Mon, Oct 19, 2009 at 2:09 PM, Ian Lance Taylor <iant@google.com> wrote: >> >> We don't "fix" the incoming stack alignment. We just need to >> know what value it will be. Normally the incoming stack alignment >> is a constant specified by psABI. I don't think we should >> change the way we do RTL expansion just to work around a >> quirk in gcc implementation of i386 psABI. > > I'm not sure I understand what you mean here. The incoming stack > alignment is still a constant, isn't it? Does using a vector somehow > change the incoming stack alignment? > Using a vector doesn't change the incoming stack alignment, which is set by caller. But it may change the minimum value of the coming stack alignment, which leads to different code generation if stack alignment is required by hardware. The incoming stack alignment should be a constant when RTL expansion starts, especially before calling expand_call. In cfgexpand.c, there are comments: /* Call update_stack_boundary here to update incoming stack boundary before TARGET_FUNCTION_OK_FOR_SIBCALL is called. TARGET_FUNCTION_OK_FOR_SIBCALL needs to know the accurate incoming stack alignment to check if it is OK to perform sibcall optimization since sibcall optimization will only align the outgoing stack to incoming stack boundary. */ if (targetm.calls.update_stack_boundary) targetm.calls.update_stack_boundary (); In i386, there are /* If we need to align the outgoing stack, then sibcalling would unalign the stack, which may break the called function. */ if (ix86_incoming_stack_boundary < PREFERRED_STACK_BOUNDARY) return false; Before calling expand_stack_alignment in gimple_expand_cfg, we can generate any alignment on stack. expand_stack_alignment has crtl->stack_realign_needed = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated; At that point, we make the decision if we need to align the stack. We can always not align the stack at some later time. But we can't align the stack if stack_realign_needed is false since it will impact how stack pointer and frame pointer are used in IRA and reload. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 19:16 ` H.J. Lu 2009-10-19 21:15 ` Ian Lance Taylor @ 2009-10-20 1:23 ` Michael Matz 2009-10-20 19:12 ` H.J. Lu 1 sibling, 1 reply; 59+ messages in thread From: Michael Matz @ 2009-10-20 1:23 UTC (permalink / raw) To: H.J. Lu Cc: Ian Lance Taylor, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek [-- Attachment #1: Type: TEXT/PLAIN, Size: 2839 bytes --] Hi, On Mon, 19 Oct 2009, H.J. Lu wrote: > On Mon, Oct 19, 2009 at 10:47 AM, Ian Lance Taylor <iant@google.com> wrote: > > "H.J. Lu" <hjl.tools@gmail.com> writes: > > > >>> If somebody asked you "where does gcc set the required stack > >>> alignment?" would you expect the answer to be "in the vectorizer > >>> code?" For sake of completeness: obviously not. Any implementation where this answer would be true is broken (on one or the other level). > >>> Is there any way we can fix incoming stack alignment so that it can > >>> be controlled by automatic stack variables? Â I don't understand why > >>> this > >> > >> The incoming stack alignment can be controlled by automatic stack > >> variables. But it can't be controlled by pseudo registers. > >> > >>> is not done by the prologue and epilogue code. > >>> > >> > >> Because the prologue and epilogue code is generated after RTL > >> expansion starts? > > > > That just restates my question without answering it. > > > > We don't "fix" the incoming stack alignment. We just need to know what > value it will be. Normally the incoming stack alignment is a constant > specified by psABI. I don't think we should change the way we do RTL > expansion just to work around a quirk in gcc implementation of i386 > psABI. Actually I think that we should change whatever we need to change for the change (ahem) to make sense. You said: > >> The incoming stack alignment can be controlled by automatic stack > >> variables. But it can't be controlled by pseudo registers. That doesn't make sense verbatim. Why should the _incoming_ alignment (I understand this as the guarantee that the caller makes, "incoming" from the perspective of the called function) be influenced by anything the given function does or doesn't do to the stack. I guess there's some confusion with the terms, so please clarify. You also said: [Ian] > > > I don't understand why this > > > is not done by the prologue and epilogue code. [H.J.] > > Because the prologue and epilogue code is generated after RTL > > expansion starts? Somewhere we're going downhill with understanding. I assume that stack alignment work (if necessary) is done in the prologue. I further assume that required stack alignment depends on emitted calls and stack slots in the body. So if (as you say right above) prologue is emitted after the expansion of all real instructions (which then includes arbitrary temporaries, but not spill slots), it should be very well able to emit whatever insns necessary to guarantee the alignment required depending on the guaranteed incoming alignment. (Ignore for a moment that currently the function start is not emitted after everything else, but before. For discussion purpose assume that the function start is emitted after everything else.) Ciao, Michael. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-20 1:23 ` Michael Matz @ 2009-10-20 19:12 ` H.J. Lu 0 siblings, 0 replies; 59+ messages in thread From: H.J. Lu @ 2009-10-20 19:12 UTC (permalink / raw) To: Michael Matz Cc: Ian Lance Taylor, Richard Guenther, Uros Bizjak, gcc-patches, Jakub Jelinek On Mon, Oct 19, 2009 at 6:12 PM, Michael Matz <matz@suse.de> wrote: > > Somewhere we're going downhill with understanding. I assume that stack > alignment work (if necessary) is done in the prologue. I further assume > that required stack alignment depends on emitted calls and stack slots in > the body. Stack alignment work is done all over middle-end, RA, reload as well as backend. prologue is just one part of it. > So if (as you say right above) prologue is emitted after the expansion of > all real instructions (which then includes arbitrary temporaries, but not > spill slots), it should be very well able to emit whatever insns necessary > to guarantee the alignment required depending on the guaranteed incoming > alignment. > The problem is we want to know what "the guaranteed incoming alignment" is. 4byte is the safest. But we have to realign the stack for every non-leaf function in order to generate 16byte outgoing stack alignment. When is it safe to assuming 16byte incoming stack alignment when the actual incoming stack alignment is really 4byte? We want to identify functions which can't tolerate misaligned stack. So far my list is: 1. Has any SSE automatic variable. 2. Has any vector statement. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-19 16:38 ` H.J. Lu 2009-10-19 17:08 ` Ian Lance Taylor @ 2009-10-20 1:53 ` Michael Matz 2009-10-20 21:15 ` H.J. Lu 1 sibling, 1 reply; 59+ messages in thread From: Michael Matz @ 2009-10-20 1:53 UTC (permalink / raw) To: H.J. Lu Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek [-- Attachment #1: Type: TEXT/PLAIN, Size: 1745 bytes --] Hi, On Mon, 19 Oct 2009, H.J. Lu wrote: > > He tries to solve this by pessimistically assuming that potentially > > everything imaginable could go on stack. Â What I don't understand is > > why we don't instead track hard_stack_alignment in assign_*_temp > > (where we then assume that the stack will be aligned perfectly), and > > expand stack realignment code _after_ having expanded everything else > > (plus examined local variables for the possibility of generating spill > > slots). > > Vectorizer may not call assign_*_temp at all. Instead, x86 backend may > call gen_reg_rtx to generate pseudo registers when expanding vector > statement. If that (and not assign_*_temp) is the problem, then it's obvious that fiddling with the vectorizer doesn't solve it. The expanders can create new pseudos for whatever they see fit. For expanding vector statements, for expanding block moves, for expanding string compares, for expanding additions, for anything. Mucking around with the vectorizer won't solve the problem. On the outset it's very easy: if anything (call it X) goes to the stack it requires some alignment for optimality. That alignment can only be generated in the prologue. It follows trivially that such prologue can only be generated after every X is done and emitted. You seem to want to work around this by guessing what might possibly be required from stack alignment. That's sort of fine, but if you go down that path you can't just change the vectorizer, you have to look at all possible sources of stack temporaries, and that simply includes the pseudos. Given all this: remind me why we don't simply adjust the prologue after reload, when all stack slots are allocated? Ciao, Michael. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-20 1:53 ` Michael Matz @ 2009-10-20 21:15 ` H.J. Lu 2009-10-21 1:10 ` H.J. Lu 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-20 21:15 UTC (permalink / raw) To: Michael Matz Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek [-- Attachment #1: Type: text/plain, Size: 2706 bytes --] On Mon, Oct 19, 2009 at 6:25 PM, Michael Matz <matz@suse.de> wrote: > Hi, > > On Mon, 19 Oct 2009, H.J. Lu wrote: > >> > He tries to solve this by pessimistically assuming that potentially >> > everything imaginable could go on stack. What I don't understand is >> > why we don't instead track hard_stack_alignment in assign_*_temp >> > (where we then assume that the stack will be aligned perfectly), and >> > expand stack realignment code _after_ having expanded everything else >> > (plus examined local variables for the possibility of generating spill >> > slots). >> >> Vectorizer may not call assign_*_temp at all. Instead, x86 backend may >> call gen_reg_rtx to generate pseudo registers when expanding vector >> statement. > > If that (and not assign_*_temp) is the problem, then it's obvious that > fiddling with the vectorizer doesn't solve it. The expanders can create > new pseudos for whatever they see fit. For expanding vector statements, > for expanding block moves, for expanding string compares, for expanding > additions, for anything. Mucking around with the vectorizer won't solve > the problem. > Here is a new patch. I added hard_stack_alignment, moved update_stack_boundary after RTL expansion and updated ix86_function_ok_for_sibcall to deal with it. Any comments? Thanks. -- H.J. --- gcc/ 2009-10-20 H.J. Lu <hongjiu.lu@intel.com> PR target/40836 * cfgexpand.c (get_decl_align_unit): Update hard_stack_alignment. (expand_one_var): Likewise. (gimple_expand_cfg): Initialize hard_stack_alignment to 0. Move update_stack_boundary call to ... (expand_stack_alignment): Here. * emit-rtl.c (gen_reg_rtx): Update hard_stack_alignment. * function.c (assign_stack_local_1): Likewise. (assign_parms): Likewise. (locate_and_pad_parm): Likewise. * function.h (rtl_data): Add hard_stack_alignment. * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New. (verride_options): Don't check ix86_force_align_arg_pointer here. (ix86_function_ok_for_sibcall): Use it. (ix86_update_stack_boundary): Likewise. * config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments. gcc/testsuite/ 2009-10-20 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * gcc.target/i386/incoming-6.c: New. * gcc.target/i386/incoming-7.c: Likewise. * gcc.target/i386/incoming-8.c: Likewise. * gcc.target/i386/incoming-9.c: Likewise. * gcc.target/i386/incoming-10.c: Likewise. * gcc.target/i386/incoming-11.c: Likewise. * gcc.target/i386/incoming-12.c: Likewise. * gcc.target/i386/incoming-13.c: Likewise. * gcc.target/i386/incoming-14.c: Likewise. * gcc.target/i386/incoming-15.c: Likewise. [-- Attachment #2: gcc-pr40838-8.patch --] [-- Type: text/plain, Size: 18213 bytes --] gcc/ 2009-10-20 H.J. Lu <hongjiu.lu@intel.com> PR target/40836 * cfgexpand.c (get_decl_align_unit): Update hard_stack_alignment. (expand_one_var): Likewise. (gimple_expand_cfg): Initialize hard_stack_alignment to 0. Move update_stack_boundary call to ... (expand_stack_alignment): Here. * emit-rtl.c (gen_reg_rtx): Update hard_stack_alignment. * function.c (assign_stack_local_1): Likewise. (assign_parms): Likewise. (locate_and_pad_parm): Likewise. * function.h (rtl_data): Add hard_stack_alignment. * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New. (verride_options): Don't check ix86_force_align_arg_pointer here. (ix86_function_ok_for_sibcall): Use it. (ix86_update_stack_boundary): Likewise. * config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments. gcc/testsuite/ 2009-10-20 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * gcc.target/i386/incoming-6.c: New. * gcc.target/i386/incoming-7.c: Likewise. * gcc.target/i386/incoming-8.c: Likewise. * gcc.target/i386/incoming-9.c: Likewise. * gcc.target/i386/incoming-10.c: Likewise. * gcc.target/i386/incoming-11.c: Likewise. * gcc.target/i386/incoming-12.c: Likewise. * gcc.target/i386/incoming-13.c: Likewise. * gcc.target/i386/incoming-14.c: Likewise. * gcc.target/i386/incoming-15.c: Likewise. diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c index acd70c1..05df1fd 100644 --- a/gcc/cfgexpand.c +++ b/gcc/cfgexpand.c @@ -247,6 +247,9 @@ get_decl_align_unit (tree decl) gcc_assert(!crtl->stack_realign_processed); crtl->stack_alignment_estimated = align; } + + if (crtl->hard_stack_alignment < align) + crtl->hard_stack_alignment = align; } /* stack_alignment_needed > PREFERRED_STACK_BOUNDARY is permitted. @@ -994,6 +997,9 @@ expand_one_var (tree var, bool toplevel, bool really_expand) gcc_assert(!crtl->stack_realign_processed); crtl->stack_alignment_estimated = align; } + + if (crtl->hard_stack_alignment < align) + crtl->hard_stack_alignment = align; } if (TREE_CODE (origvar) == SSA_NAME) @@ -3472,6 +3478,19 @@ expand_stack_alignment (void) gcc_assert (crtl->stack_alignment_needed <= crtl->stack_alignment_estimated); + /* Call update_stack_boundary here again to update incoming stack + boundary. It may set incoming stack alignment to a different + value after RTL expansion. TARGET_FUNCTION_OK_FOR_SIBCALL may + use the minimum incoming stack alignment to check if it is OK + to perform sibcall optimization since sibcall optimization will + only align the outgoing stack to incoming stack boundary. */ + if (targetm.calls.update_stack_boundary) + targetm.calls.update_stack_boundary (); + + /* The incoming stack frame has to be aligned at least at + parm_stack_boundary. */ + gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY); + /* Update crtl->stack_alignment_estimated and use it later to align stack. We check PREFERRED_STACK_BOUNDARY if there may be non-call exceptions since callgraph doesn't collect incoming stack alignment @@ -3564,6 +3583,7 @@ gimple_expand_cfg (void) crtl->max_used_stack_slot_alignment = STACK_BOUNDARY; crtl->stack_alignment_estimated = STACK_BOUNDARY; crtl->preferred_stack_boundary = STACK_BOUNDARY; + crtl->hard_stack_alignment = 0; cfun->cfg->max_jumptable_ents = 0; @@ -3626,23 +3646,6 @@ gimple_expand_cfg (void) if (crtl->stack_protect_guard) stack_protect_prologue (); - /* Update stack boundary if needed. */ - if (SUPPORTS_STACK_ALIGNMENT) - { - /* Call update_stack_boundary here to update incoming stack - boundary before TARGET_FUNCTION_OK_FOR_SIBCALL is called. - TARGET_FUNCTION_OK_FOR_SIBCALL needs to know the accurate - incoming stack alignment to check if it is OK to perform - sibcall optimization since sibcall optimization will only - align the outgoing stack to incoming stack boundary. */ - if (targetm.calls.update_stack_boundary) - targetm.calls.update_stack_boundary (); - - /* The incoming stack frame has to be aligned at least at - parm_stack_boundary. */ - gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY); - } - expand_phi_nodes (&SA); /* Register rtl specific functions for cfg. */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 73913b8..3db9485 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1905,6 +1905,7 @@ static bool ix86_valid_target_attribute_p (tree, tree, tree, int); static bool ix86_valid_target_attribute_inner_p (tree, char *[]); static bool ix86_can_inline_p (tree, tree); static void ix86_set_current_function (tree); +static unsigned int ix86_minimum_incoming_stack_boundary (bool); static enum calling_abi ix86_function_abi (const_tree); @@ -3239,12 +3240,10 @@ override_options (bool main_args_p) if (ix86_force_align_arg_pointer == -1) ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT; + ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; + /* Validate -mincoming-stack-boundary= value or default it to MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */ - if (ix86_force_align_arg_pointer) - ix86_default_incoming_stack_boundary = MIN_STACK_BOUNDARY; - else - ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary; if (ix86_incoming_stack_boundary_string) { @@ -4277,7 +4276,8 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) /* If we need to align the outgoing stack, then sibcalling would unalign the stack, which may break the called function. */ - if (ix86_incoming_stack_boundary < PREFERRED_STACK_BOUNDARY) + if (ix86_minimum_incoming_stack_boundary (true) + < PREFERRED_STACK_BOUNDARY) return false; if (decl) @@ -8196,37 +8196,57 @@ find_drap_reg (void) } } -/* Update incoming stack boundary and estimated stack alignment. */ +/* Return minimum incoming stack alignment. */ -static void -ix86_update_stack_boundary (void) +static unsigned int +ix86_minimum_incoming_stack_boundary (bool sibcall) { + unsigned int incoming_stack_boundary; + /* Prefer the one specified at command line. */ - ix86_incoming_stack_boundary - = (ix86_user_incoming_stack_boundary - ? ix86_user_incoming_stack_boundary - : ix86_default_incoming_stack_boundary); + if (ix86_user_incoming_stack_boundary) + incoming_stack_boundary = ix86_user_incoming_stack_boundary; + /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary if + -mstackrealign is used and it is called to check if sibcall is + OK or hard stack alignment is 128bit. */ + else if (!TARGET_64BIT + && ix86_force_align_arg_pointer + && (sibcall || crtl->hard_stack_alignment == 128)) + incoming_stack_boundary = MIN_STACK_BOUNDARY; + else + incoming_stack_boundary = ix86_default_incoming_stack_boundary; /* Incoming stack alignment can be changed on individual functions via force_align_arg_pointer attribute. We use the smallest incoming stack boundary. */ - if (ix86_incoming_stack_boundary > MIN_STACK_BOUNDARY + if (incoming_stack_boundary > MIN_STACK_BOUNDARY && lookup_attribute (ix86_force_align_arg_pointer_string, TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl)))) - ix86_incoming_stack_boundary = MIN_STACK_BOUNDARY; + incoming_stack_boundary = MIN_STACK_BOUNDARY; /* The incoming stack frame has to be aligned at least at parm_stack_boundary. */ - if (ix86_incoming_stack_boundary < crtl->parm_stack_boundary) - ix86_incoming_stack_boundary = crtl->parm_stack_boundary; + if (incoming_stack_boundary < crtl->parm_stack_boundary) + incoming_stack_boundary = crtl->parm_stack_boundary; /* Stack at entrance of main is aligned by runtime. We use the smallest incoming stack boundary. */ - if (ix86_incoming_stack_boundary > MAIN_STACK_BOUNDARY + if (incoming_stack_boundary > MAIN_STACK_BOUNDARY && DECL_NAME (current_function_decl) && MAIN_NAME_P (DECL_NAME (current_function_decl)) && DECL_FILE_SCOPE_P (current_function_decl)) - ix86_incoming_stack_boundary = MAIN_STACK_BOUNDARY; + incoming_stack_boundary = MAIN_STACK_BOUNDARY; + + return incoming_stack_boundary; +} + +/* Update incoming stack boundary and estimated stack alignment. */ + +static void +ix86_update_stack_boundary (void) +{ + ix86_incoming_stack_boundary + = ix86_minimum_incoming_stack_boundary (false); /* x86_64 vararg needs 16byte stack alignment for register save area. */ diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 33a5077..22187a9 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -706,9 +706,7 @@ enum target_cpu_default generate an alternate prologue and epilogue that realigns the runtime stack if nessary. This supports mixing codes that keep a 4-byte aligned stack, as specified by i386 psABI, with codes that - need a 16-byte aligned stack, as required by SSE instructions. If - STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is - 128, stacks for all functions may be realigned. */ + need a 16-byte aligned stack, as required by SSE instructions. */ #define STACK_REALIGN_DEFAULT 0 /* Boundary (in *bits*) on which the incoming stack is aligned. */ diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c index b868298..c1ca40e 100644 --- a/gcc/emit-rtl.c +++ b/gcc/emit-rtl.c @@ -872,6 +872,9 @@ gen_reg_rtx (enum machine_mode mode) unsigned int min_align = MINIMUM_ALIGNMENT (NULL, mode, align); if (crtl->stack_alignment_estimated < min_align) crtl->stack_alignment_estimated = min_align; + + if (crtl->hard_stack_alignment < min_align) + crtl->hard_stack_alignment = min_align; } if (generating_concat_p diff --git a/gcc/function.c b/gcc/function.c index 35c0cfd..eb8ecfe 100644 --- a/gcc/function.c +++ b/gcc/function.c @@ -356,6 +356,10 @@ assign_stack_local_1 (enum machine_mode mode, HOST_WIDE_INT size, } } } + + if (!crtl->stack_realign_processed + && crtl->hard_stack_alignment < alignment_in_bits) + crtl->hard_stack_alignment = alignment_in_bits; } if (crtl->stack_alignment_needed < alignment_in_bits) @@ -3166,6 +3170,9 @@ assign_parms (tree fndecl) gcc_assert (!crtl->stack_realign_processed); crtl->stack_alignment_estimated = align; } + + if (crtl->hard_stack_alignment < align) + crtl->hard_stack_alignment = align; } if (cfun->stdarg && !TREE_CHAIN (parm)) @@ -3223,6 +3230,9 @@ assign_parms (tree fndecl) gcc_assert (!crtl->stack_realign_processed); crtl->stack_alignment_estimated = align; } + + if (crtl->hard_stack_alignment < align) + crtl->hard_stack_alignment = align; } } } @@ -3538,6 +3548,10 @@ locate_and_pad_parm (enum machine_mode passed_mode, tree type, int in_regs, && crtl->stack_realign_needed); } } + + if (!crtl->stack_realign_processed + && crtl->hard_stack_alignment < boundary) + crtl->hard_stack_alignment = boundary; } /* Remember if the outgoing parameter requires extra alignment on the diff --git a/gcc/function.h b/gcc/function.h index 4825d16..8b57c9a 100644 --- a/gcc/function.h +++ b/gcc/function.h @@ -341,6 +341,9 @@ struct GTY(()) rtl_data { local stack. */ unsigned int stack_alignment_estimated; + /* The largest hard alignment on the stack. */ + unsigned int hard_stack_alignment; + /* For reorg. */ /* If some insns can be deferred to the delay slots of the epilogue, the diff --git a/gcc/testsuite/gcc.target/i386/incoming-10.c b/gcc/testsuite/gcc.target/i386/incoming-10.c new file mode 100644 index 0000000..31d9e61 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-10.c @@ -0,0 +1,19 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */ + +struct s { + int x[8]; +}; + +void g(struct s *); + +void f() +{ + int i; + struct s s; + for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0; + g(&s); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-11.c b/gcc/testsuite/gcc.target/i386/incoming-11.c new file mode 100644 index 0000000..e5787af --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-11.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */ + +void g(); + +int p[100]; +int q[100]; + +void f() +{ + int i; + for (i = 0; i < 100; i++) p[i] = 0; + g(); + for (i = 0; i < 100; i++) q[i] = 0; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-12.c b/gcc/testsuite/gcc.target/i386/incoming-12.c new file mode 100644 index 0000000..d7ef103 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-12.c @@ -0,0 +1,20 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +struct x { + v4si v; + v4si w; +}; + +void y(void *); + +v4si x(void) +{ + struct x x; + y(&x); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-13.c b/gcc/testsuite/gcc.target/i386/incoming-13.c new file mode 100644 index 0000000..bbc8993 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-13.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern double y(double *s3); + +extern double s1, s2; + +double x(void) +{ + double s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-14.c b/gcc/testsuite/gcc.target/i386/incoming-14.c new file mode 100644 index 0000000..d27179d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-14.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern int y(int *s3); + +extern int s1, s2; + +int x(void) +{ + int s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-15.c b/gcc/testsuite/gcc.target/i386/incoming-15.c new file mode 100644 index 0000000..e6a1749 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-15.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern long long y(long long *s3); + +extern long long s1, s2; + +long long x(void) +{ + long long s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-6.c b/gcc/testsuite/gcc.target/i386/incoming-6.c new file mode 100644 index 0000000..5cc4ab3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-6.c @@ -0,0 +1,17 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si *s3); + +extern v4si s1, s2; + +v4si x(void) +{ + v4si s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-7.c b/gcc/testsuite/gcc.target/i386/incoming-7.c new file mode 100644 index 0000000..cdd6037 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-7.c @@ -0,0 +1,16 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si, v4si, v4si, v4si, v4si); + +extern v4si s1, s2; + +v4si x(void) +{ + return y(s1, s2, s1, s2, s2); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-8.c b/gcc/testsuite/gcc.target/i386/incoming-8.c new file mode 100644 index 0000000..2dd8800 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-8.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O3 -msse2 -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-9.c b/gcc/testsuite/gcc.target/i386/incoming-9.c new file mode 100644 index 0000000..e43cbd6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-9.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O3 -mno-sse -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-20 21:15 ` H.J. Lu @ 2009-10-21 1:10 ` H.J. Lu 2009-10-21 9:54 ` Michael Matz 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-21 1:10 UTC (permalink / raw) To: Michael Matz Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek [-- Attachment #1: Type: text/plain, Size: 3054 bytes --] On Tue, Oct 20, 2009 at 2:07 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Mon, Oct 19, 2009 at 6:25 PM, Michael Matz <matz@suse.de> wrote: >> Hi, >> >> On Mon, 19 Oct 2009, H.J. Lu wrote: >> >>> > He tries to solve this by pessimistically assuming that potentially >>> > everything imaginable could go on stack. What I don't understand is >>> > why we don't instead track hard_stack_alignment in assign_*_temp >>> > (where we then assume that the stack will be aligned perfectly), and >>> > expand stack realignment code _after_ having expanded everything else >>> > (plus examined local variables for the possibility of generating spill >>> > slots). >>> >>> Vectorizer may not call assign_*_temp at all. Instead, x86 backend may >>> call gen_reg_rtx to generate pseudo registers when expanding vector >>> statement. >> >> If that (and not assign_*_temp) is the problem, then it's obvious that >> fiddling with the vectorizer doesn't solve it. The expanders can create >> new pseudos for whatever they see fit. For expanding vector statements, >> for expanding block moves, for expanding string compares, for expanding >> additions, for anything. Mucking around with the vectorizer won't solve >> the problem. >> > > Here is a new patch. I added hard_stack_alignment, moved > update_stack_boundary after RTL expansion and updated > ix86_function_ok_for_sibcall to deal with it. Any comments? > Here is the updated patch. For -mstackrealign, we don't need to check MIN_STACK_BOUNDARY for sibcall. Callee of the sibcall will get the same incoming stack alignment. I added a new testcase to check it. -- H.J. --- gcc/ 2009-10-20 H.J. Lu <hongjiu.lu@intel.com> PR target/40836 * cfgexpand.c (get_decl_align_unit): Update hard_stack_alignment. (expand_one_var): Likewise. (gimple_expand_cfg): Initialize hard_stack_alignment to 0. Move update_stack_boundary call to ... (expand_stack_alignment): Here. * emit-rtl.c (gen_reg_rtx): Update hard_stack_alignment. * function.c (assign_stack_local_1): Likewise. (assign_parms): Likewise. (locate_and_pad_parm): Likewise. * function.h (rtl_data): Add hard_stack_alignment. * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New. (verride_options): Don't check ix86_force_align_arg_pointer here. (ix86_function_ok_for_sibcall): Use it. (ix86_update_stack_boundary): Likewise. * config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments. gcc/testsuite/ 2009-10-20 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * gcc.target/i386/incoming-6.c: New. * gcc.target/i386/incoming-7.c: Likewise. * gcc.target/i386/incoming-8.c: Likewise. * gcc.target/i386/incoming-9.c: Likewise. * gcc.target/i386/incoming-10.c: Likewise. * gcc.target/i386/incoming-11.c: Likewise. * gcc.target/i386/incoming-12.c: Likewise. * gcc.target/i386/incoming-13.c: Likewise. * gcc.target/i386/incoming-14.c: Likewise. * gcc.target/i386/incoming-15.c: Likewise. * gcc.target/i386/pr37843-4.c: Likewise. [-- Attachment #2: gcc-pr40838-9.patch --] [-- Type: text/plain, Size: 18940 bytes --] gcc/ 2009-10-20 H.J. Lu <hongjiu.lu@intel.com> PR target/40836 * cfgexpand.c (get_decl_align_unit): Update hard_stack_alignment. (expand_one_var): Likewise. (gimple_expand_cfg): Initialize hard_stack_alignment to 0. Move update_stack_boundary call to ... (expand_stack_alignment): Here. * emit-rtl.c (gen_reg_rtx): Update hard_stack_alignment. * function.c (assign_stack_local_1): Likewise. (assign_parms): Likewise. (locate_and_pad_parm): Likewise. * function.h (rtl_data): Add hard_stack_alignment. * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New. (verride_options): Don't check ix86_force_align_arg_pointer here. (ix86_function_ok_for_sibcall): Use it. (ix86_update_stack_boundary): Likewise. * config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments. gcc/testsuite/ 2009-10-20 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * gcc.target/i386/incoming-6.c: New. * gcc.target/i386/incoming-7.c: Likewise. * gcc.target/i386/incoming-8.c: Likewise. * gcc.target/i386/incoming-9.c: Likewise. * gcc.target/i386/incoming-10.c: Likewise. * gcc.target/i386/incoming-11.c: Likewise. * gcc.target/i386/incoming-12.c: Likewise. * gcc.target/i386/incoming-13.c: Likewise. * gcc.target/i386/incoming-14.c: Likewise. * gcc.target/i386/incoming-15.c: Likewise. * gcc.target/i386/pr37843-4.c: Likewise. diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c index 2678d7e..27475d7 100644 --- a/gcc/cfgexpand.c +++ b/gcc/cfgexpand.c @@ -247,6 +247,9 @@ get_decl_align_unit (tree decl) gcc_assert(!crtl->stack_realign_processed); crtl->stack_alignment_estimated = align; } + + if (crtl->hard_stack_alignment < align) + crtl->hard_stack_alignment = align; } /* stack_alignment_needed > PREFERRED_STACK_BOUNDARY is permitted. @@ -994,6 +997,9 @@ expand_one_var (tree var, bool toplevel, bool really_expand) gcc_assert(!crtl->stack_realign_processed); crtl->stack_alignment_estimated = align; } + + if (crtl->hard_stack_alignment < align) + crtl->hard_stack_alignment = align; } if (TREE_CODE (origvar) == SSA_NAME) @@ -3433,6 +3439,19 @@ expand_stack_alignment (void) gcc_assert (crtl->stack_alignment_needed <= crtl->stack_alignment_estimated); + /* Call update_stack_boundary here again to update incoming stack + boundary. It may set incoming stack alignment to a different + value after RTL expansion. TARGET_FUNCTION_OK_FOR_SIBCALL may + use the minimum incoming stack alignment to check if it is OK + to perform sibcall optimization since sibcall optimization will + only align the outgoing stack to incoming stack boundary. */ + if (targetm.calls.update_stack_boundary) + targetm.calls.update_stack_boundary (); + + /* The incoming stack frame has to be aligned at least at + parm_stack_boundary. */ + gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY); + /* Update crtl->stack_alignment_estimated and use it later to align stack. We check PREFERRED_STACK_BOUNDARY if there may be non-call exceptions since callgraph doesn't collect incoming stack alignment @@ -3525,6 +3544,7 @@ gimple_expand_cfg (void) crtl->max_used_stack_slot_alignment = STACK_BOUNDARY; crtl->stack_alignment_estimated = STACK_BOUNDARY; crtl->preferred_stack_boundary = STACK_BOUNDARY; + crtl->hard_stack_alignment = 0; cfun->cfg->max_jumptable_ents = 0; @@ -3587,23 +3607,6 @@ gimple_expand_cfg (void) if (crtl->stack_protect_guard) stack_protect_prologue (); - /* Update stack boundary if needed. */ - if (SUPPORTS_STACK_ALIGNMENT) - { - /* Call update_stack_boundary here to update incoming stack - boundary before TARGET_FUNCTION_OK_FOR_SIBCALL is called. - TARGET_FUNCTION_OK_FOR_SIBCALL needs to know the accurate - incoming stack alignment to check if it is OK to perform - sibcall optimization since sibcall optimization will only - align the outgoing stack to incoming stack boundary. */ - if (targetm.calls.update_stack_boundary) - targetm.calls.update_stack_boundary (); - - /* The incoming stack frame has to be aligned at least at - parm_stack_boundary. */ - gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY); - } - expand_phi_nodes (&SA); /* Register rtl specific functions for cfg. */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 6065f49..6603250 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1905,6 +1905,7 @@ static bool ix86_valid_target_attribute_p (tree, tree, tree, int); static bool ix86_valid_target_attribute_inner_p (tree, char *[]); static bool ix86_can_inline_p (tree, tree); static void ix86_set_current_function (tree); +static unsigned int ix86_minimum_incoming_stack_boundary (bool); static enum calling_abi ix86_function_abi (const_tree); @@ -3239,12 +3240,10 @@ override_options (bool main_args_p) if (ix86_force_align_arg_pointer == -1) ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT; + ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; + /* Validate -mincoming-stack-boundary= value or default it to MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */ - if (ix86_force_align_arg_pointer) - ix86_default_incoming_stack_boundary = MIN_STACK_BOUNDARY; - else - ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary; if (ix86_incoming_stack_boundary_string) { @@ -4277,7 +4276,8 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) /* If we need to align the outgoing stack, then sibcalling would unalign the stack, which may break the called function. */ - if (ix86_incoming_stack_boundary < PREFERRED_STACK_BOUNDARY) + if (ix86_minimum_incoming_stack_boundary (true) + < PREFERRED_STACK_BOUNDARY) return false; if (decl) @@ -8196,37 +8196,58 @@ find_drap_reg (void) } } -/* Update incoming stack boundary and estimated stack alignment. */ +/* Return minimum incoming stack alignment. */ -static void -ix86_update_stack_boundary (void) +static unsigned int +ix86_minimum_incoming_stack_boundary (bool sibcall) { + unsigned int incoming_stack_boundary; + /* Prefer the one specified at command line. */ - ix86_incoming_stack_boundary - = (ix86_user_incoming_stack_boundary - ? ix86_user_incoming_stack_boundary - : ix86_default_incoming_stack_boundary); + if (ix86_user_incoming_stack_boundary) + incoming_stack_boundary = ix86_user_incoming_stack_boundary; + /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary if + -mstackrealign is used, it isn't used for sibcall check and hard + stack alignment is 128bit. */ + else if (!sibcall + && !TARGET_64BIT + && ix86_force_align_arg_pointer + && crtl->hard_stack_alignment == 128) + incoming_stack_boundary = MIN_STACK_BOUNDARY; + else + incoming_stack_boundary = ix86_default_incoming_stack_boundary; /* Incoming stack alignment can be changed on individual functions via force_align_arg_pointer attribute. We use the smallest incoming stack boundary. */ - if (ix86_incoming_stack_boundary > MIN_STACK_BOUNDARY + if (incoming_stack_boundary > MIN_STACK_BOUNDARY && lookup_attribute (ix86_force_align_arg_pointer_string, TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl)))) - ix86_incoming_stack_boundary = MIN_STACK_BOUNDARY; + incoming_stack_boundary = MIN_STACK_BOUNDARY; /* The incoming stack frame has to be aligned at least at parm_stack_boundary. */ - if (ix86_incoming_stack_boundary < crtl->parm_stack_boundary) - ix86_incoming_stack_boundary = crtl->parm_stack_boundary; + if (incoming_stack_boundary < crtl->parm_stack_boundary) + incoming_stack_boundary = crtl->parm_stack_boundary; /* Stack at entrance of main is aligned by runtime. We use the smallest incoming stack boundary. */ - if (ix86_incoming_stack_boundary > MAIN_STACK_BOUNDARY + if (incoming_stack_boundary > MAIN_STACK_BOUNDARY && DECL_NAME (current_function_decl) && MAIN_NAME_P (DECL_NAME (current_function_decl)) && DECL_FILE_SCOPE_P (current_function_decl)) - ix86_incoming_stack_boundary = MAIN_STACK_BOUNDARY; + incoming_stack_boundary = MAIN_STACK_BOUNDARY; + + return incoming_stack_boundary; +} + +/* Update incoming stack boundary and estimated stack alignment. */ + +static void +ix86_update_stack_boundary (void) +{ + ix86_incoming_stack_boundary + = ix86_minimum_incoming_stack_boundary (false); /* x86_64 vararg needs 16byte stack alignment for register save area. */ diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 33a5077..22187a9 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -706,9 +706,7 @@ enum target_cpu_default generate an alternate prologue and epilogue that realigns the runtime stack if nessary. This supports mixing codes that keep a 4-byte aligned stack, as specified by i386 psABI, with codes that - need a 16-byte aligned stack, as required by SSE instructions. If - STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is - 128, stacks for all functions may be realigned. */ + need a 16-byte aligned stack, as required by SSE instructions. */ #define STACK_REALIGN_DEFAULT 0 /* Boundary (in *bits*) on which the incoming stack is aligned. */ diff --git a/gcc/emit-rtl.c b/gcc/emit-rtl.c index b868298..c1ca40e 100644 --- a/gcc/emit-rtl.c +++ b/gcc/emit-rtl.c @@ -872,6 +872,9 @@ gen_reg_rtx (enum machine_mode mode) unsigned int min_align = MINIMUM_ALIGNMENT (NULL, mode, align); if (crtl->stack_alignment_estimated < min_align) crtl->stack_alignment_estimated = min_align; + + if (crtl->hard_stack_alignment < min_align) + crtl->hard_stack_alignment = min_align; } if (generating_concat_p diff --git a/gcc/function.c b/gcc/function.c index 35c0cfd..eb8ecfe 100644 --- a/gcc/function.c +++ b/gcc/function.c @@ -356,6 +356,10 @@ assign_stack_local_1 (enum machine_mode mode, HOST_WIDE_INT size, } } } + + if (!crtl->stack_realign_processed + && crtl->hard_stack_alignment < alignment_in_bits) + crtl->hard_stack_alignment = alignment_in_bits; } if (crtl->stack_alignment_needed < alignment_in_bits) @@ -3166,6 +3170,9 @@ assign_parms (tree fndecl) gcc_assert (!crtl->stack_realign_processed); crtl->stack_alignment_estimated = align; } + + if (crtl->hard_stack_alignment < align) + crtl->hard_stack_alignment = align; } if (cfun->stdarg && !TREE_CHAIN (parm)) @@ -3223,6 +3230,9 @@ assign_parms (tree fndecl) gcc_assert (!crtl->stack_realign_processed); crtl->stack_alignment_estimated = align; } + + if (crtl->hard_stack_alignment < align) + crtl->hard_stack_alignment = align; } } } @@ -3538,6 +3548,10 @@ locate_and_pad_parm (enum machine_mode passed_mode, tree type, int in_regs, && crtl->stack_realign_needed); } } + + if (!crtl->stack_realign_processed + && crtl->hard_stack_alignment < boundary) + crtl->hard_stack_alignment = boundary; } /* Remember if the outgoing parameter requires extra alignment on the diff --git a/gcc/function.h b/gcc/function.h index 4825d16..8b57c9a 100644 --- a/gcc/function.h +++ b/gcc/function.h @@ -341,6 +341,9 @@ struct GTY(()) rtl_data { local stack. */ unsigned int stack_alignment_estimated; + /* The largest hard alignment on the stack. */ + unsigned int hard_stack_alignment; + /* For reorg. */ /* If some insns can be deferred to the delay slots of the epilogue, the diff --git a/gcc/testsuite/gcc.target/i386/incoming-10.c b/gcc/testsuite/gcc.target/i386/incoming-10.c new file mode 100644 index 0000000..31d9e61 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-10.c @@ -0,0 +1,19 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */ + +struct s { + int x[8]; +}; + +void g(struct s *); + +void f() +{ + int i; + struct s s; + for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0; + g(&s); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-11.c b/gcc/testsuite/gcc.target/i386/incoming-11.c new file mode 100644 index 0000000..e5787af --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-11.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */ + +void g(); + +int p[100]; +int q[100]; + +void f() +{ + int i; + for (i = 0; i < 100; i++) p[i] = 0; + g(); + for (i = 0; i < 100; i++) q[i] = 0; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-12.c b/gcc/testsuite/gcc.target/i386/incoming-12.c new file mode 100644 index 0000000..d7ef103 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-12.c @@ -0,0 +1,20 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +struct x { + v4si v; + v4si w; +}; + +void y(void *); + +v4si x(void) +{ + struct x x; + y(&x); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-13.c b/gcc/testsuite/gcc.target/i386/incoming-13.c new file mode 100644 index 0000000..bbc8993 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-13.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern double y(double *s3); + +extern double s1, s2; + +double x(void) +{ + double s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-14.c b/gcc/testsuite/gcc.target/i386/incoming-14.c new file mode 100644 index 0000000..d27179d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-14.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern int y(int *s3); + +extern int s1, s2; + +int x(void) +{ + int s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-15.c b/gcc/testsuite/gcc.target/i386/incoming-15.c new file mode 100644 index 0000000..e6a1749 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-15.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern long long y(long long *s3); + +extern long long s1, s2; + +long long x(void) +{ + long long s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-6.c b/gcc/testsuite/gcc.target/i386/incoming-6.c new file mode 100644 index 0000000..5cc4ab3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-6.c @@ -0,0 +1,17 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si *s3); + +extern v4si s1, s2; + +v4si x(void) +{ + v4si s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-7.c b/gcc/testsuite/gcc.target/i386/incoming-7.c new file mode 100644 index 0000000..cdd6037 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-7.c @@ -0,0 +1,16 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si, v4si, v4si, v4si, v4si); + +extern v4si s1, s2; + +v4si x(void) +{ + return y(s1, s2, s1, s2, s2); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-8.c b/gcc/testsuite/gcc.target/i386/incoming-8.c new file mode 100644 index 0000000..2dd8800 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-8.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O3 -msse2 -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-9.c b/gcc/testsuite/gcc.target/i386/incoming-9.c new file mode 100644 index 0000000..e43cbd6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-9.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O3 -mno-sse -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr37843-4.c b/gcc/testsuite/gcc.target/i386/pr37843-4.c new file mode 100644 index 0000000..8e5f51f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr37843-4.c @@ -0,0 +1,13 @@ +/* Test for stack alignment with sibcall optimization. */ +/* { dg-do compile { target { ilp32 && nonpic } } } */ +/* { dg-options "-O2 -msse2 -mpreferred-stack-boundary=4 -mstackrealign" } */ +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%\[re\]?sp" } } */ +/* { dg-final { scan-assembler-not "call\[\\t \]*foo" } } */ +/* { dg-final { scan-assembler "jmp\[\\t \]*foo" } } */ + +extern int foo (void); + +int bar (void) +{ + return foo(); +} ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-21 1:10 ` H.J. Lu @ 2009-10-21 9:54 ` Michael Matz 2009-10-21 16:56 ` H.J. Lu 0 siblings, 1 reply; 59+ messages in thread From: Michael Matz @ 2009-10-21 9:54 UTC (permalink / raw) To: H.J. Lu Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek [-- Attachment #1: Type: TEXT/PLAIN, Size: 358 bytes --] Hi, On Tue, 20 Oct 2009, H.J. Lu wrote: > > Here is a new patch. I added hard_stack_alignment, moved > > update_stack_boundary after RTL expansion and updated > > ix86_function_ok_for_sibcall to deal with it. Â Any comments? Well, I think this approach of tracking the required alignment (in gen_reg_rtx especially) is indeed better. Ciao, Michael. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-21 9:54 ` Michael Matz @ 2009-10-21 16:56 ` H.J. Lu 2009-10-30 10:08 ` Richard Guenther 0 siblings, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-10-21 16:56 UTC (permalink / raw) To: Michael Matz Cc: Richard Guenther, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek [-- Attachment #1: Type: text/plain, Size: 1859 bytes --] On Wed, Oct 21, 2009 at 2:05 AM, Michael Matz <matz@suse.de> wrote: > Hi, > > On Tue, 20 Oct 2009, H.J. Lu wrote: > >> > Here is a new patch. I added hard_stack_alignment, moved >> > update_stack_boundary after RTL expansion and updated >> > ix86_function_ok_for_sibcall to deal with it. Any comments? > > Well, I think this approach of tracking the required alignment (in > gen_reg_rtx especially) is indeed better. > Here is the updated patch. We don't need to add hard_stack_alignment. We can initialize stack_alignment_estimated to 0 and check it instead. Tested it on Linux/x86-64 with -m32. OK for trunk? Thanks. -- H.J. --- gcc/ 2009-10-21 H.J. Lu <hongjiu.lu@intel.com> PR target/40836 * cfgexpand.c (expand_stack_alignment): Call update_stack_boundary first. Move assert on stack_alignment_estimated just before setting stack_realign_needed. (gimple_expand_cfg): Initialize stack_alignment_estimated to 0. Don't call update_stack_boundary. * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New. (verride_options): Don't check ix86_force_align_arg_pointer here. (ix86_function_ok_for_sibcall): Use it. (ix86_update_stack_boundary): Likewise. * config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments. gcc/testsuite/ 2009-10-21 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * gcc.target/i386/incoming-6.c: New. * gcc.target/i386/incoming-7.c: Likewise. * gcc.target/i386/incoming-8.c: Likewise. * gcc.target/i386/incoming-9.c: Likewise. * gcc.target/i386/incoming-10.c: Likewise. * gcc.target/i386/incoming-11.c: Likewise. * gcc.target/i386/incoming-12.c: Likewise. * gcc.target/i386/incoming-13.c: Likewise. * gcc.target/i386/incoming-14.c: Likewise. * gcc.target/i386/incoming-15.c: Likewise. * gcc.target/i386/pr37843-4.c: Likewise. [-- Attachment #2: gcc-pr40838-10.patch --] [-- Type: text/plain, Size: 16361 bytes --] gcc/ 2009-10-21 H.J. Lu <hongjiu.lu@intel.com> PR target/40836 * cfgexpand.c (expand_stack_alignment): Call update_stack_boundary first. Move assert on stack_alignment_estimated just before setting stack_realign_needed. (gimple_expand_cfg): Initialize stack_alignment_estimated to 0. Don't call update_stack_boundary. * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New. (verride_options): Don't check ix86_force_align_arg_pointer here. (ix86_function_ok_for_sibcall): Use it. (ix86_update_stack_boundary): Likewise. * config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments. gcc/testsuite/ 2009-10-21 H.J. Lu <hongjiu.lu@intel.com> PR target/40838 * gcc.target/i386/incoming-6.c: New. * gcc.target/i386/incoming-7.c: Likewise. * gcc.target/i386/incoming-8.c: Likewise. * gcc.target/i386/incoming-9.c: Likewise. * gcc.target/i386/incoming-10.c: Likewise. * gcc.target/i386/incoming-11.c: Likewise. * gcc.target/i386/incoming-12.c: Likewise. * gcc.target/i386/incoming-13.c: Likewise. * gcc.target/i386/incoming-14.c: Likewise. * gcc.target/i386/incoming-15.c: Likewise. * gcc.target/i386/pr37843-4.c: Likewise. diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c index 2678d7e..a4b8e4f 100644 --- a/gcc/cfgexpand.c +++ b/gcc/cfgexpand.c @@ -3430,8 +3430,18 @@ expand_stack_alignment (void) || crtl->has_nonlocal_goto) crtl->need_drap = true; - gcc_assert (crtl->stack_alignment_needed - <= crtl->stack_alignment_estimated); + /* Call update_stack_boundary here again to update incoming stack + boundary. It may set incoming stack alignment to a different + value after RTL expansion. TARGET_FUNCTION_OK_FOR_SIBCALL may + use the minimum incoming stack alignment to check if it is OK + to perform sibcall optimization since sibcall optimization will + only align the outgoing stack to incoming stack boundary. */ + if (targetm.calls.update_stack_boundary) + targetm.calls.update_stack_boundary (); + + /* The incoming stack frame has to be aligned at least at + parm_stack_boundary. */ + gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY); /* Update crtl->stack_alignment_estimated and use it later to align stack. We check PREFERRED_STACK_BOUNDARY if there may be non-call @@ -3447,6 +3457,9 @@ expand_stack_alignment (void) if (preferred_stack_boundary > crtl->stack_alignment_needed) crtl->stack_alignment_needed = preferred_stack_boundary; + gcc_assert (crtl->stack_alignment_needed + <= crtl->stack_alignment_estimated); + crtl->stack_realign_needed = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated; crtl->stack_realign_tried = crtl->stack_realign_needed; @@ -3523,7 +3536,7 @@ gimple_expand_cfg (void) targetm.expand_to_rtl_hook (); crtl->stack_alignment_needed = STACK_BOUNDARY; crtl->max_used_stack_slot_alignment = STACK_BOUNDARY; - crtl->stack_alignment_estimated = STACK_BOUNDARY; + crtl->stack_alignment_estimated = 0; crtl->preferred_stack_boundary = STACK_BOUNDARY; cfun->cfg->max_jumptable_ents = 0; @@ -3587,23 +3600,6 @@ gimple_expand_cfg (void) if (crtl->stack_protect_guard) stack_protect_prologue (); - /* Update stack boundary if needed. */ - if (SUPPORTS_STACK_ALIGNMENT) - { - /* Call update_stack_boundary here to update incoming stack - boundary before TARGET_FUNCTION_OK_FOR_SIBCALL is called. - TARGET_FUNCTION_OK_FOR_SIBCALL needs to know the accurate - incoming stack alignment to check if it is OK to perform - sibcall optimization since sibcall optimization will only - align the outgoing stack to incoming stack boundary. */ - if (targetm.calls.update_stack_boundary) - targetm.calls.update_stack_boundary (); - - /* The incoming stack frame has to be aligned at least at - parm_stack_boundary. */ - gcc_assert (crtl->parm_stack_boundary <= INCOMING_STACK_BOUNDARY); - } - expand_phi_nodes (&SA); /* Register rtl specific functions for cfg. */ diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 6065f49..e222f72 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1905,6 +1905,7 @@ static bool ix86_valid_target_attribute_p (tree, tree, tree, int); static bool ix86_valid_target_attribute_inner_p (tree, char *[]); static bool ix86_can_inline_p (tree, tree); static void ix86_set_current_function (tree); +static unsigned int ix86_minimum_incoming_stack_boundary (bool); static enum calling_abi ix86_function_abi (const_tree); @@ -3239,12 +3240,10 @@ override_options (bool main_args_p) if (ix86_force_align_arg_pointer == -1) ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT; + ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; + /* Validate -mincoming-stack-boundary= value or default it to MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */ - if (ix86_force_align_arg_pointer) - ix86_default_incoming_stack_boundary = MIN_STACK_BOUNDARY; - else - ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY; ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary; if (ix86_incoming_stack_boundary_string) { @@ -4277,7 +4276,8 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) /* If we need to align the outgoing stack, then sibcalling would unalign the stack, which may break the called function. */ - if (ix86_incoming_stack_boundary < PREFERRED_STACK_BOUNDARY) + if (ix86_minimum_incoming_stack_boundary (true) + < PREFERRED_STACK_BOUNDARY) return false; if (decl) @@ -8196,37 +8196,58 @@ find_drap_reg (void) } } -/* Update incoming stack boundary and estimated stack alignment. */ +/* Return minimum incoming stack alignment. */ -static void -ix86_update_stack_boundary (void) +static unsigned int +ix86_minimum_incoming_stack_boundary (bool sibcall) { + unsigned int incoming_stack_boundary; + /* Prefer the one specified at command line. */ - ix86_incoming_stack_boundary - = (ix86_user_incoming_stack_boundary - ? ix86_user_incoming_stack_boundary - : ix86_default_incoming_stack_boundary); + if (ix86_user_incoming_stack_boundary) + incoming_stack_boundary = ix86_user_incoming_stack_boundary; + /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary + if -mstackrealign is used, it isn't used for sibcall check and + estimated stack alignment is 128bit. */ + else if (!sibcall + && !TARGET_64BIT + && ix86_force_align_arg_pointer + && crtl->stack_alignment_estimated == 128) + incoming_stack_boundary = MIN_STACK_BOUNDARY; + else + incoming_stack_boundary = ix86_default_incoming_stack_boundary; /* Incoming stack alignment can be changed on individual functions via force_align_arg_pointer attribute. We use the smallest incoming stack boundary. */ - if (ix86_incoming_stack_boundary > MIN_STACK_BOUNDARY + if (incoming_stack_boundary > MIN_STACK_BOUNDARY && lookup_attribute (ix86_force_align_arg_pointer_string, TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl)))) - ix86_incoming_stack_boundary = MIN_STACK_BOUNDARY; + incoming_stack_boundary = MIN_STACK_BOUNDARY; /* The incoming stack frame has to be aligned at least at parm_stack_boundary. */ - if (ix86_incoming_stack_boundary < crtl->parm_stack_boundary) - ix86_incoming_stack_boundary = crtl->parm_stack_boundary; + if (incoming_stack_boundary < crtl->parm_stack_boundary) + incoming_stack_boundary = crtl->parm_stack_boundary; /* Stack at entrance of main is aligned by runtime. We use the smallest incoming stack boundary. */ - if (ix86_incoming_stack_boundary > MAIN_STACK_BOUNDARY + if (incoming_stack_boundary > MAIN_STACK_BOUNDARY && DECL_NAME (current_function_decl) && MAIN_NAME_P (DECL_NAME (current_function_decl)) && DECL_FILE_SCOPE_P (current_function_decl)) - ix86_incoming_stack_boundary = MAIN_STACK_BOUNDARY; + incoming_stack_boundary = MAIN_STACK_BOUNDARY; + + return incoming_stack_boundary; +} + +/* Update incoming stack boundary and estimated stack alignment. */ + +static void +ix86_update_stack_boundary (void) +{ + ix86_incoming_stack_boundary + = ix86_minimum_incoming_stack_boundary (false); /* x86_64 vararg needs 16byte stack alignment for register save area. */ diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 33a5077..22187a9 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -706,9 +706,7 @@ enum target_cpu_default generate an alternate prologue and epilogue that realigns the runtime stack if nessary. This supports mixing codes that keep a 4-byte aligned stack, as specified by i386 psABI, with codes that - need a 16-byte aligned stack, as required by SSE instructions. If - STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is - 128, stacks for all functions may be realigned. */ + need a 16-byte aligned stack, as required by SSE instructions. */ #define STACK_REALIGN_DEFAULT 0 /* Boundary (in *bits*) on which the incoming stack is aligned. */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-10.c b/gcc/testsuite/gcc.target/i386/incoming-10.c new file mode 100644 index 0000000..31d9e61 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-10.c @@ -0,0 +1,19 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */ + +struct s { + int x[8]; +}; + +void g(struct s *); + +void f() +{ + int i; + struct s s; + for (i = 0; i < sizeof(s.x) / sizeof(*s.x); i++) s.x[i] = 0; + g(&s); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-11.c b/gcc/testsuite/gcc.target/i386/incoming-11.c new file mode 100644 index 0000000..e5787af --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-11.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -fomit-frame-pointer -O3 -march=barcelona -mpreferred-stack-boundary=4" } */ + +void g(); + +int p[100]; +int q[100]; + +void f() +{ + int i; + for (i = 0; i < 100; i++) p[i] = 0; + g(); + for (i = 0; i < 100; i++) q[i] = 0; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-12.c b/gcc/testsuite/gcc.target/i386/incoming-12.c new file mode 100644 index 0000000..d7ef103 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-12.c @@ -0,0 +1,20 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +struct x { + v4si v; + v4si w; +}; + +void y(void *); + +v4si x(void) +{ + struct x x; + y(&x); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-13.c b/gcc/testsuite/gcc.target/i386/incoming-13.c new file mode 100644 index 0000000..bbc8993 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-13.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern double y(double *s3); + +extern double s1, s2; + +double x(void) +{ + double s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-14.c b/gcc/testsuite/gcc.target/i386/incoming-14.c new file mode 100644 index 0000000..d27179d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-14.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern int y(int *s3); + +extern int s1, s2; + +int x(void) +{ + int s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-15.c b/gcc/testsuite/gcc.target/i386/incoming-15.c new file mode 100644 index 0000000..e6a1749 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-15.c @@ -0,0 +1,15 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -mpreferred-stack-boundary=4" } */ + +extern long long y(long long *s3); + +extern long long s1, s2; + +long long x(void) +{ + long long s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-6.c b/gcc/testsuite/gcc.target/i386/incoming-6.c new file mode 100644 index 0000000..5cc4ab3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-6.c @@ -0,0 +1,17 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si *s3); + +extern v4si s1, s2; + +v4si x(void) +{ + v4si s3 = s1 + s2; + return y(&s3); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-7.c b/gcc/testsuite/gcc.target/i386/incoming-7.c new file mode 100644 index 0000000..cdd6037 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-7.c @@ -0,0 +1,16 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O2 -msse2 -mpreferred-stack-boundary=4" } */ + +typedef int v4si __attribute__ ((vector_size (16))); + +extern v4si y(v4si, v4si, v4si, v4si, v4si); + +extern v4si s1, s2; + +v4si x(void) +{ + return y(s1, s2, s1, s2, s2); +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-8.c b/gcc/testsuite/gcc.target/i386/incoming-8.c new file mode 100644 index 0000000..2dd8800 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-8.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O3 -msse2 -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/incoming-9.c b/gcc/testsuite/gcc.target/i386/incoming-9.c new file mode 100644 index 0000000..e43cbd6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/incoming-9.c @@ -0,0 +1,18 @@ +/* PR target/40838 */ +/* { dg-do compile { target { { ! *-*-darwin* } && ilp32 } } } */ +/* { dg-options "-w -mstackrealign -O3 -mno-sse -mpreferred-stack-boundary=4" } */ + +float +foo (float f) +{ + float array[128]; + float x; + int i; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + array[i] = f; + for (i = 0; i < sizeof(array) / sizeof(*array); i++) + x += array[i]; + return x; +} + +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%esp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr37843-4.c b/gcc/testsuite/gcc.target/i386/pr37843-4.c new file mode 100644 index 0000000..8e5f51f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr37843-4.c @@ -0,0 +1,13 @@ +/* Test for stack alignment with sibcall optimization. */ +/* { dg-do compile { target { ilp32 && nonpic } } } */ +/* { dg-options "-O2 -msse2 -mpreferred-stack-boundary=4 -mstackrealign" } */ +/* { dg-final { scan-assembler-not "andl\[\\t \]*\\$-16,\[\\t \]*%\[re\]?sp" } } */ +/* { dg-final { scan-assembler-not "call\[\\t \]*foo" } } */ +/* { dg-final { scan-assembler "jmp\[\\t \]*foo" } } */ + +extern int foo (void); + +int bar (void) +{ + return foo(); +} ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-21 16:56 ` H.J. Lu @ 2009-10-30 10:08 ` Richard Guenther 0 siblings, 0 replies; 59+ messages in thread From: Richard Guenther @ 2009-10-30 10:08 UTC (permalink / raw) To: H.J. Lu Cc: Michael Matz, Ian Lance Taylor, Uros Bizjak, gcc-patches, Jakub Jelinek On Wed, Oct 21, 2009 at 5:53 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Wed, Oct 21, 2009 at 2:05 AM, Michael Matz <matz@suse.de> wrote: >> Hi, >> >> On Tue, 20 Oct 2009, H.J. Lu wrote: >> >>> > Here is a new patch. I added hard_stack_alignment, moved >>> > update_stack_boundary after RTL expansion and updated >>> > ix86_function_ok_for_sibcall to deal with it. Any comments? >> >> Well, I think this approach of tracking the required alignment (in >> gen_reg_rtx especially) is indeed better. >> > > Here is the updated patch. We don't need to add > hard_stack_alignment. We can initialize > stack_alignment_estimated to 0 and check it instead. > Tested it on Linux/x86-64 with -m32. OK for trunk? Ok. Thanks, Richard. > Thanks. > > > -- > H.J. > --- > gcc/ > > 2009-10-21 H.J. Lu <hongjiu.lu@intel.com> > > PR target/40836 > * cfgexpand.c (expand_stack_alignment): Call update_stack_boundary > first. Move assert on stack_alignment_estimated just before > setting stack_realign_needed. > (gimple_expand_cfg): Initialize stack_alignment_estimated to 0. > Don't call update_stack_boundary. > > * config/i386/i386.c (ix86_minimum_incoming_stack_boundary): New. > (verride_options): Don't check ix86_force_align_arg_pointer here. > (ix86_function_ok_for_sibcall): Use it. > (ix86_update_stack_boundary): Likewise. > > * config/i386/i386.h (STACK_REALIGN_DEFAULT): Update comments. > > gcc/testsuite/ > > 2009-10-21 H.J. Lu <hongjiu.lu@intel.com> > > PR target/40838 > * gcc.target/i386/incoming-6.c: New. > * gcc.target/i386/incoming-7.c: Likewise. > * gcc.target/i386/incoming-8.c: Likewise. > * gcc.target/i386/incoming-9.c: Likewise. > * gcc.target/i386/incoming-10.c: Likewise. > * gcc.target/i386/incoming-11.c: Likewise. > * gcc.target/i386/incoming-12.c: Likewise. > * gcc.target/i386/incoming-13.c: Likewise. > * gcc.target/i386/incoming-14.c: Likewise. > * gcc.target/i386/incoming-15.c: Likewise. > * gcc.target/i386/pr37843-4.c: Likewise. > ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-10-16 20:27 ` H.J. Lu 2009-10-17 1:03 ` Ian Lance Taylor @ 2009-10-17 7:09 ` Uros Bizjak 1 sibling, 0 replies; 59+ messages in thread From: Uros Bizjak @ 2009-10-17 7:09 UTC (permalink / raw) To: H.J. Lu; +Cc: gcc-patches, Jakub Jelinek On 10/16/2009 10:12 PM, H.J. Lu wrote: >> OTOH, I think that this whole stuff should depend on -mstackrealing somehow, >> according to the comment: >> >> /* 1 if -mstackrealign should be turned on by default. �It will >> � generate an alternate prologue and epilogue that realigns the >> � runtime stack if nessary. �This supports mixing codes that keep a >> � 4-byte aligned stack, as specified by i386 psABI, with codes that >> � need a 16-byte aligned stack, as required by SSE instructions. �If >> � STACK_REALIGN_DEFAULT is 1 and PREFERRED_STACK_BOUNDARY_DEFAULT is >> � 128, stacks for all functions may be realigned. �*/ >> #define STACK_REALIGN_DEFAULT 0 >> >> As far as I understand from many comments from the PR40838 trail (especially >> comment #51), -mstackrealing is not effective in some cases involving >> automatic SSE variables on the stack. I think that -mstackrealign should be >> fixed in the way your patch outlines, so old/broken sources that assume >> 4byte alignment can be compiled using this option without penalizing >> new/fixed code that assumes 16byte alignment. >> >> Also, Jakub has its opinion here, I have also CC'd him. >> Uros. >> >> > Here is the patch to make -mstackrealing only align stack when > SSE registers are used. People who care about mixing binaries > compiled by the older gcc can use it without aligning stack for > every function. OK for trunk if regression passes? > Please also update the comment to STACK_REALIGN_DEFAULT in i386.h where it still says that stacks for _all_ functions may be realigned. Otherwise, this patch OK as far as x86 is concerned (please note that middle-end parts need another approval). IMO, this patch makes -mstackrealign really usable. Also, it is in line with -mstackrealign documentation that says that this option aligns the stack only if necessary [...] to support mixing of 4 and 16byte aligned code. Thanks, Uros. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned
@ 2009-08-07 0:54 Mikulas Patocka
2009-08-07 7:13 ` Jakub Jelinek
0 siblings, 1 reply; 59+ messages in thread
From: Mikulas Patocka @ 2009-08-07 0:54 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: gcc-patches, ubizjak
> > In 32bit, the incoming stack may not be 16 byte aligned. This patch
> > assumes the incoming stack is 4 byte aligned and realigns stack if any
> > SSE variable is put on stack. Any comments?
>
> IMHO this is wrong, I could live with a non-default option for those who
> don't care about performance and think a SCO document from 1996 has any
> relevance to Linux these days. In reality a Linux ABI for years assumes
> 16 byte stack alignment for 32-bit code.
>
> Jakub
Tell me which Linux distribution did you run with 16-byte stack alignment
checking (as proposed in bug 40838) and what was the result?
For me, the result was that 75% of binaries in /bin in Debian Lenny do not
align the stack on 16-byte boundary.
Mikulas
^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-07 0:54 Mikulas Patocka @ 2009-08-07 7:13 ` Jakub Jelinek 2009-08-07 12:53 ` H.J. Lu 2009-08-07 21:08 ` Mikulas Patocka 0 siblings, 2 replies; 59+ messages in thread From: Jakub Jelinek @ 2009-08-07 7:13 UTC (permalink / raw) To: Mikulas Patocka; +Cc: gcc-patches, ubizjak On Fri, Aug 07, 2009 at 02:54:46AM +0200, Mikulas Patocka wrote: > > > In 32bit, the incoming stack may not be 16 byte aligned. This patch > > > assumes the incoming stack is 4 byte aligned and realigns stack if any > > > SSE variable is put on stack. Any comments? > > > > IMHO this is wrong, I could live with a non-default option for those who > > don't care about performance and think a SCO document from 1996 has any > > relevance to Linux these days. In reality a Linux ABI for years assumes > > 16 byte stack alignment for 32-bit code. > > Tell me which Linux distribution did you run with 16-byte stack alignment > checking (as proposed in bug 40838) and what was the result? > > For me, the result was that 75% of binaries in /bin in Debian Lenny do not > align the stack on 16-byte boundary. Besides the obstack glibc bug which has been fixed since then you haven't reported anything particular. It is true that parts of i?86 glibc is compiled with -mpreferered-stack-boundary=2, but only parts that don't call callbacks. Async signals AFAIK will align the stack properly. I simply don't trust your 75% claim, lots of stuff would break if things weren't aligned properly. Jakub ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-07 7:13 ` Jakub Jelinek @ 2009-08-07 12:53 ` H.J. Lu 2009-08-07 22:30 ` H.J. Lu 2009-08-07 21:08 ` Mikulas Patocka 1 sibling, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-08-07 12:53 UTC (permalink / raw) To: Jakub Jelinek; +Cc: Mikulas Patocka, gcc-patches, ubizjak On Fri, Aug 7, 2009 at 12:13 AM, Jakub Jelinek<jakub@redhat.com> wrote: > On Fri, Aug 07, 2009 at 02:54:46AM +0200, Mikulas Patocka wrote: >> > > In 32bit, the incoming stack may not be 16 byte aligned. This patch >> > > assumes the incoming stack is 4 byte aligned and realigns stack if any >> > > SSE variable is put on stack. Any comments? >> > >> > IMHO this is wrong, I could live with a non-default option for those who >> > don't care about performance and think a SCO document from 1996 has any >> > relevance to Linux these days. In reality a Linux ABI for years assumes >> > 16 byte stack alignment for 32-bit code. >> >> Tell me which Linux distribution did you run with 16-byte stack alignment >> checking (as proposed in bug 40838) and what was the result? >> >> For me, the result was that 75% of binaries in /bin in Debian Lenny do not >> align the stack on 16-byte boundary. > > Besides the obstack glibc bug which has been fixed since then you haven't > reported anything particular. It is true that parts of i?86 glibc is > compiled with -mpreferered-stack-boundary=2, but only parts that don't call > callbacks. Async signals AFAIK will align the stack properly. > > I simply don't trust your 75% claim, lots of stuff would break if things > weren't aligned properly. > From gcc 3.4: /* Validate -mpreferred-stack-boundary= value, or provide default. The default of 128 bits is for Pentium III's SSE __m128, but we don't want additional code to keep the stack aligned when optimizing for code size. */ ix86_preferred_stack_boundary = (optimize_size ? TARGET_64BIT ? 128 : 32 : 128); If you compile code with -Os, you will get 4 byte stack alignment. Just step back, we changed stack alignment from 4 byte to 16byte for SSE since we couldn't realign stack at the time. Now we can realign the stack very efficiently. I think we should do it for SSE to support the existing Linux binaries which have 4 byte stack alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse results with SPEC CPU 2006, before and after my patch. Thanks. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-07 12:53 ` H.J. Lu @ 2009-08-07 22:30 ` H.J. Lu 2009-08-08 17:35 ` Mikulas Patocka 2009-08-24 17:39 ` H.J. Lu 0 siblings, 2 replies; 59+ messages in thread From: H.J. Lu @ 2009-08-07 22:30 UTC (permalink / raw) To: Jakub Jelinek; +Cc: Mikulas Patocka, gcc-patches, ubizjak On Fri, Aug 7, 2009 at 5:53 AM, H.J. Lu<hjl.tools@gmail.com> wrote: > On Fri, Aug 7, 2009 at 12:13 AM, Jakub Jelinek<jakub@redhat.com> wrote: >> On Fri, Aug 07, 2009 at 02:54:46AM +0200, Mikulas Patocka wrote: >>> > > In 32bit, the incoming stack may not be 16 byte aligned. This patch >>> > > assumes the incoming stack is 4 byte aligned and realigns stack if any >>> > > SSE variable is put on stack. Any comments? >>> > >>> > IMHO this is wrong, I could live with a non-default option for those who >>> > don't care about performance and think a SCO document from 1996 has any >>> > relevance to Linux these days. In reality a Linux ABI for years assumes >>> > 16 byte stack alignment for 32-bit code. >>> >>> Tell me which Linux distribution did you run with 16-byte stack alignment >>> checking (as proposed in bug 40838) and what was the result? >>> >>> For me, the result was that 75% of binaries in /bin in Debian Lenny do not >>> align the stack on 16-byte boundary. >> >> Besides the obstack glibc bug which has been fixed since then you haven't >> reported anything particular. It is true that parts of i?86 glibc is >> compiled with -mpreferered-stack-boundary=2, but only parts that don't call >> callbacks. Async signals AFAIK will align the stack properly. >> >> I simply don't trust your 75% claim, lots of stuff would break if things >> weren't aligned properly. >> > > From gcc 3.4: > > /* Validate -mpreferred-stack-boundary= value, or provide default. > The default of 128 bits is for Pentium III's SSE __m128, but we > don't want additional code to keep the stack aligned when > optimizing for code size. */ > ix86_preferred_stack_boundary = (optimize_size > ? TARGET_64BIT ? 128 : 32 > : 128); > > If you compile code with -Os, you will get 4 byte stack alignment. > Just step back, we changed stack alignment from 4 byte to 16byte > for SSE since we couldn't realign stack at the time. Now we can > realign the stack very efficiently. I think we should do it for SSE > to support the existing Linux binaries which have 4 byte stack > alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse > results with SPEC CPU 2006, before and after my patch. > Here are the differences of -m32 -O3 -msse2 -mfpmath=sse -ffast-math -funroll-loops before and after my patch: 400.perlbench -0.384615% 401.bzip2 0% 403.gcc -0.362319% 429.mcf -0.813008% 445.gobmk 0.921659% 456.hmmer 0.549451% 458.sjeng -0.438596% 462.libquantum 0% 464.h264ref 0% 471.omnetpp -0.478469% 473.astar -0.645161% 483.xalancbmk -0.727273% SPECint(R)_base2006 -0.411523% 410.bwaves -0.406504% 416.gamess 0% 433.milc -1.36986% 434.zeusmp -0.44843% 435.gromacs 0% 436.cactusADM 0% 437.leslie3d -0.888889% 444.namd 1.20482% 447.dealII -0.350877% 450.soplex -0.31746% 453.povray 0.458716% 454.calculix 0% 459.GemsFDTD 0% 465.tonto 0% 470.lbm 0% 481.wrf 0.480769% 482.sphinx3 0.940439% SPECfp(R)_base2006 0% I think we should align stack if SSE variables are put on stack. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-07 22:30 ` H.J. Lu @ 2009-08-08 17:35 ` Mikulas Patocka 2009-08-16 21:25 ` H.J. Lu 2009-08-24 17:39 ` H.J. Lu 1 sibling, 1 reply; 59+ messages in thread From: Mikulas Patocka @ 2009-08-08 17:35 UTC (permalink / raw) To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches, ubizjak [-- Attachment #1: Type: TEXT/PLAIN, Size: 2825 bytes --] > > From gcc 3.4: > > > > /* Validate -mpreferred-stack-boundary= value, or provide default. > > The default of 128 bits is for Pentium III's SSE __m128, but we > > don't want additional code to keep the stack aligned when > > optimizing for code size. */ > > ix86_preferred_stack_boundary = (optimize_size > > ? TARGET_64BIT ? 128 : 32 > > : 128); > > > > If you compile code with -Os, you will get 4 byte stack alignment. > > Just step back, we changed stack alignment from 4 byte to 16byte > > for SSE since we couldn't realign stack at the time. Now we can > > realign the stack very efficiently. I think we should do it for SSE > > to support the existing Linux binaries which have 4 byte stack > > alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse > > results with SPEC CPU 2006, before and after my patch. > > > > Here are the differences of -m32 -O3 -msse2 -mfpmath=sse -ffast-math > -funroll-loops > before and after my patch: > > 400.perlbench -0.384615% > 401.bzip2 0% > 403.gcc -0.362319% > 429.mcf -0.813008% > 445.gobmk 0.921659% > 456.hmmer 0.549451% > 458.sjeng -0.438596% > 462.libquantum 0% > 464.h264ref 0% > 471.omnetpp -0.478469% > 473.astar -0.645161% > 483.xalancbmk -0.727273% > SPECint(R)_base2006 -0.411523% > 410.bwaves -0.406504% > 416.gamess 0% > 433.milc -1.36986% > 434.zeusmp -0.44843% > 435.gromacs 0% > 436.cactusADM 0% > 437.leslie3d -0.888889% > 444.namd 1.20482% > 447.dealII -0.350877% > 450.soplex -0.31746% > 453.povray 0.458716% > 454.calculix 0% > 459.GemsFDTD 0% > 465.tonto 0% > 470.lbm 0% > 481.wrf 0.480769% > 482.sphinx3 0.940439% > SPECfp(R)_base2006 0% > > I think we should align stack if SSE variables are put on stack. > > -- > H.J. Your patch is buggy, it aligns the stack in functions with vector types but doesn't align if the autovectorizer creates 16-bit SSE instructions (that is even more dangerous than vector types because it may make crashes at random places). You must put that test somewhere else. Mikulas ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-08 17:35 ` Mikulas Patocka @ 2009-08-16 21:25 ` H.J. Lu 0 siblings, 0 replies; 59+ messages in thread From: H.J. Lu @ 2009-08-16 21:25 UTC (permalink / raw) To: Mikulas Patocka; +Cc: Jakub Jelinek, gcc-patches, ubizjak 2009/8/8 Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>: >> > From gcc 3.4: >> > >> > /* Validate -mpreferred-stack-boundary= value, or provide default. >> > The default of 128 bits is for Pentium III's SSE __m128, but we >> > don't want additional code to keep the stack aligned when >> > optimizing for code size. */ >> > ix86_preferred_stack_boundary = (optimize_size >> > ? TARGET_64BIT ? 128 : 32 >> > : 128); >> > >> > If you compile code with -Os, you will get 4 byte stack alignment. >> > Just step back, we changed stack alignment from 4 byte to 16byte >> > for SSE since we couldn't realign stack at the time. Now we can >> > realign the stack very efficiently. I think we should do it for SSE >> > to support the existing Linux binaries which have 4 byte stack >> > alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse >> > results with SPEC CPU 2006, before and after my patch. >> > >> >> Here are the differences of -m32 -O3 -msse2 -mfpmath=sse -ffast-math >> -funroll-loops >> before and after my patch: >> >> 400.perlbench -0.384615% >> 401.bzip2 0% >> 403.gcc -0.362319% >> 429.mcf -0.813008% >> 445.gobmk 0.921659% >> 456.hmmer 0.549451% >> 458.sjeng -0.438596% >> 462.libquantum 0% >> 464.h264ref 0% >> 471.omnetpp -0.478469% >> 473.astar -0.645161% >> 483.xalancbmk -0.727273% >> SPECint(R)_base2006 -0.411523% >> 410.bwaves -0.406504% >> 416.gamess 0% >> 433.milc -1.36986% >> 434.zeusmp -0.44843% >> 435.gromacs 0% >> 436.cactusADM 0% >> 437.leslie3d -0.888889% >> 444.namd 1.20482% >> 447.dealII -0.350877% >> 450.soplex -0.31746% >> 453.povray 0.458716% >> 454.calculix 0% >> 459.GemsFDTD 0% >> 465.tonto 0% >> 470.lbm 0% >> 481.wrf 0.480769% >> 482.sphinx3 0.940439% >> SPECfp(R)_base2006 0% >> >> I think we should align stack if SSE variables are put on stack. >> >> -- >> H.J. > > Your patch is buggy, it aligns the stack in functions with vector types > but doesn't align if the autovectorizer creates 16-bit SSE instructions > (that is even more dangerous than vector types because it may make crashes > at random places). You must put that test somewhere else. > Please show the assembly output with my patch applied. Thanks. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-07 22:30 ` H.J. Lu 2009-08-08 17:35 ` Mikulas Patocka @ 2009-08-24 17:39 ` H.J. Lu 2009-09-12 23:32 ` Mikulas Patocka 1 sibling, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-08-24 17:39 UTC (permalink / raw) To: Jakub Jelinek; +Cc: Mikulas Patocka, gcc-patches, ubizjak On Fri, Aug 7, 2009 at 3:30 PM, H.J. Lu<hjl.tools@gmail.com> wrote: > On Fri, Aug 7, 2009 at 5:53 AM, H.J. Lu<hjl.tools@gmail.com> wrote: >> On Fri, Aug 7, 2009 at 12:13 AM, Jakub Jelinek<jakub@redhat.com> wrote: >>> On Fri, Aug 07, 2009 at 02:54:46AM +0200, Mikulas Patocka wrote: >>>> > > In 32bit, the incoming stack may not be 16 byte aligned. This patch >>>> > > assumes the incoming stack is 4 byte aligned and realigns stack if any >>>> > > SSE variable is put on stack. Any comments? >>>> > >>>> > IMHO this is wrong, I could live with a non-default option for those who >>>> > don't care about performance and think a SCO document from 1996 has any >>>> > relevance to Linux these days. In reality a Linux ABI for years assumes >>>> > 16 byte stack alignment for 32-bit code. >>>> >>>> Tell me which Linux distribution did you run with 16-byte stack alignment >>>> checking (as proposed in bug 40838) and what was the result? >>>> >>>> For me, the result was that 75% of binaries in /bin in Debian Lenny do not >>>> align the stack on 16-byte boundary. >>> >>> Besides the obstack glibc bug which has been fixed since then you haven't >>> reported anything particular. It is true that parts of i?86 glibc is >>> compiled with -mpreferered-stack-boundary=2, but only parts that don't call >>> callbacks. Async signals AFAIK will align the stack properly. >>> >>> I simply don't trust your 75% claim, lots of stuff would break if things >>> weren't aligned properly. >>> >> >> From gcc 3.4: >> >> /* Validate -mpreferred-stack-boundary= value, or provide default. >> The default of 128 bits is for Pentium III's SSE __m128, but we >> don't want additional code to keep the stack aligned when >> optimizing for code size. */ >> ix86_preferred_stack_boundary = (optimize_size >> ? TARGET_64BIT ? 128 : 32 >> : 128); >> >> If you compile code with -Os, you will get 4 byte stack alignment. >> Just step back, we changed stack alignment from 4 byte to 16byte >> for SSE since we couldn't realign stack at the time. Now we can >> realign the stack very efficiently. I think we should do it for SSE >> to support the existing Linux binaries which have 4 byte stack >> alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse >> results with SPEC CPU 2006, before and after my patch. >> > > Here are the differences of -m32 -O3 -msse2 -mfpmath=sse -ffast-math > -funroll-loops > before and after my patch: > > 400.perlbench -0.384615% > 401.bzip2 0% > 403.gcc -0.362319% > 429.mcf -0.813008% > 445.gobmk 0.921659% > 456.hmmer 0.549451% > 458.sjeng -0.438596% > 462.libquantum 0% > 464.h264ref 0% > 471.omnetpp -0.478469% > 473.astar -0.645161% > 483.xalancbmk -0.727273% > SPECint(R)_base2006 -0.411523% > 410.bwaves -0.406504% > 416.gamess 0% > 433.milc -1.36986% > 434.zeusmp -0.44843% > 435.gromacs 0% > 436.cactusADM 0% > 437.leslie3d -0.888889% > 444.namd 1.20482% > 447.dealII -0.350877% > 450.soplex -0.31746% > 453.povray 0.458716% > 454.calculix 0% > 459.GemsFDTD 0% > 465.tonto 0% > 470.lbm 0% > 481.wrf 0.480769% > 482.sphinx3 0.940439% > SPECfp(R)_base2006 0% > > I think we should align stack if SSE variables are put on stack. > Darwin ia32 psABI specifies 16byte stack alignment and enforces it with #define PREFERRED_STACK_BOUNDARY \ MAX (STACK_BOUNDARY, ix86_preferred_stack_boundary) On other ia32 targets, 4byte outgoing stack alignment is correct and allowed. My patch assumes 4 byte incoming stack alignment only when SSE variables are put on stack. Automatic stack alignment implementation is quite efficient. Its performance impact is very limited as show in SPEC CPU 2006 results. It also fixed a regression: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156 OK for trunk? Thanks. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-24 17:39 ` H.J. Lu @ 2009-09-12 23:32 ` Mikulas Patocka 2009-09-12 23:42 ` Mikulas Patocka 2009-09-13 1:55 ` H.J. Lu 0 siblings, 2 replies; 59+ messages in thread From: Mikulas Patocka @ 2009-09-12 23:32 UTC (permalink / raw) To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches, ubizjak [-- Attachment #1: Type: TEXT/PLAIN, Size: 8848 bytes --] On Mon, 24 Aug 2009, H.J. Lu wrote: > On Fri, Aug 7, 2009 at 3:30 PM, H.J. Lu<hjl.tools@gmail.com> wrote: > > On Fri, Aug 7, 2009 at 5:53 AM, H.J. Lu<hjl.tools@gmail.com> wrote: > >> On Fri, Aug 7, 2009 at 12:13 AM, Jakub Jelinek<jakub@redhat.com> wrote: > >>> On Fri, Aug 07, 2009 at 02:54:46AM +0200, Mikulas Patocka wrote: > >>>> > > In 32bit, the incoming stack may not be 16 byte aligned. This patch > >>>> > > assumes the incoming stack is 4 byte aligned and realigns stack if any > >>>> > > SSE variable is put on stack. Any comments? > >>>> > > >>>> > IMHO this is wrong, I could live with a non-default option for those who > >>>> > don't care about performance and think a SCO document from 1996 has any > >>>> > relevance to Linux these days. In reality a Linux ABI for years assumes > >>>> > 16 byte stack alignment for 32-bit code. > >>>> > >>>> Tell me which Linux distribution did you run with 16-byte stack alignment > >>>> checking (as proposed in bug 40838) and what was the result? > >>>> > >>>> For me, the result was that 75% of binaries in /bin in Debian Lenny do not > >>>> align the stack on 16-byte boundary. > >>> > >>> Besides the obstack glibc bug which has been fixed since then you haven't > >>> reported anything particular. It is true that parts of i?86 glibc is > >>> compiled with -mpreferered-stack-boundary=2, but only parts that don't call > >>> callbacks. Async signals AFAIK will align the stack properly. > >>> > >>> I simply don't trust your 75% claim, lots of stuff would break if things > >>> weren't aligned properly. > >>> > >> > >> From gcc 3.4: > >> > >> /* Validate -mpreferred-stack-boundary= value, or provide default. > >> The default of 128 bits is for Pentium III's SSE __m128, but we > >> don't want additional code to keep the stack aligned when > >> optimizing for code size. */ > >> ix86_preferred_stack_boundary = (optimize_size > >> ? TARGET_64BIT ? 128 : 32 > >> : 128); > >> > >> If you compile code with -Os, you will get 4 byte stack alignment. > >> Just step back, we changed stack alignment from 4 byte to 16byte > >> for SSE since we couldn't realign stack at the time. Now we can > >> realign the stack very efficiently. I think we should do it for SSE > >> to support the existing Linux binaries which have 4 byte stack > >> alignment. If it helps, I can compare -m32 -O3 -msse2 -mfp-math=sse > >> results with SPEC CPU 2006, before and after my patch. > >> > > > > Here are the differences of -m32 -O3 -msse2 -mfpmath=sse -ffast-math > > -funroll-loops > > before and after my patch: > > > > 400.perlbench -0.384615% > > 401.bzip2 0% > > 403.gcc -0.362319% > > 429.mcf -0.813008% > > 445.gobmk 0.921659% > > 456.hmmer 0.549451% > > 458.sjeng -0.438596% > > 462.libquantum 0% > > 464.h264ref 0% > > 471.omnetpp -0.478469% > > 473.astar -0.645161% > > 483.xalancbmk -0.727273% > > SPECint(R)_base2006 -0.411523% > > 410.bwaves -0.406504% > > 416.gamess 0% > > 433.milc -1.36986% > > 434.zeusmp -0.44843% > > 435.gromacs 0% > > 436.cactusADM 0% > > 437.leslie3d -0.888889% > > 444.namd 1.20482% > > 447.dealII -0.350877% > > 450.soplex -0.31746% > > 453.povray 0.458716% > > 454.calculix 0% > > 459.GemsFDTD 0% > > 465.tonto 0% > > 470.lbm 0% > > 481.wrf 0.480769% > > 482.sphinx3 0.940439% > > SPECfp(R)_base2006 0% > > > > I think we should align stack if SSE variables are put on stack. > > > > Darwin ia32 psABI specifies 16byte stack alignment and enforces it > with > > #define PREFERRED_STACK_BOUNDARY \ > MAX (STACK_BOUNDARY, ix86_preferred_stack_boundary) > > On other ia32 targets, 4byte outgoing stack alignment is > correct and allowed. My patch assumes 4 byte incoming > stack alignment only when SSE variables are put on stack. > Automatic stack alignment implementation is quite efficient. > Its performance impact is very limited as show in SPEC CPU > 2006 results. It also fixed a regression: > > http://gcc.gnu.org/bugzilla/show_bug.cgi?id=41156 > > OK for trunk? > > Thanks. > > -- > H.J. I tried this patch for 4.4.1 (that one posted in bugzilla 40838) with seamonkey 1.1.17 and it relly misses alignment in some parts of it. I used CFLAGS and CXXFLAGS "-O3 -fomit-frame-pointer -frename-registers -march=barcelona" I found the functions that miscompile, the trivial way to find them is to compile seamonkey, do find . -name "*.o"|xargs objdump -d|less, search for "movdqa.*esp" or "movaps.*esp" (or even "esp.*xmm" and "xmm.*esp", but it gives more false positives) and check the function that it has aligment code. There are a few functions that don't. Please look at these functions, the idea of your patch is good, it just needs to be fixed. BTW. gentoo documentation lists -O3 as problematic flag that leads to instability with gcc 4 (they don't way why, but the most likely reason is this alignment issue), it would be good if we could fix it and allow people to use SSE. Mikulas Some of the miscompiled functions are: pow5mult: a00: 55 push %ebp a01: 57 push %edi a02: 56 push %esi a03: 53 push %ebx a04: 89 d3 mov %edx,%ebx a06: 83 ec 6c sub $0x6c,%esp ... a7d: 0f 29 44 24 10 movaps %xmm0,0x10(%esp) a82: e8 79 f8 ff ff call 300 <Balloc> a87: 85 c0 test %eax,%eax a89: 89 44 24 4c mov %eax,0x4c(%esp) a8d: 66 0f 6f 44 24 10 movdqa 0x10(%esp),%xmm0 PR_strtod: 10a0: 55 push %ebp 10a1: 57 push %edi 10a2: 56 push %esi 10a3: 53 push %ebx 10a4: 81 ec fc 00 00 00 sub $0xfc,%esp ... 172c: 66 0f 6f 44 24 20 movdqa 0x20(%esp),%xmm0 PR_select: 3da0: 55 push %ebp 3da1: 57 push %edi 3da2: 56 push %esi 3da3: 89 ce mov %ecx,%esi 3da5: 53 push %ebx 3da6: 89 d3 mov %edx,%ebx 3da8: 81 ec ec 01 00 00 sub $0x1ec,%esp ... 3dcb: 0f 29 84 24 50 01 00 movaps %xmm0,0x150(%esp) 3dd2: 00 3dd3: 0f 29 84 24 60 01 00 movaps %xmm0,0x160(%esp) 3dda: 00 3ddb: 0f 29 84 24 70 01 00 movaps %xmm0,0x170(%esp) 3de2: 00 3de3: 0f 29 84 24 80 01 00 movaps %xmm0,0x180(%esp) 3dea: 00 3deb: 0f 29 84 24 90 01 00 movaps %xmm0,0x190(%esp) 3df2: 00 _ZL18wait_for_retrievalP13_GtkClipboardP17retrieval_context: 560: 55 push %ebp 561: 57 push %edi 562: 56 push %esi 563: 89 d6 mov %edx,%esi 565: 53 push %ebx 566: 81 ec bc 01 00 00 sub $0x1bc,%esp ... 5b0: 0f 29 44 24 20 movaps %xmm0,0x20(%esp) 5b5: 0f 29 44 24 30 movaps %xmm0,0x30(%esp) 5ba: 0f 29 44 24 40 movaps %xmm0,0x40(%esp) 5bf: 0f 29 44 24 50 movaps %xmm0,0x50(%esp) 5c4: 0f 29 44 24 60 movaps %xmm0,0x60(%esp) 5c9: 0f 29 44 24 70 movaps %xmm0,0x70(%esp) 5ce: 0f 29 84 24 80 00 00 movaps %xmm0,0x80(%esp) 5d5: 00 5d6: 0f 29 84 24 90 00 00 movaps %xmm0,0x90(%esp) 5dd: 00 _ZN13XRemoteClient7GetLockEmPi: 680: 55 push %ebp 681: 57 push %edi 682: 56 push %esi 683: 53 push %ebx 684: 89 c3 mov %eax,%ebx 686: 81 ec 0c 02 00 00 sub $0x20c,%esp ... 6ee: 0f 29 44 24 30 movaps %xmm0,0x30(%esp) ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-09-12 23:32 ` Mikulas Patocka @ 2009-09-12 23:42 ` Mikulas Patocka 2009-09-13 1:55 ` H.J. Lu 1 sibling, 0 replies; 59+ messages in thread From: Mikulas Patocka @ 2009-09-12 23:42 UTC (permalink / raw) To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches, ubizjak > _ZL18wait_for_retrievalP13_GtkClipboardP17retrieval_context: > 560: 55 push %ebp > 561: 57 push %edi > 562: 56 push %esi > 563: 89 d6 mov %edx,%esi > 565: 53 push %ebx > 566: 81 ec bc 01 00 00 sub $0x1bc,%esp > ... > 5b0: 0f 29 44 24 20 movaps %xmm0,0x20(%esp) ^^^ BTW. this is the place where it really crashed when I tried to run it. Mikulas > 5b5: 0f 29 44 24 30 movaps %xmm0,0x30(%esp) > 5ba: 0f 29 44 24 40 movaps %xmm0,0x40(%esp) > 5bf: 0f 29 44 24 50 movaps %xmm0,0x50(%esp) > 5c4: 0f 29 44 24 60 movaps %xmm0,0x60(%esp) > 5c9: 0f 29 44 24 70 movaps %xmm0,0x70(%esp) > 5ce: 0f 29 84 24 80 00 00 movaps %xmm0,0x80(%esp) > 5d5: 00 > 5d6: 0f 29 84 24 90 00 00 movaps %xmm0,0x90(%esp) > 5dd: 00 ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-09-12 23:32 ` Mikulas Patocka 2009-09-12 23:42 ` Mikulas Patocka @ 2009-09-13 1:55 ` H.J. Lu 2009-09-13 14:10 ` Mikulas Patocka 1 sibling, 1 reply; 59+ messages in thread From: H.J. Lu @ 2009-09-13 1:55 UTC (permalink / raw) To: Mikulas Patocka; +Cc: Jakub Jelinek, gcc-patches, ubizjak 2009/9/12 Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>: > > I tried this patch for 4.4.1 (that one posted in bugzilla 40838) with > seamonkey 1.1.17 and it relly misses alignment in some parts of it. I used > CFLAGS and CXXFLAGS "-O3 -fomit-frame-pointer -frename-registers > -march=barcelona" > > I found the functions that miscompile, the trivial way to find them is to > compile seamonkey, do find . -name "*.o"|xargs objdump -d|less, search for > "movdqa.*esp" or "movaps.*esp" (or even "esp.*xmm" and "xmm.*esp", but it > gives more false positives) and check the function that it has aligment > code. There are a few functions that don't. Please look at these > functions, the idea of your patch is good, it just needs to be fixed. > > BTW. gentoo documentation lists -O3 as problematic flag that leads to > instability with gcc 4 (they don't way why, but the most likely reason is > this alignment issue), it would be good if we could fix it and allow > people to use SSE. > Please find a small testcase and upload it to PR 40838. Thank.s Thanks. -- H.J. ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-09-13 1:55 ` H.J. Lu @ 2009-09-13 14:10 ` Mikulas Patocka 0 siblings, 0 replies; 59+ messages in thread From: Mikulas Patocka @ 2009-09-13 14:10 UTC (permalink / raw) To: H.J. Lu; +Cc: Jakub Jelinek, gcc-patches, ubizjak On Sat, 12 Sep 2009, H.J. Lu wrote: > 2009/9/12 Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>: > > > > I tried this patch for 4.4.1 (that one posted in bugzilla 40838) with > > seamonkey 1.1.17 and it relly misses alignment in some parts of it. I used > > CFLAGS and CXXFLAGS "-O3 -fomit-frame-pointer -frename-registers > > -march=barcelona" > > > > I found the functions that miscompile, the trivial way to find them is to > > compile seamonkey, do find . -name "*.o"|xargs objdump -d|less, search for > > "movdqa.*esp" or "movaps.*esp" (or even "esp.*xmm" and "xmm.*esp", but it > > gives more false positives) and check the function that it has aligment > > code. There are a few functions that don't. Please look at these > > functions, the idea of your patch is good, it just needs to be fixed. > > > > BTW. gentoo documentation lists -O3 as problematic flag that leads to > > instability with gcc 4 (they don't way why, but the most likely reason is > > this alignment issue), it would be good if we could fix it and allow > > people to use SSE. > > > > Please find a small testcase and upload it to PR 40838. > > Thank.s > > Thanks. OK. The misgenerated examples were similar to one of the two issues, so I posted two examples for them to bugzilla. So you can find the bug in your patch. There is another bug: The compiler shouldn't assume that arrays in the common section are aligned. They may be initialized in a different module compiled with different compiler that doesn't do alignment. Mikulas ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-07 7:13 ` Jakub Jelinek 2009-08-07 12:53 ` H.J. Lu @ 2009-08-07 21:08 ` Mikulas Patocka 2009-08-07 21:25 ` Richard Guenther 1 sibling, 1 reply; 59+ messages in thread From: Mikulas Patocka @ 2009-08-07 21:08 UTC (permalink / raw) To: Jakub Jelinek; +Cc: gcc-patches, ubizjak > > For me, the result was that 75% of binaries in /bin in Debian Lenny do not > > align the stack on 16-byte boundary. > > Besides the obstack glibc bug which has been fixed since then you haven't > reported anything particular. It is true that parts of i?86 glibc is > compiled with -mpreferered-stack-boundary=2, but only parts that don't call > callbacks. Async signals AFAIK will align the stack properly. > > I simply don't trust your 75% claim, lots of stuff would break if things > weren't aligned properly. > > Jakub The reason for that '75%' is that integer-only binaries are usually compiled with -mpreferred-stack-boundary=2. People used the switch "-mpreferred-stack-boundary=2" to tell the compiler "I don't care about performance of floating point". If gcc developers made that "-mpreferred-stack-boundary" switch, people used it. It is not surprising that distributors built packages that don't do much floating point calculations with it. Now it's hard to say that anyone who used the switch was wrong. It very much ressembles the situatino when mad man cries that he is the only one normal and all the other people are crazy because they can't tolerate his behavior. If you don't believe what I'm saying, just run the test on your own. Other things that misalign the stack (found during source review): - Mozilla (has assembler stuff for XPCOM calls that aligns only to 8-bytes, based on the reasoning that it uses only doubles) - Python (has a FFI library that does callbacks to C, it aligns only for Darwin ... could be trivially fixed by using the Darwin code for Linux) - Dosbox (generates machine code that calls back to C, doesn't align) - Mplayer+Wine (link Windows libraries that call back to C) - Intel C (aligns only parts that need it, doesn't align in integer functions) - Zip (uses some assembler routine that calls to C and doesn't align) Mikulas ^ permalink raw reply [flat|nested] 59+ messages in thread
* Re: PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned 2009-08-07 21:08 ` Mikulas Patocka @ 2009-08-07 21:25 ` Richard Guenther 0 siblings, 0 replies; 59+ messages in thread From: Richard Guenther @ 2009-08-07 21:25 UTC (permalink / raw) To: Mikulas Patocka; +Cc: Jakub Jelinek, gcc-patches, ubizjak On Fri, Aug 7, 2009 at 11:08 PM, Mikulas Patocka<mikulas@artax.karlin.mff.cuni.cz> wrote: >> > For me, the result was that 75% of binaries in /bin in Debian Lenny do not >> > align the stack on 16-byte boundary. >> >> Besides the obstack glibc bug which has been fixed since then you haven't >> reported anything particular. It is true that parts of i?86 glibc is >> compiled with -mpreferered-stack-boundary=2, but only parts that don't call >> callbacks. Async signals AFAIK will align the stack properly. >> >> I simply don't trust your 75% claim, lots of stuff would break if things >> weren't aligned properly. >> >> Jakub > > The reason for that '75%' is that integer-only binaries are usually > compiled with -mpreferred-stack-boundary=2. People used the switch > "-mpreferred-stack-boundary=2" to tell the compiler "I don't care about > performance of floating point". > > If gcc developers made that "-mpreferred-stack-boundary" switch, people > used it. It is not surprising that distributors built packages that don't > do much floating point calculations with it. > > Now it's hard to say that anyone who used the switch was wrong. It very > much ressembles the situatino when mad man cries that he is the only one > normal and all the other people are crazy because they can't tolerate his > behavior. > > > If you don't believe what I'm saying, just run the test on your own. > > > Other things that misalign the stack (found during source review): > > - Mozilla (has assembler stuff for XPCOM calls that aligns only to > 8-bytes, based on the reasoning that it uses only doubles) > - Python (has a FFI library that does callbacks to C, it aligns only for > Darwin ... could be trivially fixed by using the Darwin code for Linux) > - Dosbox (generates machine code that calls back to C, doesn't align) > - Mplayer+Wine (link Windows libraries that call back to C) > - Intel C (aligns only parts that need it, doesn't align in integer > functions) > - Zip (uses some assembler routine that calls to C and doesn't align) The only openSUSE packages that specify -mprefered-stack-boundary are fftw3:4 fftw:4 glibc.i686:2 glibc.i686:4 glibc:2 glibc:4 quickcam:2 valgrind:2 virtualbox-ose:2 where the value after the colon is the specified stack boundary. Richard. ^ permalink raw reply [flat|nested] 59+ messages in thread
end of thread, other threads:[~2009-10-30 9:51 UTC | newest] Thread overview: 59+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2009-08-06 21:42 PATCH: PR target/40838: gcc shouldn't assume that the stack is aligned H.J. Lu 2009-08-06 22:26 ` Jakub Jelinek 2009-08-06 22:52 ` H.J. Lu 2009-10-15 15:58 ` H.J. Lu 2009-10-15 18:45 ` Uros Bizjak 2009-10-15 19:22 ` H.J. Lu 2009-10-15 19:32 ` Uros Bizjak 2009-10-15 19:43 ` H.J. Lu 2009-10-15 19:48 ` Jakub Jelinek 2009-10-15 20:11 ` H.J. Lu 2009-10-15 19:53 ` Uros Bizjak 2009-10-15 21:01 ` H.J. Lu 2009-10-15 21:41 ` Uros Bizjak 2009-10-16 20:27 ` H.J. Lu 2009-10-17 1:03 ` Ian Lance Taylor 2009-10-17 18:22 ` H.J. Lu 2009-10-17 19:02 ` Richard Guenther 2009-10-17 19:21 ` H.J. Lu 2009-10-17 19:29 ` Richard Guenther 2009-10-17 19:35 ` H.J. Lu 2009-10-17 19:46 ` Richard Guenther 2009-10-17 20:01 ` H.J. Lu 2009-10-17 20:59 ` Richard Guenther 2009-10-18 19:21 ` Michael Matz 2009-10-18 19:45 ` Richard Guenther 2009-10-19 16:36 ` H.J. Lu 2009-10-20 1:12 ` Michael Matz 2009-10-20 19:10 ` H.J. Lu 2009-10-19 16:38 ` H.J. Lu 2009-10-19 17:08 ` Ian Lance Taylor 2009-10-19 17:26 ` H.J. Lu 2009-10-19 17:33 ` Ian Lance Taylor 2009-10-19 17:46 ` H.J. Lu 2009-10-19 17:55 ` Ian Lance Taylor 2009-10-19 19:16 ` H.J. Lu 2009-10-19 21:15 ` Ian Lance Taylor 2009-10-20 19:00 ` H.J. Lu 2009-10-20 1:23 ` Michael Matz 2009-10-20 19:12 ` H.J. Lu 2009-10-20 1:53 ` Michael Matz 2009-10-20 21:15 ` H.J. Lu 2009-10-21 1:10 ` H.J. Lu 2009-10-21 9:54 ` Michael Matz 2009-10-21 16:56 ` H.J. Lu 2009-10-30 10:08 ` Richard Guenther 2009-10-17 7:09 ` Uros Bizjak 2009-08-07 0:54 Mikulas Patocka 2009-08-07 7:13 ` Jakub Jelinek 2009-08-07 12:53 ` H.J. Lu 2009-08-07 22:30 ` H.J. Lu 2009-08-08 17:35 ` Mikulas Patocka 2009-08-16 21:25 ` H.J. Lu 2009-08-24 17:39 ` H.J. Lu 2009-09-12 23:32 ` Mikulas Patocka 2009-09-12 23:42 ` Mikulas Patocka 2009-09-13 1:55 ` H.J. Lu 2009-09-13 14:10 ` Mikulas Patocka 2009-08-07 21:08 ` Mikulas Patocka 2009-08-07 21:25 ` Richard Guenther
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).